From 4c7fefd4e3a8eea5a70f3f475615af98eb8b419b Mon Sep 17 00:00:00 2001 From: huangyuxin Date: Thu, 16 Sep 2021 12:16:59 +0000 Subject: [PATCH] add transformed v1.8 model --- .../transv1.8to2.x/data/aishell/aishell.py | 123 +++ .../data/librispeech/librispeech.py | 159 ++++ .../data/noise/chime3_background.py | 139 ++++ .../transv1.8to2.x/data/voxforge/run_data.sh | 16 + .../transv1.8to2.x/data/voxforge/voxforge.py | 234 ++++++ .../transv1.8to2.x/data_utils/__init__.py | 13 + examples/transv1.8to2.x/data_utils/audio.py | 695 +++++++++++++++++ .../data_utils/augmentor/__init__.py | 13 + .../data_utils/augmentor/augmentation.py | 134 ++++ .../data_utils/augmentor/base.py | 43 ++ .../data_utils/augmentor/impulse_response.py | 43 ++ .../data_utils/augmentor/noise_perturb.py | 58 ++ .../online_bayesian_normalization.py | 57 ++ .../data_utils/augmentor/resample.py | 42 + .../data_utils/augmentor/shift_perturb.py | 43 ++ .../data_utils/augmentor/speed_perturb.py | 56 ++ .../data_utils/augmentor/volume_perturb.py | 49 ++ examples/transv1.8to2.x/data_utils/data.py | 380 +++++++++ .../data_utils/featurizer/__init__.py | 13 + .../data_utils/featurizer/audio_featurizer.py | 194 +++++ .../featurizer/speech_featurizer.py | 107 +++ .../data_utils/featurizer/text_featurizer.py | 76 ++ .../transv1.8to2.x/data_utils/normalizer.py | 97 +++ examples/transv1.8to2.x/data_utils/speech.py | 153 ++++ examples/transv1.8to2.x/data_utils/utility.py | 98 +++ .../transv1.8to2.x/deepspeech/__init__.py | 370 +++++++++ .../deepspeech/decoders/README.MD | 3 + .../deepspeech/decoders/__init__.py | 13 + .../decoders/decoders_deprecated.py | 248 ++++++ .../deepspeech/decoders/scorer_deprecated.py | 78 ++ .../deepspeech/decoders/swig_wrapper.py | 134 ++++ .../deepspeech/frontend/__init__.py | 13 + .../deepspeech/frontend/audio.py | 721 ++++++++++++++++++ .../deepspeech/frontend/augmentor/__init__.py | 13 + .../frontend/augmentor/augmentation.py | 218 ++++++ .../deepspeech/frontend/augmentor/base.py | 59 ++ .../frontend/augmentor/impulse_response.py | 50 ++ .../frontend/augmentor/noise_perturb.py | 64 ++ .../online_bayesian_normalization.py | 63 ++ .../deepspeech/frontend/augmentor/resample.py | 48 ++ .../frontend/augmentor/shift_perturb.py | 49 ++ .../frontend/augmentor/spec_augment.py | 256 +++++++ .../frontend/augmentor/speed_perturb.py | 106 +++ .../frontend/augmentor/volume_perturb.py | 55 ++ .../frontend/featurizer/__init__.py | 16 + .../frontend/featurizer/audio_featurizer.py | 363 +++++++++ .../frontend/featurizer/speech_featurizer.py | 153 ++++ .../frontend/featurizer/text_featurizer.py | 202 +++++ .../deepspeech/frontend/normalizer.py | 199 +++++ .../deepspeech/frontend/speech.py | 217 ++++++ .../deepspeech/frontend/utility.py | 289 +++++++ .../transv1.8to2.x/deepspeech/io/__init__.py | 13 + .../transv1.8to2.x/deepspeech/io/batchfy.py | 469 ++++++++++++ .../transv1.8to2.x/deepspeech/io/collator.py | 321 ++++++++ .../deepspeech/io/collator_st.py | 631 +++++++++++++++ .../transv1.8to2.x/deepspeech/io/converter.py | 81 ++ .../deepspeech/io/dataloader.py | 170 +++++ .../transv1.8to2.x/deepspeech/io/dataset.py | 149 ++++ .../transv1.8to2.x/deepspeech/io/reader.py | 410 ++++++++++ .../transv1.8to2.x/deepspeech/io/sampler.py | 251 ++++++ .../transv1.8to2.x/deepspeech/io/utility.py | 87 +++ .../deepspeech/models/__init__.py | 13 + .../deepspeech/models/ds2/__init__.py | 17 + .../deepspeech/models/ds2/conv.py | 165 ++++ .../deepspeech/models/ds2/deepspeech2.py | 313 ++++++++ .../deepspeech/models/ds2/rnn.py | 334 ++++++++ .../deepspeech/modules/__init__.py | 13 + .../deepspeech/modules/activation.py | 145 ++++ .../transv1.8to2.x/deepspeech/modules/cmvn.py | 51 ++ .../transv1.8to2.x/deepspeech/modules/crf.py | 370 +++++++++ .../transv1.8to2.x/deepspeech/modules/ctc.py | 274 +++++++ .../deepspeech/modules/decoder.py | 182 +++++ .../deepspeech/modules/decoder_layer.py | 151 ++++ .../deepspeech/modules/encoder.py | 453 +++++++++++ .../transv1.8to2.x/deepspeech/modules/loss.py | 144 ++++ .../transv1.8to2.x/deepspeech/modules/mask.py | 260 +++++++ .../deepspeech/utils/__init__.py | 13 + .../deepspeech/utils/bleu_score.py | 54 ++ .../deepspeech/utils/checkpoint.py | 298 ++++++++ .../deepspeech/utils/ctc_utils.py | 134 ++++ .../deepspeech/utils/dynamic_import.py | 67 ++ .../deepspeech/utils/error_rate.py | 206 +++++ .../deepspeech/utils/layer_tools.py | 88 +++ .../transv1.8to2.x/deepspeech/utils/log.py | 182 +++++ .../deepspeech/utils/mp_tools.py | 30 + .../deepspeech/utils/socket_server.py | 112 +++ .../deepspeech/utils/tensor_utils.py | 180 +++++ .../deepspeech/utils/text_grid.py | 127 +++ .../deepspeech/utils/utility.py | 110 +++ .../example/aishell/run_data.sh | 42 + .../example/aishell/run_infer_golden.sh | 55 ++ .../example/aishell/run_test_golden.sh | 54 ++ .../example/baidu_en8k/run_data.sh | 45 ++ .../example/baidu_en8k/run_infer_golden.sh | 55 ++ .../example/baidu_en8k/run_test_golden.sh | 55 ++ .../example/librispeech/run_data.sh | 45 ++ .../example/librispeech/run_infer_golden.sh | 55 ++ .../example/librispeech/run_test_golden.sh | 55 ++ examples/transv1.8to2.x/infer2x.py | 163 ++++ .../transv1.8to2.x/model_utils/__init__.py | 13 + .../transv1.8to2.x/model_utils/model_check.py | 49 ++ .../models/aishell/download_model.sh | 19 + .../models/baidu_en8k/download_model.sh | 19 + .../models/librispeech/download_model.sh | 19 + .../models/lm/download_lm_ch.sh | 18 + .../models/lm/download_lm_en.sh | 18 + examples/transv1.8to2.x/test2x.py | 169 ++++ examples/transv1.8to2.x/tools/_init_paths.py | 31 + examples/transv1.8to2.x/tools/build_vocab.py | 70 ++ .../transv1.8to2.x/tools/compute_mean_std.py | 64 ++ examples/transv1.8to2.x/utils/__init__.py | 13 + examples/transv1.8to2.x/utils/error_rate.py | 204 +++++ .../utils/tests/test_error_rate.py | 124 +++ examples/transv1.8to2.x/utils/utility.py | 56 ++ examples/transv1.8to2.x/utils/utility.sh | 23 + 115 files changed, 15636 insertions(+) create mode 100644 examples/transv1.8to2.x/data/aishell/aishell.py create mode 100644 examples/transv1.8to2.x/data/librispeech/librispeech.py create mode 100644 examples/transv1.8to2.x/data/noise/chime3_background.py create mode 100644 examples/transv1.8to2.x/data/voxforge/run_data.sh create mode 100644 examples/transv1.8to2.x/data/voxforge/voxforge.py create mode 100644 examples/transv1.8to2.x/data_utils/__init__.py create mode 100644 examples/transv1.8to2.x/data_utils/audio.py create mode 100644 examples/transv1.8to2.x/data_utils/augmentor/__init__.py create mode 100644 examples/transv1.8to2.x/data_utils/augmentor/augmentation.py create mode 100644 examples/transv1.8to2.x/data_utils/augmentor/base.py create mode 100644 examples/transv1.8to2.x/data_utils/augmentor/impulse_response.py create mode 100644 examples/transv1.8to2.x/data_utils/augmentor/noise_perturb.py create mode 100644 examples/transv1.8to2.x/data_utils/augmentor/online_bayesian_normalization.py create mode 100644 examples/transv1.8to2.x/data_utils/augmentor/resample.py create mode 100644 examples/transv1.8to2.x/data_utils/augmentor/shift_perturb.py create mode 100644 examples/transv1.8to2.x/data_utils/augmentor/speed_perturb.py create mode 100644 examples/transv1.8to2.x/data_utils/augmentor/volume_perturb.py create mode 100644 examples/transv1.8to2.x/data_utils/data.py create mode 100644 examples/transv1.8to2.x/data_utils/featurizer/__init__.py create mode 100644 examples/transv1.8to2.x/data_utils/featurizer/audio_featurizer.py create mode 100644 examples/transv1.8to2.x/data_utils/featurizer/speech_featurizer.py create mode 100644 examples/transv1.8to2.x/data_utils/featurizer/text_featurizer.py create mode 100644 examples/transv1.8to2.x/data_utils/normalizer.py create mode 100644 examples/transv1.8to2.x/data_utils/speech.py create mode 100644 examples/transv1.8to2.x/data_utils/utility.py create mode 100644 examples/transv1.8to2.x/deepspeech/__init__.py create mode 100644 examples/transv1.8to2.x/deepspeech/decoders/README.MD create mode 100644 examples/transv1.8to2.x/deepspeech/decoders/__init__.py create mode 100644 examples/transv1.8to2.x/deepspeech/decoders/decoders_deprecated.py create mode 100644 examples/transv1.8to2.x/deepspeech/decoders/scorer_deprecated.py create mode 100644 examples/transv1.8to2.x/deepspeech/decoders/swig_wrapper.py create mode 100644 examples/transv1.8to2.x/deepspeech/frontend/__init__.py create mode 100644 examples/transv1.8to2.x/deepspeech/frontend/audio.py create mode 100644 examples/transv1.8to2.x/deepspeech/frontend/augmentor/__init__.py create mode 100644 examples/transv1.8to2.x/deepspeech/frontend/augmentor/augmentation.py create mode 100644 examples/transv1.8to2.x/deepspeech/frontend/augmentor/base.py create mode 100644 examples/transv1.8to2.x/deepspeech/frontend/augmentor/impulse_response.py create mode 100644 examples/transv1.8to2.x/deepspeech/frontend/augmentor/noise_perturb.py create mode 100644 examples/transv1.8to2.x/deepspeech/frontend/augmentor/online_bayesian_normalization.py create mode 100644 examples/transv1.8to2.x/deepspeech/frontend/augmentor/resample.py create mode 100644 examples/transv1.8to2.x/deepspeech/frontend/augmentor/shift_perturb.py create mode 100644 examples/transv1.8to2.x/deepspeech/frontend/augmentor/spec_augment.py create mode 100644 examples/transv1.8to2.x/deepspeech/frontend/augmentor/speed_perturb.py create mode 100644 examples/transv1.8to2.x/deepspeech/frontend/augmentor/volume_perturb.py create mode 100644 examples/transv1.8to2.x/deepspeech/frontend/featurizer/__init__.py create mode 100644 examples/transv1.8to2.x/deepspeech/frontend/featurizer/audio_featurizer.py create mode 100644 examples/transv1.8to2.x/deepspeech/frontend/featurizer/speech_featurizer.py create mode 100644 examples/transv1.8to2.x/deepspeech/frontend/featurizer/text_featurizer.py create mode 100644 examples/transv1.8to2.x/deepspeech/frontend/normalizer.py create mode 100644 examples/transv1.8to2.x/deepspeech/frontend/speech.py create mode 100644 examples/transv1.8to2.x/deepspeech/frontend/utility.py create mode 100644 examples/transv1.8to2.x/deepspeech/io/__init__.py create mode 100644 examples/transv1.8to2.x/deepspeech/io/batchfy.py create mode 100644 examples/transv1.8to2.x/deepspeech/io/collator.py create mode 100644 examples/transv1.8to2.x/deepspeech/io/collator_st.py create mode 100644 examples/transv1.8to2.x/deepspeech/io/converter.py create mode 100644 examples/transv1.8to2.x/deepspeech/io/dataloader.py create mode 100644 examples/transv1.8to2.x/deepspeech/io/dataset.py create mode 100644 examples/transv1.8to2.x/deepspeech/io/reader.py create mode 100644 examples/transv1.8to2.x/deepspeech/io/sampler.py create mode 100644 examples/transv1.8to2.x/deepspeech/io/utility.py create mode 100644 examples/transv1.8to2.x/deepspeech/models/__init__.py create mode 100644 examples/transv1.8to2.x/deepspeech/models/ds2/__init__.py create mode 100644 examples/transv1.8to2.x/deepspeech/models/ds2/conv.py create mode 100644 examples/transv1.8to2.x/deepspeech/models/ds2/deepspeech2.py create mode 100644 examples/transv1.8to2.x/deepspeech/models/ds2/rnn.py create mode 100644 examples/transv1.8to2.x/deepspeech/modules/__init__.py create mode 100644 examples/transv1.8to2.x/deepspeech/modules/activation.py create mode 100644 examples/transv1.8to2.x/deepspeech/modules/cmvn.py create mode 100644 examples/transv1.8to2.x/deepspeech/modules/crf.py create mode 100644 examples/transv1.8to2.x/deepspeech/modules/ctc.py create mode 100644 examples/transv1.8to2.x/deepspeech/modules/decoder.py create mode 100644 examples/transv1.8to2.x/deepspeech/modules/decoder_layer.py create mode 100644 examples/transv1.8to2.x/deepspeech/modules/encoder.py create mode 100644 examples/transv1.8to2.x/deepspeech/modules/loss.py create mode 100644 examples/transv1.8to2.x/deepspeech/modules/mask.py create mode 100644 examples/transv1.8to2.x/deepspeech/utils/__init__.py create mode 100644 examples/transv1.8to2.x/deepspeech/utils/bleu_score.py create mode 100644 examples/transv1.8to2.x/deepspeech/utils/checkpoint.py create mode 100644 examples/transv1.8to2.x/deepspeech/utils/ctc_utils.py create mode 100644 examples/transv1.8to2.x/deepspeech/utils/dynamic_import.py create mode 100644 examples/transv1.8to2.x/deepspeech/utils/error_rate.py create mode 100644 examples/transv1.8to2.x/deepspeech/utils/layer_tools.py create mode 100644 examples/transv1.8to2.x/deepspeech/utils/log.py create mode 100644 examples/transv1.8to2.x/deepspeech/utils/mp_tools.py create mode 100644 examples/transv1.8to2.x/deepspeech/utils/socket_server.py create mode 100644 examples/transv1.8to2.x/deepspeech/utils/tensor_utils.py create mode 100644 examples/transv1.8to2.x/deepspeech/utils/text_grid.py create mode 100644 examples/transv1.8to2.x/deepspeech/utils/utility.py create mode 100644 examples/transv1.8to2.x/example/aishell/run_data.sh create mode 100644 examples/transv1.8to2.x/example/aishell/run_infer_golden.sh create mode 100644 examples/transv1.8to2.x/example/aishell/run_test_golden.sh create mode 100644 examples/transv1.8to2.x/example/baidu_en8k/run_data.sh create mode 100644 examples/transv1.8to2.x/example/baidu_en8k/run_infer_golden.sh create mode 100644 examples/transv1.8to2.x/example/baidu_en8k/run_test_golden.sh create mode 100644 examples/transv1.8to2.x/example/librispeech/run_data.sh create mode 100644 examples/transv1.8to2.x/example/librispeech/run_infer_golden.sh create mode 100644 examples/transv1.8to2.x/example/librispeech/run_test_golden.sh create mode 100644 examples/transv1.8to2.x/infer2x.py create mode 100644 examples/transv1.8to2.x/model_utils/__init__.py create mode 100644 examples/transv1.8to2.x/model_utils/model_check.py create mode 100644 examples/transv1.8to2.x/models/aishell/download_model.sh create mode 100644 examples/transv1.8to2.x/models/baidu_en8k/download_model.sh create mode 100644 examples/transv1.8to2.x/models/librispeech/download_model.sh create mode 100644 examples/transv1.8to2.x/models/lm/download_lm_ch.sh create mode 100644 examples/transv1.8to2.x/models/lm/download_lm_en.sh create mode 100644 examples/transv1.8to2.x/test2x.py create mode 100644 examples/transv1.8to2.x/tools/_init_paths.py create mode 100644 examples/transv1.8to2.x/tools/build_vocab.py create mode 100644 examples/transv1.8to2.x/tools/compute_mean_std.py create mode 100644 examples/transv1.8to2.x/utils/__init__.py create mode 100644 examples/transv1.8to2.x/utils/error_rate.py create mode 100644 examples/transv1.8to2.x/utils/tests/test_error_rate.py create mode 100644 examples/transv1.8to2.x/utils/utility.py create mode 100644 examples/transv1.8to2.x/utils/utility.sh diff --git a/examples/transv1.8to2.x/data/aishell/aishell.py b/examples/transv1.8to2.x/data/aishell/aishell.py new file mode 100644 index 00000000..348b0460 --- /dev/null +++ b/examples/transv1.8to2.x/data/aishell/aishell.py @@ -0,0 +1,123 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare Aishell mandarin dataset + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +import argparse +import codecs +import json +import os + +import soundfile +from data_utils.utility import download +from data_utils.utility import unpack + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL_ROOT = 'http://www.openslr.org/resources/33' +URL_ROOT = 'https://openslr.magicdatatech.com/resources/33' +DATA_URL = URL_ROOT + '/data_aishell.tgz' +MD5_DATA = '2f494334227864a8a8fec932999db9d8' + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/Aishell", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def create_manifest(data_dir, manifest_path_prefix): + print("Creating manifest %s ..." % manifest_path_prefix) + json_lines = [] + transcript_path = os.path.join(data_dir, 'transcript', + 'aishell_transcript_v0.8.txt') + transcript_dict = {} + for line in codecs.open(transcript_path, 'r', 'utf-8'): + line = line.strip() + if line == '': + continue + audio_id, text = line.split(' ', 1) + # remove withespace + text = ''.join(text.split()) + transcript_dict[audio_id] = text + + data_types = ['train', 'dev', 'test'] + for type in data_types: + del json_lines[:] + audio_dir = os.path.join(data_dir, 'wav', type) + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for fname in filelist: + audio_path = os.path.join(subfolder, fname) + audio_id = fname[:-4] + # if no transcription for audio then skipped + if audio_id not in transcript_dict: + continue + audio_data, samplerate = soundfile.read(audio_path) + duration = float(len(audio_data) / samplerate) + text = transcript_dict[audio_id] + json_lines.append( + json.dumps( + { + 'audio_filepath': audio_path, + 'duration': duration, + 'text': text + }, + ensure_ascii=False)) + manifest_path = manifest_path_prefix + '.' + type + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + +def prepare_dataset(url, md5sum, target_dir, manifest_path): + """Download, unpack and create manifest file.""" + data_dir = os.path.join(target_dir, 'data_aishell') + if not os.path.exists(data_dir): + filepath = download(url, md5sum, target_dir) + unpack(filepath, target_dir) + # unpack all audio tar files + audio_dir = os.path.join(data_dir, 'wav') + for subfolder, _, filelist in sorted(os.walk(audio_dir)): + for ftar in filelist: + unpack(os.path.join(subfolder, ftar), subfolder, True) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + create_manifest(data_dir, manifest_path) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset( + url=DATA_URL, + md5sum=MD5_DATA, + target_dir=args.target_dir, + manifest_path=args.manifest_prefix) + + +if __name__ == '__main__': + main() diff --git a/examples/transv1.8to2.x/data/librispeech/librispeech.py b/examples/transv1.8to2.x/data/librispeech/librispeech.py new file mode 100644 index 00000000..2db37a32 --- /dev/null +++ b/examples/transv1.8to2.x/data/librispeech/librispeech.py @@ -0,0 +1,159 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare Librispeech ASR datasets. + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +import argparse +import codecs +import distutils.util +import io +import json +import os + +import soundfile +from data_utils.utility import download +from data_utils.utility import unpack + +URL_ROOT = "http://www.openslr.org/resources/12" +URL_ROOT = "https://openslr.magicdatatech.com/resources/12" +URL_TEST_CLEAN = URL_ROOT + "/test-clean.tar.gz" +URL_TEST_OTHER = URL_ROOT + "/test-other.tar.gz" +URL_DEV_CLEAN = URL_ROOT + "/dev-clean.tar.gz" +URL_DEV_OTHER = URL_ROOT + "/dev-other.tar.gz" +URL_TRAIN_CLEAN_100 = URL_ROOT + "/train-clean-100.tar.gz" +URL_TRAIN_CLEAN_360 = URL_ROOT + "/train-clean-360.tar.gz" +URL_TRAIN_OTHER_500 = URL_ROOT + "/train-other-500.tar.gz" + +MD5_TEST_CLEAN = "32fa31d27d2e1cad72775fee3f4849a9" +MD5_TEST_OTHER = "fb5a50374b501bb3bac4815ee91d3135" +MD5_DEV_CLEAN = "42e2234ba48799c1f50f24a7926300a1" +MD5_DEV_OTHER = "c8d0bcc9cca99d4f8b62fcc847357931" +MD5_TRAIN_CLEAN_100 = "2a93770f6d5c6c964bc36631d331a522" +MD5_TRAIN_CLEAN_360 = "c0e676e450a7ff2f54aeade5171606fa" +MD5_TRAIN_OTHER_500 = "d1a0fd59409feb2c614ce4d30c387708" + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default='~/.cache/paddle/dataset/speech/libri', + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +parser.add_argument( + "--full_download", + default="True", + type=distutils.util.strtobool, + help="Download all datasets for Librispeech." + " If False, only download a minimal requirement (test-clean, dev-clean" + " train-clean-100). (default: %(default)s)") +args = parser.parse_args() + + +def create_manifest(data_dir, manifest_path): + """Create a manifest json file summarizing the data set, with each line + containing the meta data (i.e. audio filepath, transcription text, audio + duration) of each audio file within the data set. + """ + print("Creating manifest %s ..." % manifest_path) + json_lines = [] + for subfolder, _, filelist in sorted(os.walk(data_dir)): + text_filelist = [ + filename for filename in filelist if filename.endswith('trans.txt') + ] + if len(text_filelist) > 0: + text_filepath = os.path.join(subfolder, text_filelist[0]) + for line in io.open(text_filepath, encoding="utf8"): + segments = line.strip().split() + text = ' '.join(segments[1:]).lower() + audio_filepath = os.path.join(subfolder, segments[0] + '.flac') + audio_data, samplerate = soundfile.read(audio_filepath) + duration = float(len(audio_data)) / samplerate + json_lines.append( + json.dumps({ + 'audio_filepath': audio_filepath, + 'duration': duration, + 'text': text + })) + with codecs.open(manifest_path, 'w', 'utf-8') as out_file: + for line in json_lines: + out_file.write(line + '\n') + + +def prepare_dataset(url, md5sum, target_dir, manifest_path): + """Download, unpack and create summmary manifest file. + """ + if not os.path.exists(os.path.join(target_dir, "LibriSpeech")): + # download + filepath = download(url, md5sum, target_dir) + # unpack + unpack(filepath, target_dir) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + # create manifest json file + create_manifest(target_dir, manifest_path) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset( + url=URL_TEST_CLEAN, + md5sum=MD5_TEST_CLEAN, + target_dir=os.path.join(args.target_dir, "test-clean"), + manifest_path=args.manifest_prefix + ".test-clean") + prepare_dataset( + url=URL_DEV_CLEAN, + md5sum=MD5_DEV_CLEAN, + target_dir=os.path.join(args.target_dir, "dev-clean"), + manifest_path=args.manifest_prefix + ".dev-clean") + if args.full_download: + prepare_dataset( + url=URL_TRAIN_CLEAN_100, + md5sum=MD5_TRAIN_CLEAN_100, + target_dir=os.path.join(args.target_dir, "train-clean-100"), + manifest_path=args.manifest_prefix + ".train-clean-100") + prepare_dataset( + url=URL_TEST_OTHER, + md5sum=MD5_TEST_OTHER, + target_dir=os.path.join(args.target_dir, "test-other"), + manifest_path=args.manifest_prefix + ".test-other") + prepare_dataset( + url=URL_DEV_OTHER, + md5sum=MD5_DEV_OTHER, + target_dir=os.path.join(args.target_dir, "dev-other"), + manifest_path=args.manifest_prefix + ".dev-other") + prepare_dataset( + url=URL_TRAIN_CLEAN_360, + md5sum=MD5_TRAIN_CLEAN_360, + target_dir=os.path.join(args.target_dir, "train-clean-360"), + manifest_path=args.manifest_prefix + ".train-clean-360") + prepare_dataset( + url=URL_TRAIN_OTHER_500, + md5sum=MD5_TRAIN_OTHER_500, + target_dir=os.path.join(args.target_dir, "train-other-500"), + manifest_path=args.manifest_prefix + ".train-other-500") + + +if __name__ == '__main__': + main() diff --git a/examples/transv1.8to2.x/data/noise/chime3_background.py b/examples/transv1.8to2.x/data/noise/chime3_background.py new file mode 100644 index 00000000..78187b1f --- /dev/null +++ b/examples/transv1.8to2.x/data/noise/chime3_background.py @@ -0,0 +1,139 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare CHiME3 background data. + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +import argparse +import io +import json +import os +import zipfile + +import soundfile +import wget +from paddle.v2.dataset.common import md5file + +DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech') + +URL = "https://d4s.myairbridge.com/packagev2/AG0Y3DNBE5IWRRTV/?dlid=W19XG7T0NNHB027139H0EQ" +MD5 = "c3ff512618d7a67d4f85566ea1bc39ec" + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/chime3_background", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--manifest_filepath", + default="manifest.chime3.background", + type=str, + help="Filepath for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def download(url, md5sum, target_dir, filename=None): + """Download file from url to target_dir, and check md5sum.""" + if filename is None: + filename = url.split("/")[-1] + if not os.path.exists(target_dir): + os.makedirs(target_dir) + filepath = os.path.join(target_dir, filename) + if not (os.path.exists(filepath) and md5file(filepath) == md5sum): + print("Downloading %s ..." % url) + wget.download(url, target_dir) + print("\nMD5 Chesksum %s ..." % filepath) + if not md5file(filepath) == md5sum: + raise RuntimeError("MD5 checksum failed.") + else: + print("File exists, skip downloading. (%s)" % filepath) + return filepath + + +def unpack(filepath, target_dir): + """Unpack the file to the target_dir.""" + print("Unpacking %s ..." % filepath) + if filepath.endswith('.zip'): + zip = zipfile.ZipFile(filepath, 'r') + zip.extractall(target_dir) + zip.close() + elif filepath.endswith('.tar') or filepath.endswith('.tar.gz'): + tar = zipfile.open(filepath) + tar.extractall(target_dir) + tar.close() + else: + raise ValueError("File format is not supported for unpacking.") + + +def create_manifest(data_dir, manifest_path): + """Create a manifest json file summarizing the data set, with each line + containing the meta data (i.e. audio filepath, transcription text, audio + duration) of each audio file within the data set. + """ + print("Creating manifest %s ..." % manifest_path) + json_lines = [] + for subfolder, _, filelist in sorted(os.walk(data_dir)): + for filename in filelist: + if filename.endswith('.wav'): + filepath = os.path.join(data_dir, subfolder, filename) + audio_data, samplerate = soundfile.read(filepath) + duration = float(len(audio_data)) / samplerate + json_lines.append( + json.dumps({ + 'audio_filepath': filepath, + 'duration': duration, + 'text': '' + })) + with io.open(manifest_path, mode='w', encoding='utf8') as out_file: + for line in json_lines: + out_file.write(line + '\n') + + +def prepare_chime3(url, md5sum, target_dir, manifest_path): + """Download, unpack and create summmary manifest file.""" + if not os.path.exists(os.path.join(target_dir, "CHiME3")): + # download + filepath = download(url, md5sum, target_dir, + "myairbridge-AG0Y3DNBE5IWRRTV.zip") + # unpack + unpack(filepath, target_dir) + unpack( + os.path.join(target_dir, 'CHiME3_background_bus.zip'), target_dir) + unpack( + os.path.join(target_dir, 'CHiME3_background_caf.zip'), target_dir) + unpack( + os.path.join(target_dir, 'CHiME3_background_ped.zip'), target_dir) + unpack( + os.path.join(target_dir, 'CHiME3_background_str.zip'), target_dir) + else: + print("Skip downloading and unpacking. Data already exists in %s." % + target_dir) + # create manifest json file + create_manifest(target_dir, manifest_path) + + +def main(): + prepare_chime3( + url=URL, + md5sum=MD5, + target_dir=args.target_dir, + manifest_path=args.manifest_filepath) + + +if __name__ == '__main__': + main() diff --git a/examples/transv1.8to2.x/data/voxforge/run_data.sh b/examples/transv1.8to2.x/data/voxforge/run_data.sh new file mode 100644 index 00000000..0276744a --- /dev/null +++ b/examples/transv1.8to2.x/data/voxforge/run_data.sh @@ -0,0 +1,16 @@ +#! /usr/bin/env bash + +# download data, generate manifests +PYTHONPATH=../../:$PYTHONPATH python voxforge.py \ +--manifest_prefix='./manifest' \ +--target_dir='./dataset/VoxForge' \ +--is_merge_dialect=True \ +--dialects 'american' 'british' 'australian' 'european' 'irish' 'canadian' 'indian' + +if [ $? -ne 0 ]; then + echo "Prepare VoxForge failed. Terminated." + exit 1 +fi + +echo "VoxForge Data preparation done." +exit 0 diff --git a/examples/transv1.8to2.x/data/voxforge/voxforge.py b/examples/transv1.8to2.x/data/voxforge/voxforge.py new file mode 100644 index 00000000..7d90b3c6 --- /dev/null +++ b/examples/transv1.8to2.x/data/voxforge/voxforge.py @@ -0,0 +1,234 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Prepare VoxForge dataset + +Download, unpack and create manifest files. +Manifest file is a json-format file with each line containing the +meta data (i.e. audio filepath, transcript and audio duration) +of each audio file in the data set. +""" +import argparse +import codecs +import datetime +import json +import os +import shutil +import subprocess + +import soundfile +from data_utils.utility import download_multi +from data_utils.utility import getfile_insensitive +from data_utils.utility import unpack + +DATA_HOME = './dataset' + +DATA_URL = 'http://www.repository.voxforge1.org/downloads/SpeechCorpus/Trunk/' \ + 'Audio/Main/16kHz_16bit' + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--target_dir", + default=DATA_HOME + "/VoxForge", + type=str, + help="Directory to save the dataset. (default: %(default)s)") +parser.add_argument( + "--dialects", + default=[ + 'american', 'british', 'australian', 'european', 'irish', 'canadian', + 'indian' + ], + nargs='+', + type=str, + help="Dialect types. (default: %(default)s)") +parser.add_argument( + "--is_merge_dialect", + default=True, + type=bool, + help="If set True, manifests of american dialect and canadian dialect will " + "be merged to american-canadian dialect; manifests of british " + "dialect, irish dialect and australian dialect will be merged to " + "commonwealth dialect. (default: %(default)s)") +parser.add_argument( + "--manifest_prefix", + default="manifest", + type=str, + help="Filepath prefix for output manifests. (default: %(default)s)") +args = parser.parse_args() + + +def download_and_unpack(target_dir, url): + wget_args = '-q -l 1 -N -nd -c -e robots=off -A tgz -r -np' + tgz_dir = os.path.join(target_dir, 'tgz') + exit_code = download_multi(url, tgz_dir, wget_args) + if exit_code != 0: + print('Download tgz audio files failed with exit code %d.' % exit_code) + else: + print('Download done, start unpacking ...') + audio_dir = os.path.join(target_dir, 'audio') + for root, dirs, files in os.walk(tgz_dir): + for file in files: + print(file) + if file.endswith('.tgz'): + unpack(os.path.join(root, file), audio_dir) + + +def select_dialects(target_dir, dialect_list): + """Classify audio files by dialect.""" + dialect_root_dir = os.path.join(target_dir, 'dialect') + if os.path.exists(dialect_root_dir): + shutil.rmtree(dialect_root_dir) + os.mkdir(dialect_root_dir) + audio_dir = os.path.abspath(os.path.join(target_dir, 'audio')) + for dialect in dialect_list: + # filter files by dialect + command = 'find %s -iwholename "*etc/readme*" -exec egrep -iHl \ + "pronunciation dialect.*%s" {} \;' % (audio_dir, dialect) + p = subprocess.Popen( + command, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=True) + output, err = p.communicate() + dialect_dir = os.path.join(dialect_root_dir, dialect) + if os.path.exists(dialect_dir): + shutil.rmtree(dialect_dir) + os.mkdir(dialect_dir) + for path in output.splitlines(): + src_dir = os.path.dirname(os.path.dirname(path)) + link = os.path.basename(os.path.normpath(src_dir)) + os.symlink(src_dir, os.path.join(dialect_dir, link)) + + +def generate_manifest(data_dir, manifest_path): + json_lines = [] + + for path in os.listdir(data_dir): + audio_link = os.path.join(data_dir, path) + assert os.path.islink( + audio_link), '%s should be symbolic link.' % audio_link + actual_audio_dir = os.path.abspath(os.readlink(audio_link)) + + audio_type = '' + if os.path.isdir(os.path.join(actual_audio_dir, 'wav')): + audio_type = 'wav' + elif os.path.isdir(os.path.join(actual_audio_dir, 'flac')): + audio_type = 'flac' + else: + print('Unknown audio type, skipped processing %s.' % + actual_audio_dir) + continue + + etc_dir = os.path.join(actual_audio_dir, 'etc') + prompts_file = os.path.join(etc_dir, 'PROMPTS') + if not os.path.isfile(prompts_file): + print('PROMPTS file missing, skip processing %s.' % + actual_audio_dir) + continue + + readme_file = getfile_insensitive(os.path.join(etc_dir, 'README')) + if readme_file is None: + print('README file missing, skip processing %s.' % actual_audio_dir) + continue + + for line in file(prompts_file): + u, trans = line.strip().split(None, 1) + u_parts = u.split('/') + + # try to format the date time + try: + speaker, date, sfx = u_parts[-3].split('-') + obj = datetime.datetime.strptime(date, '%y.%m.%d') + formatted = obj.strftime('%Y%m%d') + u_parts[-3] = '-'.join([speaker, formatted, sfx]) + except Exception as e: + pass + + if len(u_parts) < 2: + u_parts = [audio_type] + u_parts + u_parts[-2] = audio_type + u_parts[-1] += '.' + audio_type + u = os.path.join(actual_audio_dir, '/'.join(u_parts[-2:])) + + if not os.path.isfile(u): + print('Audio file missing, skip processing %s.' % u) + continue + + if os.stat(u).st_size == 0: + print('Empty audio file, skip processing %s.' % u) + continue + + trans = trans.strip().replace('-', ' ') + if not trans.isupper() or \ + not trans.strip().replace(' ', '').replace("'", "").isalpha(): + print("Transcript not normalized properly, skip processing %s." + % u) + continue + + audio_data, samplerate = soundfile.read(u) + duration = float(len(audio_data)) / samplerate + json_lines.append( + json.dumps({ + 'audio_filepath': u, + 'duration': duration, + 'text': trans.lower() + })) + + with codecs.open(manifest_path, 'w', 'utf-8') as fout: + for line in json_lines: + fout.write(line + '\n') + + +def merge_manifests(manifest_files, save_path): + lines = [] + for manifest_file in manifest_files: + line = codecs.open(manifest_file, 'r', 'utf-8').readlines() + lines += line + + with codecs.open(save_path, 'w', 'utf-8') as fout: + for line in lines: + fout.write(line) + + +def prepare_dataset(url, dialects, target_dir, manifest_prefix, is_merge): + download_and_unpack(target_dir, url) + select_dialects(target_dir, dialects) + american_canadian_manifests = [] + commonwealth_manifests = [] + for dialect in dialects: + dialect_dir = os.path.join(target_dir, 'dialect', dialect) + manifest_fpath = manifest_prefix + '.' + dialect + if dialect == 'american' or dialect == 'canadian': + american_canadian_manifests.append(manifest_fpath) + if dialect == 'australian' \ + or dialect == 'british' \ + or dialect == 'irish': + commonwealth_manifests.append(manifest_fpath) + generate_manifest(dialect_dir, manifest_fpath) + + if is_merge: + if len(american_canadian_manifests) > 0: + manifest_fpath = manifest_prefix + '.american-canadian' + merge_manifests(american_canadian_manifests, manifest_fpath) + if len(commonwealth_manifests) > 0: + manifest_fpath = manifest_prefix + '.commonwealth' + merge_manifests(commonwealth_manifests, manifest_fpath) + + +def main(): + if args.target_dir.startswith('~'): + args.target_dir = os.path.expanduser(args.target_dir) + + prepare_dataset(DATA_URL, args.dialects, args.target_dir, + args.manifest_prefix, args.is_merge_dialect) + + +if __name__ == '__main__': + main() diff --git a/examples/transv1.8to2.x/data_utils/__init__.py b/examples/transv1.8to2.x/data_utils/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/examples/transv1.8to2.x/data_utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/transv1.8to2.x/data_utils/audio.py b/examples/transv1.8to2.x/data_utils/audio.py new file mode 100644 index 00000000..f4b5ac07 --- /dev/null +++ b/examples/transv1.8to2.x/data_utils/audio.py @@ -0,0 +1,695 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the audio segment class.""" +import copy +import io +import random +import re +import struct + +import numpy as np +import resampy +import soundfile +from scipy import signal + + +class AudioSegment(object): + """Monaural audio segment abstraction. + + :param samples: Audio samples [num_samples x num_channels]. + :type samples: ndarray.float32 + :param sample_rate: Audio sample rate. + :type sample_rate: int + :raises TypeError: If the sample data type is not float or int. + """ + + def __init__(self, samples, sample_rate): + """Create audio segment from samples. + + Samples are convert float32 internally, with int scaled to [-1, 1]. + """ + self._samples = self._convert_samples_to_float32(samples) + self._sample_rate = sample_rate + if self._samples.ndim >= 2: + self._samples = np.mean(self._samples, 1) + + def __eq__(self, other): + """Return whether two objects are equal.""" + if type(other) is not type(self): + return False + if self._sample_rate != other._sample_rate: + return False + if self._samples.shape != other._samples.shape: + return False + if np.any(self.samples != other._samples): + return False + return True + + def __ne__(self, other): + """Return whether two objects are unequal.""" + return not self.__eq__(other) + + def __str__(self): + """Return human-readable representation of segment.""" + return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, " + "rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate, + self.duration, self.rms_db)) + + @classmethod + def from_file(cls, file): + """Create audio segment from audio file. + + :param filepath: Filepath or file object to audio file. + :type filepath: str|file + :return: Audio segment instance. + :rtype: AudioSegment + """ + if isinstance(file, str) and re.findall(r".seqbin_\d+$", file): + return cls.from_sequence_file(file) + else: + samples, sample_rate = soundfile.read(file, dtype='float32') + return cls(samples, sample_rate) + + @classmethod + def slice_from_file(cls, file, start=None, end=None): + """Loads a small section of an audio without having to load + the entire file into the memory which can be incredibly wasteful. + + :param file: Input audio filepath or file object. + :type file: str|file + :param start: Start time in seconds. If start is negative, it wraps + around from the end. If not provided, this function + reads from the very beginning. + :type start: float + :param end: End time in seconds. If end is negative, it wraps around + from the end. If not provided, the default behvaior is + to read to the end of the file. + :type end: float + :return: AudioSegment instance of the specified slice of the input + audio file. + :rtype: AudioSegment + :raise ValueError: If start or end is incorrectly set, e.g. out of + bounds in time. + """ + sndfile = soundfile.SoundFile(file) + sample_rate = sndfile.samplerate + duration = float(len(sndfile)) / sample_rate + start = 0. if start is None else start + end = duration if end is None else end + if start < 0.0: + start += duration + if end < 0.0: + end += duration + if start < 0.0: + raise ValueError("The slice start position (%f s) is out of " + "bounds." % start) + if end < 0.0: + raise ValueError("The slice end position (%f s) is out of bounds." % + end) + if start > end: + raise ValueError("The slice start position (%f s) is later than " + "the slice end position (%f s)." % (start, end)) + if end > duration: + raise ValueError("The slice end position (%f s) is out of bounds " + "(> %f s)" % (end, duration)) + start_frame = int(start * sample_rate) + end_frame = int(end * sample_rate) + sndfile.seek(start_frame) + data = sndfile.read(frames=end_frame - start_frame, dtype='float32') + return cls(data, sample_rate) + + @classmethod + def from_sequence_file(cls, filepath): + """Create audio segment from sequence file. Sequence file is a binary + file containing a collection of multiple audio files, with several + header bytes in the head indicating the offsets of each audio byte data + chunk. + + The format is: + + 4 bytes (int, version), + 4 bytes (int, num of utterance), + 4 bytes (int, bytes per header), + [bytes_per_header*(num_utterance+1)] bytes (offsets for each audio), + audio_bytes_data_of_1st_utterance, + audio_bytes_data_of_2nd_utterance, + ...... + + Sequence file name must end with ".seqbin". And the filename of the 5th + utterance's audio file in sequence file "xxx.seqbin" must be + "xxx.seqbin_5", with "5" indicating the utterance index within this + sequence file (starting from 1). + + :param filepath: Filepath of sequence file. + :type filepath: str + :return: Audio segment instance. + :rtype: AudioSegment + """ + # parse filepath + matches = re.match(r"(.+\.seqbin)_(\d+)", filepath) + if matches is None: + raise IOError("File type of %s is not supported" % filepath) + filename = matches.group(1) + fileno = int(matches.group(2)) + + # read headers + f = io.open(filename, mode='rb', encoding='utf8') + version = f.read(4) + num_utterances = struct.unpack("i", f.read(4))[0] + bytes_per_header = struct.unpack("i", f.read(4))[0] + header_bytes = f.read(bytes_per_header * (num_utterances + 1)) + header = [ + struct.unpack("i", header_bytes[bytes_per_header * i: + bytes_per_header * (i + 1)])[0] + for i in range(num_utterances + 1) + ] + + # read audio bytes + f.seek(header[fileno - 1]) + audio_bytes = f.read(header[fileno] - header[fileno - 1]) + f.close() + + # create audio segment + try: + return cls.from_bytes(audio_bytes) + except Exception as e: + samples = np.frombuffer(audio_bytes, dtype='int16') + return cls(samples=samples, sample_rate=8000) + + @classmethod + def from_bytes(cls, bytes): + """Create audio segment from a byte string containing audio samples. + + :param bytes: Byte string containing audio samples. + :type bytes: str + :return: Audio segment instance. + :rtype: AudioSegment + """ + samples, sample_rate = soundfile.read( + io.BytesIO(bytes), dtype='float32') + return cls(samples, sample_rate) + + @classmethod + def concatenate(cls, *segments): + """Concatenate an arbitrary number of audio segments together. + + :param *segments: Input audio segments to be concatenated. + :type *segments: tuple of AudioSegment + :return: Audio segment instance as concatenating results. + :rtype: AudioSegment + :raises ValueError: If the number of segments is zero, or if the + sample_rate of any segments does not match. + :raises TypeError: If any segment is not AudioSegment instance. + """ + # Perform basic sanity-checks. + if len(segments) == 0: + raise ValueError("No audio segments are given to concatenate.") + sample_rate = segments[0]._sample_rate + for seg in segments: + if sample_rate != seg._sample_rate: + raise ValueError("Can't concatenate segments with " + "different sample rates") + if type(seg) is not cls: + raise TypeError("Only audio segments of the same type " + "can be concatenated.") + samples = np.concatenate([seg.samples for seg in segments]) + return cls(samples, sample_rate) + + @classmethod + def make_silence(cls, duration, sample_rate): + """Creates a silent audio segment of the given duration and sample rate. + + :param duration: Length of silence in seconds. + :type duration: float + :param sample_rate: Sample rate. + :type sample_rate: float + :return: Silent AudioSegment instance of the given duration. + :rtype: AudioSegment + """ + samples = np.zeros(int(duration * sample_rate)) + return cls(samples, sample_rate) + + def to_wav_file(self, filepath, dtype='float32'): + """Save audio segment to disk as wav file. + + :param filepath: WAV filepath or file object to save the + audio segment. + :type filepath: str|file + :param dtype: Subtype for audio file. Options: 'int16', 'int32', + 'float32', 'float64'. Default is 'float32'. + :type dtype: str + :raises TypeError: If dtype is not supported. + """ + samples = self._convert_samples_from_float32(self._samples, dtype) + subtype_map = { + 'int16': 'PCM_16', + 'int32': 'PCM_32', + 'float32': 'FLOAT', + 'float64': 'DOUBLE' + } + soundfile.write( + filepath, + samples, + self._sample_rate, + format='WAV', + subtype=subtype_map[dtype]) + + def superimpose(self, other): + """Add samples from another segment to those of this segment + (sample-wise addition, not segment concatenation). + + Note that this is an in-place transformation. + + :param other: Segment containing samples to be added in. + :type other: AudioSegments + :raise TypeError: If type of two segments don't match. + :raise ValueError: If the sample rates of the two segments are not + equal, or if the lengths of segments don't match. + """ + if isinstance(other, type(self)): + raise TypeError("Cannot add segments of different types: %s " + "and %s." % (type(self), type(other))) + if self._sample_rate != other._sample_rate: + raise ValueError("Sample rates must match to add segments.") + if len(self._samples) != len(other._samples): + raise ValueError("Segment lengths must match to add segments.") + self._samples += other._samples + + def to_bytes(self, dtype='float32'): + """Create a byte string containing the audio content. + + :param dtype: Data type for export samples. Options: 'int16', 'int32', + 'float32', 'float64'. Default is 'float32'. + :type dtype: str + :return: Byte string containing audio content. + :rtype: str + """ + samples = self._convert_samples_from_float32(self._samples, dtype) + return samples.tostring() + + def gain_db(self, gain): + """Apply gain in decibels to samples. + + Note that this is an in-place transformation. + + :param gain: Gain in decibels to apply to samples. + :type gain: float|1darray + """ + self._samples *= 10.**(gain / 20.) + + def change_speed(self, speed_rate): + """Change the audio speed by linear interpolation. + + Note that this is an in-place transformation. + + :param speed_rate: Rate of speed change: + speed_rate > 1.0, speed up the audio; + speed_rate = 1.0, unchanged; + speed_rate < 1.0, slow down the audio; + speed_rate <= 0.0, not allowed, raise ValueError. + :type speed_rate: float + :raises ValueError: If speed_rate <= 0.0. + """ + if speed_rate <= 0: + raise ValueError("speed_rate should be greater than zero.") + old_length = self._samples.shape[0] + new_length = int(old_length / speed_rate) + old_indices = np.arange(old_length) + new_indices = np.linspace(start=0, stop=old_length, num=new_length) + self._samples = np.interp(new_indices, old_indices, self._samples) + + def normalize(self, target_db=-20, max_gain_db=300.0): + """Normalize audio to be of the desired RMS value in decibels. + + Note that this is an in-place transformation. + + :param target_db: Target RMS value in decibels. This value should be + less than 0.0 as 0.0 is full-scale audio. + :type target_db: float + :param max_gain_db: Max amount of gain in dB that can be applied for + normalization. This is to prevent nans when + attempting to normalize a signal consisting of + all zeros. + :type max_gain_db: float + :raises ValueError: If the required gain to normalize the segment to + the target_db value exceeds max_gain_db. + """ + gain = target_db - self.rms_db + if gain > max_gain_db: + raise ValueError( + "Unable to normalize segment to %f dB because the " + "the probable gain have exceeds max_gain_db (%f dB)" % + (target_db, max_gain_db)) + self.gain_db(min(max_gain_db, target_db - self.rms_db)) + + def normalize_online_bayesian(self, + target_db, + prior_db, + prior_samples, + startup_delay=0.0): + """Normalize audio using a production-compatible online/causal + algorithm. This uses an exponential likelihood and gamma prior to + make online estimates of the RMS even when there are very few samples. + + Note that this is an in-place transformation. + + :param target_db: Target RMS value in decibels. + :type target_bd: float + :param prior_db: Prior RMS estimate in decibels. + :type prior_db: float + :param prior_samples: Prior strength in number of samples. + :type prior_samples: float + :param startup_delay: Default 0.0s. If provided, this function will + accrue statistics for the first startup_delay + seconds before applying online normalization. + :type startup_delay: float + """ + # Estimate total RMS online. + startup_sample_idx = min(self.num_samples - 1, + int(self.sample_rate * startup_delay)) + prior_mean_squared = 10.**(prior_db / 10.) + prior_sum_of_squares = prior_mean_squared * prior_samples + cumsum_of_squares = np.cumsum(self.samples**2) + sample_count = np.arange(self.num_samples) + 1 + if startup_sample_idx > 0: + cumsum_of_squares[:startup_sample_idx] = \ + cumsum_of_squares[startup_sample_idx] + sample_count[:startup_sample_idx] = \ + sample_count[startup_sample_idx] + mean_squared_estimate = ((cumsum_of_squares + prior_sum_of_squares) / + (sample_count + prior_samples)) + rms_estimate_db = 10 * np.log10(mean_squared_estimate) + # Compute required time-varying gain. + gain_db = target_db - rms_estimate_db + self.gain_db(gain_db) + + def resample(self, target_sample_rate, filter='kaiser_best'): + """Resample the audio to a target sample rate. + + Note that this is an in-place transformation. + + :param target_sample_rate: Target sample rate. + :type target_sample_rate: int + :param filter: The resampling filter to use one of {'kaiser_best', + 'kaiser_fast'}. + :type filter: str + """ + self._samples = resampy.resample( + self.samples, self.sample_rate, target_sample_rate, filter=filter) + self._sample_rate = target_sample_rate + + def pad_silence(self, duration, sides='both'): + """Pad this audio sample with a period of silence. + + Note that this is an in-place transformation. + + :param duration: Length of silence in seconds to pad. + :type duration: float + :param sides: Position for padding: + 'beginning' - adds silence in the beginning; + 'end' - adds silence in the end; + 'both' - adds silence in both the beginning and the end. + :type sides: str + :raises ValueError: If sides is not supported. + """ + if duration == 0.0: + return self + cls = type(self) + silence = self.make_silence(duration, self._sample_rate) + if sides == "beginning": + padded = cls.concatenate(silence, self) + elif sides == "end": + padded = cls.concatenate(self, silence) + elif sides == "both": + padded = cls.concatenate(silence, self, silence) + else: + raise ValueError("Unknown value for the sides %s" % sides) + self._samples = padded._samples + + def shift(self, shift_ms): + """Shift the audio in time. If `shift_ms` is positive, shift with time + advance; if negative, shift with time delay. Silence are padded to + keep the duration unchanged. + + Note that this is an in-place transformation. + + :param shift_ms: Shift time in millseconds. If positive, shift with + time advance; if negative; shift with time delay. + :type shift_ms: float + :raises ValueError: If shift_ms is longer than audio duration. + """ + if abs(shift_ms) / 1000.0 > self.duration: + raise ValueError("Absolute value of shift_ms should be smaller " + "than audio duration.") + shift_samples = int(shift_ms * self._sample_rate / 1000) + if shift_samples > 0: + # time advance + self._samples[:-shift_samples] = self._samples[shift_samples:] + self._samples[-shift_samples:] = 0 + elif shift_samples < 0: + # time delay + self._samples[-shift_samples:] = self._samples[:shift_samples] + self._samples[:-shift_samples] = 0 + + def subsegment(self, start_sec=None, end_sec=None): + """Cut the AudioSegment between given boundaries. + + Note that this is an in-place transformation. + + :param start_sec: Beginning of subsegment in seconds. + :type start_sec: float + :param end_sec: End of subsegment in seconds. + :type end_sec: float + :raise ValueError: If start_sec or end_sec is incorrectly set, e.g. out + of bounds in time. + """ + start_sec = 0.0 if start_sec is None else start_sec + end_sec = self.duration if end_sec is None else end_sec + if start_sec < 0.0: + start_sec = self.duration + start_sec + if end_sec < 0.0: + end_sec = self.duration + end_sec + if start_sec < 0.0: + raise ValueError("The slice start position (%f s) is out of " + "bounds." % start_sec) + if end_sec < 0.0: + raise ValueError("The slice end position (%f s) is out of bounds." % + end_sec) + if start_sec > end_sec: + raise ValueError("The slice start position (%f s) is later than " + "the end position (%f s)." % (start_sec, end_sec)) + if end_sec > self.duration: + raise ValueError("The slice end position (%f s) is out of bounds " + "(> %f s)" % (end_sec, self.duration)) + start_sample = int(round(start_sec * self._sample_rate)) + end_sample = int(round(end_sec * self._sample_rate)) + self._samples = self._samples[start_sample:end_sample] + + def random_subsegment(self, subsegment_length, rng=None): + """Cut the specified length of the audiosegment randomly. + + Note that this is an in-place transformation. + + :param subsegment_length: Subsegment length in seconds. + :type subsegment_length: float + :param rng: Random number generator state. + :type rng: random.Random + :raises ValueError: If the length of subsegment is greater than + the origineal segemnt. + """ + rng = random.Random() if rng is None else rng + if subsegment_length > self.duration: + raise ValueError("Length of subsegment must not be greater " + "than original segment.") + start_time = rng.uniform(0.0, self.duration - subsegment_length) + self.subsegment(start_time, start_time + subsegment_length) + + def convolve(self, impulse_segment, allow_resample=False): + """Convolve this audio segment with the given impulse segment. + + Note that this is an in-place transformation. + + :param impulse_segment: Impulse response segments. + :type impulse_segment: AudioSegment + :param allow_resample: Indicates whether resampling is allowed when + the impulse_segment has a different sample + rate from this signal. + :type allow_resample: bool + :raises ValueError: If the sample rate is not match between two + audio segments when resample is not allowed. + """ + if allow_resample and self.sample_rate != impulse_segment.sample_rate: + impulse_segment.resample(self.sample_rate) + if self.sample_rate != impulse_segment.sample_rate: + raise ValueError("Impulse segment's sample rate (%d Hz) is not " + "equal to base signal sample rate (%d Hz)." % + (impulse_segment.sample_rate, self.sample_rate)) + samples = signal.fftconvolve(self.samples, impulse_segment.samples, + "full") + self._samples = samples + + def convolve_and_normalize(self, impulse_segment, allow_resample=False): + """Convolve and normalize the resulting audio segment so that it + has the same average power as the input signal. + + Note that this is an in-place transformation. + + :param impulse_segment: Impulse response segments. + :type impulse_segment: AudioSegment + :param allow_resample: Indicates whether resampling is allowed when + the impulse_segment has a different sample + rate from this signal. + :type allow_resample: bool + """ + target_db = self.rms_db + self.convolve(impulse_segment, allow_resample=allow_resample) + self.normalize(target_db) + + def add_noise(self, + noise, + snr_dB, + allow_downsampling=False, + max_gain_db=300.0, + rng=None): + """Add the given noise segment at a specific signal-to-noise ratio. + If the noise segment is longer than this segment, a random subsegment + of matching length is sampled from it and used instead. + + Note that this is an in-place transformation. + + :param noise: Noise signal to add. + :type noise: AudioSegment + :param snr_dB: Signal-to-Noise Ratio, in decibels. + :type snr_dB: float + :param allow_downsampling: Whether to allow the noise signal to be + downsampled to match the base signal sample + rate. + :type allow_downsampling: bool + :param max_gain_db: Maximum amount of gain to apply to noise signal + before adding it in. This is to prevent attempting + to apply infinite gain to a zero signal. + :type max_gain_db: float + :param rng: Random number generator state. + :type rng: None|random.Random + :raises ValueError: If the sample rate does not match between the two + audio segments when downsampling is not allowed, or + if the duration of noise segments is shorter than + original audio segments. + """ + rng = random.Random() if rng is None else rng + if allow_downsampling and noise.sample_rate > self.sample_rate: + noise = noise.resample(self.sample_rate) + if noise.sample_rate != self.sample_rate: + raise ValueError("Noise sample rate (%d Hz) is not equal to base " + "signal sample rate (%d Hz)." % (noise.sample_rate, + self.sample_rate)) + if noise.duration < self.duration: + raise ValueError("Noise signal (%f sec) must be at least as long as" + " base signal (%f sec)." % + (noise.duration, self.duration)) + noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db) + noise_new = copy.deepcopy(noise) + noise_new.random_subsegment(self.duration, rng=rng) + noise_new.gain_db(noise_gain_db) + self.superimpose(noise_new) + + @property + def samples(self): + """Return audio samples. + + :return: Audio samples. + :rtype: ndarray + """ + return self._samples.copy() + + @property + def sample_rate(self): + """Return audio sample rate. + + :return: Audio sample rate. + :rtype: int + """ + return self._sample_rate + + @property + def num_samples(self): + """Return number of samples. + + :return: Number of samples. + :rtype: int + """ + return self._samples.shape[0] + + @property + def duration(self): + """Return audio duration. + + :return: Audio duration in seconds. + :rtype: float + """ + return self._samples.shape[0] / float(self._sample_rate) + + @property + def rms_db(self): + """Return root mean square energy of the audio in decibels. + + :return: Root mean square energy in decibels. + :rtype: float + """ + # square root => multiply by 10 instead of 20 for dBs + mean_square = np.mean(self._samples**2) + return 10 * np.log10(mean_square) + + def _convert_samples_to_float32(self, samples): + """Convert sample type to float32. + + Audio sample type is usually integer or float-point. + Integers will be scaled to [-1, 1] in float32. + """ + float32_samples = samples.astype('float32') + if samples.dtype in np.sctypes['int']: + bits = np.iinfo(samples.dtype).bits + float32_samples *= (1. / 2**(bits - 1)) + elif samples.dtype in np.sctypes['float']: + pass + else: + raise TypeError("Unsupported sample type: %s." % samples.dtype) + return float32_samples + + def _convert_samples_from_float32(self, samples, dtype): + """Convert sample type from float32 to dtype. + + Audio sample type is usually integer or float-point. For integer + type, float32 will be rescaled from [-1, 1] to the maximum range + supported by the integer type. + + This is for writing a audio file. + """ + dtype = np.dtype(dtype) + output_samples = samples.copy() + if dtype in np.sctypes['int']: + bits = np.iinfo(dtype).bits + output_samples *= (2**(bits - 1) / 1.) + min_val = np.iinfo(dtype).min + max_val = np.iinfo(dtype).max + output_samples[output_samples > max_val] = max_val + output_samples[output_samples < min_val] = min_val + elif samples.dtype in np.sctypes['float']: + min_val = np.finfo(dtype).min + max_val = np.finfo(dtype).max + output_samples[output_samples > max_val] = max_val + output_samples[output_samples < min_val] = min_val + else: + raise TypeError("Unsupported sample type: %s." % samples.dtype) + return output_samples.astype(dtype) diff --git a/examples/transv1.8to2.x/data_utils/augmentor/__init__.py b/examples/transv1.8to2.x/data_utils/augmentor/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/examples/transv1.8to2.x/data_utils/augmentor/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/transv1.8to2.x/data_utils/augmentor/augmentation.py b/examples/transv1.8to2.x/data_utils/augmentor/augmentation.py new file mode 100644 index 00000000..e43063cb --- /dev/null +++ b/examples/transv1.8to2.x/data_utils/augmentor/augmentation.py @@ -0,0 +1,134 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the data augmentation pipeline.""" +import json +import random + +from data_utils.augmentor.impulse_response import ImpulseResponseAugmentor +from data_utils.augmentor.noise_perturb import NoisePerturbAugmentor +from data_utils.augmentor.online_bayesian_normalization import \ + OnlineBayesianNormalizationAugmentor +from data_utils.augmentor.resample import ResampleAugmentor +from data_utils.augmentor.shift_perturb import ShiftPerturbAugmentor +from data_utils.augmentor.speed_perturb import SpeedPerturbAugmentor +from data_utils.augmentor.volume_perturb import VolumePerturbAugmentor + + +class AugmentationPipeline(object): + """Build a pre-processing pipeline with various augmentation models.Such a + data augmentation pipeline is oftern leveraged to augment the training + samples to make the model invariant to certain types of perturbations in the + real world, improving model's generalization ability. + + The pipeline is built according the the augmentation configuration in json + string, e.g. + + .. code-block:: + + [ { + "type": "noise", + "params": {"min_snr_dB": 10, + "max_snr_dB": 20, + "noise_manifest_path": "datasets/manifest.noise"}, + "prob": 0.0 + }, + { + "type": "speed", + "params": {"min_speed_rate": 0.9, + "max_speed_rate": 1.1}, + "prob": 1.0 + }, + { + "type": "shift", + "params": {"min_shift_ms": -5, + "max_shift_ms": 5}, + "prob": 1.0 + }, + { + "type": "volume", + "params": {"min_gain_dBFS": -10, + "max_gain_dBFS": 10}, + "prob": 0.0 + }, + { + "type": "bayesian_normal", + "params": {"target_db": -20, + "prior_db": -20, + "prior_samples": 100}, + "prob": 0.0 + } + ] + + This augmentation configuration inserts two augmentation models + into the pipeline, with one is VolumePerturbAugmentor and the other + SpeedPerturbAugmentor. "prob" indicates the probability of the current + augmentor to take effect. If "prob" is zero, the augmentor does not take + effect. + + :param augmentation_config: Augmentation configuration in json string. + :type augmentation_config: str + :param random_seed: Random seed. + :type random_seed: int + :raises ValueError: If the augmentation json config is in incorrect format". + """ + + def __init__(self, augmentation_config, random_seed=0): + self._rng = random.Random(random_seed) + self._augmentors, self._rates = self._parse_pipeline_from( + augmentation_config) + + def transform_audio(self, audio_segment): + """Run the pre-processing pipeline for data augmentation. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to process. + :type audio_segment: AudioSegmenet|SpeechSegment + """ + for augmentor, rate in zip(self._augmentors, self._rates): + if self._rng.uniform(0., 1.) < rate: + augmentor.transform_audio(audio_segment) + + def _parse_pipeline_from(self, config_json): + """Parse the config json to build a augmentation pipelien.""" + try: + configs = json.loads(config_json) + augmentors = [ + self._get_augmentor(config["type"], config["params"]) + for config in configs + ] + rates = [config["prob"] for config in configs] + except Exception as e: + raise ValueError("Failed to parse the augmentation config json: " + "%s" % str(e)) + return augmentors, rates + + def _get_augmentor(self, augmentor_type, params): + """Return an augmentation model by the type name, and pass in params.""" + if augmentor_type == "volume": + return VolumePerturbAugmentor(self._rng, **params) + elif augmentor_type == "shift": + return ShiftPerturbAugmentor(self._rng, **params) + elif augmentor_type == "speed": + return SpeedPerturbAugmentor(self._rng, **params) + elif augmentor_type == "resample": + return ResampleAugmentor(self._rng, **params) + elif augmentor_type == "bayesian_normal": + return OnlineBayesianNormalizationAugmentor(self._rng, **params) + elif augmentor_type == "noise": + return NoisePerturbAugmentor(self._rng, **params) + elif augmentor_type == "impulse": + return ImpulseResponseAugmentor(self._rng, **params) + else: + raise ValueError("Unknown augmentor type [%s]." % augmentor_type) diff --git a/examples/transv1.8to2.x/data_utils/augmentor/base.py b/examples/transv1.8to2.x/data_utils/augmentor/base.py new file mode 100644 index 00000000..3f65d324 --- /dev/null +++ b/examples/transv1.8to2.x/data_utils/augmentor/base.py @@ -0,0 +1,43 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the abstract base class for augmentation models.""" +from abc import ABCMeta +from abc import abstractmethod + + +class AugmentorBase(object): + """Abstract base class for augmentation model (augmentor) class. + All augmentor classes should inherit from this class, and implement the + following abstract methods. + """ + + __metaclass__ = ABCMeta + + @abstractmethod + def __init__(self): + pass + + @abstractmethod + def transform_audio(self, audio_segment): + """Adds various effects to the input audio segment. Such effects + will augment the training data to make the model invariant to certain + types of perturbations in the real world, improving model's + generalization ability. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegmenet|SpeechSegment + """ + pass diff --git a/examples/transv1.8to2.x/data_utils/augmentor/impulse_response.py b/examples/transv1.8to2.x/data_utils/augmentor/impulse_response.py new file mode 100644 index 00000000..d3928bf4 --- /dev/null +++ b/examples/transv1.8to2.x/data_utils/augmentor/impulse_response.py @@ -0,0 +1,43 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the impulse response augmentation model.""" +from data_utils.audio import AudioSegment +from data_utils.augmentor.base import AugmentorBase +from data_utils.utility import read_manifest + + +class ImpulseResponseAugmentor(AugmentorBase): + """Augmentation model for adding impulse response effect. + + :param rng: Random generator object. + :type rng: random.Random + :param impulse_manifest_path: Manifest path for impulse audio data. + :type impulse_manifest_path: str + """ + + def __init__(self, rng, impulse_manifest_path): + self._rng = rng + self._impulse_manifest = read_manifest(impulse_manifest_path) + + def transform_audio(self, audio_segment): + """Add impulse response effect. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegmenet|SpeechSegment + """ + impulse_json = self._rng.sample(self._impulse_manifest, 1)[0] + impulse_segment = AudioSegment.from_file(impulse_json['audio_filepath']) + audio_segment.convolve(impulse_segment, allow_resample=True) diff --git a/examples/transv1.8to2.x/data_utils/augmentor/noise_perturb.py b/examples/transv1.8to2.x/data_utils/augmentor/noise_perturb.py new file mode 100644 index 00000000..2b352e92 --- /dev/null +++ b/examples/transv1.8to2.x/data_utils/augmentor/noise_perturb.py @@ -0,0 +1,58 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the noise perturb augmentation model.""" +from data_utils.audio import AudioSegment +from data_utils.augmentor.base import AugmentorBase +from data_utils.utility import read_manifest + + +class NoisePerturbAugmentor(AugmentorBase): + """Augmentation model for adding background noise. + + :param rng: Random generator object. + :type rng: random.Random + :param min_snr_dB: Minimal signal noise ratio, in decibels. + :type min_snr_dB: float + :param max_snr_dB: Maximal signal noise ratio, in decibels. + :type max_snr_dB: float + :param noise_manifest_path: Manifest path for noise audio data. + :type noise_manifest_path: str + """ + + def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest_path): + self._min_snr_dB = min_snr_dB + self._max_snr_dB = max_snr_dB + self._rng = rng + self._noise_manifest = read_manifest(manifest_path=noise_manifest_path) + + def transform_audio(self, audio_segment): + """Add background noise audio. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegmenet|SpeechSegment + """ + noise_json = self._rng.sample(self._noise_manifest, 1)[0] + if noise_json['duration'] < audio_segment.duration: + raise RuntimeError("The duration of sampled noise audio is smaller " + "than the audio segment to add effects to.") + diff_duration = noise_json['duration'] - audio_segment.duration + start = self._rng.uniform(0, diff_duration) + end = start + audio_segment.duration + noise_segment = AudioSegment.slice_from_file( + noise_json['audio_filepath'], start=start, end=end) + snr_dB = self._rng.uniform(self._min_snr_dB, self._max_snr_dB) + audio_segment.add_noise( + noise_segment, snr_dB, allow_downsampling=True, rng=self._rng) diff --git a/examples/transv1.8to2.x/data_utils/augmentor/online_bayesian_normalization.py b/examples/transv1.8to2.x/data_utils/augmentor/online_bayesian_normalization.py new file mode 100644 index 00000000..7514dd78 --- /dev/null +++ b/examples/transv1.8to2.x/data_utils/augmentor/online_bayesian_normalization.py @@ -0,0 +1,57 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contain the online bayesian normalization augmentation model.""" +from data_utils.augmentor.base import AugmentorBase + + +class OnlineBayesianNormalizationAugmentor(AugmentorBase): + """Augmentation model for adding online bayesian normalization. + + :param rng: Random generator object. + :type rng: random.Random + :param target_db: Target RMS value in decibels. + :type target_db: float + :param prior_db: Prior RMS estimate in decibels. + :type prior_db: float + :param prior_samples: Prior strength in number of samples. + :type prior_samples: int + :param startup_delay: Default 0.0s. If provided, this function will + accrue statistics for the first startup_delay + seconds before applying online normalization. + :type starup_delay: float. + """ + + def __init__(self, + rng, + target_db, + prior_db, + prior_samples, + startup_delay=0.0): + self._target_db = target_db + self._prior_db = prior_db + self._prior_samples = prior_samples + self._rng = rng + self._startup_delay = startup_delay + + def transform_audio(self, audio_segment): + """Normalizes the input audio using the online Bayesian approach. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegment|SpeechSegment + """ + audio_segment.normalize_online_bayesian(self._target_db, self._prior_db, + self._prior_samples, + self._startup_delay) diff --git a/examples/transv1.8to2.x/data_utils/augmentor/resample.py b/examples/transv1.8to2.x/data_utils/augmentor/resample.py new file mode 100644 index 00000000..e1b801ca --- /dev/null +++ b/examples/transv1.8to2.x/data_utils/augmentor/resample.py @@ -0,0 +1,42 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contain the resample augmentation model.""" +from data_utils.augmentor.base import AugmentorBase + + +class ResampleAugmentor(AugmentorBase): + """Augmentation model for resampling. + + See more info here: + https://ccrma.stanford.edu/~jos/resample/index.html + + :param rng: Random generator object. + :type rng: random.Random + :param new_sample_rate: New sample rate in Hz. + :type new_sample_rate: int + """ + + def __init__(self, rng, new_sample_rate): + self._new_sample_rate = new_sample_rate + self._rng = rng + + def transform_audio(self, audio_segment): + """Resamples the input audio to a target sample rate. + + Note that this is an in-place transformation. + + :param audio: Audio segment to add effects to. + :type audio: AudioSegment|SpeechSegment + """ + audio_segment.resample(self._new_sample_rate) diff --git a/examples/transv1.8to2.x/data_utils/augmentor/shift_perturb.py b/examples/transv1.8to2.x/data_utils/augmentor/shift_perturb.py new file mode 100644 index 00000000..cdbc8afe --- /dev/null +++ b/examples/transv1.8to2.x/data_utils/augmentor/shift_perturb.py @@ -0,0 +1,43 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the volume perturb augmentation model.""" +from data_utils.augmentor.base import AugmentorBase + + +class ShiftPerturbAugmentor(AugmentorBase): + """Augmentation model for adding random shift perturbation. + + :param rng: Random generator object. + :type rng: random.Random + :param min_shift_ms: Minimal shift in milliseconds. + :type min_shift_ms: float + :param max_shift_ms: Maximal shift in milliseconds. + :type max_shift_ms: float + """ + + def __init__(self, rng, min_shift_ms, max_shift_ms): + self._min_shift_ms = min_shift_ms + self._max_shift_ms = max_shift_ms + self._rng = rng + + def transform_audio(self, audio_segment): + """Shift audio. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegmenet|SpeechSegment + """ + shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms) + audio_segment.shift(shift_ms) diff --git a/examples/transv1.8to2.x/data_utils/augmentor/speed_perturb.py b/examples/transv1.8to2.x/data_utils/augmentor/speed_perturb.py new file mode 100644 index 00000000..46ca1f05 --- /dev/null +++ b/examples/transv1.8to2.x/data_utils/augmentor/speed_perturb.py @@ -0,0 +1,56 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contain the speech perturbation augmentation model.""" +from data_utils.augmentor.base import AugmentorBase + + +class SpeedPerturbAugmentor(AugmentorBase): + """Augmentation model for adding speed perturbation. + + See reference paper here: + http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf + + :param rng: Random generator object. + :type rng: random.Random + :param min_speed_rate: Lower bound of new speed rate to sample and should + not be smaller than 0.9. + :type min_speed_rate: float + :param max_speed_rate: Upper bound of new speed rate to sample and should + not be larger than 1.1. + :type max_speed_rate: float + """ + + def __init__(self, rng, min_speed_rate, max_speed_rate): + if min_speed_rate < 0.9: + raise ValueError( + "Sampling speed below 0.9 can cause unnatural effects") + if max_speed_rate > 1.1: + raise ValueError( + "Sampling speed above 1.1 can cause unnatural effects") + self._min_speed_rate = min_speed_rate + self._max_speed_rate = max_speed_rate + self._rng = rng + + def transform_audio(self, audio_segment): + """Sample a new speed rate from the given range and + changes the speed of the given audio clip. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegment|SpeechSegment + """ + sampled_speed = self._rng.uniform(self._min_speed_rate, + self._max_speed_rate) + audio_segment.change_speed(sampled_speed) diff --git a/examples/transv1.8to2.x/data_utils/augmentor/volume_perturb.py b/examples/transv1.8to2.x/data_utils/augmentor/volume_perturb.py new file mode 100644 index 00000000..9e5c5aa3 --- /dev/null +++ b/examples/transv1.8to2.x/data_utils/augmentor/volume_perturb.py @@ -0,0 +1,49 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the volume perturb augmentation model.""" +from data_utils.augmentor.base import AugmentorBase + + +class VolumePerturbAugmentor(AugmentorBase): + """Augmentation model for adding random volume perturbation. + + This is used for multi-loudness training of PCEN. See + + https://arxiv.org/pdf/1607.05666v1.pdf + + for more details. + + :param rng: Random generator object. + :type rng: random.Random + :param min_gain_dBFS: Minimal gain in dBFS. + :type min_gain_dBFS: float + :param max_gain_dBFS: Maximal gain in dBFS. + :type max_gain_dBFS: float + """ + + def __init__(self, rng, min_gain_dBFS, max_gain_dBFS): + self._min_gain_dBFS = min_gain_dBFS + self._max_gain_dBFS = max_gain_dBFS + self._rng = rng + + def transform_audio(self, audio_segment): + """Change audio loadness. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegmenet|SpeechSegment + """ + gain = self._rng.uniform(self._min_gain_dBFS, self._max_gain_dBFS) + audio_segment.gain_db(gain) diff --git a/examples/transv1.8to2.x/data_utils/data.py b/examples/transv1.8to2.x/data_utils/data.py new file mode 100644 index 00000000..df296f70 --- /dev/null +++ b/examples/transv1.8to2.x/data_utils/data.py @@ -0,0 +1,380 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains data generator for orgnaizing various audio data preprocessing +pipeline and offering data reader interface of PaddlePaddle requirements. +""" +import random +import tarfile +from threading import local + +import numpy as np +import paddle.fluid as fluid +from data_utils.augmentor.augmentation import AugmentationPipeline +from data_utils.featurizer.speech_featurizer import SpeechFeaturizer +from data_utils.normalizer import FeatureNormalizer +from data_utils.speech import SpeechSegment +from data_utils.utility import read_manifest + + +class DataGenerator(object): + """ + DataGenerator provides basic audio data preprocessing pipeline, and offers + data reader interfaces of PaddlePaddle requirements. + + :param vocab_filepath: Vocabulary filepath for indexing tokenized + transcripts. + :type vocab_filepath: str + :param mean_std_filepath: File containing the pre-computed mean and stddev. + :type mean_std_filepath: None|str + :param augmentation_config: Augmentation configuration in json string. + Details see AugmentationPipeline.__doc__. + :type augmentation_config: str + :param max_duration: Audio with duration (in seconds) greater than + this will be discarded. + :type max_duration: float + :param min_duration: Audio with duration (in seconds) smaller than + this will be discarded. + :type min_duration: float + :param stride_ms: Striding size (in milliseconds) for generating frames. + :type stride_ms: float + :param window_ms: Window size (in milliseconds) for generating frames. + :type window_ms: float + :param max_freq: Used when specgram_type is 'linear', only FFT bins + corresponding to frequencies between [0, max_freq] are + returned. + :types max_freq: None|float + :param specgram_type: Specgram feature type. Options: 'linear'. + :type specgram_type: str + :param use_dB_normalization: Whether to normalize the audio to -20 dB + before extracting the features. + :type use_dB_normalization: bool + :param random_seed: Random seed. + :type random_seed: int + :param keep_transcription_text: If set to True, transcription text will + be passed forward directly without + converting to index sequence. + :type keep_transcription_text: bool + :param place: The place to run the program. + :type place: CPUPlace or CUDAPlace + :param is_training: If set to True, generate text data for training, + otherwise, generate text data for infer. + :type is_training: bool + """ + + def __init__(self, + vocab_filepath, + mean_std_filepath, + augmentation_config='{}', + max_duration=float('inf'), + min_duration=0.0, + stride_ms=10.0, + window_ms=20.0, + max_freq=None, + specgram_type='linear', + use_dB_normalization=True, + random_seed=0, + keep_transcription_text=False, + place=fluid.CPUPlace(), + is_training=True): + self._max_duration = max_duration + self._min_duration = min_duration + self._normalizer = FeatureNormalizer(mean_std_filepath) + self._augmentation_pipeline = AugmentationPipeline( + augmentation_config=augmentation_config, random_seed=random_seed) + self._speech_featurizer = SpeechFeaturizer( + vocab_filepath=vocab_filepath, + specgram_type=specgram_type, + stride_ms=stride_ms, + window_ms=window_ms, + max_freq=max_freq, + use_dB_normalization=use_dB_normalization) + self._rng = random.Random(random_seed) + self._keep_transcription_text = keep_transcription_text + self._epoch = 0 + self._is_training = is_training + # for caching tar files info + self._local_data = local() + self._local_data.tar2info = {} + self._local_data.tar2object = {} + self._place = place + + def process_utterance(self, audio_file, transcript): + """Load, augment, featurize and normalize for speech data. + + :param audio_file: Filepath or file object of audio file. + :type audio_file: str | file + :param transcript: Transcription text. + :type transcript: str + :return: Tuple of audio feature tensor and data of transcription part, + where transcription part could be token ids or text. + :rtype: tuple of (2darray, list) + """ + if isinstance(audio_file, str) and audio_file.startswith('tar:'): + speech_segment = SpeechSegment.from_file( + self._subfile_from_tar(audio_file), transcript) + else: + speech_segment = SpeechSegment.from_file(audio_file, transcript) + self._augmentation_pipeline.transform_audio(speech_segment) + specgram, transcript_part = self._speech_featurizer.featurize( + speech_segment, self._keep_transcription_text) + specgram = self._normalizer.apply(specgram) + return specgram, transcript_part + + def batch_reader_creator(self, + manifest_path, + batch_size, + padding_to=-1, + flatten=False, + sortagrad=False, + shuffle_method="batch_shuffle"): + """ + Batch data reader creator for audio data. Return a callable generator + function to produce batches of data. + + Audio features within one batch will be padded with zeros to have the + same shape, or a user-defined shape. + + :param manifest_path: Filepath of manifest for audio files. + :type manifest_path: str + :param batch_size: Number of instances in a batch. + :type batch_size: int + :param padding_to: If set -1, the maximun shape in the batch + will be used as the target shape for padding. + Otherwise, `padding_to` will be the target shape. + :type padding_to: int + :param flatten: If set True, audio features will be flatten to 1darray. + :type flatten: bool + :param sortagrad: If set True, sort the instances by audio duration + in the first epoch for speed up training. + :type sortagrad: bool + :param shuffle_method: Shuffle method. Options: + '' or None: no shuffle. + 'instance_shuffle': instance-wise shuffle. + 'batch_shuffle': similarly-sized instances are + put into batches, and then + batch-wise shuffle the batches. + For more details, please see + ``_batch_shuffle.__doc__``. + 'batch_shuffle_clipped': 'batch_shuffle' with + head shift and tail + clipping. For more + details, please see + ``_batch_shuffle``. + If sortagrad is True, shuffle is disabled + for the first epoch. + :type shuffle_method: None|str + :return: Batch reader function, producing batches of data when called. + :rtype: callable + """ + + def batch_reader(): + # read manifest + manifest = read_manifest( + manifest_path=manifest_path, + max_duration=self._max_duration, + min_duration=self._min_duration) + # sort (by duration) or batch-wise shuffle the manifest + if self._epoch == 0 and sortagrad: + manifest.sort(key=lambda x: x["duration"]) + + else: + if shuffle_method == "batch_shuffle": + manifest = self._batch_shuffle( + manifest, batch_size, clipped=False) + elif shuffle_method == "batch_shuffle_clipped": + manifest = self._batch_shuffle( + manifest, batch_size, clipped=True) + elif shuffle_method == "instance_shuffle": + self._rng.shuffle(manifest) + elif shuffle_method is None: + pass + else: + raise ValueError("Unknown shuffle method %s." % + shuffle_method) + # prepare batches + batch = [] + instance_reader = self._instance_reader_creator(manifest) + + for instance in instance_reader(): + batch.append(instance) + if len(batch) == batch_size: + yield self._padding_batch(batch, padding_to, flatten) + batch = [] + if len(batch) >= 1: + yield self._padding_batch(batch, padding_to, flatten) + self._epoch += 1 + + return batch_reader + + @property + def feeding(self): + """Returns data reader's feeding dict. + + :return: Data feeding dict. + :rtype: dict + """ + feeding_dict = {"audio_spectrogram": 0, "transcript_text": 1} + return feeding_dict + + @property + def vocab_size(self): + """Return the vocabulary size. + + :return: Vocabulary size. + :rtype: int + """ + return self._speech_featurizer.vocab_size + + @property + def vocab_list(self): + """Return the vocabulary in list. + + :return: Vocabulary in list. + :rtype: list + """ + return self._speech_featurizer.vocab_list + + def _parse_tar(self, file): + """Parse a tar file to get a tarfile object + and a map containing tarinfoes + """ + result = {} + f = tarfile.open(file) + for tarinfo in f.getmembers(): + result[tarinfo.name] = tarinfo + return f, result + + def _subfile_from_tar(self, file): + """Get subfile object from tar. + + It will return a subfile object from tar file + and cached tar file info for next reading request. + """ + tarpath, filename = file.split(':', 1)[1].split('#', 1) + if 'tar2info' not in self._local_data.__dict__: + self._local_data.tar2info = {} + if 'tar2object' not in self._local_data.__dict__: + self._local_data.tar2object = {} + if tarpath not in self._local_data.tar2info: + object, infoes = self._parse_tar(tarpath) + self._local_data.tar2info[tarpath] = infoes + self._local_data.tar2object[tarpath] = object + return self._local_data.tar2object[tarpath].extractfile( + self._local_data.tar2info[tarpath][filename]) + + def _instance_reader_creator(self, manifest): + """ + Instance reader creator. Create a callable function to produce + instances of data. + + Instance: a tuple of ndarray of audio spectrogram and a list of + token indices for transcript. + """ + + def reader(): + for instance in manifest: + inst = self.process_utterance(instance["audio_filepath"], + instance["text"]) + yield inst + + return reader + + def _padding_batch(self, batch, padding_to=-1, flatten=False): + """ + Padding audio features with zeros to make them have the same shape (or + a user-defined shape) within one bach. + + If ``padding_to`` is -1, the maximun shape in the batch will be used + as the target shape for padding. Otherwise, `padding_to` will be the + target shape (only refers to the second axis). + + If `flatten` is True, features will be flatten to 1darray. + """ + new_batch = [] + # get target shape + max_length = max([audio.shape[1] for audio, text in batch]) + if padding_to != -1: + if padding_to < max_length: + raise ValueError("If padding_to is not -1, it should be larger " + "than any instance's shape in the batch") + max_length = padding_to + # padding + padded_audios = [] + texts, text_lens = [], [] + audio_lens = [] + masks = [] + for audio, text in batch: + padded_audio = np.zeros([audio.shape[0], max_length]) + padded_audio[:, :audio.shape[1]] = audio + if flatten: + padded_audio = padded_audio.flatten() + padded_audios.append(padded_audio) + if self._is_training: + texts += text + else: + texts.append(text) + text_lens.append(len(text)) + audio_lens.append(audio.shape[1]) + mask_shape0 = (audio.shape[0] - 1) // 2 + 1 + mask_shape1 = (audio.shape[1] - 1) // 3 + 1 + mask_max_len = (max_length - 1) // 3 + 1 + mask_ones = np.ones((mask_shape0, mask_shape1)) + mask_zeros = np.zeros((mask_shape0, mask_max_len - mask_shape1)) + mask = np.repeat( + np.reshape( + np.concatenate((mask_ones, mask_zeros), axis=1), + (1, mask_shape0, mask_max_len)), + 32, + axis=0) + masks.append(mask) + padded_audios = np.array(padded_audios).astype('float32') + if self._is_training: + texts = np.expand_dims(np.array(texts).astype('int32'), axis=-1) + texts = fluid.create_lod_tensor( + texts, recursive_seq_lens=[text_lens], place=self._place) + audio_lens = np.array(audio_lens).astype('int64').reshape([-1, 1]) + masks = np.array(masks).astype('float32') + return padded_audios, texts, audio_lens, masks + + def _batch_shuffle(self, manifest, batch_size, clipped=False): + """Put similarly-sized instances into minibatches for better efficiency + and make a batch-wise shuffle. + + 1. Sort the audio clips by duration. + 2. Generate a random number `k`, k in [0, batch_size). + 3. Randomly shift `k` instances in order to create different batches + for different epochs. Create minibatches. + 4. Shuffle the minibatches. + + :param manifest: Manifest contents. List of dict. + :type manifest: list + :param batch_size: Batch size. This size is also used for generate + a random number for batch shuffle. + :type batch_size: int + :param clipped: Whether to clip the heading (small shift) and trailing + (incomplete batch) instances. + :type clipped: bool + :return: Batch shuffled mainifest. + :rtype: list + """ + manifest.sort(key=lambda x: x["duration"]) + shift_len = self._rng.randint(0, batch_size - 1) + batch_manifest = list(zip(* [iter(manifest[shift_len:])] * batch_size)) + self._rng.shuffle(batch_manifest) + batch_manifest = [item for batch in batch_manifest for item in batch] + if not clipped: + res_len = len(manifest) - shift_len - len(batch_manifest) + batch_manifest.extend(manifest[-res_len:]) + batch_manifest.extend(manifest[0:shift_len]) + return batch_manifest diff --git a/examples/transv1.8to2.x/data_utils/featurizer/__init__.py b/examples/transv1.8to2.x/data_utils/featurizer/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/examples/transv1.8to2.x/data_utils/featurizer/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/transv1.8to2.x/data_utils/featurizer/audio_featurizer.py b/examples/transv1.8to2.x/data_utils/featurizer/audio_featurizer.py new file mode 100644 index 00000000..9d45f8b1 --- /dev/null +++ b/examples/transv1.8to2.x/data_utils/featurizer/audio_featurizer.py @@ -0,0 +1,194 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the audio featurizer class.""" +import numpy as np +from python_speech_features import delta +from python_speech_features import mfcc + + +class AudioFeaturizer(object): + """Audio featurizer, for extracting features from audio contents of + AudioSegment or SpeechSegment. + + Currently, it supports feature types of linear spectrogram and mfcc. + + :param specgram_type: Specgram feature type. Options: 'linear'. + :type specgram_type: str + :param stride_ms: Striding size (in milliseconds) for generating frames. + :type stride_ms: float + :param window_ms: Window size (in milliseconds) for generating frames. + :type window_ms: float + :param max_freq: When specgram_type is 'linear', only FFT bins + corresponding to frequencies between [0, max_freq] are + returned; when specgram_type is 'mfcc', max_feq is the + highest band edge of mel filters. + :types max_freq: None|float + :param target_sample_rate: Audio are resampled (if upsampling or + downsampling is allowed) to this before + extracting spectrogram features. + :type target_sample_rate: float + :param use_dB_normalization: Whether to normalize the audio to a certain + decibels before extracting the features. + :type use_dB_normalization: bool + :param target_dB: Target audio decibels for normalization. + :type target_dB: float + """ + + def __init__(self, + specgram_type='linear', + stride_ms=10.0, + window_ms=20.0, + max_freq=None, + target_sample_rate=16000, + use_dB_normalization=True, + target_dB=-20): + self._specgram_type = specgram_type + self._stride_ms = stride_ms + self._window_ms = window_ms + self._max_freq = max_freq + self._target_sample_rate = target_sample_rate + self._use_dB_normalization = use_dB_normalization + self._target_dB = target_dB + + def featurize(self, + audio_segment, + allow_downsampling=True, + allow_upsampling=True): + """Extract audio features from AudioSegment or SpeechSegment. + + :param audio_segment: Audio/speech segment to extract features from. + :type audio_segment: AudioSegment|SpeechSegment + :param allow_downsampling: Whether to allow audio downsampling before + featurizing. + :type allow_downsampling: bool + :param allow_upsampling: Whether to allow audio upsampling before + featurizing. + :type allow_upsampling: bool + :return: Spectrogram audio feature in 2darray. + :rtype: ndarray + :raises ValueError: If audio sample rate is not supported. + """ + # upsampling or downsampling + if ((audio_segment.sample_rate > self._target_sample_rate and + allow_downsampling) or + (audio_segment.sample_rate < self._target_sample_rate and + allow_upsampling)): + audio_segment.resample(self._target_sample_rate) + if audio_segment.sample_rate != self._target_sample_rate: + raise ValueError("Audio sample rate is not supported. " + "Turn allow_downsampling or allow up_sampling on.") + # decibel normalization + if self._use_dB_normalization: + audio_segment.normalize(target_db=self._target_dB) + # extract spectrogram + return self._compute_specgram(audio_segment.samples, + audio_segment.sample_rate) + + def _compute_specgram(self, samples, sample_rate): + """Extract various audio features.""" + if self._specgram_type == 'linear': + return self._compute_linear_specgram( + samples, sample_rate, self._stride_ms, self._window_ms, + self._max_freq) + elif self._specgram_type == 'mfcc': + return self._compute_mfcc(samples, sample_rate, self._stride_ms, + self._window_ms, self._max_freq) + else: + raise ValueError("Unknown specgram_type %s. " + "Supported values: linear." % self._specgram_type) + + def _compute_linear_specgram(self, + samples, + sample_rate, + stride_ms=10.0, + window_ms=20.0, + max_freq=None, + eps=1e-14): + """Compute the linear spectrogram from FFT energy.""" + if max_freq is None: + max_freq = sample_rate / 2 + if max_freq > sample_rate / 2: + raise ValueError("max_freq must not be greater than half of " + "sample rate.") + if stride_ms > window_ms: + raise ValueError("Stride size must not be greater than " + "window size.") + stride_size = int(0.001 * sample_rate * stride_ms) + window_size = int(0.001 * sample_rate * window_ms) + specgram, freqs = self._specgram_real( + samples, + window_size=window_size, + stride_size=stride_size, + sample_rate=sample_rate) + ind = np.where(freqs <= max_freq)[0][-1] + 1 + return np.log(specgram[:ind, :] + eps) + + def _specgram_real(self, samples, window_size, stride_size, sample_rate): + """Compute the spectrogram for samples from a real signal.""" + # extract strided windows + truncate_size = (len(samples) - window_size) % stride_size + samples = samples[:len(samples) - truncate_size] + nshape = (window_size, (len(samples) - window_size) // stride_size + 1) + nstrides = (samples.strides[0], samples.strides[0] * stride_size) + windows = np.lib.stride_tricks.as_strided( + samples, shape=nshape, strides=nstrides) + assert np.all( + windows[:, 1] == samples[stride_size:(stride_size + window_size)]) + # window weighting, squared Fast Fourier Transform (fft), scaling + weighting = np.hanning(window_size)[:, None] + fft = np.fft.rfft(windows * weighting, axis=0) + fft = np.absolute(fft) + fft = fft**2 + scale = np.sum(weighting**2) * sample_rate + fft[1:-1, :] *= (2.0 / scale) + fft[(0, -1), :] /= scale + # prepare fft frequency list + freqs = float(sample_rate) / window_size * np.arange(fft.shape[0]) + return fft, freqs + + def _compute_mfcc(self, + samples, + sample_rate, + stride_ms=10.0, + window_ms=20.0, + max_freq=None): + """Compute mfcc from samples.""" + if max_freq is None: + max_freq = sample_rate / 2 + if max_freq > sample_rate / 2: + raise ValueError("max_freq must not be greater than half of " + "sample rate.") + if stride_ms > window_ms: + raise ValueError("Stride size must not be greater than " + "window size.") + # compute the 13 cepstral coefficients, and the first one is replaced + # by log(frame energy) + mfcc_feat = mfcc( + signal=samples, + samplerate=sample_rate, + winlen=0.001 * window_ms, + winstep=0.001 * stride_ms, + highfreq=max_freq) + # Deltas + d_mfcc_feat = delta(mfcc_feat, 2) + # Deltas-Deltas + dd_mfcc_feat = delta(d_mfcc_feat, 2) + # transpose + mfcc_feat = np.transpose(mfcc_feat) + d_mfcc_feat = np.transpose(d_mfcc_feat) + dd_mfcc_feat = np.transpose(dd_mfcc_feat) + # concat above three features + concat_mfcc_feat = np.concatenate( + (mfcc_feat, d_mfcc_feat, dd_mfcc_feat)) + return concat_mfcc_feat diff --git a/examples/transv1.8to2.x/data_utils/featurizer/speech_featurizer.py b/examples/transv1.8to2.x/data_utils/featurizer/speech_featurizer.py new file mode 100644 index 00000000..232251dd --- /dev/null +++ b/examples/transv1.8to2.x/data_utils/featurizer/speech_featurizer.py @@ -0,0 +1,107 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the speech featurizer class.""" +from data_utils.featurizer.audio_featurizer import AudioFeaturizer +from data_utils.featurizer.text_featurizer import TextFeaturizer + + +class SpeechFeaturizer(object): + """Speech featurizer, for extracting features from both audio and transcript + contents of SpeechSegment. + + Currently, for audio parts, it supports feature types of linear + spectrogram and mfcc; for transcript parts, it only supports char-level + tokenizing and conversion into a list of token indices. Note that the + token indexing order follows the given vocabulary file. + + :param vocab_filepath: Filepath to load vocabulary for token indices + conversion. + :type specgram_type: str + :param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'. + :type specgram_type: str + :param stride_ms: Striding size (in milliseconds) for generating frames. + :type stride_ms: float + :param window_ms: Window size (in milliseconds) for generating frames. + :type window_ms: float + :param max_freq: When specgram_type is 'linear', only FFT bins + corresponding to frequencies between [0, max_freq] are + returned; when specgram_type is 'mfcc', max_freq is the + highest band edge of mel filters. + :types max_freq: None|float + :param target_sample_rate: Speech are resampled (if upsampling or + downsampling is allowed) to this before + extracting spectrogram features. + :type target_sample_rate: float + :param use_dB_normalization: Whether to normalize the audio to a certain + decibels before extracting the features. + :type use_dB_normalization: bool + :param target_dB: Target audio decibels for normalization. + :type target_dB: float + """ + + def __init__(self, + vocab_filepath, + specgram_type='linear', + stride_ms=10.0, + window_ms=20.0, + max_freq=None, + target_sample_rate=16000, + use_dB_normalization=True, + target_dB=-20): + self._audio_featurizer = AudioFeaturizer( + specgram_type=specgram_type, + stride_ms=stride_ms, + window_ms=window_ms, + max_freq=max_freq, + target_sample_rate=target_sample_rate, + use_dB_normalization=use_dB_normalization, + target_dB=target_dB) + self._text_featurizer = TextFeaturizer(vocab_filepath) + + def featurize(self, speech_segment, keep_transcription_text): + """Extract features for speech segment. + + 1. For audio parts, extract the audio features. + 2. For transcript parts, keep the original text or convert text string + to a list of token indices in char-level. + + :param audio_segment: Speech segment to extract features from. + :type audio_segment: SpeechSegment + :return: A tuple of 1) spectrogram audio feature in 2darray, 2) list of + char-level token indices. + :rtype: tuple + """ + audio_feature = self._audio_featurizer.featurize(speech_segment) + if keep_transcription_text: + return audio_feature, speech_segment.transcript + text_ids = self._text_featurizer.featurize(speech_segment.transcript) + return audio_feature, text_ids + + @property + def vocab_size(self): + """Return the vocabulary size. + + :return: Vocabulary size. + :rtype: int + """ + return self._text_featurizer.vocab_size + + @property + def vocab_list(self): + """Return the vocabulary in list. + + :return: Vocabulary in list. + :rtype: list + """ + return self._text_featurizer.vocab_list diff --git a/examples/transv1.8to2.x/data_utils/featurizer/text_featurizer.py b/examples/transv1.8to2.x/data_utils/featurizer/text_featurizer.py new file mode 100644 index 00000000..a56073f9 --- /dev/null +++ b/examples/transv1.8to2.x/data_utils/featurizer/text_featurizer.py @@ -0,0 +1,76 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the text featurizer class.""" +import codecs + + +class TextFeaturizer(object): + """Text featurizer, for processing or extracting features from text. + + Currently, it only supports char-level tokenizing and conversion into + a list of token indices. Note that the token indexing order follows the + given vocabulary file. + + :param vocab_filepath: Filepath to load vocabulary for token indices + conversion. + :type specgram_type: str + """ + + def __init__(self, vocab_filepath): + self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file( + vocab_filepath) + + def featurize(self, text): + """Convert text string to a list of token indices in char-level.Note + that the token indexing order follows the given vocabulary file. + + :param text: Text to process. + :type text: str + :return: List of char-level token indices. + :rtype: list + """ + tokens = self._char_tokenize(text) + return [self._vocab_dict[token] for token in tokens] + + @property + def vocab_size(self): + """Return the vocabulary size. + + :return: Vocabulary size. + :rtype: int + """ + return len(self._vocab_list) + + @property + def vocab_list(self): + """Return the vocabulary in list. + + :return: Vocabulary in list. + :rtype: list + """ + return self._vocab_list + + def _char_tokenize(self, text): + """Character tokenizer.""" + return list(text.strip()) + + def _load_vocabulary_from_file(self, vocab_filepath): + """Load vocabulary from file.""" + vocab_lines = [] + with codecs.open(vocab_filepath, 'r', 'utf-8') as file: + vocab_lines.extend(file.readlines()) + vocab_list = [line[:-1] for line in vocab_lines] + vocab_dict = dict( + [(token, id) for (id, token) in enumerate(vocab_list)]) + return vocab_dict, vocab_list diff --git a/examples/transv1.8to2.x/data_utils/normalizer.py b/examples/transv1.8to2.x/data_utils/normalizer.py new file mode 100644 index 00000000..c1d94528 --- /dev/null +++ b/examples/transv1.8to2.x/data_utils/normalizer.py @@ -0,0 +1,97 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains feature normalizers.""" +import random + +import numpy as np +from data_utils.audio import AudioSegment +from data_utils.utility import read_manifest + + +class FeatureNormalizer(object): + """Feature normalizer. Normalize features to be of zero mean and unit + stddev. + + if mean_std_filepath is provided (not None), the normalizer will directly + initilize from the file. Otherwise, both manifest_path and featurize_func + should be given for on-the-fly mean and stddev computing. + + :param mean_std_filepath: File containing the pre-computed mean and stddev. + :type mean_std_filepath: None|str + :param manifest_path: Manifest of instances for computing mean and stddev. + :type meanifest_path: None|str + :param featurize_func: Function to extract features. It should be callable + with ``featurize_func(audio_segment)``. + :type featurize_func: None|callable + :param num_samples: Number of random samples for computing mean and stddev. + :type num_samples: int + :param random_seed: Random seed for sampling instances. + :type random_seed: int + :raises ValueError: If both mean_std_filepath and manifest_path + (or both mean_std_filepath and featurize_func) are None. + """ + + def __init__(self, + mean_std_filepath, + manifest_path=None, + featurize_func=None, + num_samples=500, + random_seed=0): + if not mean_std_filepath: + if not (manifest_path and featurize_func): + raise ValueError("If mean_std_filepath is None, meanifest_path " + "and featurize_func should not be None.") + self._rng = random.Random(random_seed) + self._compute_mean_std(manifest_path, featurize_func, num_samples) + else: + self._read_mean_std_from_file(mean_std_filepath) + + def apply(self, features, eps=1e-14): + """Normalize features to be of zero mean and unit stddev. + + :param features: Input features to be normalized. + :type features: ndarray + :param eps: added to stddev to provide numerical stablibity. + :type eps: float + :return: Normalized features. + :rtype: ndarray + """ + return (features - self._mean) / (self._std + eps) + + def write_to_file(self, filepath): + """Write the mean and stddev to the file. + + :param filepath: File to write mean and stddev. + :type filepath: str + """ + np.savez(filepath, mean=self._mean, std=self._std) + + def _read_mean_std_from_file(self, filepath): + """Load mean and std from file.""" + npzfile = np.load(filepath) + self._mean = npzfile["mean"] + self._std = npzfile["std"] + + def _compute_mean_std(self, manifest_path, featurize_func, num_samples): + """Compute mean and std from randomly sampled instances.""" + manifest = read_manifest(manifest_path) + sampled_manifest = self._rng.sample(manifest, num_samples) + features = [] + for instance in sampled_manifest: + features.append( + featurize_func( + AudioSegment.from_file(instance["audio_filepath"]))) + features = np.hstack(features) + self._mean = np.mean(features, axis=1).reshape([-1, 1]) + self._std = np.std(features, axis=1).reshape([-1, 1]) diff --git a/examples/transv1.8to2.x/data_utils/speech.py b/examples/transv1.8to2.x/data_utils/speech.py new file mode 100644 index 00000000..8daf58b0 --- /dev/null +++ b/examples/transv1.8to2.x/data_utils/speech.py @@ -0,0 +1,153 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the speech segment class.""" +import numpy as np +from data_utils.audio import AudioSegment + + +class SpeechSegment(AudioSegment): + """Speech segment abstraction, a subclass of AudioSegment, + with an additional transcript. + + :param samples: Audio samples [num_samples x num_channels]. + :type samples: ndarray.float32 + :param sample_rate: Audio sample rate. + :type sample_rate: int + :param transcript: Transcript text for the speech. + :type transript: str + :raises TypeError: If the sample data type is not float or int. + """ + + def __init__(self, samples, sample_rate, transcript): + AudioSegment.__init__(self, samples, sample_rate) + self._transcript = transcript + + def __eq__(self, other): + """Return whether two objects are equal. + """ + if not AudioSegment.__eq__(self, other): + return False + if self._transcript != other._transcript: + return False + return True + + def __ne__(self, other): + """Return whether two objects are unequal.""" + return not self.__eq__(other) + + @classmethod + def from_file(cls, filepath, transcript): + """Create speech segment from audio file and corresponding transcript. + + :param filepath: Filepath or file object to audio file. + :type filepath: str|file + :param transcript: Transcript text for the speech. + :type transript: str + :return: Speech segment instance. + :rtype: SpeechSegment + """ + audio = AudioSegment.from_file(filepath) + return cls(audio.samples, audio.sample_rate, transcript) + + @classmethod + def from_bytes(cls, bytes, transcript): + """Create speech segment from a byte string and corresponding + transcript. + + :param bytes: Byte string containing audio samples. + :type bytes: str + :param transcript: Transcript text for the speech. + :type transript: str + :return: Speech segment instance. + :rtype: Speech Segment + """ + audio = AudioSegment.from_bytes(bytes) + return cls(audio.samples, audio.sample_rate, transcript) + + @classmethod + def concatenate(cls, *segments): + """Concatenate an arbitrary number of speech segments together, both + audio and transcript will be concatenated. + + :param *segments: Input speech segments to be concatenated. + :type *segments: tuple of SpeechSegment + :return: Speech segment instance. + :rtype: SpeechSegment + :raises ValueError: If the number of segments is zero, or if the + sample_rate of any two segments does not match. + :raises TypeError: If any segment is not SpeechSegment instance. + """ + if len(segments) == 0: + raise ValueError("No speech segments are given to concatenate.") + sample_rate = segments[0]._sample_rate + transcripts = "" + for seg in segments: + if sample_rate != seg._sample_rate: + raise ValueError("Can't concatenate segments with " + "different sample rates") + if type(seg) is not cls: + raise TypeError("Only speech segments of the same type " + "instance can be concatenated.") + transcripts += seg._transcript + samples = np.concatenate([seg.samples for seg in segments]) + return cls(samples, sample_rate, transcripts) + + @classmethod + def slice_from_file(cls, filepath, transcript, start=None, end=None): + """Loads a small section of an speech without having to load + the entire file into the memory which can be incredibly wasteful. + + :param filepath: Filepath or file object to audio file. + :type filepath: str|file + :param start: Start time in seconds. If start is negative, it wraps + around from the end. If not provided, this function + reads from the very beginning. + :type start: float + :param end: End time in seconds. If end is negative, it wraps around + from the end. If not provided, the default behvaior is + to read to the end of the file. + :type end: float + :param transcript: Transcript text for the speech. if not provided, + the defaults is an empty string. + :type transript: str + :return: SpeechSegment instance of the specified slice of the input + speech file. + :rtype: SpeechSegment + """ + audio = AudioSegment.slice_from_file(filepath, start, end) + return cls(audio.samples, audio.sample_rate, transcript) + + @classmethod + def make_silence(cls, duration, sample_rate): + """Creates a silent speech segment of the given duration and + sample rate, transcript will be an empty string. + + :param duration: Length of silence in seconds. + :type duration: float + :param sample_rate: Sample rate. + :type sample_rate: float + :return: Silence of the given duration. + :rtype: SpeechSegment + """ + audio = AudioSegment.make_silence(duration, sample_rate) + return cls(audio.samples, audio.sample_rate, "") + + @property + def transcript(self): + """Return the transcript text. + + :return: Transcript text for the speech. + :rtype: str + """ + return self._transcript diff --git a/examples/transv1.8to2.x/data_utils/utility.py b/examples/transv1.8to2.x/data_utils/utility.py new file mode 100644 index 00000000..61e78dcb --- /dev/null +++ b/examples/transv1.8to2.x/data_utils/utility.py @@ -0,0 +1,98 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains data helper functions.""" +import codecs +import json +import os +import tarfile + +from paddle.dataset.common import md5file + + +def read_manifest(manifest_path, max_duration=float('inf'), min_duration=0.0): + """Load and parse manifest file. + + Instances with durations outside [min_duration, max_duration] will be + filtered out. + + :param manifest_path: Manifest file to load and parse. + :type manifest_path: str + :param max_duration: Maximal duration in seconds for instance filter. + :type max_duration: float + :param min_duration: Minimal duration in seconds for instance filter. + :type min_duration: float + :return: Manifest parsing results. List of dict. + :rtype: list + :raises IOError: If failed to parse the manifest. + """ + manifest = [] + for json_line in codecs.open(manifest_path, 'r', 'utf-8'): + try: + json_data = json.loads(json_line) + except Exception as e: + raise IOError("Error reading manifest: %s" % str(e)) + if (json_data["duration"] <= max_duration and + json_data["duration"] >= min_duration): + manifest.append(json_data) + return manifest + + +def getfile_insensitive(path): + """Get the actual file path when given insensitive filename.""" + directory, filename = os.path.split(path) + directory, filename = (directory or '.'), filename.lower() + for f in os.listdir(directory): + newpath = os.path.join(directory, f) + if os.path.isfile(newpath) and f.lower() == filename: + return newpath + + +def download_multi(url, target_dir, extra_args): + """Download multiple files from url to target_dir.""" + if not os.path.exists(target_dir): + os.makedirs(target_dir) + print("Downloading %s ..." % url) + ret_code = os.system("wget -c " + url + ' ' + extra_args + " -P " + + target_dir) + return ret_code + + +def download(url, md5sum, target_dir): + """Download file from url to target_dir, and check md5sum.""" + if not os.path.exists(target_dir): + os.makedirs(target_dir) + filepath = os.path.join(target_dir, url.split("/")[-1]) + if not (os.path.exists(filepath) and md5file(filepath) == md5sum): + print("Downloading %s ..." % url) + os.system("wget -c " + url + " -P " + target_dir) + print("\nMD5 Chesksum %s ..." % filepath) + if not md5file(filepath) == md5sum: + raise RuntimeError("MD5 checksum failed.") + else: + print("File exists, skip downloading. (%s)" % filepath) + return filepath + + +def unpack(filepath, target_dir, rm_tar=False): + """Unpack the file to the target_dir.""" + print("Unpacking %s ..." % filepath) + tar = tarfile.open(filepath) + tar.extractall(target_dir) + tar.close() + if rm_tar is True: + os.remove(filepath) + + +class XmapEndSignal(): + pass diff --git a/examples/transv1.8to2.x/deepspeech/__init__.py b/examples/transv1.8to2.x/deepspeech/__init__.py new file mode 100644 index 00000000..d85a3dde --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/__init__.py @@ -0,0 +1,370 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any +from typing import List +from typing import Tuple +from typing import Union + +import paddle +from paddle import nn +from paddle.fluid import core +from paddle.nn import functional as F + +from deepspeech.utils.log import Log + +#TODO(Hui Zhang): remove fluid import +logger = Log(__name__).getlog() + +########### hcak logging ############# +logger.warn = logger.warning + +########### hcak paddle ############# +paddle.half = 'float16' +paddle.float = 'float32' +paddle.double = 'float64' +paddle.short = 'int16' +paddle.int = 'int32' +paddle.long = 'int64' +paddle.uint16 = 'uint16' +paddle.cdouble = 'complex128' + + +def convert_dtype_to_string(tensor_dtype): + """ + Convert the data type in numpy to the data type in Paddle + Args: + tensor_dtype(core.VarDesc.VarType): the data type in numpy. + Returns: + core.VarDesc.VarType: the data type in Paddle. + """ + dtype = tensor_dtype + if dtype == core.VarDesc.VarType.FP32: + return paddle.float32 + elif dtype == core.VarDesc.VarType.FP64: + return paddle.float64 + elif dtype == core.VarDesc.VarType.FP16: + return paddle.float16 + elif dtype == core.VarDesc.VarType.INT32: + return paddle.int32 + elif dtype == core.VarDesc.VarType.INT16: + return paddle.int16 + elif dtype == core.VarDesc.VarType.INT64: + return paddle.int64 + elif dtype == core.VarDesc.VarType.BOOL: + return paddle.bool + elif dtype == core.VarDesc.VarType.BF16: + # since there is still no support for bfloat16 in NumPy, + # uint16 is used for casting bfloat16 + return paddle.uint16 + elif dtype == core.VarDesc.VarType.UINT8: + return paddle.uint8 + elif dtype == core.VarDesc.VarType.INT8: + return paddle.int8 + elif dtype == core.VarDesc.VarType.COMPLEX64: + return paddle.complex64 + elif dtype == core.VarDesc.VarType.COMPLEX128: + return paddle.complex128 + else: + raise ValueError("Not supported tensor dtype %s" % dtype) + + +if not hasattr(paddle, 'softmax'): + logger.warn("register user softmax to paddle, remove this when fixed!") + setattr(paddle, 'softmax', paddle.nn.functional.softmax) + +if not hasattr(paddle, 'log_softmax'): + logger.warn("register user log_softmax to paddle, remove this when fixed!") + setattr(paddle, 'log_softmax', paddle.nn.functional.log_softmax) + +if not hasattr(paddle, 'sigmoid'): + logger.warn("register user sigmoid to paddle, remove this when fixed!") + setattr(paddle, 'sigmoid', paddle.nn.functional.sigmoid) + +if not hasattr(paddle, 'log_sigmoid'): + logger.warn("register user log_sigmoid to paddle, remove this when fixed!") + setattr(paddle, 'log_sigmoid', paddle.nn.functional.log_sigmoid) + +if not hasattr(paddle, 'relu'): + logger.warn("register user relu to paddle, remove this when fixed!") + setattr(paddle, 'relu', paddle.nn.functional.relu) + + +def cat(xs, dim=0): + return paddle.concat(xs, axis=dim) + + +if not hasattr(paddle, 'cat'): + logger.warn( + "override cat of paddle if exists or register, remove this when fixed!") + paddle.cat = cat + + +########### hcak paddle.Tensor ############# +def item(x: paddle.Tensor): + return x.numpy().item() + + +if not hasattr(paddle.Tensor, 'item'): + logger.warn( + "override item of paddle.Tensor if exists or register, remove this when fixed!" + ) + paddle.Tensor.item = item + + +def func_long(x: paddle.Tensor): + return paddle.cast(x, paddle.long) + + +if not hasattr(paddle.Tensor, 'long'): + logger.warn( + "override long of paddle.Tensor if exists or register, remove this when fixed!" + ) + paddle.Tensor.long = func_long + +if not hasattr(paddle.Tensor, 'numel'): + logger.warn( + "override numel of paddle.Tensor if exists or register, remove this when fixed!" + ) + paddle.Tensor.numel = paddle.numel + + +def new_full(x: paddle.Tensor, + size: Union[List[int], Tuple[int], paddle.Tensor], + fill_value: Union[float, int, bool, paddle.Tensor], + dtype=None): + return paddle.full(size, fill_value, dtype=x.dtype) + + +if not hasattr(paddle.Tensor, 'new_full'): + logger.warn( + "override new_full of paddle.Tensor if exists or register, remove this when fixed!" + ) + paddle.Tensor.new_full = new_full + + +def eq(xs: paddle.Tensor, ys: Union[paddle.Tensor, float]) -> paddle.Tensor: + if convert_dtype_to_string(xs.dtype) == paddle.bool: + xs = xs.astype(paddle.int) + return xs.equal( + paddle.to_tensor( + ys, dtype=convert_dtype_to_string(xs.dtype), place=xs.place)) + + +if not hasattr(paddle.Tensor, 'eq'): + logger.warn( + "override eq of paddle.Tensor if exists or register, remove this when fixed!" + ) + paddle.Tensor.eq = eq + +if not hasattr(paddle, 'eq'): + logger.warn( + "override eq of paddle if exists or register, remove this when fixed!") + paddle.eq = eq + + +def contiguous(xs: paddle.Tensor) -> paddle.Tensor: + return xs + + +if not hasattr(paddle.Tensor, 'contiguous'): + logger.warn( + "override contiguous of paddle.Tensor if exists or register, remove this when fixed!" + ) + paddle.Tensor.contiguous = contiguous + + +def size(xs: paddle.Tensor, *args: int) -> paddle.Tensor: + nargs = len(args) + assert (nargs <= 1) + s = paddle.shape(xs) + if nargs == 1: + return s[args[0]] + else: + return s + + +#`to_static` do not process `size` property, maybe some `paddle` api dependent on it. +logger.warn( + "override size of paddle.Tensor " + "(`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!" +) +paddle.Tensor.size = size + + +def view(xs: paddle.Tensor, *args: int) -> paddle.Tensor: + return xs.reshape(args) + + +if not hasattr(paddle.Tensor, 'view'): + logger.warn("register user view to paddle.Tensor, remove this when fixed!") + paddle.Tensor.view = view + + +def view_as(xs: paddle.Tensor, ys: paddle.Tensor) -> paddle.Tensor: + return xs.reshape(ys.size()) + + +if not hasattr(paddle.Tensor, 'view_as'): + logger.warn( + "register user view_as to paddle.Tensor, remove this when fixed!") + paddle.Tensor.view_as = view_as + + +def is_broadcastable(shp1, shp2): + for a, b in zip(shp1[::-1], shp2[::-1]): + if a == 1 or b == 1 or a == b: + pass + else: + return False + return True + + +def masked_fill(xs: paddle.Tensor, + mask: paddle.Tensor, + value: Union[float, int]): + assert is_broadcastable(xs.shape, mask.shape) is True + bshape = paddle.broadcast_shape(xs.shape, mask.shape) + mask = mask.broadcast_to(bshape) + trues = paddle.ones_like(xs) * value + xs = paddle.where(mask, trues, xs) + return xs + + +if not hasattr(paddle.Tensor, 'masked_fill'): + logger.warn( + "register user masked_fill to paddle.Tensor, remove this when fixed!") + paddle.Tensor.masked_fill = masked_fill + + +def masked_fill_(xs: paddle.Tensor, + mask: paddle.Tensor, + value: Union[float, int]) -> paddle.Tensor: + assert is_broadcastable(xs.shape, mask.shape) is True + bshape = paddle.broadcast_shape(xs.shape, mask.shape) + mask = mask.broadcast_to(bshape) + trues = paddle.ones_like(xs) * value + ret = paddle.where(mask, trues, xs) + paddle.assign(ret.detach(), output=xs) + return xs + + +if not hasattr(paddle.Tensor, 'masked_fill_'): + logger.warn( + "register user masked_fill_ to paddle.Tensor, remove this when fixed!") + paddle.Tensor.masked_fill_ = masked_fill_ + + +def fill_(xs: paddle.Tensor, value: Union[float, int]) -> paddle.Tensor: + val = paddle.full_like(xs, value) + paddle.assign(val.detach(), output=xs) + return xs + + +if not hasattr(paddle.Tensor, 'fill_'): + logger.warn("register user fill_ to paddle.Tensor, remove this when fixed!") + paddle.Tensor.fill_ = fill_ + + +def repeat(xs: paddle.Tensor, *size: Any) -> paddle.Tensor: + return paddle.tile(xs, size) + + +if not hasattr(paddle.Tensor, 'repeat'): + logger.warn( + "register user repeat to paddle.Tensor, remove this when fixed!") + paddle.Tensor.repeat = repeat + +if not hasattr(paddle.Tensor, 'softmax'): + logger.warn( + "register user softmax to paddle.Tensor, remove this when fixed!") + setattr(paddle.Tensor, 'softmax', paddle.nn.functional.softmax) + +if not hasattr(paddle.Tensor, 'sigmoid'): + logger.warn( + "register user sigmoid to paddle.Tensor, remove this when fixed!") + setattr(paddle.Tensor, 'sigmoid', paddle.nn.functional.sigmoid) + +if not hasattr(paddle.Tensor, 'relu'): + logger.warn("register user relu to paddle.Tensor, remove this when fixed!") + setattr(paddle.Tensor, 'relu', paddle.nn.functional.relu) + + +def type_as(x: paddle.Tensor, other: paddle.Tensor) -> paddle.Tensor: + return x.astype(other.dtype) + + +if not hasattr(paddle.Tensor, 'type_as'): + logger.warn( + "register user type_as to paddle.Tensor, remove this when fixed!") + setattr(paddle.Tensor, 'type_as', type_as) + + +def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor: + assert len(args) == 1 + if isinstance(args[0], str): # dtype + return x.astype(args[0]) + elif isinstance(args[0], paddle.Tensor): #Tensor + return x.astype(args[0].dtype) + else: # Device + return x + + +if not hasattr(paddle.Tensor, 'to'): + logger.warn("register user to to paddle.Tensor, remove this when fixed!") + setattr(paddle.Tensor, 'to', to) + + +def func_float(x: paddle.Tensor) -> paddle.Tensor: + return x.astype(paddle.float) + + +if not hasattr(paddle.Tensor, 'float'): + logger.warn("register user float to paddle.Tensor, remove this when fixed!") + setattr(paddle.Tensor, 'float', func_float) + + +def func_int(x: paddle.Tensor) -> paddle.Tensor: + return x.astype(paddle.int) + + +if not hasattr(paddle.Tensor, 'int'): + logger.warn("register user int to paddle.Tensor, remove this when fixed!") + setattr(paddle.Tensor, 'int', func_int) + + +def tolist(x: paddle.Tensor) -> List[Any]: + return x.numpy().tolist() + + +if not hasattr(paddle.Tensor, 'tolist'): + logger.warn( + "register user tolist to paddle.Tensor, remove this when fixed!") + setattr(paddle.Tensor, 'tolist', tolist) + + +########### hcak paddle.nn ############# +class GLU(nn.Layer): + """Gated Linear Units (GLU) Layer""" + + def __init__(self, dim: int=-1): + super().__init__() + self.dim = dim + + def forward(self, xs): + return F.glu(xs, axis=self.dim) + + +if not hasattr(paddle.nn, 'GLU'): + logger.warn("register user GLU to paddle.nn, remove this when fixed!") + setattr(paddle.nn, 'GLU', GLU) diff --git a/examples/transv1.8to2.x/deepspeech/decoders/README.MD b/examples/transv1.8to2.x/deepspeech/decoders/README.MD new file mode 100644 index 00000000..046069d6 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/decoders/README.MD @@ -0,0 +1,3 @@ +# Reference +* [Sequence Modeling With CTC](https://distill.pub/2017/ctc/) +* [First-Pass Large Vocabulary Continuous Speech Recognition using Bi-Directional Recurrent DNNs](https://arxiv.org/pdf/1408.2873.pdf) \ No newline at end of file diff --git a/examples/transv1.8to2.x/deepspeech/decoders/__init__.py b/examples/transv1.8to2.x/deepspeech/decoders/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/decoders/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/transv1.8to2.x/deepspeech/decoders/decoders_deprecated.py b/examples/transv1.8to2.x/deepspeech/decoders/decoders_deprecated.py new file mode 100644 index 00000000..fef08807 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/decoders/decoders_deprecated.py @@ -0,0 +1,248 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains various CTC decoders.""" +import multiprocessing +from itertools import groupby +from math import log + +import numpy as np + + +def ctc_greedy_decoder(probs_seq, vocabulary): + """CTC greedy (best path) decoder. + + Path consisting of the most probable tokens are further post-processed to + remove consecutive repetitions and all blanks. + + :param probs_seq: 2-D list of probabilities over the vocabulary for each + character. Each element is a list of float probabilities + for one character. + :type probs_seq: list + :param vocabulary: Vocabulary list. + :type vocabulary: list + :return: Decoding result string. + :rtype: baseline + """ + # dimension verification + for probs in probs_seq: + if not len(probs) == len(vocabulary) + 1: + raise ValueError("probs_seq dimension mismatchedd with vocabulary") + # argmax to get the best index for each time step + max_index_list = list(np.array(probs_seq).argmax(axis=1)) + # remove consecutive duplicate indexes + index_list = [index_group[0] for index_group in groupby(max_index_list)] + # remove blank indexes + blank_index = len(vocabulary) + index_list = [index for index in index_list if index != blank_index] + # convert index list to string + return ''.join([vocabulary[index] for index in index_list]) + + +def ctc_beam_search_decoder(probs_seq, + beam_size, + vocabulary, + cutoff_prob=1.0, + cutoff_top_n=40, + ext_scoring_func=None, + nproc=False): + """CTC Beam search decoder. + + It utilizes beam search to approximately select top best decoding + labels and returning results in the descending order. + The implementation is based on Prefix Beam Search + (https://arxiv.org/abs/1408.2873), and the unclear part is + redesigned. Two important modifications: 1) in the iterative computation + of probabilities, the assignment operation is changed to accumulation for + one prefix may comes from different paths; 2) the if condition "if l^+ not + in A_prev then" after probabilities' computation is deprecated for it is + hard to understand and seems unnecessary. + + :param probs_seq: 2-D list of probability distributions over each time + step, with each element being a list of normalized + probabilities over vocabulary and blank. + :type probs_seq: 2-D list + :param beam_size: Width for beam search. + :type beam_size: int + :param vocabulary: Vocabulary list. + :type vocabulary: list + :param cutoff_prob: Cutoff probability in pruning, + default 1.0, no pruning. + :type cutoff_prob: float + :param ext_scoring_func: External scoring function for + partially decoded sentence, e.g. word count + or language model. + :type external_scoring_func: callable + :param nproc: Whether the decoder used in multiprocesses. + :type nproc: bool + :return: List of tuples of log probability and sentence as decoding + results, in descending order of the probability. + :rtype: list + """ + # dimension check + for prob_list in probs_seq: + if not len(prob_list) == len(vocabulary) + 1: + raise ValueError("The shape of prob_seq does not match with the " + "shape of the vocabulary.") + + # blank_id assign + blank_id = len(vocabulary) + + # If the decoder called in the multiprocesses, then use the global scorer + # instantiated in ctc_beam_search_decoder_batch(). + if nproc is True: + global ext_nproc_scorer + ext_scoring_func = ext_nproc_scorer + + # initialize + # prefix_set_prev: the set containing selected prefixes + # probs_b_prev: prefixes' probability ending with blank in previous step + # probs_nb_prev: prefixes' probability ending with non-blank in previous step + prefix_set_prev = {'\t': 1.0} + probs_b_prev, probs_nb_prev = {'\t': 1.0}, {'\t': 0.0} + + # extend prefix in loop + for time_step in range(len(probs_seq)): + # prefix_set_next: the set containing candidate prefixes + # probs_b_cur: prefixes' probability ending with blank in current step + # probs_nb_cur: prefixes' probability ending with non-blank in current step + prefix_set_next, probs_b_cur, probs_nb_cur = {}, {}, {} + + prob_idx = list(enumerate(probs_seq[time_step])) + cutoff_len = len(prob_idx) + # If pruning is enabled + if cutoff_prob < 1.0 or cutoff_top_n < cutoff_len: + prob_idx = sorted(prob_idx, key=lambda asd: asd[1], reverse=True) + cutoff_len, cum_prob = 0, 0.0 + for i in range(len(prob_idx)): + cum_prob += prob_idx[i][1] + cutoff_len += 1 + if cum_prob >= cutoff_prob: + break + cutoff_len = min(cutoff_len, cutoff_top_n) + prob_idx = prob_idx[0:cutoff_len] + + for l in prefix_set_prev: + if l not in prefix_set_next: + probs_b_cur[l], probs_nb_cur[l] = 0.0, 0.0 + + # extend prefix by travering prob_idx + for index in range(cutoff_len): + c, prob_c = prob_idx[index][0], prob_idx[index][1] + + if c == blank_id: + probs_b_cur[l] += prob_c * ( + probs_b_prev[l] + probs_nb_prev[l]) + else: + last_char = l[-1] + new_char = vocabulary[c] + l_plus = l + new_char + if l_plus not in prefix_set_next: + probs_b_cur[l_plus], probs_nb_cur[l_plus] = 0.0, 0.0 + + if new_char == last_char: + probs_nb_cur[l_plus] += prob_c * probs_b_prev[l] + probs_nb_cur[l] += prob_c * probs_nb_prev[l] + elif new_char == ' ': + if (ext_scoring_func is None) or (len(l) == 1): + score = 1.0 + else: + prefix = l[1:] + score = ext_scoring_func(prefix) + probs_nb_cur[l_plus] += score * prob_c * ( + probs_b_prev[l] + probs_nb_prev[l]) + else: + probs_nb_cur[l_plus] += prob_c * ( + probs_b_prev[l] + probs_nb_prev[l]) + # add l_plus into prefix_set_next + prefix_set_next[l_plus] = probs_nb_cur[ + l_plus] + probs_b_cur[l_plus] + # add l into prefix_set_next + prefix_set_next[l] = probs_b_cur[l] + probs_nb_cur[l] + # update probs + probs_b_prev, probs_nb_prev = probs_b_cur, probs_nb_cur + + # store top beam_size prefixes + prefix_set_prev = sorted( + prefix_set_next.items(), key=lambda asd: asd[1], reverse=True) + if beam_size < len(prefix_set_prev): + prefix_set_prev = prefix_set_prev[:beam_size] + prefix_set_prev = dict(prefix_set_prev) + + beam_result = [] + for seq, prob in prefix_set_prev.items(): + if prob > 0.0 and len(seq) > 1: + result = seq[1:] + # score last word by external scorer + if (ext_scoring_func is not None) and (result[-1] != ' '): + prob = prob * ext_scoring_func(result) + log_prob = log(prob) + beam_result.append((log_prob, result)) + else: + beam_result.append((float('-inf'), '')) + + # output top beam_size decoding results + beam_result = sorted(beam_result, key=lambda asd: asd[0], reverse=True) + return beam_result + + +def ctc_beam_search_decoder_batch(probs_split, + beam_size, + vocabulary, + num_processes, + cutoff_prob=1.0, + cutoff_top_n=40, + ext_scoring_func=None): + """CTC beam search decoder using multiple processes. + + :param probs_seq: 3-D list with each element as an instance of 2-D list + of probabilities used by ctc_beam_search_decoder(). + :type probs_seq: 3-D list + :param beam_size: Width for beam search. + :type beam_size: int + :param vocabulary: Vocabulary list. + :type vocabulary: list + :param num_processes: Number of parallel processes. + :type num_processes: int + :param cutoff_prob: Cutoff probability in pruning, + default 1.0, no pruning. + :type cutoff_prob: float + :param num_processes: Number of parallel processes. + :type num_processes: int + :param ext_scoring_func: External scoring function for + partially decoded sentence, e.g. word count + or language model. + :type external_scoring_function: callable + :return: List of tuples of log probability and sentence as decoding + results, in descending order of the probability. + :rtype: list + """ + if not num_processes > 0: + raise ValueError("Number of processes must be positive!") + + # use global variable to pass the externnal scorer to beam search decoder + global ext_nproc_scorer + ext_nproc_scorer = ext_scoring_func + nproc = True + + pool = multiprocessing.Pool(processes=num_processes) + results = [] + for i, probs_list in enumerate(probs_split): + args = (probs_list, beam_size, vocabulary, cutoff_prob, cutoff_top_n, + None, nproc) + results.append(pool.apply_async(ctc_beam_search_decoder, args)) + + pool.close() + pool.join() + beam_search_results = [result.get() for result in results] + return beam_search_results diff --git a/examples/transv1.8to2.x/deepspeech/decoders/scorer_deprecated.py b/examples/transv1.8to2.x/deepspeech/decoders/scorer_deprecated.py new file mode 100644 index 00000000..d81fb2e3 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/decoders/scorer_deprecated.py @@ -0,0 +1,78 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""External Scorer for Beam Search Decoder.""" +import os + +import kenlm +import numpy as np + + +class Scorer(object): + """External scorer to evaluate a prefix or whole sentence in + beam search decoding, including the score from n-gram language + model and word count. + + :param alpha: Parameter associated with language model. Don't use + language model when alpha = 0. + :type alpha: float + :param beta: Parameter associated with word count. Don't use word + count when beta = 0. + :type beta: float + :model_path: Path to load language model. + :type model_path: str + """ + + def __init__(self, alpha, beta, model_path): + self._alpha = alpha + self._beta = beta + if not os.path.isfile(model_path): + raise IOError("Invaid language model path: %s" % model_path) + self._language_model = kenlm.LanguageModel(model_path) + + # n-gram language model scoring + def _language_model_score(self, sentence): + #log10 prob of last word + log_cond_prob = list( + self._language_model.full_scores(sentence, eos=False))[-1][0] + return np.power(10, log_cond_prob) + + # word insertion term + def _word_count(self, sentence): + words = sentence.strip().split(' ') + return len(words) + + # reset alpha and beta + def reset_params(self, alpha, beta): + self._alpha = alpha + self._beta = beta + + # execute evaluation + def __call__(self, sentence, log=False): + """Evaluation function, gathering all the different scores + and return the final one. + + :param sentence: The input sentence for evalutation + :type sentence: str + :param log: Whether return the score in log representation. + :type log: bool + :return: Evaluation score, in the decimal or log. + :rtype: float + """ + lm = self._language_model_score(sentence) + word_cnt = self._word_count(sentence) + if log is False: + score = np.power(lm, self._alpha) * np.power(word_cnt, self._beta) + else: + score = self._alpha * np.log(lm) + self._beta * np.log(word_cnt) + return score diff --git a/examples/transv1.8to2.x/deepspeech/decoders/swig_wrapper.py b/examples/transv1.8to2.x/deepspeech/decoders/swig_wrapper.py new file mode 100644 index 00000000..d883d430 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/decoders/swig_wrapper.py @@ -0,0 +1,134 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Wrapper for various CTC decoders in SWIG.""" +import swig_decoders + + +class Scorer(swig_decoders.Scorer): + """Wrapper for Scorer. + + :param alpha: Parameter associated with language model. Don't use + language model when alpha = 0. + :type alpha: float + :param beta: Parameter associated with word count. Don't use word + count when beta = 0. + :type beta: float + :model_path: Path to load language model. + :type model_path: str + """ + + def __init__(self, alpha, beta, model_path, vocabulary): + swig_decoders.Scorer.__init__(self, alpha, beta, model_path, vocabulary) + + +def ctc_greedy_decoder(probs_seq, vocabulary, blank_id): + """Wrapper for ctc best path decoder in swig. + + :param probs_seq: 2-D list of probability distributions over each time + step, with each element being a list of normalized + probabilities over vocabulary and blank. + :type probs_seq: 2-D list + :param vocabulary: Vocabulary list. + :type vocabulary: list + :return: Decoding result string. + :rtype: str + """ + result = swig_decoders.ctc_greedy_decoder(probs_seq.tolist(), vocabulary, + blank_id) + return result + + +def ctc_beam_search_decoder(probs_seq, + vocabulary, + beam_size, + cutoff_prob=1.0, + cutoff_top_n=40, + ext_scoring_func=None, + blank_id=0): + """Wrapper for the CTC Beam Search Decoder. + + :param probs_seq: 2-D list of probability distributions over each time + step, with each element being a list of normalized + probabilities over vocabulary and blank. + :type probs_seq: 2-D list + :param vocabulary: Vocabulary list. + :type vocabulary: list + :param beam_size: Width for beam search. + :type beam_size: int + :param cutoff_prob: Cutoff probability in pruning, + default 1.0, no pruning. + :type cutoff_prob: float + :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n + characters with highest probs in vocabulary will be + used in beam search, default 40. + :type cutoff_top_n: int + :param ext_scoring_func: External scoring function for + partially decoded sentence, e.g. word count + or language model. + :type external_scoring_func: callable + :return: List of tuples of log probability and sentence as decoding + results, in descending order of the probability. + :rtype: list + """ + beam_results = swig_decoders.ctc_beam_search_decoder( + probs_seq.tolist(), vocabulary, beam_size, cutoff_prob, cutoff_top_n, + ext_scoring_func, blank_id) + beam_results = [(res[0], res[1].decode('utf-8')) for res in beam_results] + return beam_results + + +def ctc_beam_search_decoder_batch(probs_split, + vocabulary, + beam_size, + num_processes, + cutoff_prob=1.0, + cutoff_top_n=40, + ext_scoring_func=None, + blank_id=0): + """Wrapper for the batched CTC beam search decoder. + + :param probs_seq: 3-D list with each element as an instance of 2-D list + of probabilities used by ctc_beam_search_decoder(). + :type probs_seq: 3-D list + :param vocabulary: Vocabulary list. + :type vocabulary: list + :param beam_size: Width for beam search. + :type beam_size: int + :param num_processes: Number of parallel processes. + :type num_processes: int + :param cutoff_prob: Cutoff probability in vocabulary pruning, + default 1.0, no pruning. + :type cutoff_prob: float + :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n + characters with highest probs in vocabulary will be + used in beam search, default 40. + :type cutoff_top_n: int + :param num_processes: Number of parallel processes. + :type num_processes: int + :param ext_scoring_func: External scoring function for + partially decoded sentence, e.g. word count + or language model. + :type external_scoring_function: callable + :return: List of tuples of log probability and sentence as decoding + results, in descending order of the probability. + :rtype: list + """ + probs_split = [probs_seq.tolist() for probs_seq in probs_split] + + batch_beam_results = swig_decoders.ctc_beam_search_decoder_batch( + probs_split, vocabulary, beam_size, num_processes, cutoff_prob, + cutoff_top_n, ext_scoring_func, blank_id) + batch_beam_results = [[(res[0], res[1]) for res in beam_results] + for beam_results in batch_beam_results] + return batch_beam_results diff --git a/examples/transv1.8to2.x/deepspeech/frontend/__init__.py b/examples/transv1.8to2.x/deepspeech/frontend/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/frontend/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/transv1.8to2.x/deepspeech/frontend/audio.py b/examples/transv1.8to2.x/deepspeech/frontend/audio.py new file mode 100644 index 00000000..ffdcd4b3 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/frontend/audio.py @@ -0,0 +1,721 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the audio segment class.""" +import copy +import io +import random +import re +import struct + +import numpy as np +import resampy +import soundfile +import soxbindings as sox +from scipy import signal + + +class AudioSegment(object): + """Monaural audio segment abstraction. + + :param samples: Audio samples [num_samples x num_channels]. + :type samples: ndarray.float32 + :param sample_rate: Audio sample rate. + :type sample_rate: int + :raises TypeError: If the sample data type is not float or int. + """ + + def __init__(self, samples, sample_rate): + """Create audio segment from samples. + + Samples are convert float32 internally, with int scaled to [-1, 1]. + """ + self._samples = self._convert_samples_to_float32(samples) + self._sample_rate = sample_rate + if self._samples.ndim >= 2: + self._samples = np.mean(self._samples, 1) + + def __eq__(self, other): + """Return whether two objects are equal.""" + if type(other) is not type(self): + return False + if self._sample_rate != other._sample_rate: + return False + if self._samples.shape != other._samples.shape: + return False + if np.any(self.samples != other._samples): + return False + return True + + def __ne__(self, other): + """Return whether two objects are unequal.""" + return not self.__eq__(other) + + def __str__(self): + """Return human-readable representation of segment.""" + return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, " + "rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate, + self.duration, self.rms_db)) + + @classmethod + def from_file(cls, file): + """Create audio segment from audio file. + + :param filepath: Filepath or file object to audio file. + :type filepath: str|file + :return: Audio segment instance. + :rtype: AudioSegment + """ + if isinstance(file, str) and re.findall(r".seqbin_\d+$", file): + return cls.from_sequence_file(file) + else: + samples, sample_rate = soundfile.read(file, dtype='float32') + return cls(samples, sample_rate) + + @classmethod + def slice_from_file(cls, file, start=None, end=None): + """Loads a small section of an audio without having to load + the entire file into the memory which can be incredibly wasteful. + + :param file: Input audio filepath or file object. + :type file: str|file + :param start: Start time in seconds. If start is negative, it wraps + around from the end. If not provided, this function + reads from the very beginning. + :type start: float + :param end: End time in seconds. If end is negative, it wraps around + from the end. If not provided, the default behvaior is + to read to the end of the file. + :type end: float + :return: AudioSegment instance of the specified slice of the input + audio file. + :rtype: AudioSegment + :raise ValueError: If start or end is incorrectly set, e.g. out of + bounds in time. + """ + sndfile = soundfile.SoundFile(file) + sample_rate = sndfile.samplerate + duration = float(len(sndfile)) / sample_rate + start = 0. if start is None else start + end = duration if end is None else end + if start < 0.0: + start += duration + if end < 0.0: + end += duration + if start < 0.0: + raise ValueError("The slice start position (%f s) is out of " + "bounds." % start) + if end < 0.0: + raise ValueError("The slice end position (%f s) is out of bounds." % + end) + if start > end: + raise ValueError("The slice start position (%f s) is later than " + "the slice end position (%f s)." % (start, end)) + if end > duration: + raise ValueError("The slice end position (%f s) is out of bounds " + "(> %f s)" % (end, duration)) + start_frame = int(start * sample_rate) + end_frame = int(end * sample_rate) + sndfile.seek(start_frame) + data = sndfile.read(frames=end_frame - start_frame, dtype='float32') + return cls(data, sample_rate) + + @classmethod + def from_sequence_file(cls, filepath): + """Create audio segment from sequence file. Sequence file is a binary + file containing a collection of multiple audio files, with several + header bytes in the head indicating the offsets of each audio byte data + chunk. + + The format is: + + 4 bytes (int, version), + 4 bytes (int, num of utterance), + 4 bytes (int, bytes per header), + [bytes_per_header*(num_utterance+1)] bytes (offsets for each audio), + audio_bytes_data_of_1st_utterance, + audio_bytes_data_of_2nd_utterance, + ...... + + Sequence file name must end with ".seqbin". And the filename of the 5th + utterance's audio file in sequence file "xxx.seqbin" must be + "xxx.seqbin_5", with "5" indicating the utterance index within this + sequence file (starting from 1). + + :param filepath: Filepath of sequence file. + :type filepath: str + :return: Audio segment instance. + :rtype: AudioSegment + """ + # parse filepath + matches = re.match(r"(.+\.seqbin)_(\d+)", filepath) + if matches is None: + raise IOError("File type of %s is not supported" % filepath) + filename = matches.group(1) + fileno = int(matches.group(2)) + + # read headers + f = io.open(filename, mode='rb', encoding='utf8') + version = f.read(4) + num_utterances = struct.unpack("i", f.read(4))[0] + bytes_per_header = struct.unpack("i", f.read(4))[0] + header_bytes = f.read(bytes_per_header * (num_utterances + 1)) + header = [ + struct.unpack("i", header_bytes[bytes_per_header * i: + bytes_per_header * (i + 1)])[0] + for i in range(num_utterances + 1) + ] + + # read audio bytes + f.seek(header[fileno - 1]) + audio_bytes = f.read(header[fileno] - header[fileno - 1]) + f.close() + + # create audio segment + try: + return cls.from_bytes(audio_bytes) + except Exception as e: + samples = np.frombuffer(audio_bytes, dtype='int16') + return cls(samples=samples, sample_rate=8000) + + @classmethod + def from_bytes(cls, bytes): + """Create audio segment from a byte string containing audio samples. + + :param bytes: Byte string containing audio samples. + :type bytes: str + :return: Audio segment instance. + :rtype: AudioSegment + """ + samples, sample_rate = soundfile.read( + io.BytesIO(bytes), dtype='float32') + return cls(samples, sample_rate) + + @classmethod + def concatenate(cls, *segments): + """Concatenate an arbitrary number of audio segments together. + + :param *segments: Input audio segments to be concatenated. + :type *segments: tuple of AudioSegment + :return: Audio segment instance as concatenating results. + :rtype: AudioSegment + :raises ValueError: If the number of segments is zero, or if the + sample_rate of any segments does not match. + :raises TypeError: If any segment is not AudioSegment instance. + """ + # Perform basic sanity-checks. + if len(segments) == 0: + raise ValueError("No audio segments are given to concatenate.") + sample_rate = segments[0]._sample_rate + for seg in segments: + if sample_rate != seg._sample_rate: + raise ValueError("Can't concatenate segments with " + "different sample rates") + if type(seg) is not cls: + raise TypeError("Only audio segments of the same type " + "can be concatenated.") + samples = np.concatenate([seg.samples for seg in segments]) + return cls(samples, sample_rate) + + @classmethod + def make_silence(cls, duration, sample_rate): + """Creates a silent audio segment of the given duration and sample rate. + + :param duration: Length of silence in seconds. + :type duration: float + :param sample_rate: Sample rate. + :type sample_rate: float + :return: Silent AudioSegment instance of the given duration. + :rtype: AudioSegment + """ + samples = np.zeros(int(duration * sample_rate)) + return cls(samples, sample_rate) + + def to_wav_file(self, filepath, dtype='float32'): + """Save audio segment to disk as wav file. + + :param filepath: WAV filepath or file object to save the + audio segment. + :type filepath: str|file + :param dtype: Subtype for audio file. Options: 'int16', 'int32', + 'float32', 'float64'. Default is 'float32'. + :type dtype: str + :raises TypeError: If dtype is not supported. + """ + samples = self._convert_samples_from_float32(self._samples, dtype) + subtype_map = { + 'int16': 'PCM_16', + 'int32': 'PCM_32', + 'float32': 'FLOAT', + 'float64': 'DOUBLE' + } + soundfile.write( + filepath, + samples, + self._sample_rate, + format='WAV', + subtype=subtype_map[dtype]) + + def superimpose(self, other): + """Add samples from another segment to those of this segment + (sample-wise addition, not segment concatenation). + + Note that this is an in-place transformation. + + :param other: Segment containing samples to be added in. + :type other: AudioSegments + :raise TypeError: If type of two segments don't match. + :raise ValueError: If the sample rates of the two segments are not + equal, or if the lengths of segments don't match. + """ + if isinstance(other, type(self)): + raise TypeError("Cannot add segments of different types: %s " + "and %s." % (type(self), type(other))) + if self._sample_rate != other._sample_rate: + raise ValueError("Sample rates must match to add segments.") + if len(self._samples) != len(other._samples): + raise ValueError("Segment lengths must match to add segments.") + self._samples += other._samples + + def to_bytes(self, dtype='float32'): + """Create a byte string containing the audio content. + + :param dtype: Data type for export samples. Options: 'int16', 'int32', + 'float32', 'float64'. Default is 'float32'. + :type dtype: str + :return: Byte string containing audio content. + :rtype: str + """ + samples = self._convert_samples_from_float32(self._samples, dtype) + return samples.tostring() + + def to(self, dtype='int16'): + """Create a `dtype` audio content. + + :param dtype: Data type for export samples. Options: 'int16', 'int32', + 'float32', 'float64'. Default is 'float32'. + :type dtype: str + :return: np.ndarray containing `dtype` audio content. + :rtype: str + """ + samples = self._convert_samples_from_float32(self._samples, dtype) + return samples + + def gain_db(self, gain): + """Apply gain in decibels to samples. + + Note that this is an in-place transformation. + + :param gain: Gain in decibels to apply to samples. + :type gain: float|1darray + """ + self._samples *= 10.**(gain / 20.) + + def change_speed(self, speed_rate): + """Change the audio speed by linear interpolation. + + Note that this is an in-place transformation. + + :param speed_rate: Rate of speed change: + speed_rate > 1.0, speed up the audio; + speed_rate = 1.0, unchanged; + speed_rate < 1.0, slow down the audio; + speed_rate <= 0.0, not allowed, raise ValueError. + :type speed_rate: float + :raises ValueError: If speed_rate <= 0.0. + """ + if speed_rate == 1.0: + return + if speed_rate <= 0: + raise ValueError("speed_rate should be greater than zero.") + + # numpy + # old_length = self._samples.shape[0] + # new_length = int(old_length / speed_rate) + # old_indices = np.arange(old_length) + # new_indices = np.linspace(start=0, stop=old_length, num=new_length) + # self._samples = np.interp(new_indices, old_indices, self._samples) + + # sox, slow + tfm = sox.Transformer() + tfm.set_globals(multithread=False) + tfm.speed(speed_rate) + self._samples = tfm.build_array( + input_array=self._samples, + sample_rate_in=self._sample_rate).squeeze(-1).astype( + np.float32).copy() + + def normalize(self, target_db=-20, max_gain_db=300.0): + """Normalize audio to be of the desired RMS value in decibels. + + Note that this is an in-place transformation. + + :param target_db: Target RMS value in decibels. This value should be + less than 0.0 as 0.0 is full-scale audio. + :type target_db: float + :param max_gain_db: Max amount of gain in dB that can be applied for + normalization. This is to prevent nans when + attempting to normalize a signal consisting of + all zeros. + :type max_gain_db: float + :raises ValueError: If the required gain to normalize the segment to + the target_db value exceeds max_gain_db. + """ + gain = target_db - self.rms_db + if gain > max_gain_db: + raise ValueError( + "Unable to normalize segment to %f dB because the " + "the probable gain have exceeds max_gain_db (%f dB)" % + (target_db, max_gain_db)) + self.gain_db(min(max_gain_db, target_db - self.rms_db)) + + def normalize_online_bayesian(self, + target_db, + prior_db, + prior_samples, + startup_delay=0.0): + """Normalize audio using a production-compatible online/causal + algorithm. This uses an exponential likelihood and gamma prior to + make online estimates of the RMS even when there are very few samples. + + Note that this is an in-place transformation. + + :param target_db: Target RMS value in decibels. + :type target_bd: float + :param prior_db: Prior RMS estimate in decibels. + :type prior_db: float + :param prior_samples: Prior strength in number of samples. + :type prior_samples: float + :param startup_delay: Default 0.0s. If provided, this function will + accrue statistics for the first startup_delay + seconds before applying online normalization. + :type startup_delay: float + """ + # Estimate total RMS online. + startup_sample_idx = min(self.num_samples - 1, + int(self.sample_rate * startup_delay)) + prior_mean_squared = 10.**(prior_db / 10.) + prior_sum_of_squares = prior_mean_squared * prior_samples + cumsum_of_squares = np.cumsum(self.samples**2) + sample_count = np.arange(self.num_samples) + 1 + if startup_sample_idx > 0: + cumsum_of_squares[:startup_sample_idx] = \ + cumsum_of_squares[startup_sample_idx] + sample_count[:startup_sample_idx] = \ + sample_count[startup_sample_idx] + mean_squared_estimate = ((cumsum_of_squares + prior_sum_of_squares) / + (sample_count + prior_samples)) + rms_estimate_db = 10 * np.log10(mean_squared_estimate) + # Compute required time-varying gain. + gain_db = target_db - rms_estimate_db + self.gain_db(gain_db) + + def resample(self, target_sample_rate, filter='kaiser_best'): + """Resample the audio to a target sample rate. + + Note that this is an in-place transformation. + + :param target_sample_rate: Target sample rate. + :type target_sample_rate: int + :param filter: The resampling filter to use one of {'kaiser_best', + 'kaiser_fast'}. + :type filter: str + """ + self._samples = resampy.resample( + self.samples, self.sample_rate, target_sample_rate, filter=filter) + self._sample_rate = target_sample_rate + + def pad_silence(self, duration, sides='both'): + """Pad this audio sample with a period of silence. + + Note that this is an in-place transformation. + + :param duration: Length of silence in seconds to pad. + :type duration: float + :param sides: Position for padding: + 'beginning' - adds silence in the beginning; + 'end' - adds silence in the end; + 'both' - adds silence in both the beginning and the end. + :type sides: str + :raises ValueError: If sides is not supported. + """ + if duration == 0.0: + return self + cls = type(self) + silence = self.make_silence(duration, self._sample_rate) + if sides == "beginning": + padded = cls.concatenate(silence, self) + elif sides == "end": + padded = cls.concatenate(self, silence) + elif sides == "both": + padded = cls.concatenate(silence, self, silence) + else: + raise ValueError("Unknown value for the sides %s" % sides) + self._samples = padded._samples + + def shift(self, shift_ms): + """Shift the audio in time. If `shift_ms` is positive, shift with time + advance; if negative, shift with time delay. Silence are padded to + keep the duration unchanged. + + Note that this is an in-place transformation. + + :param shift_ms: Shift time in millseconds. If positive, shift with + time advance; if negative; shift with time delay. + :type shift_ms: float + :raises ValueError: If shift_ms is longer than audio duration. + """ + if abs(shift_ms) / 1000.0 > self.duration: + raise ValueError("Absolute value of shift_ms should be smaller " + "than audio duration.") + shift_samples = int(shift_ms * self._sample_rate / 1000) + if shift_samples > 0: + # time advance + self._samples[:-shift_samples] = self._samples[shift_samples:] + self._samples[-shift_samples:] = 0 + elif shift_samples < 0: + # time delay + self._samples[-shift_samples:] = self._samples[:shift_samples] + self._samples[:-shift_samples] = 0 + + def subsegment(self, start_sec=None, end_sec=None): + """Cut the AudioSegment between given boundaries. + + Note that this is an in-place transformation. + + :param start_sec: Beginning of subsegment in seconds. + :type start_sec: float + :param end_sec: End of subsegment in seconds. + :type end_sec: float + :raise ValueError: If start_sec or end_sec is incorrectly set, e.g. out + of bounds in time. + """ + start_sec = 0.0 if start_sec is None else start_sec + end_sec = self.duration if end_sec is None else end_sec + if start_sec < 0.0: + start_sec = self.duration + start_sec + if end_sec < 0.0: + end_sec = self.duration + end_sec + if start_sec < 0.0: + raise ValueError("The slice start position (%f s) is out of " + "bounds." % start_sec) + if end_sec < 0.0: + raise ValueError("The slice end position (%f s) is out of bounds." % + end_sec) + if start_sec > end_sec: + raise ValueError("The slice start position (%f s) is later than " + "the end position (%f s)." % (start_sec, end_sec)) + if end_sec > self.duration: + raise ValueError("The slice end position (%f s) is out of bounds " + "(> %f s)" % (end_sec, self.duration)) + start_sample = int(round(start_sec * self._sample_rate)) + end_sample = int(round(end_sec * self._sample_rate)) + self._samples = self._samples[start_sample:end_sample] + + def random_subsegment(self, subsegment_length, rng=None): + """Cut the specified length of the audiosegment randomly. + + Note that this is an in-place transformation. + + :param subsegment_length: Subsegment length in seconds. + :type subsegment_length: float + :param rng: Random number generator state. + :type rng: random.Random + :raises ValueError: If the length of subsegment is greater than + the origineal segemnt. + """ + rng = random.Random() if rng is None else rng + if subsegment_length > self.duration: + raise ValueError("Length of subsegment must not be greater " + "than original segment.") + start_time = rng.uniform(0.0, self.duration - subsegment_length) + self.subsegment(start_time, start_time + subsegment_length) + + def convolve(self, impulse_segment, allow_resample=False): + """Convolve this audio segment with the given impulse segment. + + Note that this is an in-place transformation. + + :param impulse_segment: Impulse response segments. + :type impulse_segment: AudioSegment + :param allow_resample: Indicates whether resampling is allowed when + the impulse_segment has a different sample + rate from this signal. + :type allow_resample: bool + :raises ValueError: If the sample rate is not match between two + audio segments when resample is not allowed. + """ + if allow_resample and self.sample_rate != impulse_segment.sample_rate: + impulse_segment.resample(self.sample_rate) + if self.sample_rate != impulse_segment.sample_rate: + raise ValueError("Impulse segment's sample rate (%d Hz) is not " + "equal to base signal sample rate (%d Hz)." % + (impulse_segment.sample_rate, self.sample_rate)) + samples = signal.fftconvolve(self.samples, impulse_segment.samples, + "full") + self._samples = samples + + def convolve_and_normalize(self, impulse_segment, allow_resample=False): + """Convolve and normalize the resulting audio segment so that it + has the same average power as the input signal. + + Note that this is an in-place transformation. + + :param impulse_segment: Impulse response segments. + :type impulse_segment: AudioSegment + :param allow_resample: Indicates whether resampling is allowed when + the impulse_segment has a different sample + rate from this signal. + :type allow_resample: bool + """ + target_db = self.rms_db + self.convolve(impulse_segment, allow_resample=allow_resample) + self.normalize(target_db) + + def add_noise(self, + noise, + snr_dB, + allow_downsampling=False, + max_gain_db=300.0, + rng=None): + """Add the given noise segment at a specific signal-to-noise ratio. + If the noise segment is longer than this segment, a random subsegment + of matching length is sampled from it and used instead. + + Note that this is an in-place transformation. + + :param noise: Noise signal to add. + :type noise: AudioSegment + :param snr_dB: Signal-to-Noise Ratio, in decibels. + :type snr_dB: float + :param allow_downsampling: Whether to allow the noise signal to be + downsampled to match the base signal sample + rate. + :type allow_downsampling: bool + :param max_gain_db: Maximum amount of gain to apply to noise signal + before adding it in. This is to prevent attempting + to apply infinite gain to a zero signal. + :type max_gain_db: float + :param rng: Random number generator state. + :type rng: None|random.Random + :raises ValueError: If the sample rate does not match between the two + audio segments when downsampling is not allowed, or + if the duration of noise segments is shorter than + original audio segments. + """ + rng = random.Random() if rng is None else rng + if allow_downsampling and noise.sample_rate > self.sample_rate: + noise = noise.resample(self.sample_rate) + if noise.sample_rate != self.sample_rate: + raise ValueError("Noise sample rate (%d Hz) is not equal to base " + "signal sample rate (%d Hz)." % (noise.sample_rate, + self.sample_rate)) + if noise.duration < self.duration: + raise ValueError("Noise signal (%f sec) must be at least as long as" + " base signal (%f sec)." % + (noise.duration, self.duration)) + noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db) + noise_new = copy.deepcopy(noise) + noise_new.random_subsegment(self.duration, rng=rng) + noise_new.gain_db(noise_gain_db) + self.superimpose(noise_new) + + @property + def samples(self): + """Return audio samples. + + :return: Audio samples. + :rtype: ndarray + """ + return self._samples.copy() + + @property + def sample_rate(self): + """Return audio sample rate. + + :return: Audio sample rate. + :rtype: int + """ + return self._sample_rate + + @property + def num_samples(self): + """Return number of samples. + + :return: Number of samples. + :rtype: int + """ + return self._samples.shape[0] + + @property + def duration(self): + """Return audio duration. + + :return: Audio duration in seconds. + :rtype: float + """ + return self._samples.shape[0] / float(self._sample_rate) + + @property + def rms_db(self): + """Return root mean square energy of the audio in decibels. + + :return: Root mean square energy in decibels. + :rtype: float + """ + # square root => multiply by 10 instead of 20 for dBs + mean_square = np.mean(self._samples**2) + return 10 * np.log10(mean_square) + + def _convert_samples_to_float32(self, samples): + """Convert sample type to float32. + + Audio sample type is usually integer or float-point. + Integers will be scaled to [-1, 1] in float32. + """ + float32_samples = samples.astype('float32') + if samples.dtype in np.sctypes['int']: + bits = np.iinfo(samples.dtype).bits + float32_samples *= (1. / 2**(bits - 1)) + elif samples.dtype in np.sctypes['float']: + pass + else: + raise TypeError("Unsupported sample type: %s." % samples.dtype) + return float32_samples + + def _convert_samples_from_float32(self, samples, dtype): + """Convert sample type from float32 to dtype. + + Audio sample type is usually integer or float-point. For integer + type, float32 will be rescaled from [-1, 1] to the maximum range + supported by the integer type. + + This is for writing a audio file. + """ + dtype = np.dtype(dtype) + output_samples = samples.copy() + if dtype in np.sctypes['int']: + bits = np.iinfo(dtype).bits + output_samples *= (2**(bits - 1) / 1.) + min_val = np.iinfo(dtype).min + max_val = np.iinfo(dtype).max + output_samples[output_samples > max_val] = max_val + output_samples[output_samples < min_val] = min_val + elif samples.dtype in np.sctypes['float']: + min_val = np.finfo(dtype).min + max_val = np.finfo(dtype).max + output_samples[output_samples > max_val] = max_val + output_samples[output_samples < min_val] = min_val + else: + raise TypeError("Unsupported sample type: %s." % samples.dtype) + return output_samples.astype(dtype) diff --git a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/__init__.py b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/augmentation.py b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/augmentation.py new file mode 100644 index 00000000..17abcf60 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/augmentation.py @@ -0,0 +1,218 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the data augmentation pipeline.""" +import json +from collections.abc import Sequence +from inspect import signature + +import numpy as np + +from deepspeech.frontend.augmentor.base import AugmentorBase +from deepspeech.utils.dynamic_import import dynamic_import +from deepspeech.utils.log import Log + +__all__ = ["AugmentationPipeline"] + +logger = Log(__name__).getlog() + +import_alias = dict( + volume="deepspeech.frontend.augmentor.impulse_response:VolumePerturbAugmentor", + shift="deepspeech.frontend.augmentor.shift_perturb:ShiftPerturbAugmentor", + speed="deepspeech.frontend.augmentor.speed_perturb:SpeedPerturbAugmentor", + resample="deepspeech.frontend.augmentor.resample:ResampleAugmentor", + bayesian_normal="deepspeech.frontend.augmentor.online_bayesian_normalization:OnlineBayesianNormalizationAugmentor", + noise="deepspeech.frontend.augmentor.noise_perturb:NoisePerturbAugmentor", + impulse="deepspeech.frontend.augmentor.impulse_response:ImpulseResponseAugmentor", + specaug="deepspeech.frontend.augmentor.spec_augment:SpecAugmentor", ) + + +class AugmentationPipeline(): + """Build a pre-processing pipeline with various augmentation models.Such a + data augmentation pipeline is oftern leveraged to augment the training + samples to make the model invariant to certain types of perturbations in the + real world, improving model's generalization ability. + + The pipeline is built according the the augmentation configuration in json + string, e.g. + + .. code-block:: + + [ { + "type": "noise", + "params": {"min_snr_dB": 10, + "max_snr_dB": 20, + "noise_manifest_path": "datasets/manifest.noise"}, + "prob": 0.0 + }, + { + "type": "speed", + "params": {"min_speed_rate": 0.9, + "max_speed_rate": 1.1}, + "prob": 1.0 + }, + { + "type": "shift", + "params": {"min_shift_ms": -5, + "max_shift_ms": 5}, + "prob": 1.0 + }, + { + "type": "volume", + "params": {"min_gain_dBFS": -10, + "max_gain_dBFS": 10}, + "prob": 0.0 + }, + { + "type": "bayesian_normal", + "params": {"target_db": -20, + "prior_db": -20, + "prior_samples": 100}, + "prob": 0.0 + } + ] + + This augmentation configuration inserts two augmentation models + into the pipeline, with one is VolumePerturbAugmentor and the other + SpeedPerturbAugmentor. "prob" indicates the probability of the current + augmentor to take effect. If "prob" is zero, the augmentor does not take + effect. + + Params: + augmentation_config(str): Augmentation configuration in json string. + random_seed(int): Random seed. + train(bool): whether is train mode. + + Raises: + ValueError: If the augmentation json config is in incorrect format". + """ + + SPEC_TYPES = {'specaug'} + + def __init__(self, augmentation_config: str, random_seed: int=0): + self._rng = np.random.RandomState(random_seed) + self.conf = {'mode': 'sequential', 'process': []} + if augmentation_config: + process = json.loads(augmentation_config) + self.conf['process'] += process + + self._augmentors, self._rates = self._parse_pipeline_from('all') + self._audio_augmentors, self._audio_rates = self._parse_pipeline_from( + 'audio') + self._spec_augmentors, self._spec_rates = self._parse_pipeline_from( + 'feature') + + def __call__(self, xs, uttid_list=None, **kwargs): + if not isinstance(xs, Sequence): + is_batch = False + xs = [xs] + else: + is_batch = True + + if isinstance(uttid_list, str): + uttid_list = [uttid_list for _ in range(len(xs))] + + if self.conf.get("mode", "sequential") == "sequential": + for idx, (func, rate) in enumerate( + zip(self._augmentors, self._rates), 0): + if self._rng.uniform(0., 1.) >= rate: + continue + + # Derive only the args which the func has + try: + param = signature(func).parameters + except ValueError: + # Some function, e.g. built-in function, are failed + param = {} + _kwargs = {k: v for k, v in kwargs.items() if k in param} + + try: + if uttid_list is not None and "uttid" in param: + xs = [ + func(x, u, **_kwargs) + for x, u in zip(xs, uttid_list) + ] + else: + xs = [func(x, **_kwargs) for x in xs] + except Exception: + logger.fatal("Catch a exception from {}th func: {}".format( + idx, func)) + raise + else: + raise NotImplementedError( + "Not supporting mode={}".format(self.conf["mode"])) + + if is_batch: + return xs + else: + return xs[0] + + def transform_audio(self, audio_segment): + """Run the pre-processing pipeline for data augmentation. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to process. + :type audio_segment: AudioSegmenet|SpeechSegment + """ + for augmentor, rate in zip(self._audio_augmentors, self._audio_rates): + if self._rng.uniform(0., 1.) < rate: + augmentor.transform_audio(audio_segment) + + def transform_feature(self, spec_segment): + """spectrogram augmentation. + + Args: + spec_segment (np.ndarray): audio feature, (D, T). + """ + for augmentor, rate in zip(self._spec_augmentors, self._spec_rates): + if self._rng.uniform(0., 1.) < rate: + spec_segment = augmentor.transform_feature(spec_segment) + return spec_segment + + def _parse_pipeline_from(self, aug_type='all'): + """Parse the config json to build a augmentation pipelien.""" + assert aug_type in ('audio', 'feature', 'all'), aug_type + audio_confs = [] + feature_confs = [] + all_confs = [] + for config in self.conf['process']: + all_confs.append(config) + if config["type"] in self.SPEC_TYPES: + feature_confs.append(config) + else: + audio_confs.append(config) + + if aug_type == 'audio': + aug_confs = audio_confs + elif aug_type == 'feature': + aug_confs = feature_confs + else: + aug_confs = all_confs + + augmentors = [ + self._get_augmentor(config["type"], config["params"]) + for config in aug_confs + ] + rates = [config["prob"] for config in aug_confs] + return augmentors, rates + + def _get_augmentor(self, augmentor_type, params): + """Return an augmentation model by the type name, and pass in params.""" + class_obj = dynamic_import(augmentor_type, import_alias) + assert issubclass(class_obj, AugmentorBase) + try: + obj = class_obj(self._rng, **params) + except Exception: + raise ValueError("Unknown augmentor type [%s]." % augmentor_type) + return obj diff --git a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/base.py b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/base.py new file mode 100644 index 00000000..18d003c0 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/base.py @@ -0,0 +1,59 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the abstract base class for augmentation models.""" +from abc import ABCMeta +from abc import abstractmethod + + +class AugmentorBase(): + """Abstract base class for augmentation model (augmentor) class. + All augmentor classes should inherit from this class, and implement the + following abstract methods. + """ + + __metaclass__ = ABCMeta + + @abstractmethod + def __init__(self): + pass + + @abstractmethod + def __call__(self, xs): + raise NotImplementedError("AugmentorBase: Not impl __call__") + + @abstractmethod + def transform_audio(self, audio_segment): + """Adds various effects to the input audio segment. Such effects + will augment the training data to make the model invariant to certain + types of perturbations in the real world, improving model's + generalization ability. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegmenet|SpeechSegment + """ + raise NotImplementedError("AugmentorBase: Not impl transform_audio") + + @abstractmethod + def transform_feature(self, spec_segment): + """Adds various effects to the input audo feature segment. Such effects + will augment the training data to make the model invariant to certain + types of time_mask or freq_mask in the real world, improving model's + generalization ability. + + Args: + spec_segment (Spectrogram): Spectrogram segment to add effects to. + """ + raise NotImplementedError("AugmentorBase: Not impl transform_feature") diff --git a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/impulse_response.py b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/impulse_response.py new file mode 100644 index 00000000..818251ed --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/impulse_response.py @@ -0,0 +1,50 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the impulse response augmentation model.""" +from deepspeech.frontend.audio import AudioSegment +from deepspeech.frontend.augmentor.base import AugmentorBase +from deepspeech.frontend.utility import read_manifest + + +class ImpulseResponseAugmentor(AugmentorBase): + """Augmentation model for adding impulse response effect. + + :param rng: Random generator object. + :type rng: random.Random + :param impulse_manifest_path: Manifest path for impulse audio data. + :type impulse_manifest_path: str + """ + + def __init__(self, rng, impulse_manifest_path): + self._rng = rng + self._impulse_manifest = read_manifest(impulse_manifest_path) + + def __call__(self, x, uttid=None, train=True): + if not train: + return x + self.transform_audio(x) + return x + + def transform_audio(self, audio_segment): + """Add impulse response effect. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegmenet|SpeechSegment + """ + impulse_json = self._rng.choice( + self._impulse_manifest, 1, replace=False)[0] + impulse_segment = AudioSegment.from_file(impulse_json['audio_filepath']) + audio_segment.convolve(impulse_segment, allow_resample=True) diff --git a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/noise_perturb.py b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/noise_perturb.py new file mode 100644 index 00000000..790b0c39 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/noise_perturb.py @@ -0,0 +1,64 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the noise perturb augmentation model.""" +from deepspeech.frontend.audio import AudioSegment +from deepspeech.frontend.augmentor.base import AugmentorBase +from deepspeech.frontend.utility import read_manifest + + +class NoisePerturbAugmentor(AugmentorBase): + """Augmentation model for adding background noise. + + :param rng: Random generator object. + :type rng: random.Random + :param min_snr_dB: Minimal signal noise ratio, in decibels. + :type min_snr_dB: float + :param max_snr_dB: Maximal signal noise ratio, in decibels. + :type max_snr_dB: float + :param noise_manifest_path: Manifest path for noise audio data. + :type noise_manifest_path: str + """ + + def __init__(self, rng, min_snr_dB, max_snr_dB, noise_manifest_path): + self._min_snr_dB = min_snr_dB + self._max_snr_dB = max_snr_dB + self._rng = rng + self._noise_manifest = read_manifest(manifest_path=noise_manifest_path) + + def __call__(self, x, uttid=None, train=True): + if not train: + return x + self.transform_audio(x) + return x + + def transform_audio(self, audio_segment): + """Add background noise audio. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegmenet|SpeechSegment + """ + noise_json = self._rng.choice(self._noise_manifest, 1, replace=False)[0] + if noise_json['duration'] < audio_segment.duration: + raise RuntimeError("The duration of sampled noise audio is smaller " + "than the audio segment to add effects to.") + diff_duration = noise_json['duration'] - audio_segment.duration + start = self._rng.uniform(0, diff_duration) + end = start + audio_segment.duration + noise_segment = AudioSegment.slice_from_file( + noise_json['audio_filepath'], start=start, end=end) + snr_dB = self._rng.uniform(self._min_snr_dB, self._max_snr_dB) + audio_segment.add_noise( + noise_segment, snr_dB, allow_downsampling=True, rng=self._rng) diff --git a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/online_bayesian_normalization.py b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/online_bayesian_normalization.py new file mode 100644 index 00000000..0f9d3ef6 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/online_bayesian_normalization.py @@ -0,0 +1,63 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contain the online bayesian normalization augmentation model.""" +from deepspeech.frontend.augmentor.base import AugmentorBase + + +class OnlineBayesianNormalizationAugmentor(AugmentorBase): + """Augmentation model for adding online bayesian normalization. + + :param rng: Random generator object. + :type rng: random.Random + :param target_db: Target RMS value in decibels. + :type target_db: float + :param prior_db: Prior RMS estimate in decibels. + :type prior_db: float + :param prior_samples: Prior strength in number of samples. + :type prior_samples: int + :param startup_delay: Default 0.0s. If provided, this function will + accrue statistics for the first startup_delay + seconds before applying online normalization. + :type starup_delay: float. + """ + + def __init__(self, + rng, + target_db, + prior_db, + prior_samples, + startup_delay=0.0): + self._target_db = target_db + self._prior_db = prior_db + self._prior_samples = prior_samples + self._rng = rng + self._startup_delay = startup_delay + + def __call__(self, x, uttid=None, train=True): + if not train: + return x + self.transform_audio(x) + return x + + def transform_audio(self, audio_segment): + """Normalizes the input audio using the online Bayesian approach. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegment|SpeechSegment + """ + audio_segment.normalize_online_bayesian(self._target_db, self._prior_db, + self._prior_samples, + self._startup_delay) diff --git a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/resample.py b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/resample.py new file mode 100644 index 00000000..509fe003 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/resample.py @@ -0,0 +1,48 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contain the resample augmentation model.""" +from deepspeech.frontend.augmentor.base import AugmentorBase + + +class ResampleAugmentor(AugmentorBase): + """Augmentation model for resampling. + + See more info here: + https://ccrma.stanford.edu/~jos/resample/index.html + + :param rng: Random generator object. + :type rng: random.Random + :param new_sample_rate: New sample rate in Hz. + :type new_sample_rate: int + """ + + def __init__(self, rng, new_sample_rate): + self._new_sample_rate = new_sample_rate + self._rng = rng + + def __call__(self, x, uttid=None, train=True): + if not train: + return x + self.transform_audio(x) + return x + + def transform_audio(self, audio_segment): + """Resamples the input audio to a target sample rate. + + Note that this is an in-place transformation. + + :param audio: Audio segment to add effects to. + :type audio: AudioSegment|SpeechSegment + """ + audio_segment.resample(self._new_sample_rate) diff --git a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/shift_perturb.py b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/shift_perturb.py new file mode 100644 index 00000000..8b7439fe --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/shift_perturb.py @@ -0,0 +1,49 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the volume perturb augmentation model.""" +from deepspeech.frontend.augmentor.base import AugmentorBase + + +class ShiftPerturbAugmentor(AugmentorBase): + """Augmentation model for adding random shift perturbation. + + :param rng: Random generator object. + :type rng: random.Random + :param min_shift_ms: Minimal shift in milliseconds. + :type min_shift_ms: float + :param max_shift_ms: Maximal shift in milliseconds. + :type max_shift_ms: float + """ + + def __init__(self, rng, min_shift_ms, max_shift_ms): + self._min_shift_ms = min_shift_ms + self._max_shift_ms = max_shift_ms + self._rng = rng + + def __call__(self, x, uttid=None, train=True): + if not train: + return x + self.transform_audio(x) + return x + + def transform_audio(self, audio_segment): + """Shift audio. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegmenet|SpeechSegment + """ + shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms) + audio_segment.shift(shift_ms) diff --git a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/spec_augment.py b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/spec_augment.py new file mode 100644 index 00000000..26c94d41 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/spec_augment.py @@ -0,0 +1,256 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the volume perturb augmentation model.""" +import random + +import numpy as np +from PIL import Image +from PIL.Image import BICUBIC + +from deepspeech.frontend.augmentor.base import AugmentorBase +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + + +class SpecAugmentor(AugmentorBase): + """Augmentation model for Time warping, Frequency masking, Time masking. + + SpecAugment: A Simple Data Augmentation Method for Automatic Speech Recognition + https://arxiv.org/abs/1904.08779 + + SpecAugment on Large Scale Datasets + https://arxiv.org/abs/1912.05533 + + """ + + def __init__(self, + rng, + F, + T, + n_freq_masks, + n_time_masks, + p=1.0, + W=40, + adaptive_number_ratio=0, + adaptive_size_ratio=0, + max_n_time_masks=20, + replace_with_zero=True, + warp_mode='PIL'): + """SpecAugment class. + Args: + rng (random.Random): random generator object. + F (int): parameter for frequency masking + T (int): parameter for time masking + n_freq_masks (int): number of frequency masks + n_time_masks (int): number of time masks + p (float): parameter for upperbound of the time mask + W (int): parameter for time warping + adaptive_number_ratio (float): adaptive multiplicity ratio for time masking + adaptive_size_ratio (float): adaptive size ratio for time masking + max_n_time_masks (int): maximum number of time masking + replace_with_zero (bool): pad zero on mask if true else use mean + warp_mode (str): "PIL" (default, fast, not differentiable) + or "sparse_image_warp" (slow, differentiable) + """ + super().__init__() + self._rng = rng + self.inplace = True + self.replace_with_zero = replace_with_zero + + self.mode = warp_mode + self.W = W + self.F = F + self.T = T + self.n_freq_masks = n_freq_masks + self.n_time_masks = n_time_masks + self.p = p + + # adaptive SpecAugment + self.adaptive_number_ratio = adaptive_number_ratio + self.adaptive_size_ratio = adaptive_size_ratio + self.max_n_time_masks = max_n_time_masks + + if adaptive_number_ratio > 0: + self.n_time_masks = 0 + logger.info('n_time_masks is set ot zero for adaptive SpecAugment.') + if adaptive_size_ratio > 0: + self.T = 0 + logger.info('T is set to zero for adaptive SpecAugment.') + + self._freq_mask = None + self._time_mask = None + + def librispeech_basic(self): + self.W = 80 + self.F = 27 + self.T = 100 + self.n_freq_masks = 1 + self.n_time_masks = 1 + self.p = 1.0 + + def librispeech_double(self): + self.W = 80 + self.F = 27 + self.T = 100 + self.n_freq_masks = 2 + self.n_time_masks = 2 + self.p = 1.0 + + def switchboard_mild(self): + self.W = 40 + self.F = 15 + self.T = 70 + self.n_freq_masks = 2 + self.n_time_masks = 2 + self.p = 0.2 + + def switchboard_strong(self): + self.W = 40 + self.F = 27 + self.T = 70 + self.n_freq_masks = 2 + self.n_time_masks = 2 + self.p = 0.2 + + @property + def freq_mask(self): + return self._freq_mask + + @property + def time_mask(self): + return self._time_mask + + def __repr__(self): + return f"specaug: F-{F}, T-{T}, F-n-{n_freq_masks}, T-n-{n_time_masks}" + + def time_warp(self, x, mode='PIL'): + """time warp for spec augment + move random center frame by the random width ~ uniform(-window, window) + + Args: + x (np.ndarray): spectrogram (time, freq) + mode (str): PIL or sparse_image_warp + + Raises: + NotImplementedError: [description] + NotImplementedError: [description] + + Returns: + np.ndarray: time warped spectrogram (time, freq) + """ + window = max_time_warp = self.W + if window == 0: + return x + + if mode == "PIL": + t = x.shape[0] + if t - window <= window: + return x + # NOTE: randrange(a, b) emits a, a + 1, ..., b - 1 + center = random.randrange(window, t - window) + warped = random.randrange(center - window, center + + window) + 1 # 1 ... t - 1 + + left = Image.fromarray(x[:center]).resize((x.shape[1], warped), + BICUBIC) + right = Image.fromarray(x[center:]).resize((x.shape[1], t - warped), + BICUBIC) + if self.inplace: + x[:warped] = left + x[warped:] = right + return x + return np.concatenate((left, right), 0) + elif mode == "sparse_image_warp": + raise NotImplementedError('sparse_image_warp') + else: + raise NotImplementedError( + "unknown resize mode: " + mode + + ", choose one from (PIL, sparse_image_warp).") + + def mask_freq(self, x, replace_with_zero=False): + """freq mask + + Args: + x (np.ndarray): spectrogram (time, freq) + replace_with_zero (bool, optional): Defaults to False. + + Returns: + np.ndarray: freq mask spectrogram (time, freq) + """ + n_bins = x.shape[1] + for i in range(0, self.n_freq_masks): + f = int(self._rng.uniform(low=0, high=self.F)) + f_0 = int(self._rng.uniform(low=0, high=n_bins - f)) + assert f_0 <= f_0 + f + if replace_with_zero: + x[:, f_0:f_0 + f] = 0 + else: + x[:, f_0:f_0 + f] = x.mean() + self._freq_mask = (f_0, f_0 + f) + return x + + def mask_time(self, x, replace_with_zero=False): + """time mask + + Args: + x (np.ndarray): spectrogram (time, freq) + replace_with_zero (bool, optional): Defaults to False. + + Returns: + np.ndarray: time mask spectrogram (time, freq) + """ + n_frames = x.shape[0] + + if self.adaptive_number_ratio > 0: + n_masks = int(n_frames * self.adaptive_number_ratio) + n_masks = min(n_masks, self.max_n_time_masks) + else: + n_masks = self.n_time_masks + + if self.adaptive_size_ratio > 0: + T = self.adaptive_size_ratio * n_frames + else: + T = self.T + + for i in range(n_masks): + t = int(self._rng.uniform(low=0, high=T)) + t = min(t, int(n_frames * self.p)) + t_0 = int(self._rng.uniform(low=0, high=n_frames - t)) + assert t_0 <= t_0 + t + if replace_with_zero: + x[t_0:t_0 + t, :] = 0 + else: + x[t_0:t_0 + t, :] = x.mean() + self._time_mask = (t_0, t_0 + t) + return x + + def __call__(self, x, train=True): + if not train: + return x + return self.transform_feature(x) + + def transform_feature(self, x: np.ndarray): + """ + Args: + x (np.ndarray): `[T, F]` + Returns: + x (np.ndarray): `[T, F]` + """ + assert isinstance(x, np.ndarray) + assert x.ndim == 2 + x = self.time_warp(x, self.mode) + x = self.mask_freq(x, self.replace_with_zero) + x = self.mask_time(x, self.replace_with_zero) + return x diff --git a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/speed_perturb.py b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/speed_perturb.py new file mode 100644 index 00000000..ce8dfde0 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/speed_perturb.py @@ -0,0 +1,106 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contain the speech perturbation augmentation model.""" +import numpy as np + +from deepspeech.frontend.augmentor.base import AugmentorBase + + +class SpeedPerturbAugmentor(AugmentorBase): + """Augmentation model for adding speed perturbation.""" + + def __init__(self, rng, min_speed_rate=0.9, max_speed_rate=1.1, + num_rates=3): + """speed perturbation. + + The speed perturbation in kaldi uses sox-speed instead of sox-tempo, + and sox-speed just to resample the input, + i.e pitch and tempo are changed both. + + "Why use speed option instead of tempo -s in SoX for speed perturbation" + https://groups.google.com/forum/#!topic/kaldi-help/8OOG7eE4sZ8 + + Sox speed: + https://pysox.readthedocs.io/en/latest/api.html#sox.transform.Transformer + + See reference paper here: + http://www.danielpovey.com/files/2015_interspeech_augmentation.pdf + + Espnet: + https://espnet.github.io/espnet/_modules/espnet/transform/perturb.html + + Nemo: + https://github.com/NVIDIA/NeMo/blob/main/nemo/collections/asr/parts/perturb.py#L92 + + Args: + rng (random.Random): Random generator object. + min_speed_rate (float): Lower bound of new speed rate to sample and should + not be smaller than 0.9. + max_speed_rate (float): Upper bound of new speed rate to sample and should + not be larger than 1.1. + num_rates (int, optional): Number of discrete rates to allow. + Can be a positive or negative integer. Defaults to 3. + If a positive integer greater than 0 is provided, the range of + speed rates will be discretized into `num_rates` values. + If a negative integer or 0 is provided, the full range of speed rates + will be sampled uniformly. + Note: If a positive integer is provided and the resultant discretized + range of rates contains the value '1.0', then those samples with rate=1.0, + will not be augmented at all and simply skipped. This is to unnecessary + augmentation and increase computation time. Effective augmentation chance + in such a case is = `prob * (num_rates - 1 / num_rates) * 100`% chance + where `prob` is the global probability of a sample being augmented. + + Raises: + ValueError: when speed_rate error + """ + if min_speed_rate < 0.9: + raise ValueError( + "Sampling speed below 0.9 can cause unnatural effects") + if max_speed_rate > 1.1: + raise ValueError( + "Sampling speed above 1.1 can cause unnatural effects") + self._min_rate = min_speed_rate + self._max_rate = max_speed_rate + self._rng = rng + self._num_rates = num_rates + if num_rates > 0: + self._rates = np.linspace( + self._min_rate, self._max_rate, self._num_rates, endpoint=True) + + def __call__(self, x, uttid=None, train=True): + if not train: + return x + self.transform_audio(x) + return x + + def transform_audio(self, audio_segment): + """Sample a new speed rate from the given range and + changes the speed of the given audio clip. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegment|SpeechSegment + """ + if self._num_rates < 0: + speed_rate = self._rng.uniform(self._min_rate, self._max_rate) + else: + speed_rate = self._rng.choice(self._rates) + + # Skip perturbation in case of identity speed rate + if speed_rate == 1.0: + return + + audio_segment.change_speed(speed_rate) diff --git a/examples/transv1.8to2.x/deepspeech/frontend/augmentor/volume_perturb.py b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/volume_perturb.py new file mode 100644 index 00000000..70cb2889 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/frontend/augmentor/volume_perturb.py @@ -0,0 +1,55 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the volume perturb augmentation model.""" +from deepspeech.frontend.augmentor.base import AugmentorBase + + +class VolumePerturbAugmentor(AugmentorBase): + """Augmentation model for adding random volume perturbation. + + This is used for multi-loudness training of PCEN. See + + https://arxiv.org/pdf/1607.05666v1.pdf + + for more details. + + :param rng: Random generator object. + :type rng: random.Random + :param min_gain_dBFS: Minimal gain in dBFS. + :type min_gain_dBFS: float + :param max_gain_dBFS: Maximal gain in dBFS. + :type max_gain_dBFS: float + """ + + def __init__(self, rng, min_gain_dBFS, max_gain_dBFS): + self._min_gain_dBFS = min_gain_dBFS + self._max_gain_dBFS = max_gain_dBFS + self._rng = rng + + def __call__(self, x, uttid=None, train=True): + if not train: + return x + self.transform_audio(x) + return x + + def transform_audio(self, audio_segment): + """Change audio loadness. + + Note that this is an in-place transformation. + + :param audio_segment: Audio segment to add effects to. + :type audio_segment: AudioSegmenet|SpeechSegment + """ + gain = self._rng.uniform(self._min_gain_dBFS, self._max_gain_dBFS) + audio_segment.gain_db(gain) diff --git a/examples/transv1.8to2.x/deepspeech/frontend/featurizer/__init__.py b/examples/transv1.8to2.x/deepspeech/frontend/featurizer/__init__.py new file mode 100644 index 00000000..6992700d --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/frontend/featurizer/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .audio_featurizer import AudioFeaturizer #noqa: F401 +from .speech_featurizer import SpeechFeaturizer +from .text_featurizer import TextFeaturizer diff --git a/examples/transv1.8to2.x/deepspeech/frontend/featurizer/audio_featurizer.py b/examples/transv1.8to2.x/deepspeech/frontend/featurizer/audio_featurizer.py new file mode 100644 index 00000000..4c40c847 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/frontend/featurizer/audio_featurizer.py @@ -0,0 +1,363 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the audio featurizer class.""" +import numpy as np +from python_speech_features import delta +from python_speech_features import logfbank +from python_speech_features import mfcc + + +class AudioFeaturizer(): + """Audio featurizer, for extracting features from audio contents of + AudioSegment or SpeechSegment. + + Currently, it supports feature types of linear spectrogram and mfcc. + + :param specgram_type: Specgram feature type. Options: 'linear'. + :type specgram_type: str + :param stride_ms: Striding size (in milliseconds) for generating frames. + :type stride_ms: float + :param window_ms: Window size (in milliseconds) for generating frames. + :type window_ms: float + :param max_freq: When specgram_type is 'linear', only FFT bins + corresponding to frequencies between [0, max_freq] are + returned; when specgram_type is 'mfcc', max_feq is the + highest band edge of mel filters. + :types max_freq: None|float + :param target_sample_rate: Audio are resampled (if upsampling or + downsampling is allowed) to this before + extracting spectrogram features. + :type target_sample_rate: float + :param use_dB_normalization: Whether to normalize the audio to a certain + decibels before extracting the features. + :type use_dB_normalization: bool + :param target_dB: Target audio decibels for normalization. + :type target_dB: float + """ + + def __init__(self, + specgram_type: str='linear', + feat_dim: int=None, + delta_delta: bool=False, + stride_ms=10.0, + window_ms=20.0, + n_fft=None, + max_freq=None, + target_sample_rate=16000, + use_dB_normalization=True, + target_dB=-20, + dither=1.0): + self._specgram_type = specgram_type + # mfcc and fbank using `feat_dim` + self._feat_dim = feat_dim + # mfcc and fbank using `delta-delta` + self._delta_delta = delta_delta + self._stride_ms = stride_ms + self._window_ms = window_ms + self._max_freq = max_freq + self._target_sample_rate = target_sample_rate + self._use_dB_normalization = use_dB_normalization + self._target_dB = target_dB + self._fft_point = n_fft + self._dither = dither + + def featurize(self, + audio_segment, + allow_downsampling=True, + allow_upsampling=True): + """Extract audio features from AudioSegment or SpeechSegment. + + :param audio_segment: Audio/speech segment to extract features from. + :type audio_segment: AudioSegment|SpeechSegment + :param allow_downsampling: Whether to allow audio downsampling before + featurizing. + :type allow_downsampling: bool + :param allow_upsampling: Whether to allow audio upsampling before + featurizing. + :type allow_upsampling: bool + :return: Spectrogram audio feature in 2darray. + :rtype: ndarray + :raises ValueError: If audio sample rate is not supported. + """ + # upsampling or downsampling + if ((audio_segment.sample_rate > self._target_sample_rate and + allow_downsampling) or + (audio_segment.sample_rate < self._target_sample_rate and + allow_upsampling)): + audio_segment.resample(self._target_sample_rate) + if audio_segment.sample_rate != self._target_sample_rate: + raise ValueError("Audio sample rate is not supported. " + "Turn allow_downsampling or allow up_sampling on.") + # decibel normalization + if self._use_dB_normalization: + audio_segment.normalize(target_db=self._target_dB) + # extract spectrogram + return self._compute_specgram(audio_segment) + + @property + def stride_ms(self): + return self._stride_ms + + @property + def feature_size(self): + """audio feature size""" + feat_dim = 0 + if self._specgram_type == 'linear': + fft_point = self._window_ms if self._fft_point is None else self._fft_point + feat_dim = int(fft_point * (self._target_sample_rate / 1000) / 2 + + 1) + elif self._specgram_type == 'mfcc': + # mfcc, delta, delta-delta + feat_dim = int(self._feat_dim * + 3) if self._delta_delta else int(self._feat_dim) + elif self._specgram_type == 'fbank': + # fbank, delta, delta-delta + feat_dim = int(self._feat_dim * + 3) if self._delta_delta else int(self._feat_dim) + else: + raise ValueError("Unknown specgram_type %s. " + "Supported values: linear." % self._specgram_type) + return feat_dim + + def _compute_specgram(self, audio_segment): + """Extract various audio features.""" + sample_rate = audio_segment.sample_rate + if self._specgram_type == 'linear': + samples = audio_segment.samples + return self._compute_linear_specgram( + samples, + sample_rate, + stride_ms=self._stride_ms, + window_ms=self._window_ms, + max_freq=self._max_freq) + elif self._specgram_type == 'mfcc': + samples = audio_segment.to('int16') + return self._compute_mfcc( + samples, + sample_rate, + feat_dim=self._feat_dim, + stride_ms=self._stride_ms, + window_ms=self._window_ms, + max_freq=self._max_freq, + dither=self._dither, + delta_delta=self._delta_delta) + elif self._specgram_type == 'fbank': + samples = audio_segment.to('int16') + return self._compute_fbank( + samples, + sample_rate, + feat_dim=self._feat_dim, + stride_ms=self._stride_ms, + window_ms=self._window_ms, + max_freq=self._max_freq, + dither=self._dither, + delta_delta=self._delta_delta) + else: + raise ValueError("Unknown specgram_type %s. " + "Supported values: linear." % self._specgram_type) + + def _specgram_real(self, samples, window_size, stride_size, sample_rate): + """Compute the spectrogram for samples from a real signal.""" + # extract strided windows + truncate_size = (len(samples) - window_size) % stride_size + samples = samples[:len(samples) - truncate_size] + nshape = (window_size, (len(samples) - window_size) // stride_size + 1) + nstrides = (samples.strides[0], samples.strides[0] * stride_size) + windows = np.lib.stride_tricks.as_strided( + samples, shape=nshape, strides=nstrides) + assert np.all( + windows[:, 1] == samples[stride_size:(stride_size + window_size)]) + # window weighting, squared Fast Fourier Transform (fft), scaling + weighting = np.hanning(window_size)[:, None] + # https://numpy.org/doc/stable/reference/generated/numpy.fft.rfft.html + fft = np.fft.rfft(windows * weighting, n=None, axis=0) + fft = np.absolute(fft) + fft = fft**2 + scale = np.sum(weighting**2) * sample_rate + fft[1:-1, :] *= (2.0 / scale) + fft[(0, -1), :] /= scale + # prepare fft frequency list + freqs = float(sample_rate) / window_size * np.arange(fft.shape[0]) + return fft, freqs + + def _compute_linear_specgram(self, + samples, + sample_rate, + stride_ms=10.0, + window_ms=20.0, + max_freq=None, + eps=1e-14): + """Compute the linear spectrogram from FFT energy. + + Args: + samples ([type]): [description] + sample_rate ([type]): [description] + stride_ms (float, optional): [description]. Defaults to 10.0. + window_ms (float, optional): [description]. Defaults to 20.0. + max_freq ([type], optional): [description]. Defaults to None. + eps ([type], optional): [description]. Defaults to 1e-14. + + Raises: + ValueError: [description] + ValueError: [description] + + Returns: + np.ndarray: log spectrogram, (time, freq) + """ + if max_freq is None: + max_freq = sample_rate / 2 + if max_freq > sample_rate / 2: + raise ValueError("max_freq must not be greater than half of " + "sample rate.") + if stride_ms > window_ms: + raise ValueError("Stride size must not be greater than " + "window size.") + stride_size = int(0.001 * sample_rate * stride_ms) + window_size = int(0.001 * sample_rate * window_ms) + specgram, freqs = self._specgram_real( + samples, + window_size=window_size, + stride_size=stride_size, + sample_rate=sample_rate) + ind = np.where(freqs <= max_freq)[0][-1] + 1 + # (freq, time) + spec = np.log(specgram[:ind, :] + eps) + return np.transpose(spec) + + def _concat_delta_delta(self, feat): + """append delat, delta-delta feature. + + Args: + feat (np.ndarray): (T, D) + + Returns: + np.ndarray: feat with delta-delta, (T, 3*D) + """ + # Deltas + d_feat = delta(feat, 2) + # Deltas-Deltas + dd_feat = delta(feat, 2) + # concat above three features + concat_feat = np.concatenate((feat, d_feat, dd_feat), axis=1) + return concat_feat + + def _compute_mfcc(self, + samples, + sample_rate, + feat_dim=13, + stride_ms=10.0, + window_ms=25.0, + max_freq=None, + dither=1.0, + delta_delta=True): + """Compute mfcc from samples. + + Args: + samples (np.ndarray, np.int16): the audio signal from which to compute features. + sample_rate (float): the sample rate of the signal we are working with, in Hz. + feat_dim (int): the number of cepstrum to return, default 13. + stride_ms (float, optional): stride length in ms. Defaults to 10.0. + window_ms (float, optional): window length in ms. Defaults to 25.0. + max_freq ([type], optional): highest band edge of mel filters. In Hz, default is samplerate/2. Defaults to None. + delta_delta (bool, optional): Whether with delta delta. Defaults to False. + + Raises: + ValueError: max_freq > samplerate/2 + ValueError: stride_ms > window_ms + + Returns: + np.ndarray: mfcc feature, (D, T). + """ + if max_freq is None: + max_freq = sample_rate / 2 + if max_freq > sample_rate / 2: + raise ValueError("max_freq must not be greater than half of " + "sample rate.") + if stride_ms > window_ms: + raise ValueError("Stride size must not be greater than " + "window size.") + # compute the 13 cepstral coefficients, and the first one is replaced + # by log(frame energy), (T, D) + mfcc_feat = mfcc( + signal=samples, + samplerate=sample_rate, + winlen=0.001 * window_ms, + winstep=0.001 * stride_ms, + numcep=feat_dim, + nfilt=23, + nfft=512, + lowfreq=20, + highfreq=max_freq, + dither=dither, + remove_dc_offset=True, + preemph=0.97, + ceplifter=22, + useEnergy=True, + winfunc='povey') + if delta_delta: + mfcc_feat = self._concat_delta_delta(mfcc_feat) + return mfcc_feat + + def _compute_fbank(self, + samples, + sample_rate, + feat_dim=40, + stride_ms=10.0, + window_ms=25.0, + max_freq=None, + dither=1.0, + delta_delta=False): + """Compute logfbank from samples. + + Args: + samples (np.ndarray, np.int16): the audio signal from which to compute features. Should be an N*1 array + sample_rate (float): the sample rate of the signal we are working with, in Hz. + feat_dim (int): the number of cepstrum to return, default 13. + stride_ms (float, optional): stride length in ms. Defaults to 10.0. + window_ms (float, optional): window length in ms. Defaults to 20.0. + max_freq (float, optional): highest band edge of mel filters. In Hz, default is samplerate/2. Defaults to None. + delta_delta (bool, optional): Whether with delta delta. Defaults to False. + + Raises: + ValueError: max_freq > samplerate/2 + ValueError: stride_ms > window_ms + + Returns: + np.ndarray: mfcc feature, (D, T). + """ + if max_freq is None: + max_freq = sample_rate / 2 + if max_freq > sample_rate / 2: + raise ValueError("max_freq must not be greater than half of " + "sample rate.") + if stride_ms > window_ms: + raise ValueError("Stride size must not be greater than " + "window size.") + # (T, D) + fbank_feat = logfbank( + signal=samples, + samplerate=sample_rate, + winlen=0.001 * window_ms, + winstep=0.001 * stride_ms, + nfilt=feat_dim, + nfft=512, + lowfreq=20, + highfreq=max_freq, + dither=dither, + remove_dc_offset=True, + preemph=0.97, + wintype='povey') + if delta_delta: + fbank_feat = self._concat_delta_delta(fbank_feat) + return fbank_feat diff --git a/examples/transv1.8to2.x/deepspeech/frontend/featurizer/speech_featurizer.py b/examples/transv1.8to2.x/deepspeech/frontend/featurizer/speech_featurizer.py new file mode 100644 index 00000000..5082850d --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/frontend/featurizer/speech_featurizer.py @@ -0,0 +1,153 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the speech featurizer class.""" +from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer +from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer + + +class SpeechFeaturizer(): + """Speech featurizer, for extracting features from both audio and transcript + contents of SpeechSegment. + + Currently, for audio parts, it supports feature types of linear + spectrogram and mfcc; for transcript parts, it only supports char-level + tokenizing and conversion into a list of token indices. Note that the + token indexing order follows the given vocabulary file. + + :param vocab_filepath: Filepath to load vocabulary for token indices + conversion. + :type specgram_type: str + :param specgram_type: Specgram feature type. Options: 'linear', 'mfcc'. + :type specgram_type: str + :param stride_ms: Striding size (in milliseconds) for generating frames. + :type stride_ms: float + :param window_ms: Window size (in milliseconds) for generating frames. + :type window_ms: float + :param max_freq: When specgram_type is 'linear', only FFT bins + corresponding to frequencies between [0, max_freq] are + returned; when specgram_type is 'mfcc', max_freq is the + highest band edge of mel filters. + :types max_freq: None|float + :param target_sample_rate: Speech are resampled (if upsampling or + downsampling is allowed) to this before + extracting spectrogram features. + :type target_sample_rate: float + :param use_dB_normalization: Whether to normalize the audio to a certain + decibels before extracting the features. + :type use_dB_normalization: bool + :param target_dB: Target audio decibels for normalization. + :type target_dB: float + """ + + def __init__(self, + unit_type, + vocab_filepath, + spm_model_prefix=None, + specgram_type='linear', + feat_dim=None, + delta_delta=False, + stride_ms=10.0, + window_ms=20.0, + n_fft=None, + max_freq=None, + target_sample_rate=16000, + use_dB_normalization=True, + target_dB=-20, + dither=1.0): + self._audio_featurizer = AudioFeaturizer( + specgram_type=specgram_type, + feat_dim=feat_dim, + delta_delta=delta_delta, + stride_ms=stride_ms, + window_ms=window_ms, + n_fft=n_fft, + max_freq=max_freq, + target_sample_rate=target_sample_rate, + use_dB_normalization=use_dB_normalization, + target_dB=target_dB, + dither=dither) + self._text_featurizer = TextFeaturizer(unit_type, vocab_filepath, + spm_model_prefix) + + def featurize(self, speech_segment, keep_transcription_text): + """Extract features for speech segment. + + 1. For audio parts, extract the audio features. + 2. For transcript parts, keep the original text or convert text string + to a list of token indices in char-level. + + Args: + speech_segment (SpeechSegment): Speech segment to extract features from. + keep_transcription_text (bool): True, keep transcript text, False, token ids + + Returns: + tuple: 1) spectrogram audio feature in 2darray, 2) list oftoken indices. + """ + spec_feature = self._audio_featurizer.featurize(speech_segment) + if keep_transcription_text: + return spec_feature, speech_segment.transcript + if speech_segment.has_token: + text_ids = speech_segment.token_ids + else: + text_ids = self._text_featurizer.featurize( + speech_segment.transcript) + return spec_feature, text_ids + + @property + def vocab_size(self): + """Return the vocabulary size. + Returns: + int: Vocabulary size. + """ + return self._text_featurizer.vocab_size + + @property + def vocab_list(self): + """Return the vocabulary in list. + Returns: + List[str]: + """ + return self._text_featurizer.vocab_list + + @property + def vocab_dict(self): + """Return the vocabulary in dict. + Returns: + Dict[str, int]: + """ + return self._text_featurizer.vocab_dict + + @property + def feature_size(self): + """Return the audio feature size. + Returns: + int: audio feature size. + """ + return self._audio_featurizer.feature_size + + @property + def stride_ms(self): + """time length in `ms` unit per frame + Returns: + float: time(ms)/frame + """ + return self._audio_featurizer.stride_ms + + @property + def text_feature(self): + """Return the text feature object. + Returns: + TextFeaturizer: object. + """ + return self._text_featurizer diff --git a/examples/transv1.8to2.x/deepspeech/frontend/featurizer/text_featurizer.py b/examples/transv1.8to2.x/deepspeech/frontend/featurizer/text_featurizer.py new file mode 100644 index 00000000..e4364f70 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/frontend/featurizer/text_featurizer.py @@ -0,0 +1,202 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the text featurizer class.""" +import sentencepiece as spm + +from ..utility import EOS +from ..utility import load_dict +from ..utility import UNK + +__all__ = ["TextFeaturizer"] + + +class TextFeaturizer(): + def __init__(self, + unit_type, + vocab_filepath, + spm_model_prefix=None, + maskctc=False): + """Text featurizer, for processing or extracting features from text. + + Currently, it supports char/word/sentence-piece level tokenizing and conversion into + a list of token indices. Note that the token indexing order follows the + given vocabulary file. + + Args: + unit_type (str): unit type, e.g. char, word, spm + vocab_filepath (str): Filepath to load vocabulary for token indices conversion. + spm_model_prefix (str, optional): spm model prefix. Defaults to None. + """ + assert unit_type in ('char', 'spm', 'word') + self.unit_type = unit_type + self.unk = UNK + self.maskctc = maskctc + + if vocab_filepath: + self.vocab_dict, self._id2token, self.vocab_list, self.unk_id, self.eos_id = self._load_vocabulary_from_file( + vocab_filepath, maskctc) + self.vocab_size = len(self.vocab_list) + + if unit_type == 'spm': + spm_model = spm_model_prefix + '.model' + self.sp = spm.SentencePieceProcessor() + self.sp.Load(spm_model) + + def tokenize(self, text): + if self.unit_type == 'char': + tokens = self.char_tokenize(text) + elif self.unit_type == 'word': + tokens = self.word_tokenize(text) + else: # spm + tokens = self.spm_tokenize(text) + return tokens + + def detokenize(self, tokens): + if self.unit_type == 'char': + text = self.char_detokenize(tokens) + elif self.unit_type == 'word': + text = self.word_detokenize(tokens) + else: # spm + text = self.spm_detokenize(tokens) + return text + + def featurize(self, text): + """Convert text string to a list of token indices. + + Args: + text (str): Text. + + Returns: + List[int]: List of token indices. + """ + tokens = self.tokenize(text) + ids = [] + for token in tokens: + token = token if token in self.vocab_dict else self.unk + ids.append(self.vocab_dict[token]) + return ids + + def defeaturize(self, idxs): + """Convert a list of token indices to text string, + ignore index after eos_id. + + Args: + idxs (List[int]): List of token indices. + + Returns: + str: Text. + """ + tokens = [] + for idx in idxs: + if idx == self.eos_id: + break + tokens.append(self._id2token[idx]) + text = self.detokenize(tokens) + return text + + def char_tokenize(self, text): + """Character tokenizer. + + Args: + text (str): text string. + + Returns: + List[str]: tokens. + """ + return list(text.strip()) + + def char_detokenize(self, tokens): + """Character detokenizer. + + Args: + tokens (List[str]): tokens. + + Returns: + str: text string. + """ + return "".join(tokens) + + def word_tokenize(self, text): + """Word tokenizer, separate by .""" + return text.strip().split() + + def word_detokenize(self, tokens): + """Word detokenizer, separate by .""" + return " ".join(tokens) + + def spm_tokenize(self, text): + """spm tokenize. + + Args: + text (str): text string. + + Returns: + List[str]: sentence pieces str code + """ + stats = {"num_empty": 0, "num_filtered": 0} + + def valid(line): + return True + + def encode(l): + return self.sp.EncodeAsPieces(l) + + def encode_line(line): + line = line.strip() + if len(line) > 0: + line = encode(line) + if valid(line): + return line + else: + stats["num_filtered"] += 1 + else: + stats["num_empty"] += 1 + return None + + enc_line = encode_line(text) + return enc_line + + def spm_detokenize(self, tokens, input_format='piece'): + """spm detokenize. + + Args: + ids (List[str]): tokens. + + Returns: + str: text + """ + if input_format == "piece": + + def decode(l): + return "".join(self.sp.DecodePieces(l)) + elif input_format == "id": + + def decode(l): + return "".join(self.sp.DecodeIds(l)) + + return decode(tokens) + + def _load_vocabulary_from_file(self, vocab_filepath: str, maskctc: bool): + """Load vocabulary from file.""" + vocab_list = load_dict(vocab_filepath, maskctc) + assert vocab_list is not None + + id2token = dict( + [(idx, token) for (idx, token) in enumerate(vocab_list)]) + token2id = dict( + [(token, idx) for (idx, token) in enumerate(vocab_list)]) + + unk_id = vocab_list.index(UNK) + eos_id = vocab_list.index(EOS) + return token2id, id2token, vocab_list, unk_id, eos_id diff --git a/examples/transv1.8to2.x/deepspeech/frontend/normalizer.py b/examples/transv1.8to2.x/deepspeech/frontend/normalizer.py new file mode 100644 index 00000000..ffef8ba4 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/frontend/normalizer.py @@ -0,0 +1,199 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains feature normalizers.""" +import json + +import numpy as np +import paddle +from paddle.io import DataLoader +from paddle.io import Dataset + +from deepspeech.frontend.audio import AudioSegment +from deepspeech.frontend.utility import load_cmvn +from deepspeech.frontend.utility import read_manifest +from deepspeech.utils.log import Log + +__all__ = ["FeatureNormalizer"] + +logger = Log(__name__).getlog() + + +# https://github.com/PaddlePaddle/Paddle/pull/31481 +class CollateFunc(object): + def __init__(self, feature_func): + self.feature_func = feature_func + + def __call__(self, batch): + mean_stat = None + var_stat = None + number = 0 + for item in batch: + audioseg = AudioSegment.from_file(item['feat']) + feat = self.feature_func(audioseg) #(T, D) + + sums = np.sum(feat, axis=0) + if mean_stat is None: + mean_stat = sums + else: + mean_stat += sums + + square_sums = np.sum(np.square(feat), axis=0) + if var_stat is None: + var_stat = square_sums + else: + var_stat += square_sums + + number += feat.shape[0] + return number, mean_stat, var_stat + + +class AudioDataset(Dataset): + def __init__(self, manifest_path, num_samples=-1, rng=None, random_seed=0): + self._rng = rng if rng else np.random.RandomState(random_seed) + manifest = read_manifest(manifest_path) + if num_samples == -1: + sampled_manifest = manifest + else: + sampled_manifest = self._rng.choice( + manifest, num_samples, replace=False) + self.items = sampled_manifest + + def __len__(self): + return len(self.items) + + def __getitem__(self, idx): + return self.items[idx] + + +class FeatureNormalizer(object): + """Feature normalizer. Normalize features to be of zero mean and unit + stddev. + + if mean_std_filepath is provided (not None), the normalizer will directly + initilize from the file. Otherwise, both manifest_path and featurize_func + should be given for on-the-fly mean and stddev computing. + + :param mean_std_filepath: File containing the pre-computed mean and stddev. + :type mean_std_filepath: None|str + :param manifest_path: Manifest of instances for computing mean and stddev. + :type meanifest_path: None|str + :param featurize_func: Function to extract features. It should be callable + with ``featurize_func(audio_segment)``. + :type featurize_func: None|callable + :param num_samples: Number of random samples for computing mean and stddev. + :type num_samples: int + :param random_seed: Random seed for sampling instances. + :type random_seed: int + :raises ValueError: If both mean_std_filepath and manifest_path + (or both mean_std_filepath and featurize_func) are None. + """ + + def __init__(self, + mean_std_filepath, + manifest_path=None, + featurize_func=None, + num_samples=500, + num_workers=0, + random_seed=0): + if not mean_std_filepath: + if not (manifest_path and featurize_func): + raise ValueError("If mean_std_filepath is None, meanifest_path " + "and featurize_func should not be None.") + self._rng = np.random.RandomState(random_seed) + self._compute_mean_std(manifest_path, featurize_func, num_samples, + num_workers) + else: + self._read_mean_std_from_file(mean_std_filepath) + + def apply(self, features): + """Normalize features to be of zero mean and unit stddev. + + :param features: Input features to be normalized. + :type features: ndarray, shape (T, D) + :param eps: added to stddev to provide numerical stablibity. + :type eps: float + :return: Normalized features. + :rtype: ndarray + """ + return (features - self._mean) * self._istd + + def _read_mean_std_from_file(self, filepath, eps=1e-20): + """Load mean and std from file.""" + mean, istd = load_cmvn(filepath, filetype='json') + self._mean = np.expand_dims(mean, axis=0) + self._istd = np.expand_dims(istd, axis=0) + ''' + print ("filepath", filepath) + npz = np.load(filepath) + self._mean = npz['mean'].reshape(1,161) + self._istd = npz['std'].reshape(1,161) + print ("mean.shape", self._mean.shape) + print ("istd.shape", self._istd.shape) + ''' + + def write_to_file(self, filepath): + """Write the mean and stddev to the file. + + :param filepath: File to write mean and stddev. + :type filepath: str + """ + with open(filepath, 'w') as fout: + fout.write(json.dumps(self.cmvn_info)) + + def _compute_mean_std(self, + manifest_path, + featurize_func, + num_samples, + num_workers, + batch_size=64, + eps=1e-20): + """Compute mean and std from randomly sampled instances.""" + paddle.set_device('cpu') + + collate_func = CollateFunc(featurize_func) + dataset = AudioDataset(manifest_path, num_samples, self._rng) + data_loader = DataLoader( + dataset, + batch_size=batch_size, + shuffle=False, + num_workers=num_workers, + collate_fn=collate_func) + + with paddle.no_grad(): + all_mean_stat = None + all_var_stat = None + all_number = 0 + wav_number = 0 + for i, batch in enumerate(data_loader): + number, mean_stat, var_stat = batch + if i == 0: + all_mean_stat = mean_stat + all_var_stat = var_stat + else: + all_mean_stat += mean_stat + all_var_stat += var_stat + all_number += number + wav_number += batch_size + + if wav_number % 1000 == 0: + logger.info( + f'process {wav_number} wavs,{all_number} frames.') + + self.cmvn_info = { + 'mean_stat': list(all_mean_stat.tolist()), + 'var_stat': list(all_var_stat.tolist()), + 'frame_num': all_number, + } + + return self.cmvn_info diff --git a/examples/transv1.8to2.x/deepspeech/frontend/speech.py b/examples/transv1.8to2.x/deepspeech/frontend/speech.py new file mode 100644 index 00000000..e58795c0 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/frontend/speech.py @@ -0,0 +1,217 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains the speech segment class.""" +import numpy as np + +from deepspeech.frontend.audio import AudioSegment + + +class SpeechSegment(AudioSegment): + """Speech Segment with Text + + Args: + AudioSegment (AudioSegment): Audio Segment + """ + + def __init__(self, + samples, + sample_rate, + transcript, + tokens=None, + token_ids=None): + """Speech segment abstraction, a subclass of AudioSegment, + with an additional transcript. + + Args: + samples (ndarray.float32): Audio samples [num_samples x num_channels]. + sample_rate (int): Audio sample rate. + transcript (str): Transcript text for the speech. + tokens (List[str], optinal): Transcript tokens for the speech. + token_ids (List[int], optional): Transcript token ids for the speech. + """ + AudioSegment.__init__(self, samples, sample_rate) + self._transcript = transcript + # must init `tokens` with `token_ids` at the same time + self._tokens = tokens + self._token_ids = token_ids + + def __eq__(self, other): + """Return whether two objects are equal. + + Returns: + bool: True, when equal to other + """ + if not AudioSegment.__eq__(self, other): + return False + if self._transcript != other._transcript: + return False + if self.has_token and other.has_token: + if self._tokens != other._tokens: + return False + if self._token_ids != other._token_ids: + return False + return True + + def __ne__(self, other): + """Return whether two objects are unequal.""" + return not self.__eq__(other) + + @classmethod + def from_file(cls, filepath, transcript, tokens=None, token_ids=None): + """Create speech segment from audio file and corresponding transcript. + + Args: + filepath (str|file): Filepath or file object to audio file. + transcript (str): Transcript text for the speech. + tokens (List[str], optional): text tokens. Defaults to None. + token_ids (List[int], optional): text token ids. Defaults to None. + + Returns: + SpeechSegment: Speech segment instance. + """ + + audio = AudioSegment.from_file(filepath) + return cls(audio.samples, audio.sample_rate, transcript, tokens, + token_ids) + + @classmethod + def from_bytes(cls, bytes, transcript, tokens=None, token_ids=None): + """Create speech segment from a byte string and corresponding + + Args: + filepath (str|file): Filepath or file object to audio file. + transcript (str): Transcript text for the speech. + tokens (List[str], optional): text tokens. Defaults to None. + token_ids (List[int], optional): text token ids. Defaults to None. + + Returns: + SpeechSegment: Speech segment instance. + """ + audio = AudioSegment.from_bytes(bytes) + return cls(audio.samples, audio.sample_rate, transcript, tokens, + token_ids) + + @classmethod + def concatenate(cls, *segments): + """Concatenate an arbitrary number of speech segments together, both + audio and transcript will be concatenated. + + :param *segments: Input speech segments to be concatenated. + :type *segments: tuple of SpeechSegment + :return: Speech segment instance. + :rtype: SpeechSegment + :raises ValueError: If the number of segments is zero, or if the + sample_rate of any two segments does not match. + :raises TypeError: If any segment is not SpeechSegment instance. + """ + if len(segments) == 0: + raise ValueError("No speech segments are given to concatenate.") + sample_rate = segments[0]._sample_rate + transcripts = "" + tokens = [] + token_ids = [] + for seg in segments: + if sample_rate != seg._sample_rate: + raise ValueError("Can't concatenate segments with " + "different sample rates") + if type(seg) is not cls: + raise TypeError("Only speech segments of the same type " + "instance can be concatenated.") + transcripts += seg._transcript + if self.has_token: + tokens += seg._tokens + token_ids += seg._token_ids + samples = np.concatenate([seg.samples for seg in segments]) + return cls(samples, sample_rate, transcripts, tokens, token_ids) + + @classmethod + def slice_from_file(cls, + filepath, + transcript, + tokens=None, + token_ids=None, + start=None, + end=None): + """Loads a small section of an speech without having to load + the entire file into the memory which can be incredibly wasteful. + + :param filepath: Filepath or file object to audio file. + :type filepath: str|file + :param start: Start time in seconds. If start is negative, it wraps + around from the end. If not provided, this function + reads from the very beginning. + :type start: float + :param end: End time in seconds. If end is negative, it wraps around + from the end. If not provided, the default behvaior is + to read to the end of the file. + :type end: float + :param transcript: Transcript text for the speech. if not provided, + the defaults is an empty string. + :type transript: str + :return: SpeechSegment instance of the specified slice of the input + speech file. + :rtype: SpeechSegment + """ + audio = AudioSegment.slice_from_file(filepath, start, end) + return cls(audio.samples, audio.sample_rate, transcript, tokens, + token_ids) + + @classmethod + def make_silence(cls, duration, sample_rate): + """Creates a silent speech segment of the given duration and + sample rate, transcript will be an empty string. + + Args: + duration (float): Length of silence in seconds. + sample_rate (float): Sample rate. + + Returns: + SpeechSegment: Silence of the given duration. + """ + audio = AudioSegment.make_silence(duration, sample_rate) + return cls(audio.samples, audio.sample_rate, "") + + @property + def has_token(self): + if self._tokens and self._token_ids: + return True + return False + + @property + def transcript(self): + """Return the transcript text. + + Returns: + str: Transcript text for the speech. + """ + + return self._transcript + + @property + def tokens(self): + """Return the transcript text tokens. + + Returns: + List[str]: text tokens. + """ + return self._tokens + + @property + def token_ids(self): + """Return the transcript text token ids. + + Returns: + List[int]: text token ids. + """ + return self._token_ids diff --git a/examples/transv1.8to2.x/deepspeech/frontend/utility.py b/examples/transv1.8to2.x/deepspeech/frontend/utility.py new file mode 100644 index 00000000..72dfc98d --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/frontend/utility.py @@ -0,0 +1,289 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains data helper functions.""" +import codecs +import json +import math +from typing import List +from typing import Optional +from typing import Text + +import numpy as np + +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = [ + "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs", + "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS", + "EOS", "UNK", "BLANK", "MASKCTC" +] + +IGNORE_ID = -1 +# `sos` and `eos` using same token +SOS = "" +EOS = SOS +UNK = "" +BLANK = "" +MASKCTC = "" + + +def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]: + if dict_path is None: + return None + + with open(dict_path, "r") as f: + dictionary = f.readlines() + char_list = [entry.strip().split(" ")[0] for entry in dictionary] + if BLANK not in char_list: + char_list.insert(0, BLANK) + if EOS not in char_list: + char_list.append(EOS) + # for non-autoregressive maskctc model + if maskctc and MASKCTC not in char_list: + char_list.append(MASKCTC) + return char_list + + +def read_manifest( + manifest_path, + max_input_len=float('inf'), + min_input_len=0.0, + max_output_len=float('inf'), + min_output_len=0.0, + max_output_input_ratio=float('inf'), + min_output_input_ratio=0.0, ): + """Load and parse manifest file. + + Args: + manifest_path ([type]): Manifest file to load and parse. + max_input_len ([type], optional): maximum output seq length, + in seconds for raw wav, in frame numbers for feature data. + Defaults to float('inf'). + min_input_len (float, optional): minimum input seq length, + in seconds for raw wav, in frame numbers for feature data. + Defaults to 0.0. + max_output_len (float, optional): maximum input seq length, + in modeling units. Defaults to 500.0. + min_output_len (float, optional): minimum input seq length, + in modeling units. Defaults to 0.0. + max_output_input_ratio (float, optional): + maximum output seq length/output seq length ratio. Defaults to 10.0. + min_output_input_ratio (float, optional): + minimum output seq length/output seq length ratio. Defaults to 0.05. + + Raises: + IOError: If failed to parse the manifest. + + Returns: + List[dict]: Manifest parsing results. + """ + + manifest = [] + for json_line in codecs.open(manifest_path, 'r', 'utf-8'): + try: + json_data = json.loads(json_line) + except Exception as e: + raise IOError("Error reading manifest: %s" % str(e)) + + feat_len = json_data["feat_shape"][ + 0] if 'feat_shape' in json_data else 1.0 + token_len = json_data["token_shape"][ + 0] if 'token_shape' in json_data else 1.0 + conditions = [ + feat_len >= min_input_len, + feat_len <= max_input_len, + token_len >= min_output_len, + token_len <= max_output_len, + token_len / feat_len >= min_output_input_ratio, + token_len / feat_len <= max_output_input_ratio, + ] + if all(conditions): + manifest.append(json_data) + return manifest + + +def rms_to_db(rms: float): + """Root Mean Square to dB. + + Args: + rms ([float]): root mean square + + Returns: + float: dB + """ + return 20.0 * math.log10(max(1e-16, rms)) + + +def rms_to_dbfs(rms: float): + """Root Mean Square to dBFS. + https://fireattack.wordpress.com/2017/02/06/replaygain-loudness-normalization-and-applications/ + Audio is mix of sine wave, so 1 amp sine wave's Full scale is 0.7071, equal to -3.0103dB. + + dB = dBFS + 3.0103 + dBFS = db - 3.0103 + e.g. 0 dB = -3.0103 dBFS + + Args: + rms ([float]): root mean square + + Returns: + float: dBFS + """ + return rms_to_db(rms) - 3.0103 + + +def max_dbfs(sample_data: np.ndarray): + """Peak dBFS based on the maximum energy sample. + + Args: + sample_data ([np.ndarray]): float array, [-1, 1]. + + Returns: + float: dBFS + """ + # Peak dBFS based on the maximum energy sample. Will prevent overdrive if used for normalization. + return rms_to_dbfs(max(abs(np.min(sample_data)), abs(np.max(sample_data)))) + + +def mean_dbfs(sample_data): + """Peak dBFS based on the RMS energy. + + Args: + sample_data ([np.ndarray]): float array, [-1, 1]. + + Returns: + float: dBFS + """ + return rms_to_dbfs( + math.sqrt(np.mean(np.square(sample_data, dtype=np.float64)))) + + +def gain_db_to_ratio(gain_db: float): + """dB to ratio + + Args: + gain_db (float): gain in dB + + Returns: + float: scale in amp + """ + return math.pow(10.0, gain_db / 20.0) + + +def normalize_audio(sample_data: np.ndarray, dbfs: float=-3.0103): + """Nomalize audio to dBFS. + + Args: + sample_data (np.ndarray): input wave samples, [-1, 1]. + dbfs (float, optional): target dBFS. Defaults to -3.0103. + + Returns: + np.ndarray: normalized wave + """ + return np.maximum( + np.minimum(sample_data * gain_db_to_ratio(dbfs - max_dbfs(sample_data)), + 1.0), -1.0) + + +def _load_json_cmvn(json_cmvn_file): + """ Load the json format cmvn stats file and calculate cmvn + + Args: + json_cmvn_file: cmvn stats file in json format + + Returns: + a numpy array of [means, vars] + """ + with open(json_cmvn_file) as f: + cmvn_stats = json.load(f) + + means = cmvn_stats['mean_stat'] + variance = cmvn_stats['var_stat'] + count = cmvn_stats['frame_num'] + for i in range(len(means)): + means[i] /= count + variance[i] = variance[i] / count - means[i] * means[i] + if variance[i] < 1.0e-20: + variance[i] = 1.0e-20 + variance[i] = 1.0 / math.sqrt(variance[i]) + cmvn = np.array([means, variance]) + return cmvn + + +def _load_kaldi_cmvn(kaldi_cmvn_file): + """ Load the kaldi format cmvn stats file and calculate cmvn + + Args: + kaldi_cmvn_file: kaldi text style global cmvn file, which + is generated by: + compute-cmvn-stats --binary=false scp:feats.scp global_cmvn + + Returns: + a numpy array of [means, vars] + """ + means = [] + variance = [] + with open(kaldi_cmvn_file, 'r') as fid: + # kaldi binary file start with '\0B' + if fid.read(2) == '\0B': + logger.error('kaldi cmvn binary file is not supported, please ' + 'recompute it by: compute-cmvn-stats --binary=false ' + ' scp:feats.scp global_cmvn') + sys.exit(1) + fid.seek(0) + arr = fid.read().split() + assert (arr[0] == '[') + assert (arr[-2] == '0') + assert (arr[-1] == ']') + feat_dim = int((len(arr) - 2 - 2) / 2) + for i in range(1, feat_dim + 1): + means.append(float(arr[i])) + count = float(arr[feat_dim + 1]) + for i in range(feat_dim + 2, 2 * feat_dim + 2): + variance.append(float(arr[i])) + + for i in range(len(means)): + means[i] /= count + variance[i] = variance[i] / count - means[i] * means[i] + if variance[i] < 1.0e-20: + variance[i] = 1.0e-20 + variance[i] = 1.0 / math.sqrt(variance[i]) + cmvn = np.array([means, variance]) + return cmvn + + +def load_cmvn(cmvn_file: str, filetype: str): + """load cmvn from file. + + Args: + cmvn_file (str): cmvn path. + filetype (str): file type, optional[npz, json, kaldi]. + + Raises: + ValueError: file type not support. + + Returns: + Tuple[np.ndarray, np.ndarray]: mean, istd + """ + assert filetype in ['npz', 'json', 'kaldi'], filetype + filetype = filetype.lower() + if filetype == "json": + cmvn = _load_json_cmvn(cmvn_file) + elif filetype == "kaldi": + cmvn = _load_kaldi_cmvn(cmvn_file) + else: + raise ValueError(f"cmvn file type no support: {filetype}") + return cmvn[0], cmvn[1] diff --git a/examples/transv1.8to2.x/deepspeech/io/__init__.py b/examples/transv1.8to2.x/deepspeech/io/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/io/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/transv1.8to2.x/deepspeech/io/batchfy.py b/examples/transv1.8to2.x/deepspeech/io/batchfy.py new file mode 100644 index 00000000..de29d054 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/io/batchfy.py @@ -0,0 +1,469 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import itertools + +import numpy as np + +from deepspeech.utils.log import Log + +__all__ = ["make_batchset"] + +logger = Log(__name__).getlog() + + +def batchfy_by_seq( + sorted_data, + batch_size, + max_length_in, + max_length_out, + min_batch_size=1, + shortest_first=False, + ikey="input", + iaxis=0, + okey="output", + oaxis=0, ): + """Make batch set from json dictionary + + :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json + :param int batch_size: batch size + :param int max_length_in: maximum length of input to decide adaptive batch size + :param int max_length_out: maximum length of output to decide adaptive batch size + :param int min_batch_size: mininum batch size (for multi-gpu) + :param bool shortest_first: Sort from batch with shortest samples + to longest if true, otherwise reverse + :param str ikey: key to access input + (for ASR ikey="input", for TTS, MT ikey="output".) + :param int iaxis: dimension to access input + (for ASR, TTS iaxis=0, for MT iaxis="1".) + :param str okey: key to access output + (for ASR, MT okey="output". for TTS okey="input".) + :param int oaxis: dimension to access output + (for ASR, TTS, MT oaxis=0, reserved for future research, -1 means all axis.) + :return: List[List[Tuple[str, dict]]] list of batches + """ + if batch_size <= 0: + raise ValueError(f"Invalid batch_size={batch_size}") + + # check #utts is more than min_batch_size + if len(sorted_data) < min_batch_size: + raise ValueError( + f"#utts({len(sorted_data)}) is less than min_batch_size({min_batch_size})." + ) + + # make list of minibatches + minibatches = [] + start = 0 + while True: + _, info = sorted_data[start] + ilen = int(info[ikey][iaxis]["shape"][0]) + olen = (int(info[okey][oaxis]["shape"][0]) if oaxis >= 0 else + max(map(lambda x: int(x["shape"][0]), info[okey]))) + factor = max(int(ilen / max_length_in), int(olen / max_length_out)) + # change batchsize depending on the input and output length + # if ilen = 1000 and max_length_in = 800 + # then b = batchsize / 2 + # and max(min_batches, .) avoids batchsize = 0 + bs = max(min_batch_size, int(batch_size / (1 + factor))) + end = min(len(sorted_data), start + bs) + minibatch = sorted_data[start:end] + if shortest_first: + minibatch.reverse() + + # check each batch is more than minimum batchsize + if len(minibatch) < min_batch_size: + mod = min_batch_size - len(minibatch) % min_batch_size + additional_minibatch = [ + sorted_data[i] for i in np.random.randint(0, start, mod) + ] + if shortest_first: + additional_minibatch.reverse() + minibatch.extend(additional_minibatch) + minibatches.append(minibatch) + + if end == len(sorted_data): + break + start = end + + # batch: List[List[Tuple[str, dict]]] + return minibatches + + +def batchfy_by_bin( + sorted_data, + batch_bins, + num_batches=0, + min_batch_size=1, + shortest_first=False, + ikey="input", + okey="output", ): + """Make variably sized batch set, which maximizes + + the number of bins up to `batch_bins`. + + :param List[(str, Dict[str, Any])] sorted_data: dictionary loaded from data.json + :param int batch_bins: Maximum frames of a batch + :param int num_batches: # number of batches to use (for debug) + :param int min_batch_size: minimum batch size (for multi-gpu) + :param int test: Return only every `test` batches + :param bool shortest_first: Sort from batch with shortest samples + to longest if true, otherwise reverse + + :param str ikey: key to access input (for ASR ikey="input", for TTS ikey="output".) + :param str okey: key to access output (for ASR okey="output". for TTS okey="input".) + + :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches + """ + if batch_bins <= 0: + raise ValueError(f"invalid batch_bins={batch_bins}") + length = len(sorted_data) + idim = int(sorted_data[0][1][ikey][0]["shape"][1]) + odim = int(sorted_data[0][1][okey][0]["shape"][1]) + logger.info("# utts: " + str(len(sorted_data))) + minibatches = [] + start = 0 + n = 0 + while True: + # Dynamic batch size depending on size of samples + b = 0 + next_size = 0 + max_olen = 0 + while next_size < batch_bins and (start + b) < length: + ilen = int(sorted_data[start + b][1][ikey][0]["shape"][0]) * idim + olen = int(sorted_data[start + b][1][okey][0]["shape"][0]) * odim + if olen > max_olen: + max_olen = olen + next_size = (max_olen + ilen) * (b + 1) + if next_size <= batch_bins: + b += 1 + elif next_size == 0: + raise ValueError( + f"Can't fit one sample in batch_bins ({batch_bins}): " + f"Please increase the value") + end = min(length, start + max(min_batch_size, b)) + batch = sorted_data[start:end] + if shortest_first: + batch.reverse() + minibatches.append(batch) + # Check for min_batch_size and fixes the batches if needed + i = -1 + while len(minibatches[i]) < min_batch_size: + missing = min_batch_size - len(minibatches[i]) + if -i == len(minibatches): + minibatches[i + 1].extend(minibatches[i]) + minibatches = minibatches[1:] + break + else: + minibatches[i].extend(minibatches[i - 1][:missing]) + minibatches[i - 1] = minibatches[i - 1][missing:] + i -= 1 + if end == length: + break + start = end + n += 1 + if num_batches > 0: + minibatches = minibatches[:num_batches] + lengths = [len(x) for x in minibatches] + logger.info( + str(len(minibatches)) + " batches containing from " + str(min(lengths)) + + " to " + str(max(lengths)) + " samples " + "(avg " + str( + int(np.mean(lengths))) + " samples).") + return minibatches + + +def batchfy_by_frame( + sorted_data, + max_frames_in, + max_frames_out, + max_frames_inout, + num_batches=0, + min_batch_size=1, + shortest_first=False, + ikey="input", + okey="output", ): + """Make variable batch set, which maximizes the number of frames to max_batch_frame. + + :param List[(str, Dict[str, Any])] sorteddata: dictionary loaded from data.json + :param int max_frames_in: Maximum input frames of a batch + :param int max_frames_out: Maximum output frames of a batch + :param int max_frames_inout: Maximum input+output frames of a batch + :param int num_batches: # number of batches to use (for debug) + :param int min_batch_size: minimum batch size (for multi-gpu) + :param int test: Return only every `test` batches + :param bool shortest_first: Sort from batch with shortest samples + to longest if true, otherwise reverse + + :param str ikey: key to access input (for ASR ikey="input", for TTS ikey="output".) + :param str okey: key to access output (for ASR okey="output". for TTS okey="input".) + + :return: List[Tuple[str, Dict[str, List[Dict[str, Any]]]] list of batches + """ + if max_frames_in <= 0 and max_frames_out <= 0 and max_frames_inout <= 0: + raise ValueError( + "At least, one of `--batch-frames-in`, `--batch-frames-out` or " + "`--batch-frames-inout` should be > 0") + length = len(sorted_data) + minibatches = [] + start = 0 + end = 0 + while end != length: + # Dynamic batch size depending on size of samples + b = 0 + max_olen = 0 + max_ilen = 0 + while (start + b) < length: + ilen = int(sorted_data[start + b][1][ikey][0]["shape"][0]) + if ilen > max_frames_in and max_frames_in != 0: + raise ValueError( + f"Can't fit one sample in --batch-frames-in ({max_frames_in}): " + f"Please increase the value") + olen = int(sorted_data[start + b][1][okey][0]["shape"][0]) + if olen > max_frames_out and max_frames_out != 0: + raise ValueError( + f"Can't fit one sample in --batch-frames-out ({max_frames_out}): " + f"Please increase the value") + if ilen + olen > max_frames_inout and max_frames_inout != 0: + raise ValueError( + f"Can't fit one sample in --batch-frames-out ({max_frames_inout}): " + f"Please increase the value") + max_olen = max(max_olen, olen) + max_ilen = max(max_ilen, ilen) + in_ok = max_ilen * (b + 1) <= max_frames_in or max_frames_in == 0 + out_ok = max_olen * (b + 1) <= max_frames_out or max_frames_out == 0 + inout_ok = (max_ilen + max_olen) * ( + b + 1) <= max_frames_inout or max_frames_inout == 0 + if in_ok and out_ok and inout_ok: + # add more seq in the minibatch + b += 1 + else: + # no more seq in the minibatch + break + end = min(length, start + b) + batch = sorted_data[start:end] + if shortest_first: + batch.reverse() + minibatches.append(batch) + # Check for min_batch_size and fixes the batches if needed + i = -1 + while len(minibatches[i]) < min_batch_size: + missing = min_batch_size - len(minibatches[i]) + if -i == len(minibatches): + minibatches[i + 1].extend(minibatches[i]) + minibatches = minibatches[1:] + break + else: + minibatches[i].extend(minibatches[i - 1][:missing]) + minibatches[i - 1] = minibatches[i - 1][missing:] + i -= 1 + start = end + if num_batches > 0: + minibatches = minibatches[:num_batches] + lengths = [len(x) for x in minibatches] + logger.info( + str(len(minibatches)) + " batches containing from " + str(min(lengths)) + + " to " + str(max(lengths)) + " samples" + "(avg " + str( + int(np.mean(lengths))) + " samples).") + + return minibatches + + +def batchfy_shuffle(data, batch_size, min_batch_size, num_batches, + shortest_first): + import random + + logger.info("use shuffled batch.") + sorted_data = random.sample(data.items(), len(data.items())) + logger.info("# utts: " + str(len(sorted_data))) + # make list of minibatches + minibatches = [] + start = 0 + while True: + end = min(len(sorted_data), start + batch_size) + # check each batch is more than minimum batchsize + minibatch = sorted_data[start:end] + if shortest_first: + minibatch.reverse() + if len(minibatch) < min_batch_size: + mod = min_batch_size - len(minibatch) % min_batch_size + additional_minibatch = [ + sorted_data[i] for i in np.random.randint(0, start, mod) + ] + if shortest_first: + additional_minibatch.reverse() + minibatch.extend(additional_minibatch) + minibatches.append(minibatch) + if end == len(sorted_data): + break + start = end + + # for debugging + if num_batches > 0: + minibatches = minibatches[:num_batches] + logger.info("# minibatches: " + str(len(minibatches))) + return minibatches + + +BATCH_COUNT_CHOICES = ["auto", "seq", "bin", "frame"] +BATCH_SORT_KEY_CHOICES = ["input", "output", "shuffle"] + + +def make_batchset( + data, + batch_size=0, + max_length_in=float("inf"), + max_length_out=float("inf"), + num_batches=0, + min_batch_size=1, + shortest_first=False, + batch_sort_key="input", + count="auto", + batch_bins=0, + batch_frames_in=0, + batch_frames_out=0, + batch_frames_inout=0, + iaxis=0, + oaxis=0, ): + """Make batch set from json dictionary + + if utts have "category" value, + + >>> data = [{'category': 'A', 'input': ..., 'utt':'utt1'}, + ... {'category': 'B', 'input': ..., 'utt':'utt2'}, + ... {'category': 'B', 'input': ..., 'utt':'utt3'}, + ... {'category': 'A', 'input': ..., 'utt':'utt4'}] + >>> make_batchset(data, batchsize=2, ...) + [[('utt1', ...), ('utt4', ...)], [('utt2', ...), ('utt3': ...)]] + + Note that if any utts doesn't have "category", + perform as same as batchfy_by_{count} + + :param List[Dict[str, Any]] data: dictionary loaded from data.json + :param int batch_size: maximum number of sequences in a minibatch. + :param int batch_bins: maximum number of bins (frames x dim) in a minibatch. + :param int batch_frames_in: maximum number of input frames in a minibatch. + :param int batch_frames_out: maximum number of output frames in a minibatch. + :param int batch_frames_out: maximum number of input+output frames in a minibatch. + :param str count: strategy to count maximum size of batch. + For choices, see espnet.asr.batchfy.BATCH_COUNT_CHOICES + + :param int max_length_in: maximum length of input to decide adaptive batch size + :param int max_length_out: maximum length of output to decide adaptive batch size + :param int num_batches: # number of batches to use (for debug) + :param int min_batch_size: minimum batch size (for multi-gpu) + :param bool shortest_first: Sort from batch with shortest samples + to longest if true, otherwise reverse + :param str batch_sort_key: how to sort data before creating minibatches + ["input", "output", "shuffle"] + :param bool swap_io: if True, use "input" as output and "output" + as input in `data` dict + :param bool mt: if True, use 0-axis of "output" as output and 1-axis of "output" + as input in `data` dict + :param int iaxis: dimension to access input + (for ASR, TTS iaxis=0, for MT iaxis="1".) + :param int oaxis: dimension to access output (for ASR, TTS, MT oaxis=0, + reserved for future research, -1 means all axis.) + :return: List[List[Tuple[str, dict]]] list of batches + """ + # check args + if count not in BATCH_COUNT_CHOICES: + raise ValueError( + f"arg 'count' ({count}) should be one of {BATCH_COUNT_CHOICES}") + if batch_sort_key not in BATCH_SORT_KEY_CHOICES: + raise ValueError(f"arg 'batch_sort_key' ({batch_sort_key}) should be " + f"one of {BATCH_SORT_KEY_CHOICES}") + + ikey = "input" + okey = "output" + batch_sort_axis = 0 # index of list + if count == "auto": + if batch_size != 0: + count = "seq" + elif batch_bins != 0: + count = "bin" + elif batch_frames_in != 0 or batch_frames_out != 0 or batch_frames_inout != 0: + count = "frame" + else: + raise ValueError( + f"cannot detect `count` manually set one of {BATCH_COUNT_CHOICES}" + ) + logger.info(f"count is auto detected as {count}") + + if count != "seq" and batch_sort_key == "shuffle": + raise ValueError( + "batch_sort_key=shuffle is only available if batch_count=seq") + + category2data = {} # Dict[str, dict] + for v in data: + k = v['utt'] + category2data.setdefault(v.get("category"), {})[k] = v + + batches_list = [] # List[List[List[Tuple[str, dict]]]] + for d in category2data.values(): + if batch_sort_key == "shuffle": + batches = batchfy_shuffle(d, batch_size, min_batch_size, + num_batches, shortest_first) + batches_list.append(batches) + continue + + # sort it by input lengths (long to short) + sorted_data = sorted( + d.items(), + key=lambda data: int(data[1][batch_sort_key][batch_sort_axis]["shape"][0]), + reverse=not shortest_first, ) + logger.info("# utts: " + str(len(sorted_data))) + + if count == "seq": + batches = batchfy_by_seq( + sorted_data, + batch_size=batch_size, + max_length_in=max_length_in, + max_length_out=max_length_out, + min_batch_size=min_batch_size, + shortest_first=shortest_first, + ikey=ikey, + iaxis=iaxis, + okey=okey, + oaxis=oaxis, ) + if count == "bin": + batches = batchfy_by_bin( + sorted_data, + batch_bins=batch_bins, + min_batch_size=min_batch_size, + shortest_first=shortest_first, + ikey=ikey, + okey=okey, ) + if count == "frame": + batches = batchfy_by_frame( + sorted_data, + max_frames_in=batch_frames_in, + max_frames_out=batch_frames_out, + max_frames_inout=batch_frames_inout, + min_batch_size=min_batch_size, + shortest_first=shortest_first, + ikey=ikey, + okey=okey, ) + batches_list.append(batches) + + if len(batches_list) == 1: + batches = batches_list[0] + else: + # Concat list. This way is faster than "sum(batch_list, [])" + batches = list(itertools.chain(*batches_list)) + + # for debugging + if num_batches > 0: + batches = batches[:num_batches] + logger.info("# minibatches: " + str(len(batches))) + + # batch: List[List[Tuple[str, dict]]] + return batches diff --git a/examples/transv1.8to2.x/deepspeech/io/collator.py b/examples/transv1.8to2.x/deepspeech/io/collator.py new file mode 100644 index 00000000..df300479 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/io/collator.py @@ -0,0 +1,321 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import io +from collections import namedtuple +from typing import Optional + +import numpy as np +from yacs.config import CfgNode + +from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline +from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer +from deepspeech.frontend.normalizer import FeatureNormalizer +from deepspeech.frontend.speech import SpeechSegment +from deepspeech.frontend.utility import IGNORE_ID +from deepspeech.io.utility import pad_list +from deepspeech.utils.log import Log + +__all__ = ["SpeechCollator"] + +logger = Log(__name__).getlog() + +# namedtupe need global for pickle. +TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object']) + + +class SpeechCollator(): + @classmethod + def params(cls, config: Optional[CfgNode]=None) -> CfgNode: + default = CfgNode( + dict( + augmentation_config="", + random_seed=0, + mean_std_filepath="", + unit_type="char", + vocab_filepath="", + spm_model_prefix="", + specgram_type='linear', # 'linear', 'mfcc', 'fbank' + feat_dim=0, # 'mfcc', 'fbank' + delta_delta=False, # 'mfcc', 'fbank' + stride_ms=10.0, # ms + window_ms=20.0, # ms + n_fft=None, # fft points + max_freq=None, # None for samplerate/2 + target_sample_rate=16000, # target sample rate + use_dB_normalization=True, + target_dB=-20, + dither=1.0, # feature dither + keep_transcription_text=False)) + + if config is not None: + config.merge_from_other_cfg(default) + return default + + @classmethod + def from_config(cls, config): + """Build a SpeechCollator object from a config. + + Args: + config (yacs.config.CfgNode): configs object. + + Returns: + SpeechCollator: collator object. + """ + assert 'augmentation_config' in config.collator + assert 'keep_transcription_text' in config.collator + assert 'mean_std_filepath' in config.collator + assert 'vocab_filepath' in config.collator + assert 'specgram_type' in config.collator + assert 'n_fft' in config.collator + assert config.collator + + if isinstance(config.collator.augmentation_config, (str, bytes)): + if config.collator.augmentation_config: + aug_file = io.open( + config.collator.augmentation_config, + mode='r', + encoding='utf8') + else: + aug_file = io.StringIO(initial_value='{}', newline='') + else: + aug_file = config.collator.augmentation_config + assert isinstance(aug_file, io.StringIO) + + speech_collator = cls( + aug_file=aug_file, + random_seed=0, + mean_std_filepath=config.collator.mean_std_filepath, + unit_type=config.collator.unit_type, + vocab_filepath=config.collator.vocab_filepath, + spm_model_prefix=config.collator.spm_model_prefix, + specgram_type=config.collator.specgram_type, + feat_dim=config.collator.feat_dim, + delta_delta=config.collator.delta_delta, + stride_ms=config.collator.stride_ms, + window_ms=config.collator.window_ms, + n_fft=config.collator.n_fft, + max_freq=config.collator.max_freq, + target_sample_rate=config.collator.target_sample_rate, + use_dB_normalization=config.collator.use_dB_normalization, + target_dB=config.collator.target_dB, + dither=config.collator.dither, + keep_transcription_text=config.collator.keep_transcription_text) + return speech_collator + + def __init__( + self, + aug_file, + mean_std_filepath, + vocab_filepath, + spm_model_prefix, + random_seed=0, + unit_type="char", + specgram_type='linear', # 'linear', 'mfcc', 'fbank' + feat_dim=0, # 'mfcc', 'fbank' + delta_delta=False, # 'mfcc', 'fbank' + stride_ms=10.0, # ms + window_ms=20.0, # ms + n_fft=None, # fft points + max_freq=None, # None for samplerate/2 + target_sample_rate=16000, # target sample rate + use_dB_normalization=True, + target_dB=-20, + dither=1.0, + keep_transcription_text=True): + """SpeechCollator Collator + + Args: + unit_type(str): token unit type, e.g. char, word, spm + vocab_filepath (str): vocab file path. + mean_std_filepath (str): mean and std file path, which suffix is *.npy + spm_model_prefix (str): spm model prefix, need if `unit_type` is spm. + augmentation_config (str, optional): augmentation json str. Defaults to '{}'. + stride_ms (float, optional): stride size in ms. Defaults to 10.0. + window_ms (float, optional): window size in ms. Defaults to 20.0. + n_fft (int, optional): fft points for rfft. Defaults to None. + max_freq (int, optional): max cut freq. Defaults to None. + target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000. + specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'. + feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None. + delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False. + use_dB_normalization (bool, optional): do dB normalization. Defaults to True. + target_dB (int, optional): target dB. Defaults to -20. + random_seed (int, optional): for random generator. Defaults to 0. + keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False. + if ``keep_transcription_text`` is False, text is token ids else is raw string. + + Do augmentations + Padding audio features with zeros to make them have the same shape (or + a user-defined shape) within one batch. + """ + self._keep_transcription_text = keep_transcription_text + + self._local_data = TarLocalData(tar2info={}, tar2object={}) + self._augmentation_pipeline = AugmentationPipeline( + augmentation_config=aug_file.read(), random_seed=random_seed) + + self._normalizer = FeatureNormalizer( + mean_std_filepath) if mean_std_filepath else None + + self._stride_ms = stride_ms + self._target_sample_rate = target_sample_rate + + self._speech_featurizer = SpeechFeaturizer( + unit_type=unit_type, + vocab_filepath=vocab_filepath, + spm_model_prefix=spm_model_prefix, + specgram_type=specgram_type, + feat_dim=feat_dim, + delta_delta=delta_delta, + stride_ms=stride_ms, + window_ms=window_ms, + n_fft=n_fft, + max_freq=max_freq, + target_sample_rate=target_sample_rate, + use_dB_normalization=use_dB_normalization, + target_dB=target_dB, + dither=dither) + + def _parse_tar(self, file): + """Parse a tar file to get a tarfile object + and a map containing tarinfoes + """ + result = {} + f = tarfile.open(file) + for tarinfo in f.getmembers(): + result[tarinfo.name] = tarinfo + return f, result + + def _subfile_from_tar(self, file): + """Get subfile object from tar. + + It will return a subfile object from tar file + and cached tar file info for next reading request. + """ + tarpath, filename = file.split(':', 1)[1].split('#', 1) + if 'tar2info' not in self._local_data.__dict__: + self._local_data.tar2info = {} + if 'tar2object' not in self._local_data.__dict__: + self._local_data.tar2object = {} + if tarpath not in self._local_data.tar2info: + object, infoes = self._parse_tar(tarpath) + self._local_data.tar2info[tarpath] = infoes + self._local_data.tar2object[tarpath] = object + return self._local_data.tar2object[tarpath].extractfile( + self._local_data.tar2info[tarpath][filename]) + + def process_utterance(self, audio_file, transcript): + """Load, augment, featurize and normalize for speech data. + + :param audio_file: Filepath or file object of audio file. + :type audio_file: str | file + :param transcript: Transcription text. + :type transcript: str + :return: Tuple of audio feature tensor and data of transcription part, + where transcription part could be token ids or text. + :rtype: tuple of (2darray, list) + """ + if isinstance(audio_file, str) and audio_file.startswith('tar:'): + speech_segment = SpeechSegment.from_file( + self._subfile_from_tar(audio_file), transcript) + else: + speech_segment = SpeechSegment.from_file(audio_file, transcript) + + # audio augment + self._augmentation_pipeline.transform_audio(speech_segment) + + specgram, transcript_part = self._speech_featurizer.featurize( + speech_segment, self._keep_transcription_text) + if self._normalizer: + specgram = self._normalizer.apply(specgram) + + # specgram augment + specgram = self._augmentation_pipeline.transform_feature(specgram) + return specgram, transcript_part + + def __call__(self, batch): + """batch examples + + Args: + batch ([List]): batch is (audio, text) + audio (np.ndarray) shape (T, D) + text (List[int] or str): shape (U,) + + Returns: + tuple(audio, text, audio_lens, text_lens): batched data. + audio : (B, Tmax, D) + audio_lens: (B) + text : (B, Umax) + text_lens: (B) + """ + audios = [] + audio_lens = [] + texts = [] + text_lens = [] + utts = [] + for utt, audio, text in batch: + audio, text = self.process_utterance(audio, text) + #utt + utts.append(utt) + # audio + audios.append(audio) # [T, D] + audio_lens.append(audio.shape[0]) + # text + # for training, text is token ids + # else text is string, convert to unicode ord + tokens = [] + if self._keep_transcription_text: + assert isinstance(text, str), (type(text), text) + tokens = [ord(t) for t in text] + else: + tokens = text # token ids + tokens = tokens if isinstance(tokens, np.ndarray) else np.array( + tokens, dtype=np.int64) + texts.append(tokens) + text_lens.append(tokens.shape[0]) + + #[B, T, D] + xs_pad = pad_list(audios, 0.0).astype(np.float32) + ilens = np.array(audio_lens).astype(np.int64) + ys_pad = pad_list(texts, IGNORE_ID).astype(np.int64) + olens = np.array(text_lens).astype(np.int64) + return utts, xs_pad, ilens, ys_pad, olens + + @property + def manifest(self): + return self._manifest + + @property + def vocab_size(self): + return self._speech_featurizer.vocab_size + + @property + def vocab_list(self): + return self._speech_featurizer.vocab_list + + @property + def vocab_dict(self): + return self._speech_featurizer.vocab_dict + + @property + def text_feature(self): + return self._speech_featurizer.text_feature + + @property + def feature_size(self): + return self._speech_featurizer.feature_size + + @property + def stride_ms(self): + return self._speech_featurizer.stride_ms diff --git a/examples/transv1.8to2.x/deepspeech/io/collator_st.py b/examples/transv1.8to2.x/deepspeech/io/collator_st.py new file mode 100644 index 00000000..28573366 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/io/collator_st.py @@ -0,0 +1,631 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import io +from collections import namedtuple +from typing import Optional + +import kaldiio +import numpy as np +from yacs.config import CfgNode + +from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline +from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer +from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer +from deepspeech.frontend.normalizer import FeatureNormalizer +from deepspeech.frontend.speech import SpeechSegment +from deepspeech.frontend.utility import IGNORE_ID +from deepspeech.io.utility import pad_sequence +from deepspeech.utils.log import Log + +__all__ = ["SpeechCollator", "KaldiPrePorocessedCollator"] + +logger = Log(__name__).getlog() + +# namedtupe need global for pickle. +TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object']) + + +class SpeechCollator(): + @classmethod + def params(cls, config: Optional[CfgNode]=None) -> CfgNode: + default = CfgNode( + dict( + augmentation_config="", + random_seed=0, + mean_std_filepath="", + unit_type="char", + vocab_filepath="", + spm_model_prefix="", + specgram_type='linear', # 'linear', 'mfcc', 'fbank' + feat_dim=0, # 'mfcc', 'fbank' + delta_delta=False, # 'mfcc', 'fbank' + stride_ms=10.0, # ms + window_ms=20.0, # ms + n_fft=None, # fft points + max_freq=None, # None for samplerate/2 + target_sample_rate=16000, # target sample rate + use_dB_normalization=True, + target_dB=-20, + dither=1.0, # feature dither + keep_transcription_text=False)) + + if config is not None: + config.merge_from_other_cfg(default) + return default + + @classmethod + def from_config(cls, config): + """Build a SpeechCollator object from a config. + + Args: + config (yacs.config.CfgNode): configs object. + + Returns: + SpeechCollator: collator object. + """ + assert 'augmentation_config' in config.collator + assert 'keep_transcription_text' in config.collator + assert 'mean_std_filepath' in config.collator + assert 'vocab_filepath' in config.collator + assert 'specgram_type' in config.collator + assert 'n_fft' in config.collator + assert config.collator + + if isinstance(config.collator.augmentation_config, (str, bytes)): + if config.collator.augmentation_config: + aug_file = io.open( + config.collator.augmentation_config, + mode='r', + encoding='utf8') + else: + aug_file = io.StringIO(initial_value='{}', newline='') + else: + aug_file = config.collator.augmentation_config + assert isinstance(aug_file, io.StringIO) + + speech_collator = cls( + aug_file=aug_file, + random_seed=0, + mean_std_filepath=config.collator.mean_std_filepath, + unit_type=config.collator.unit_type, + vocab_filepath=config.collator.vocab_filepath, + spm_model_prefix=config.collator.spm_model_prefix, + specgram_type=config.collator.specgram_type, + feat_dim=config.collator.feat_dim, + delta_delta=config.collator.delta_delta, + stride_ms=config.collator.stride_ms, + window_ms=config.collator.window_ms, + n_fft=config.collator.n_fft, + max_freq=config.collator.max_freq, + target_sample_rate=config.collator.target_sample_rate, + use_dB_normalization=config.collator.use_dB_normalization, + target_dB=config.collator.target_dB, + dither=config.collator.dither, + keep_transcription_text=config.collator.keep_transcription_text) + return speech_collator + + def __init__( + self, + aug_file, + mean_std_filepath, + vocab_filepath, + spm_model_prefix, + random_seed=0, + unit_type="char", + specgram_type='linear', # 'linear', 'mfcc', 'fbank' + feat_dim=0, # 'mfcc', 'fbank' + delta_delta=False, # 'mfcc', 'fbank' + stride_ms=10.0, # ms + window_ms=20.0, # ms + n_fft=None, # fft points + max_freq=None, # None for samplerate/2 + target_sample_rate=16000, # target sample rate + use_dB_normalization=True, + target_dB=-20, + dither=1.0, + keep_transcription_text=True): + """SpeechCollator Collator + + Args: + unit_type(str): token unit type, e.g. char, word, spm + vocab_filepath (str): vocab file path. + mean_std_filepath (str): mean and std file path, which suffix is *.npy + spm_model_prefix (str): spm model prefix, need if `unit_type` is spm. + augmentation_config (str, optional): augmentation json str. Defaults to '{}'. + stride_ms (float, optional): stride size in ms. Defaults to 10.0. + window_ms (float, optional): window size in ms. Defaults to 20.0. + n_fft (int, optional): fft points for rfft. Defaults to None. + max_freq (int, optional): max cut freq. Defaults to None. + target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000. + specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'. + feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None. + delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False. + use_dB_normalization (bool, optional): do dB normalization. Defaults to True. + target_dB (int, optional): target dB. Defaults to -20. + random_seed (int, optional): for random generator. Defaults to 0. + keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False. + if ``keep_transcription_text`` is False, text is token ids else is raw string. + + Do augmentations + Padding audio features with zeros to make them have the same shape (or + a user-defined shape) within one batch. + """ + self._keep_transcription_text = keep_transcription_text + + self._local_data = TarLocalData(tar2info={}, tar2object={}) + self._augmentation_pipeline = AugmentationPipeline( + augmentation_config=aug_file.read(), random_seed=random_seed) + + self._normalizer = FeatureNormalizer( + mean_std_filepath) if mean_std_filepath else None + + self._stride_ms = stride_ms + self._target_sample_rate = target_sample_rate + + self._speech_featurizer = SpeechFeaturizer( + unit_type=unit_type, + vocab_filepath=vocab_filepath, + spm_model_prefix=spm_model_prefix, + specgram_type=specgram_type, + feat_dim=feat_dim, + delta_delta=delta_delta, + stride_ms=stride_ms, + window_ms=window_ms, + n_fft=n_fft, + max_freq=max_freq, + target_sample_rate=target_sample_rate, + use_dB_normalization=use_dB_normalization, + target_dB=target_dB, + dither=dither) + + def _parse_tar(self, file): + """Parse a tar file to get a tarfile object + and a map containing tarinfoes + """ + result = {} + f = tarfile.open(file) + for tarinfo in f.getmembers(): + result[tarinfo.name] = tarinfo + return f, result + + def _subfile_from_tar(self, file): + """Get subfile object from tar. + + It will return a subfile object from tar file + and cached tar file info for next reading request. + """ + tarpath, filename = file.split(':', 1)[1].split('#', 1) + if 'tar2info' not in self._local_data.__dict__: + self._local_data.tar2info = {} + if 'tar2object' not in self._local_data.__dict__: + self._local_data.tar2object = {} + if tarpath not in self._local_data.tar2info: + object, infoes = self._parse_tar(tarpath) + self._local_data.tar2info[tarpath] = infoes + self._local_data.tar2object[tarpath] = object + return self._local_data.tar2object[tarpath].extractfile( + self._local_data.tar2info[tarpath][filename]) + + @property + def manifest(self): + return self._manifest + + @property + def vocab_size(self): + return self._speech_featurizer.vocab_size + + @property + def vocab_list(self): + return self._speech_featurizer.vocab_list + + @property + def vocab_dict(self): + return self._speech_featurizer.vocab_dict + + @property + def text_feature(self): + return self._speech_featurizer.text_feature + + @property + def feature_size(self): + return self._speech_featurizer.feature_size + + @property + def stride_ms(self): + return self._speech_featurizer.stride_ms + + def process_utterance(self, audio_file, translation): + """Load, augment, featurize and normalize for speech data. + + :param audio_file: Filepath or file object of audio file. + :type audio_file: str | file + :param translation: translation text. + :type translation: str + :return: Tuple of audio feature tensor and data of translation part, + where translation part could be token ids or text. + :rtype: tuple of (2darray, list) + """ + if isinstance(audio_file, str) and audio_file.startswith('tar:'): + speech_segment = SpeechSegment.from_file( + self._subfile_from_tar(audio_file), translation) + else: + speech_segment = SpeechSegment.from_file(audio_file, translation) + + # audio augment + self._augmentation_pipeline.transform_audio(speech_segment) + + specgram, translation_part = self._speech_featurizer.featurize( + speech_segment, self._keep_transcription_text) + if self._normalizer: + specgram = self._normalizer.apply(specgram) + + # specgram augment + specgram = self._augmentation_pipeline.transform_feature(specgram) + return specgram, translation_part + + def __call__(self, batch): + """batch examples + + Args: + batch ([List]): batch is (audio, text) + audio (np.ndarray) shape (T, D) + text (List[int] or str): shape (U,) + + Returns: + tuple(audio, text, audio_lens, text_lens): batched data. + audio : (B, Tmax, D) + audio_lens: (B) + text : (B, Umax) + text_lens: (B) + """ + audios = [] + audio_lens = [] + texts = [] + text_lens = [] + utts = [] + for utt, audio, text in batch: + audio, text = self.process_utterance(audio, text) + #utt + utts.append(utt) + # audio + audios.append(audio) # [T, D] + audio_lens.append(audio.shape[0]) + # text + # for training, text is token ids + # else text is string, convert to unicode ord + tokens = [] + if self._keep_transcription_text: + assert isinstance(text, str), (type(text), text) + tokens = [ord(t) for t in text] + else: + tokens = text # token ids + tokens = tokens if isinstance(tokens, np.ndarray) else np.array( + tokens, dtype=np.int64) + texts.append(tokens) + text_lens.append(tokens.shape[0]) + + padded_audios = pad_sequence( + audios, padding_value=0.0).astype(np.float32) #[B, T, D] + audio_lens = np.array(audio_lens).astype(np.int64) + padded_texts = pad_sequence( + texts, padding_value=IGNORE_ID).astype(np.int64) + text_lens = np.array(text_lens).astype(np.int64) + return utts, padded_audios, audio_lens, padded_texts, text_lens + + +class TripletSpeechCollator(SpeechCollator): + def process_utterance(self, audio_file, translation, transcript): + """Load, augment, featurize and normalize for speech data. + + :param audio_file: Filepath or file object of audio file. + :type audio_file: str | file + :param translation: translation text. + :type translation: str + :return: Tuple of audio feature tensor and data of translation part, + where translation part could be token ids or text. + :rtype: tuple of (2darray, list) + """ + if isinstance(audio_file, str) and audio_file.startswith('tar:'): + speech_segment = SpeechSegment.from_file( + self._subfile_from_tar(audio_file), translation) + else: + speech_segment = SpeechSegment.from_file(audio_file, translation) + + # audio augment + self._augmentation_pipeline.transform_audio(speech_segment) + + specgram, translation_part = self._speech_featurizer.featurize( + speech_segment, self._keep_transcription_text) + transcript_part = self._speech_featurizer._text_featurizer.featurize( + transcript) + if self._normalizer: + specgram = self._normalizer.apply(specgram) + + # specgram augment + specgram = self._augmentation_pipeline.transform_feature(specgram) + return specgram, translation_part, transcript_part + + def __call__(self, batch): + """batch examples + + Args: + batch ([List]): batch is (audio, text) + audio (np.ndarray) shape (T, D) + text (List[int] or str): shape (U,) + + Returns: + tuple(audio, text, audio_lens, text_lens): batched data. + audio : (B, Tmax, D) + audio_lens: (B) + text : (B, Umax) + text_lens: (B) + """ + audios = [] + audio_lens = [] + translation_text = [] + translation_text_lens = [] + transcription_text = [] + transcription_text_lens = [] + + utts = [] + for utt, audio, translation, transcription in batch: + audio, translation, transcription = self.process_utterance( + audio, translation, transcription) + #utt + utts.append(utt) + # audio + audios.append(audio) # [T, D] + audio_lens.append(audio.shape[0]) + # text + # for training, text is token ids + # else text is string, convert to unicode ord + tokens = [[], []] + for idx, text in enumerate([translation, transcription]): + if self._keep_transcription_text: + assert isinstance(text, str), (type(text), text) + tokens[idx] = [ord(t) for t in text] + else: + tokens[idx] = text # token ids + tokens[idx] = tokens[idx] if isinstance( + tokens[idx], np.ndarray) else np.array( + tokens[idx], dtype=np.int64) + translation_text.append(tokens[0]) + translation_text_lens.append(tokens[0].shape[0]) + transcription_text.append(tokens[1]) + transcription_text_lens.append(tokens[1].shape[0]) + + padded_audios = pad_sequence( + audios, padding_value=0.0).astype(np.float32) #[B, T, D] + audio_lens = np.array(audio_lens).astype(np.int64) + padded_translation = pad_sequence( + translation_text, padding_value=IGNORE_ID).astype(np.int64) + translation_lens = np.array(translation_text_lens).astype(np.int64) + padded_transcription = pad_sequence( + transcription_text, padding_value=IGNORE_ID).astype(np.int64) + transcription_lens = np.array(transcription_text_lens).astype(np.int64) + return utts, padded_audios, audio_lens, ( + padded_translation, padded_transcription), (translation_lens, + transcription_lens) + + +class KaldiPrePorocessedCollator(SpeechCollator): + @classmethod + def params(cls, config: Optional[CfgNode]=None) -> CfgNode: + default = CfgNode( + dict( + augmentation_config="", + random_seed=0, + unit_type="char", + vocab_filepath="", + spm_model_prefix="", + feat_dim=0, + stride_ms=10.0, + keep_transcription_text=False)) + + if config is not None: + config.merge_from_other_cfg(default) + return default + + @classmethod + def from_config(cls, config): + """Build a SpeechCollator object from a config. + + Args: + config (yacs.config.CfgNode): configs object. + + Returns: + SpeechCollator: collator object. + """ + assert 'augmentation_config' in config.collator + assert 'keep_transcription_text' in config.collator + assert 'vocab_filepath' in config.collator + assert config.collator + + if isinstance(config.collator.augmentation_config, (str, bytes)): + if config.collator.augmentation_config: + aug_file = io.open( + config.collator.augmentation_config, + mode='r', + encoding='utf8') + else: + aug_file = io.StringIO(initial_value='{}', newline='') + else: + aug_file = config.collator.augmentation_config + assert isinstance(aug_file, io.StringIO) + + speech_collator = cls( + aug_file=aug_file, + random_seed=0, + unit_type=config.collator.unit_type, + vocab_filepath=config.collator.vocab_filepath, + spm_model_prefix=config.collator.spm_model_prefix, + feat_dim=config.collator.feat_dim, + stride_ms=config.collator.stride_ms, + keep_transcription_text=config.collator.keep_transcription_text) + return speech_collator + + def __init__(self, + aug_file, + vocab_filepath, + spm_model_prefix, + random_seed=0, + unit_type="char", + feat_dim=0, + stride_ms=10.0, + keep_transcription_text=True): + """SpeechCollator Collator + + Args: + unit_type(str): token unit type, e.g. char, word, spm + vocab_filepath (str): vocab file path. + spm_model_prefix (str): spm model prefix, need if `unit_type` is spm. + augmentation_config (str, optional): augmentation json str. Defaults to '{}'. + random_seed (int, optional): for random generator. Defaults to 0. + keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False. + if ``keep_transcription_text`` is False, text is token ids else is raw string. + + Do augmentations + Padding audio features with zeros to make them have the same shape (or + a user-defined shape) within one batch. + """ + self._keep_transcription_text = keep_transcription_text + self._feat_dim = feat_dim + self._stride_ms = stride_ms + + self._local_data = TarLocalData(tar2info={}, tar2object={}) + self._augmentation_pipeline = AugmentationPipeline( + augmentation_config=aug_file.read(), random_seed=random_seed) + + self._text_featurizer = TextFeaturizer(unit_type, vocab_filepath, + spm_model_prefix) + + def process_utterance(self, audio_file, translation): + """Load, augment, featurize and normalize for speech data. + + :param audio_file: Filepath or file object of kaldi processed feature. + :type audio_file: str | file + :param translation: Translation text. + :type translation: str + :return: Tuple of audio feature tensor and data of translation part, + where translation part could be token ids or text. + :rtype: tuple of (2darray, list) + """ + specgram = kaldiio.load_mat(audio_file) + assert specgram.shape[ + 1] == self._feat_dim, 'expect feat dim {}, but got {}'.format( + self._feat_dim, specgram.shape[1]) + + # specgram augment + specgram = self._augmentation_pipeline.transform_feature(specgram) + + if self._keep_transcription_text: + return specgram, translation + else: + text_ids = self._text_featurizer.featurize(translation) + return specgram, text_ids + + +class TripletKaldiPrePorocessedCollator(KaldiPrePorocessedCollator): + def process_utterance(self, audio_file, translation, transcript): + """Load, augment, featurize and normalize for speech data. + + :param audio_file: Filepath or file object of kali processed feature. + :type audio_file: str | file + :param translation: Translation text. + :type translation: str + :param transcript: Transcription text. + :type transcript: str + :return: Tuple of audio feature tensor and data of translation and transcription parts, + where translation and transcription parts could be token ids or text. + :rtype: tuple of (2darray, (list, list)) + """ + specgram = kaldiio.load_mat(audio_file) + assert specgram.shape[ + 1] == self._feat_dim, 'expect feat dim {}, but got {}'.format( + self._feat_dim, specgram.shape[1]) + + # specgram augment + specgram = self._augmentation_pipeline.transform_feature(specgram) + + if self._keep_transcription_text: + return specgram, translation, transcript + else: + translation_text_ids = self._text_featurizer.featurize(translation) + transcript_text_ids = self._text_featurizer.featurize(transcript) + return specgram, translation_text_ids, transcript_text_ids + + def __call__(self, batch): + """batch examples + + Args: + batch ([List]): batch is (audio, text) + audio (np.ndarray) shape (T, D) + translation (List[int] or str): shape (U,) + transcription (List[int] or str): shape (V,) + + Returns: + tuple(audio, text, audio_lens, text_lens): batched data. + audio : (B, Tmax, D) + audio_lens: (B) + translation_text : (B, Umax) + translation_text_lens: (B) + transcription_text : (B, Vmax) + transcription_text_lens: (B) + """ + audios = [] + audio_lens = [] + translation_text = [] + translation_text_lens = [] + transcription_text = [] + transcription_text_lens = [] + + utts = [] + for utt, audio, translation, transcription in batch: + audio, translation, transcription = self.process_utterance( + audio, translation, transcription) + #utt + utts.append(utt) + # audio + audios.append(audio) # [T, D] + audio_lens.append(audio.shape[0]) + # text + # for training, text is token ids + # else text is string, convert to unicode ord + tokens = [[], []] + for idx, text in enumerate([translation, transcription]): + if self._keep_transcription_text: + assert isinstance(text, str), (type(text), text) + tokens[idx] = [ord(t) for t in text] + else: + tokens[idx] = text # token ids + tokens[idx] = tokens[idx] if isinstance( + tokens[idx], np.ndarray) else np.array( + tokens[idx], dtype=np.int64) + translation_text.append(tokens[0]) + translation_text_lens.append(tokens[0].shape[0]) + transcription_text.append(tokens[1]) + transcription_text_lens.append(tokens[1].shape[0]) + + padded_audios = pad_sequence( + audios, padding_value=0.0).astype(np.float32) #[B, T, D] + audio_lens = np.array(audio_lens).astype(np.int64) + padded_translation = pad_sequence( + translation_text, padding_value=IGNORE_ID).astype(np.int64) + translation_lens = np.array(translation_text_lens).astype(np.int64) + padded_transcription = pad_sequence( + transcription_text, padding_value=IGNORE_ID).astype(np.int64) + transcription_lens = np.array(transcription_text_lens).astype(np.int64) + return utts, padded_audios, audio_lens, ( + padded_translation, padded_transcription), (translation_lens, + transcription_lens) diff --git a/examples/transv1.8to2.x/deepspeech/io/converter.py b/examples/transv1.8to2.x/deepspeech/io/converter.py new file mode 100644 index 00000000..b80c7b20 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/io/converter.py @@ -0,0 +1,81 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np + +from deepspeech.io.utility import pad_list +from deepspeech.utils.log import Log + +__all__ = ["CustomConverter"] + +logger = Log(__name__).getlog() + + +class CustomConverter(): + """Custom batch converter. + + Args: + subsampling_factor (int): The subsampling factor. + dtype (np.dtype): Data type to convert. + + """ + + def __init__(self, subsampling_factor=1, dtype=np.float32): + """Construct a CustomConverter object.""" + self.subsampling_factor = subsampling_factor + self.ignore_id = -1 + self.dtype = dtype + + def __call__(self, batch): + """Transform a batch and send it to a device. + + Args: + batch (list): The batch to transform. + + Returns: + tuple(np.ndarray, nn.ndarray, nn.ndarray) + + """ + # batch should be located in list + assert len(batch) == 1 + (xs, ys), utts = batch[0] + assert xs[0] is not None, "please check Reader and Augmentation impl." + + # perform subsampling + if self.subsampling_factor > 1: + xs = [x[::self.subsampling_factor, :] for x in xs] + + # get batch of lengths of input sequences + ilens = np.array([x.shape[0] for x in xs]) + + # perform padding and convert to tensor + # currently only support real number + if xs[0].dtype.kind == "c": + xs_pad_real = pad_list([x.real for x in xs], 0).astype(self.dtype) + xs_pad_imag = pad_list([x.imag for x in xs], 0).astype(self.dtype) + # Note(kamo): + # {'real': ..., 'imag': ...} will be changed to ComplexTensor in E2E. + # Don't create ComplexTensor and give it E2E here + # because torch.nn.DataParellel can't handle it. + xs_pad = {"real": xs_pad_real, "imag": xs_pad_imag} + else: + xs_pad = pad_list(xs, 0).astype(self.dtype) + + # NOTE: this is for multi-output (e.g., speech translation) + ys_pad = pad_list( + [np.array(y[0][:]) if isinstance(y, tuple) else y for y in ys], + self.ignore_id) + + olens = np.array( + [y[0].shape[0] if isinstance(y, tuple) else y.shape[0] for y in ys]) + return utts, xs_pad, ilens, ys_pad, olens diff --git a/examples/transv1.8to2.x/deepspeech/io/dataloader.py b/examples/transv1.8to2.x/deepspeech/io/dataloader.py new file mode 100644 index 00000000..a35a0bc0 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/io/dataloader.py @@ -0,0 +1,170 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any +from typing import Dict +from typing import List +from typing import Text + +import numpy as np +from paddle.io import DataLoader + +from deepspeech.frontend.utility import read_manifest +from deepspeech.io.batchfy import make_batchset +from deepspeech.io.converter import CustomConverter +from deepspeech.io.dataset import TransformDataset +from deepspeech.io.reader import LoadInputsAndTargets +from deepspeech.utils.log import Log + +__all__ = ["BatchDataLoader"] + +logger = Log(__name__).getlog() + + +def feat_dim_and_vocab_size(data_json: List[Dict[Text, Any]], + mode: Text="asr", + iaxis=0, + oaxis=0): + if mode == 'asr': + feat_dim = data_json[0]['input'][oaxis]['shape'][1] + vocab_size = data_json[0]['output'][oaxis]['shape'][1] + else: + raise ValueError(f"{mode} mode not support!") + return feat_dim, vocab_size + + +def batch_collate(x): + """de-tuple. + + Args: + x (List[Tuple]): [(utts, xs, ilens, ys, olens)] + + Returns: + Tuple: (utts, xs, ilens, ys, olens) + """ + return x[0] + + +class BatchDataLoader(): + def __init__(self, + json_file: str, + train_mode: bool, + sortagrad: bool=False, + batch_size: int=0, + maxlen_in: float=float('inf'), + maxlen_out: float=float('inf'), + minibatches: int=0, + mini_batch_size: int=1, + batch_count: str='auto', + batch_bins: int=0, + batch_frames_in: int=0, + batch_frames_out: int=0, + batch_frames_inout: int=0, + preprocess_conf=None, + n_iter_processes: int=1, + subsampling_factor: int=1, + num_encs: int=1): + self.json_file = json_file + self.train_mode = train_mode + self.use_sortagrad = sortagrad == -1 or sortagrad > 0 + self.batch_size = batch_size + self.maxlen_in = maxlen_in + self.maxlen_out = maxlen_out + self.batch_count = batch_count + self.batch_bins = batch_bins + self.batch_frames_in = batch_frames_in + self.batch_frames_out = batch_frames_out + self.batch_frames_inout = batch_frames_inout + self.subsampling_factor = subsampling_factor + self.num_encs = num_encs + self.preprocess_conf = preprocess_conf + self.n_iter_processes = n_iter_processes + + # read json data + self.data_json = read_manifest(json_file) + self.feat_dim, self.vocab_size = feat_dim_and_vocab_size( + self.data_json, mode='asr') + + # make minibatch list (variable length) + self.minibaches = make_batchset( + self.data_json, + batch_size, + maxlen_in, + maxlen_out, + minibatches, # for debug + min_batch_size=mini_batch_size, + shortest_first=self.use_sortagrad, + count=batch_count, + batch_bins=batch_bins, + batch_frames_in=batch_frames_in, + batch_frames_out=batch_frames_out, + batch_frames_inout=batch_frames_inout, + iaxis=0, + oaxis=0, ) + + # data reader + self.reader = LoadInputsAndTargets( + mode="asr", + load_output=True, + preprocess_conf=preprocess_conf, + preprocess_args={"train": + train_mode}, # Switch the mode of preprocessing + ) + + # Setup a converter + if num_encs == 1: + self.converter = CustomConverter( + subsampling_factor=subsampling_factor, dtype=np.float32) + else: + assert NotImplementedError("not impl CustomConverterMulEnc.") + + # hack to make batchsize argument as 1 + # actual bathsize is included in a list + # default collate function converts numpy array to pytorch tensor + # we used an empty collate function instead which returns list + self.dataset = TransformDataset(self.minibaches, self.converter, + self.reader) + + self.dataloader = DataLoader( + dataset=self.dataset, + batch_size=1, + shuffle=not self.use_sortagrad if self.train_mode else False, + collate_fn=batch_collate, + num_workers=self.n_iter_processes, ) + + def __repr__(self): + echo = f"<{self.__class__.__module__}.{self.__class__.__name__} object at {hex(id(self))}> " + echo += f"train_mode: {self.train_mode}, " + echo += f"sortagrad: {self.use_sortagrad}, " + echo += f"batch_size: {self.batch_size}, " + echo += f"maxlen_in: {self.maxlen_in}, " + echo += f"maxlen_out: {self.maxlen_out}, " + echo += f"batch_count: {self.batch_count}, " + echo += f"batch_bins: {self.batch_bins}, " + echo += f"batch_frames_in: {self.batch_frames_in}, " + echo += f"batch_frames_out: {self.batch_frames_out}, " + echo += f"batch_frames_inout: {self.batch_frames_inout}, " + echo += f"subsampling_factor: {self.subsampling_factor}, " + echo += f"num_encs: {self.num_encs}, " + echo += f"num_workers: {self.n_iter_processes}, " + echo += f"file: {self.json_file}" + return echo + + def __len__(self): + return len(self.dataloader) + + def __iter__(self): + return self.dataloader.__iter__() + + def __call__(self): + return self.__iter__() diff --git a/examples/transv1.8to2.x/deepspeech/io/dataset.py b/examples/transv1.8to2.x/deepspeech/io/dataset.py new file mode 100644 index 00000000..d1fe0470 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/io/dataset.py @@ -0,0 +1,149 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Optional + +from paddle.io import Dataset +from yacs.config import CfgNode + +from deepspeech.frontend.utility import read_manifest +from deepspeech.utils.log import Log + +__all__ = ["ManifestDataset", "TripletManifestDataset", "TransformDataset"] + +logger = Log(__name__).getlog() + + +class ManifestDataset(Dataset): + @classmethod + def params(cls, config: Optional[CfgNode]=None) -> CfgNode: + default = CfgNode( + dict( + manifest="", + max_input_len=27.0, + min_input_len=0.0, + max_output_len=float('inf'), + min_output_len=0.0, + max_output_input_ratio=float('inf'), + min_output_input_ratio=0.0, )) + + if config is not None: + config.merge_from_other_cfg(default) + return default + + @classmethod + def from_config(cls, config): + """Build a ManifestDataset object from a config. + + Args: + config (yacs.config.CfgNode): configs object. + + Returns: + ManifestDataset: dataet object. + """ + assert 'manifest' in config.data + assert config.data.manifest + + dataset = cls( + manifest_path=config.data.manifest, + max_input_len=config.data.max_input_len, + min_input_len=config.data.min_input_len, + max_output_len=config.data.max_output_len, + min_output_len=config.data.min_output_len, + max_output_input_ratio=config.data.max_output_input_ratio, + min_output_input_ratio=config.data.min_output_input_ratio, ) + return dataset + + def __init__(self, + manifest_path, + max_input_len=float('inf'), + min_input_len=0.0, + max_output_len=float('inf'), + min_output_len=0.0, + max_output_input_ratio=float('inf'), + min_output_input_ratio=0.0): + """Manifest Dataset + + Args: + manifest_path (str): manifest josn file path + max_input_len ([type], optional): maximum output seq length, + in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf'). + min_input_len (float, optional): minimum input seq length, + in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0. + max_output_len (float, optional): maximum input seq length, + in modeling units. Defaults to 500.0. + min_output_len (float, optional): minimum input seq length, + in modeling units. Defaults to 0.0. + max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. + Defaults to 10.0. + min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. + Defaults to 0.05. + + """ + super().__init__() + + # read manifest + self._manifest = read_manifest( + manifest_path=manifest_path, + max_input_len=max_input_len, + min_input_len=min_input_len, + max_output_len=max_output_len, + min_output_len=min_output_len, + max_output_input_ratio=max_output_input_ratio, + min_output_input_ratio=min_output_input_ratio) + self._manifest.sort(key=lambda x: x["feat_shape"][0]) + + def __len__(self): + return len(self._manifest) + + def __getitem__(self, idx): + instance = self._manifest[idx] + return instance["utt"], instance["feat"], instance["text"] + + +class TripletManifestDataset(ManifestDataset): + """ + For Joint Training of Speech Translation and ASR. + text: translation, + text1: transcript. + """ + + def __getitem__(self, idx): + instance = self._manifest[idx] + return instance["utt"], instance["feat"], instance["text"], instance[ + "text1"] + + +class TransformDataset(Dataset): + """Transform Dataset. + + Args: + data: list object from make_batchset + converter: batch function + reader: read data + """ + + def __init__(self, data, converter, reader): + """Init function.""" + super().__init__() + self.data = data + self.converter = converter + self.reader = reader + + def __len__(self): + """Len function.""" + return len(self.data) + + def __getitem__(self, idx): + """[] operator.""" + return self.converter([self.reader(self.data[idx], return_uttid=True)]) diff --git a/examples/transv1.8to2.x/deepspeech/io/reader.py b/examples/transv1.8to2.x/deepspeech/io/reader.py new file mode 100644 index 00000000..95cdbb95 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/io/reader.py @@ -0,0 +1,410 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections import OrderedDict + +import kaldiio +import numpy as np +import soundfile + +from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline +from deepspeech.utils.log import Log + +__all__ = ["LoadInputsAndTargets"] + +logger = Log(__name__).getlog() + + +class LoadInputsAndTargets(): + """Create a mini-batch from a list of dicts + + >>> batch = [('utt1', + ... dict(input=[dict(feat='some.ark:123', + ... filetype='mat', + ... name='input1', + ... shape=[100, 80])], + ... output=[dict(tokenid='1 2 3 4', + ... name='target1', + ... shape=[4, 31])]])) + >>> l = LoadInputsAndTargets() + >>> feat, target = l(batch) + + :param: str mode: Specify the task mode, "asr" or "tts" + :param: str preprocess_conf: The path of a json file for pre-processing + :param: bool load_input: If False, not to load the input data + :param: bool load_output: If False, not to load the output data + :param: bool sort_in_input_length: Sort the mini-batch in descending order + of the input length + :param: bool use_speaker_embedding: Used for tts mode only + :param: bool use_second_target: Used for tts mode only + :param: dict preprocess_args: Set some optional arguments for preprocessing + :param: Optional[dict] preprocess_args: Used for tts mode only + """ + + def __init__( + self, + mode="asr", + preprocess_conf=None, + load_input=True, + load_output=True, + sort_in_input_length=True, + preprocess_args=None, + keep_all_data_on_mem=False, ): + self._loaders = {} + + if mode not in ["asr"]: + raise ValueError("Only asr are allowed: mode={}".format(mode)) + + if preprocess_conf is not None: + with open(preprocess_conf, 'r') as fin: + self.preprocessing = AugmentationPipeline(fin.read()) + logger.warning( + "[Experimental feature] Some preprocessing will be done " + "for the mini-batch creation using {}".format( + self.preprocessing)) + else: + # If conf doesn't exist, this function don't touch anything. + self.preprocessing = None + + self.mode = mode + self.load_output = load_output + self.load_input = load_input + self.sort_in_input_length = sort_in_input_length + if preprocess_args is None: + self.preprocess_args = {} + else: + assert isinstance(preprocess_args, dict), type(preprocess_args) + self.preprocess_args = dict(preprocess_args) + + self.keep_all_data_on_mem = keep_all_data_on_mem + + def __call__(self, batch, return_uttid=False): + """Function to load inputs and targets from list of dicts + + :param List[Tuple[str, dict]] batch: list of dict which is subset of + loaded data.json + :param bool return_uttid: return utterance ID information for visualization + :return: list of input token id sequences [(L_1), (L_2), ..., (L_B)] + :return: list of input feature sequences + [(T_1, D), (T_2, D), ..., (T_B, D)] + :rtype: list of float ndarray + :return: list of target token id sequences [(L_1), (L_2), ..., (L_B)] + :rtype: list of int ndarray + + """ + x_feats_dict = OrderedDict() # OrderedDict[str, List[np.ndarray]] + y_feats_dict = OrderedDict() # OrderedDict[str, List[np.ndarray]] + uttid_list = [] # List[str] + + for uttid, info in batch: + uttid_list.append(uttid) + + if self.load_input: + # Note(kamo): This for-loop is for multiple inputs + for idx, inp in enumerate(info["input"]): + # {"input": + # [{"feat": "some/path.h5:F01_050C0101_PED_REAL", + # "filetype": "hdf5", + # "name": "input1", ...}], ...} + x = self._get_from_loader( + filepath=inp["feat"], + filetype=inp.get("filetype", "mat")) + x_feats_dict.setdefault(inp["name"], []).append(x) + + if self.load_output: + for idx, inp in enumerate(info["output"]): + if "tokenid" in inp: + # ======= Legacy format for output ======= + # {"output": [{"tokenid": "1 2 3 4"}]) + x = np.fromiter( + map(int, inp["tokenid"].split()), dtype=np.int64) + else: + # ======= New format ======= + # {"input": + # [{"feat": "some/path.h5:F01_050C0101_PED_REAL", + # "filetype": "hdf5", + # "name": "target1", ...}], ...} + x = self._get_from_loader( + filepath=inp["feat"], + filetype=inp.get("filetype", "mat")) + + y_feats_dict.setdefault(inp["name"], []).append(x) + + if self.mode == "asr": + return_batch, uttid_list = self._create_batch_asr( + x_feats_dict, y_feats_dict, uttid_list) + else: + raise NotImplementedError(self.mode) + + if self.preprocessing is not None: + # Apply pre-processing all input features + for x_name in return_batch.keys(): + if x_name.startswith("input"): + return_batch[x_name] = self.preprocessing( + return_batch[x_name], uttid_list, + **self.preprocess_args) + + if return_uttid: + return tuple(return_batch.values()), uttid_list + + # Doesn't return the names now. + return tuple(return_batch.values()) + + def _create_batch_asr(self, x_feats_dict, y_feats_dict, uttid_list): + """Create a OrderedDict for the mini-batch + + :param OrderedDict x_feats_dict: + e.g. {"input1": [ndarray, ndarray, ...], + "input2": [ndarray, ndarray, ...]} + :param OrderedDict y_feats_dict: + e.g. {"target1": [ndarray, ndarray, ...], + "target2": [ndarray, ndarray, ...]} + :param: List[str] uttid_list: + Give uttid_list to sort in the same order as the mini-batch + :return: batch, uttid_list + :rtype: Tuple[OrderedDict, List[str]] + """ + # handle single-input and multi-input (paralell) asr mode + xs = list(x_feats_dict.values()) + + if self.load_output: + ys = list(y_feats_dict.values()) + assert len(xs[0]) == len(ys[0]), (len(xs[0]), len(ys[0])) + + # get index of non-zero length samples + nonzero_idx = list( + filter(lambda i: len(ys[0][i]) > 0, range(len(ys[0])))) + for n in range(1, len(y_feats_dict)): + nonzero_idx = filter(lambda i: len(ys[n][i]) > 0, nonzero_idx) + else: + # Note(kamo): Be careful not to make nonzero_idx to a generator + nonzero_idx = list(range(len(xs[0]))) + + if self.sort_in_input_length: + # sort in input lengths based on the first input + nonzero_sorted_idx = sorted( + nonzero_idx, key=lambda i: -len(xs[0][i])) + else: + nonzero_sorted_idx = nonzero_idx + + if len(nonzero_sorted_idx) != len(xs[0]): + logger.warning( + "Target sequences include empty tokenid (batch {} -> {}).". + format(len(xs[0]), len(nonzero_sorted_idx))) + + # remove zero-length samples + xs = [[x[i] for i in nonzero_sorted_idx] for x in xs] + uttid_list = [uttid_list[i] for i in nonzero_sorted_idx] + + x_names = list(x_feats_dict.keys()) + if self.load_output: + ys = [[y[i] for i in nonzero_sorted_idx] for y in ys] + y_names = list(y_feats_dict.keys()) + + # Keeping x_name and y_name, e.g. input1, for future extension + return_batch = OrderedDict([ + * [(x_name, x) for x_name, x in zip(x_names, xs)], + * [(y_name, y) for y_name, y in zip(y_names, ys)], + ]) + else: + return_batch = OrderedDict( + [(x_name, x) for x_name, x in zip(x_names, xs)]) + return return_batch, uttid_list + + def _get_from_loader(self, filepath, filetype): + """Return ndarray + + In order to make the fds to be opened only at the first referring, + the loader are stored in self._loaders + + >>> ndarray = loader.get_from_loader( + ... 'some/path.h5:F01_050C0101_PED_REAL', filetype='hdf5') + + :param: str filepath: + :param: str filetype: + :return: + :rtype: np.ndarray + """ + if filetype == "hdf5": + # e.g. + # {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL", + # "filetype": "hdf5", + # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL" + filepath, key = filepath.split(":", 1) + + loader = self._loaders.get(filepath) + if loader is None: + # To avoid disk access, create loader only for the first time + loader = h5py.File(filepath, "r") + self._loaders[filepath] = loader + return loader[key][()] + elif filetype == "sound.hdf5": + # e.g. + # {"input": [{"feat": "some/path.h5:F01_050C0101_PED_REAL", + # "filetype": "sound.hdf5", + # -> filepath = "some/path.h5", key = "F01_050C0101_PED_REAL" + filepath, key = filepath.split(":", 1) + + loader = self._loaders.get(filepath) + if loader is None: + # To avoid disk access, create loader only for the first time + loader = SoundHDF5File(filepath, "r", dtype="int16") + self._loaders[filepath] = loader + array, rate = loader[key] + return array + elif filetype == "sound": + # e.g. + # {"input": [{"feat": "some/path.wav", + # "filetype": "sound"}, + # Assume PCM16 + if not self.keep_all_data_on_mem: + array, _ = soundfile.read(filepath, dtype="int16") + return array + if filepath not in self._loaders: + array, _ = soundfile.read(filepath, dtype="int16") + self._loaders[filepath] = array + return self._loaders[filepath] + elif filetype == "npz": + # e.g. + # {"input": [{"feat": "some/path.npz:F01_050C0101_PED_REAL", + # "filetype": "npz", + filepath, key = filepath.split(":", 1) + + loader = self._loaders.get(filepath) + if loader is None: + # To avoid disk access, create loader only for the first time + loader = np.load(filepath) + self._loaders[filepath] = loader + return loader[key] + elif filetype == "npy": + # e.g. + # {"input": [{"feat": "some/path.npy", + # "filetype": "npy"}, + if not self.keep_all_data_on_mem: + return np.load(filepath) + if filepath not in self._loaders: + self._loaders[filepath] = np.load(filepath) + return self._loaders[filepath] + elif filetype in ["mat", "vec"]: + # e.g. + # {"input": [{"feat": "some/path.ark:123", + # "filetype": "mat"}]}, + # In this case, "123" indicates the starting points of the matrix + # load_mat can load both matrix and vector + if not self.keep_all_data_on_mem: + return kaldiio.load_mat(filepath) + if filepath not in self._loaders: + self._loaders[filepath] = kaldiio.load_mat(filepath) + return self._loaders[filepath] + elif filetype == "scp": + # e.g. + # {"input": [{"feat": "some/path.scp:F01_050C0101_PED_REAL", + # "filetype": "scp", + filepath, key = filepath.split(":", 1) + loader = self._loaders.get(filepath) + if loader is None: + # To avoid disk access, create loader only for the first time + loader = kaldiio.load_scp(filepath) + self._loaders[filepath] = loader + return loader[key] + else: + raise NotImplementedError( + "Not supported: loader_type={}".format(filetype)) + + +class SoundHDF5File(): + """Collecting sound files to a HDF5 file + + >>> f = SoundHDF5File('a.flac.h5', mode='a') + >>> array = np.random.randint(0, 100, 100, dtype=np.int16) + >>> f['id'] = (array, 16000) + >>> array, rate = f['id'] + + + :param: str filepath: + :param: str mode: + :param: str format: The type used when saving wav. flac, nist, htk, etc. + :param: str dtype: + + """ + + def __init__(self, + filepath, + mode="r+", + format=None, + dtype="int16", + **kwargs): + self.filepath = filepath + self.mode = mode + self.dtype = dtype + + self.file = h5py.File(filepath, mode, **kwargs) + if format is None: + # filepath = a.flac.h5 -> format = flac + second_ext = os.path.splitext(os.path.splitext(filepath)[0])[1] + format = second_ext[1:] + if format.upper() not in soundfile.available_formats(): + # If not found, flac is selected + format = "flac" + + # This format affects only saving + self.format = format + + def __repr__(self): + return ''.format( + self.filepath, self.mode, self.format, self.dtype) + + def create_dataset(self, name, shape=None, data=None, **kwds): + f = io.BytesIO() + array, rate = data + soundfile.write(f, array, rate, format=self.format) + self.file.create_dataset( + name, shape=shape, data=np.void(f.getvalue()), **kwds) + + def __setitem__(self, name, data): + self.create_dataset(name, data=data) + + def __getitem__(self, key): + data = self.file[key][()] + f = io.BytesIO(data.tobytes()) + array, rate = soundfile.read(f, dtype=self.dtype) + return array, rate + + def keys(self): + return self.file.keys() + + def values(self): + for k in self.file: + yield self[k] + + def items(self): + for k in self.file: + yield k, self[k] + + def __iter__(self): + return iter(self.file) + + def __contains__(self, item): + return item in self.file + + def __len__(self, item): + return len(self.file) + + def __enter__(self): + return self + + def __exit__(self, exc_type, exc_val, exc_tb): + self.file.close() + + def close(self): + self.file.close() diff --git a/examples/transv1.8to2.x/deepspeech/io/sampler.py b/examples/transv1.8to2.x/deepspeech/io/sampler.py new file mode 100644 index 00000000..763a3781 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/io/sampler.py @@ -0,0 +1,251 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math + +import numpy as np +from paddle import distributed as dist +from paddle.io import BatchSampler +from paddle.io import DistributedBatchSampler + +from deepspeech.utils.log import Log + +__all__ = [ + "SortagradDistributedBatchSampler", + "SortagradBatchSampler", +] + +logger = Log(__name__).getlog() + + +def _batch_shuffle(indices, batch_size, epoch, clipped=False): + """Put similarly-sized instances into minibatches for better efficiency + and make a batch-wise shuffle. + + 1. Sort the audio clips by duration. + 2. Generate a random number `k`, k in [0, batch_size). + 3. Randomly shift `k` instances in order to create different batches + for different epochs. Create minibatches. + 4. Shuffle the minibatches. + + :param indices: indexes. List of int. + :type indices: list + :param batch_size: Batch size. This size is also used for generate + a random number for batch shuffle. + :type batch_size: int + :param clipped: Whether to clip the heading (small shift) and trailing + (incomplete batch) instances. + :type clipped: bool + :return: Batch shuffled mainifest. + :rtype: list + """ + rng = np.random.RandomState(epoch) + shift_len = rng.randint(0, batch_size - 1) + batch_indices = list(zip(* [iter(indices[shift_len:])] * batch_size)) + rng.shuffle(batch_indices) + batch_indices = [item for batch in batch_indices for item in batch] + assert clipped is False + if not clipped: + res_len = len(indices) - shift_len - len(batch_indices) + # when res_len is 0, will return whole list, len(List[-0:]) = len(List[:]) + if res_len != 0: + batch_indices.extend(indices[-res_len:]) + batch_indices.extend(indices[0:shift_len]) + assert len(indices) == len( + batch_indices + ), f"_batch_shuffle: {len(indices)} : {len(batch_indices)} : {res_len} - {shift_len}" + return batch_indices + + +class SortagradDistributedBatchSampler(DistributedBatchSampler): + def __init__(self, + dataset, + batch_size, + num_replicas=None, + rank=None, + shuffle=False, + drop_last=False, + sortagrad=False, + shuffle_method="batch_shuffle"): + """Sortagrad Sampler for multi gpus. + + Args: + dataset (paddle.io.Dataset): + batch_size (int): batch size for one gpu + num_replicas (int, optional): world size or numbers of gpus. Defaults to None. + rank (int, optional): rank id. Defaults to None. + shuffle (bool, optional): True for do shuffle, or else. Defaults to False. + drop_last (bool, optional): whether drop last batch which is less than batch size. Defaults to False. + sortagrad (bool, optional): True, do sortgrad in first epoch, then shuffle as usual; or else. Defaults to False. + shuffle_method (str, optional): shuffle method, "instance_shuffle" or "batch_shuffle". Defaults to "batch_shuffle". + """ + super().__init__(dataset, batch_size, num_replicas, rank, shuffle, + drop_last) + self._sortagrad = sortagrad + self._shuffle_method = shuffle_method + + def __iter__(self): + num_samples = len(self.dataset) + indices = np.arange(num_samples).tolist() + indices += indices[:(self.total_size - len(indices))] + assert len(indices) == self.total_size + + # sort (by duration) or batch-wise shuffle the manifest + if self.shuffle: + if self.epoch == 0 and self._sortagrad: + logger.info( + f'rank: {dist.get_rank()} dataset sortagrad! epoch {self.epoch}' + ) + else: + logger.info( + f'rank: {dist.get_rank()} dataset shuffle! epoch {self.epoch}' + ) + if self._shuffle_method == "batch_shuffle": + # using `batch_size * nrank`, or will cause instability loss and nan or inf grad, + # since diff batch examlpe length in batches case instability loss in diff rank, + # e.g. rank0 maxlength 20, rank3 maxlength 1000 + indices = _batch_shuffle( + indices, + self.batch_size * self.nranks, + self.epoch, + clipped=False) + elif self._shuffle_method == "instance_shuffle": + np.random.RandomState(self.epoch).shuffle(indices) + else: + raise ValueError("Unknown shuffle method %s." % + self._shuffle_method) + assert len( + indices + ) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}" + + # slice `self.batch_size` examples by rank id + def _get_indices_by_batch_size(indices): + subsampled_indices = [] + last_batch_size = self.total_size % (self.batch_size * self.nranks) + assert last_batch_size % self.nranks == 0 + last_local_batch_size = last_batch_size // self.nranks + + for i in range(self.local_rank * self.batch_size, + len(indices) - last_batch_size, + self.batch_size * self.nranks): + subsampled_indices.extend(indices[i:i + self.batch_size]) + + indices = indices[len(indices) - last_batch_size:] + subsampled_indices.extend( + indices[self.local_rank * last_local_batch_size:( + self.local_rank + 1) * last_local_batch_size]) + return subsampled_indices + + if self.nranks > 1: + indices = _get_indices_by_batch_size(indices) + + assert len(indices) == self.num_samples + _sample_iter = iter(indices) + + batch_indices = [] + for idx in _sample_iter: + batch_indices.append(idx) + if len(batch_indices) == self.batch_size: + logger.debug( + f"rank: {dist.get_rank()} batch index: {batch_indices} ") + yield batch_indices + batch_indices = [] + if not self.drop_last and len(batch_indices) > 0: + yield batch_indices + + def __len__(self): + num_samples = self.num_samples + num_samples += int(not self.drop_last) * (self.batch_size - 1) + return num_samples // self.batch_size + + +class SortagradBatchSampler(BatchSampler): + def __init__(self, + dataset, + batch_size, + shuffle=False, + drop_last=False, + sortagrad=False, + shuffle_method="batch_shuffle"): + """Sortagrad Sampler for one gpu. + + Args: + dataset (paddle.io.Dataset): + batch_size (int): batch size for one gpu + shuffle (bool, optional): True for do shuffle, or else. Defaults to False. + drop_last (bool, optional): whether drop last batch which is less than batch size. Defaults to False. + sortagrad (bool, optional): True, do sortgrad in first epoch, then shuffle as usual; or else. Defaults to False. + shuffle_method (str, optional): shuffle method, "instance_shuffle" or "batch_shuffle". Defaults to "batch_shuffle". + """ + self.dataset = dataset + + assert isinstance(batch_size, int) and batch_size > 0, \ + "batch_size should be a positive integer" + self.batch_size = batch_size + assert isinstance(shuffle, bool), \ + "shuffle should be a boolean value" + self.shuffle = shuffle + assert isinstance(drop_last, bool), \ + "drop_last should be a boolean number" + + self.drop_last = drop_last + self.epoch = 0 + self.num_samples = int(math.ceil(len(self.dataset) * 1.0)) + self.total_size = self.num_samples + self._sortagrad = sortagrad + self._shuffle_method = shuffle_method + + def __iter__(self): + num_samples = len(self.dataset) + indices = np.arange(num_samples).tolist() + indices += indices[:(self.total_size - len(indices))] + assert len(indices) == self.total_size + + # sort (by duration) or batch-wise shuffle the manifest + if self.shuffle: + if self.epoch == 0 and self._sortagrad: + logger.info(f'dataset sortagrad! epoch {self.epoch}') + else: + logger.info(f'dataset shuffle! epoch {self.epoch}') + if self._shuffle_method == "batch_shuffle": + indices = _batch_shuffle( + indices, self.batch_size, self.epoch, clipped=False) + elif self._shuffle_method == "instance_shuffle": + np.random.RandomState(self.epoch).shuffle(indices) + else: + raise ValueError("Unknown shuffle method %s." % + self._shuffle_method) + assert len( + indices + ) == self.total_size, f"batch shuffle examples error: {len(indices)} : {self.total_size}" + + assert len(indices) == self.num_samples + _sample_iter = iter(indices) + + batch_indices = [] + for idx in _sample_iter: + batch_indices.append(idx) + if len(batch_indices) == self.batch_size: + logger.debug( + f"rank: {dist.get_rank()} batch index: {batch_indices} ") + yield batch_indices + batch_indices = [] + if not self.drop_last and len(batch_indices) > 0: + yield batch_indices + + self.epoch += 1 + + def __len__(self): + num_samples = self.num_samples + num_samples += int(not self.drop_last) * (self.batch_size - 1) + return num_samples // self.batch_size diff --git a/examples/transv1.8to2.x/deepspeech/io/utility.py b/examples/transv1.8to2.x/deepspeech/io/utility.py new file mode 100644 index 00000000..99487a0a --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/io/utility.py @@ -0,0 +1,87 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List + +import numpy as np + +from deepspeech.utils.log import Log + +__all__ = ["pad_list", "pad_sequence"] + +logger = Log(__name__).getlog() + + +def pad_list(sequences: List[np.ndarray], + padding_value: float=0.0) -> np.ndarray: + return pad_sequence(sequences, True, padding_value) + + +def pad_sequence(sequences: List[np.ndarray], + batch_first: bool=True, + padding_value: float=0.0) -> np.ndarray: + r"""Pad a list of variable length Tensors with ``padding_value`` + + ``pad_sequence`` stacks a list of Tensors along a new dimension, + and pads them to equal length. For example, if the input is list of + sequences with size ``L x *`` and if batch_first is False, and ``T x B x *`` + otherwise. + + `B` is batch size. It is equal to the number of elements in ``sequences``. + `T` is length of the longest sequence. + `L` is length of the sequence. + `*` is any number of trailing dimensions, including none. + + Example: + >>> a = np.ones([25, 300]) + >>> b = np.ones([22, 300]) + >>> c = np.ones([15, 300]) + >>> pad_sequence([a, b, c]).shape + [25, 3, 300] + + Note: + This function returns a np.ndarray of size ``T x B x *`` or ``B x T x *`` + where `T` is the length of the longest sequence. This function assumes + trailing dimensions and type of all the Tensors in sequences are same. + + Args: + sequences (list[np.ndarray]): list of variable length sequences. + batch_first (bool, optional): output will be in ``B x T x *`` if True, or in + ``T x B x *`` otherwise + padding_value (float, optional): value for padded elements. Default: 0. + + Returns: + np.ndarray of size ``T x B x *`` if :attr:`batch_first` is ``False``. + np.ndarray of size ``B x T x *`` otherwise + """ + + # assuming trailing dimensions and type of all the Tensors + # in sequences are same and fetching those from sequences[0] + max_size = sequences[0].shape + trailing_dims = max_size[1:] + max_len = max([s.shape[0] for s in sequences]) + if batch_first: + out_dims = (len(sequences), max_len) + trailing_dims + else: + out_dims = (max_len, len(sequences)) + trailing_dims + + out_tensor = np.full(out_dims, padding_value, dtype=sequences[0].dtype) + for i, tensor in enumerate(sequences): + length = tensor.shape[0] + # use index notation to prevent duplicate references to the tensor + if batch_first: + out_tensor[i, :length, ...] = tensor + else: + out_tensor[:length, i, ...] = tensor + + return out_tensor diff --git a/examples/transv1.8to2.x/deepspeech/models/__init__.py b/examples/transv1.8to2.x/deepspeech/models/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/models/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/transv1.8to2.x/deepspeech/models/ds2/__init__.py b/examples/transv1.8to2.x/deepspeech/models/ds2/__init__.py new file mode 100644 index 00000000..39bea5bf --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/models/ds2/__init__.py @@ -0,0 +1,17 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .deepspeech2 import DeepSpeech2InferModel +from .deepspeech2 import DeepSpeech2Model + +__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel'] diff --git a/examples/transv1.8to2.x/deepspeech/models/ds2/conv.py b/examples/transv1.8to2.x/deepspeech/models/ds2/conv.py new file mode 100644 index 00000000..f76a1e58 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/models/ds2/conv.py @@ -0,0 +1,165 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from paddle import nn +from paddle.nn import functional as F + +from deepspeech.modules.activation import brelu +from deepspeech.modules.mask import make_non_pad_mask +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = ['ConvStack', "conv_output_size"] + + +def conv_output_size(I, F, P, S): + # https://stanford.edu/~shervine/teaching/cs-230/cheatsheet-convolutional-neural-networks#hyperparameters + # Output size after Conv: + # By noting I the length of the input volume size, + # F the length of the filter, + # P the amount of zero padding, + # S the stride, + # then the output size O of the feature map along that dimension is given by: + # O = (I - F + Pstart + Pend) // S + 1 + # When Pstart == Pend == P, we can replace Pstart + Pend by 2P. + # When Pstart == Pend == 0 + # O = (I - F - S) // S + # https://iq.opengenus.org/output-size-of-convolution/ + # Output height = (Input height + padding height top + padding height bottom - kernel height) / (stride height) + 1 + # Output width = (Output width + padding width right + padding width left - kernel width) / (stride width) + 1 + return (I - F + 2 * P - S) // S + + +class ConvBn(nn.Layer): + """Convolution layer with batch normalization. + + :param kernel_size: The x dimension of a filter kernel. Or input a tuple for + two image dimension. + :type kernel_size: int|tuple|list + :param num_channels_in: Number of input channels. + :type num_channels_in: int + :param num_channels_out: Number of output channels. + :type num_channels_out: int + :param stride: The x dimension of the stride. Or input a tuple for two + image dimension. + :type stride: int|tuple|list + :param padding: The x dimension of the padding. Or input a tuple for two + image dimension. + :type padding: int|tuple|list + :param act: Activation type, relu|brelu + :type act: string + :return: Batch norm layer after convolution layer. + :rtype: Variable + + """ + + def __init__(self, num_channels_in, num_channels_out, kernel_size, stride, + padding, act): + + super().__init__() + assert len(kernel_size) == 2 + assert len(stride) == 2 + assert len(padding) == 2 + self.kernel_size = kernel_size + self.stride = stride + self.padding = padding + + self.conv = nn.Conv2D( + num_channels_in, + num_channels_out, + kernel_size=kernel_size, + stride=stride, + padding=padding, + weight_attr=None, + bias_attr=False, + data_format='NCHW') + + self.bn = nn.BatchNorm2D( + num_channels_out, + weight_attr=None, + bias_attr=None, + data_format='NCHW') + self.act = F.relu if act == 'relu' else brelu + + def forward(self, x, x_len): + """ + x(Tensor): audio, shape [B, C, D, T] + """ + x = self.conv(x) + x = self.bn(x) + x = self.act(x) + + x_len = (x_len - self.kernel_size[1] + 2 * self.padding[1] + ) // self.stride[1] + 1 + + # reset padding part to 0 + masks = make_non_pad_mask(x_len) #[B, T] + masks = masks.unsqueeze(1).unsqueeze(1) # [B, 1, 1, T] + # TODO(Hui Zhang): not support bool multiply + # masks = masks.type_as(x) + masks = masks.astype(x.dtype) + x = x.multiply(masks) + + return x, x_len + + +class ConvStack(nn.Layer): + """Convolution group with stacked convolution layers. + + :param feat_size: audio feature dim. + :type feat_size: int + :param num_stacks: Number of stacked convolution layers. + :type num_stacks: int + """ + + def __init__(self, feat_size, num_stacks): + super().__init__() + self.feat_size = feat_size # D + self.num_stacks = num_stacks + + self.conv_in = ConvBn( + num_channels_in=1, + num_channels_out=32, + kernel_size=(41, 11), #[D, T] + stride=(2, 3), + padding=(20, 5), + act='brelu') + + out_channel = 32 + convs = [ + ConvBn( + num_channels_in=32, + num_channels_out=out_channel, + kernel_size=(21, 11), + stride=(2, 1), + padding=(10, 5), + act='brelu') for i in range(num_stacks - 1) + ] + self.conv_stack = nn.LayerList(convs) + + # conv output feat_dim + output_height = (feat_size - 1) // 2 + 1 + for i in range(self.num_stacks - 1): + output_height = (output_height - 1) // 2 + 1 + self.output_height = out_channel * output_height + + def forward(self, x, x_len): + """ + x: shape [B, C, D, T] + x_len : shape [B] + """ + x, x_len = self.conv_in(x, x_len) + for i, conv in enumerate(self.conv_stack): + x, x_len = conv(x, x_len) + return x, x_len diff --git a/examples/transv1.8to2.x/deepspeech/models/ds2/deepspeech2.py b/examples/transv1.8to2.x/deepspeech/models/ds2/deepspeech2.py new file mode 100644 index 00000000..603a469a --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/models/ds2/deepspeech2.py @@ -0,0 +1,313 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Deepspeech2 ASR Model""" +from typing import Optional + +import paddle +from paddle import nn +from yacs.config import CfgNode + +from deepspeech.models.ds2.conv import ConvStack +from deepspeech.models.ds2.rnn import RNNStack +from deepspeech.modules.ctc import CTCDecoder +from deepspeech.utils import layer_tools +from deepspeech.utils.checkpoint import Checkpoint +from deepspeech.utils.log import Log +logger = Log(__name__).getlog() + +__all__ = ['DeepSpeech2Model', 'DeepSpeech2InferModel'] + + +class CRNNEncoder(nn.Layer): + def __init__(self, + feat_size, + dict_size, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + use_gru=False, + share_rnn_weights=True): + super().__init__() + self.rnn_size = rnn_size + self.feat_size = feat_size # 161 for linear + self.dict_size = dict_size + + self.conv = ConvStack(feat_size, num_conv_layers) + + i_size = self.conv.output_height # H after conv stack + self.rnn = RNNStack( + i_size=i_size, + h_size=rnn_size, + num_stacks=num_rnn_layers, + use_gru=use_gru, + share_rnn_weights=share_rnn_weights) + + @property + def output_size(self): + return self.rnn_size * 2 + + def forward(self, audio, audio_len): + """Compute Encoder outputs + + Args: + audio (Tensor): [B, Tmax, D] + text (Tensor): [B, Umax] + audio_len (Tensor): [B] + text_len (Tensor): [B] + Returns: + x (Tensor): encoder outputs, [B, T, D] + x_lens (Tensor): encoder length, [B] + """ + # [B, T, D] -> [B, D, T] + audio = audio.transpose([0, 2, 1]) + # [B, D, T] -> [B, C=1, D, T] + x = audio.unsqueeze(1) + x_lens = audio_len + + # convolution group + x, x_lens = self.conv(x, x_lens) + x_val = x.numpy() + + # convert data from convolution feature map to sequence of vectors + #B, C, D, T = paddle.shape(x) # not work under jit + x = x.transpose([0, 3, 1, 2]) #[B, T, C, D] + #x = x.reshape([B, T, C * D]) #[B, T, C*D] # not work under jit + x = x.reshape([0, 0, -1]) #[B, T, C*D] + + # remove padding part + x, x_lens = self.rnn(x, x_lens) #[B, T, D] + return x, x_lens + + +class DeepSpeech2Model(nn.Layer): + """The DeepSpeech2 network structure. + + :param audio_data: Audio spectrogram data layer. + :type audio_data: Variable + :param text_data: Transcription text data layer. + :type text_data: Variable + :param audio_len: Valid sequence length data layer. + :type audio_len: Variable + :param masks: Masks data layer to reset padding. + :type masks: Variable + :param dict_size: Dictionary size for tokenized transcription. + :type dict_size: int + :param num_conv_layers: Number of stacking convolution layers. + :type num_conv_layers: int + :param num_rnn_layers: Number of stacking RNN layers. + :type num_rnn_layers: int + :param rnn_size: RNN layer size (dimension of RNN cells). + :type rnn_size: int + :param use_gru: Use gru if set True. Use simple rnn if set False. + :type use_gru: bool + :param share_rnn_weights: Whether to share input-hidden weights between + forward and backward direction RNNs. + It is only available when use_gru=False. + :type share_weights: bool + :return: A tuple of an output unnormalized log probability layer ( + before softmax) and a ctc cost layer. + :rtype: tuple of LayerOutput + """ + + @classmethod + def params(cls, config: Optional[CfgNode]=None) -> CfgNode: + default = CfgNode( + dict( + num_conv_layers=2, #Number of stacking convolution layers. + num_rnn_layers=3, #Number of stacking RNN layers. + rnn_layer_size=1024, #RNN layer size (number of RNN cells). + use_gru=True, #Use gru if set True. Use simple rnn if set False. + share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. + )) + if config is not None: + config.merge_from_other_cfg(default) + return default + + def __init__(self, + feat_size, + dict_size, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + use_gru=False, + share_rnn_weights=True, + blank_id=0): + super().__init__() + self.encoder = CRNNEncoder( + feat_size=feat_size, + dict_size=dict_size, + num_conv_layers=num_conv_layers, + num_rnn_layers=num_rnn_layers, + rnn_size=rnn_size, + use_gru=use_gru, + share_rnn_weights=share_rnn_weights) + assert (self.encoder.output_size == rnn_size * 2) + + self.decoder = CTCDecoder( + odim=dict_size, # is in vocab + enc_n_units=self.encoder.output_size, + blank_id=blank_id, # first token is + dropout_rate=0.0, + reduction=True, # sum + batch_average=True) # sum / batch_size + + def forward(self, audio, audio_len, text, text_len): + """Compute Model loss + + Args: + audio (Tenosr): [B, T, D] + audio_len (Tensor): [B] + text (Tensor): [B, U] + text_len (Tensor): [B] + + Returns: + loss (Tenosr): [1] + """ + eouts, eouts_len = self.encoder(audio, audio_len) + loss = self.decoder(eouts, eouts_len, text, text_len) + return loss + + @paddle.no_grad() + def decode(self, audio, audio_len, vocab_list, decoding_method, + lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, + cutoff_top_n, num_processes): + # init once + # decoders only accept string encoded in utf-8 + self.decoder.init_decode( + beam_alpha=beam_alpha, + beam_beta=beam_beta, + lang_model_path=lang_model_path, + vocab_list=vocab_list, + decoding_method=decoding_method) + + eouts, eouts_len = self.encoder(audio, audio_len) + probs = self.decoder.softmax(eouts) + + return self.decoder.decode_probs( + probs.numpy(), eouts_len, vocab_list, decoding_method, + lang_model_path, beam_alpha, beam_beta, beam_size, cutoff_prob, + cutoff_top_n, num_processes) + + def decode_probs_split(self, probs_split, vocab_list, decoding_method, + lang_model_path, beam_alpha, beam_beta, beam_size, + cutoff_prob, cutoff_top_n, num_processes): + self.decoder.init_decode( + beam_alpha=beam_alpha, + beam_beta=beam_beta, + lang_model_path=lang_model_path, + vocab_list=vocab_list, + decoding_method=decoding_method) + return self.decoder.decode_probs_split( + probs_split, vocab_list, decoding_method, lang_model_path, + beam_alpha, beam_beta, beam_size, cutoff_prob, cutoff_top_n, + num_processes) + + @classmethod + def from_pretrained(cls, dataloader, config, checkpoint_path): + """Build a DeepSpeech2Model model from a pretrained model. + Parameters + ---------- + dataloader: paddle.io.DataLoader + + config: yacs.config.CfgNode + model configs + + checkpoint_path: Path or str + the path of pretrained model checkpoint, without extension name + + Returns + ------- + DeepSpeech2Model + The model built from pretrained result. + """ + model = cls(feat_size=dataloader.collate_fn.feature_size, + dict_size=dataloader.collate_fn.vocab_size, + num_conv_layers=config.model.num_conv_layers, + num_rnn_layers=config.model.num_rnn_layers, + rnn_size=config.model.rnn_layer_size, + use_gru=config.model.use_gru, + share_rnn_weights=config.model.share_rnn_weights) + infos = Checkpoint().load_parameters( + model, checkpoint_path=checkpoint_path) + logger.info(f"checkpoint info: {infos}") + layer_tools.summary(model) + return model + + @classmethod + def from_config(cls, config): + """Build a DeepSpeec2Model from config + Parameters + + config: yacs.config.CfgNode + config.model + Returns + ------- + DeepSpeech2Model + The model built from config. + """ + model = cls(feat_size=config.feat_size, + dict_size=config.dict_size, + num_conv_layers=config.num_conv_layers, + num_rnn_layers=config.num_rnn_layers, + rnn_size=config.rnn_layer_size, + use_gru=config.use_gru, + share_rnn_weights=config.share_rnn_weights) + return model + + +class DeepSpeech2InferModel(DeepSpeech2Model): + def __init__(self, + feat_size, + dict_size, + num_conv_layers=2, + num_rnn_layers=3, + rnn_size=1024, + use_gru=False, + share_rnn_weights=True, + blank_id=0): + super().__init__( + feat_size=feat_size, + dict_size=dict_size, + num_conv_layers=num_conv_layers, + num_rnn_layers=num_rnn_layers, + rnn_size=rnn_size, + use_gru=use_gru, + share_rnn_weights=share_rnn_weights, + blank_id=blank_id) + + def forward(self, audio, audio_len): + """export model function + + Args: + audio (Tensor): [B, T, D] + audio_len (Tensor): [B] + + Returns: + probs: probs after softmax + """ + eouts, eouts_len = self.encoder(audio, audio_len) + probs = self.decoder.softmax(eouts) + return probs, eouts_len + + def export(self): + static_model = paddle.jit.to_static( + self, + input_spec=[ + paddle.static.InputSpec( + shape=[None, None, self.encoder.feat_size], + dtype='float32'), # audio, [B,T,D] + paddle.static.InputSpec(shape=[None], + dtype='int64'), # audio_length, [B] + ]) + return static_model diff --git a/examples/transv1.8to2.x/deepspeech/models/ds2/rnn.py b/examples/transv1.8to2.x/deepspeech/models/ds2/rnn.py new file mode 100644 index 00000000..e45db7c0 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/models/ds2/rnn.py @@ -0,0 +1,334 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import math + +import paddle +from paddle import nn +from paddle.nn import functional as F +from paddle.nn import initializer as I + +from deepspeech.modules.activation import brelu +from deepspeech.modules.mask import make_non_pad_mask +from deepspeech.utils.log import Log +logger = Log(__name__).getlog() + +__all__ = ['RNNStack'] + + +class RNNCell(nn.RNNCellBase): + r""" + Elman RNN (SimpleRNN) cell. Given the inputs and previous states, it + computes the outputs and updates states. + The formula used is as follows: + .. math:: + h_{t} & = act(x_{t} + b_{ih} + W_{hh}h_{t-1} + b_{hh}) + y_{t} & = h_{t} + + where :math:`act` is for :attr:`activation`. + """ + + def __init__(self, + hidden_size: int, + activation="tanh", + weight_ih_attr=None, + weight_hh_attr=None, + bias_ih_attr=None, + bias_hh_attr=None, + name=None): + super().__init__() + std = 1.0 / math.sqrt(hidden_size) + self.weight_hh = self.create_parameter( + (hidden_size, hidden_size), + weight_hh_attr, + default_initializer=I.Uniform(-std, std)) + self.bias_ih = None + self.bias_hh = self.create_parameter( + (hidden_size, ), + bias_hh_attr, + is_bias=True, + default_initializer=I.Uniform(-std, std)) + + self.hidden_size = hidden_size + if activation not in ["tanh", "relu", "brelu"]: + raise ValueError( + "activation for SimpleRNNCell should be tanh or relu, " + "but get {}".format(activation)) + self.activation = activation + self._activation_fn = paddle.tanh \ + if activation == "tanh" \ + else F.relu + if activation == 'brelu': + self._activation_fn = brelu + + def forward(self, inputs, states=None): + if states is None: + states = self.get_initial_states(inputs, self.state_shape) + pre_h = states + i2h = inputs + if self.bias_ih is not None: + i2h += self.bias_ih + h2h = paddle.matmul(pre_h, self.weight_hh, transpose_y=True) + if self.bias_hh is not None: + h2h += self.bias_hh + h = self._activation_fn(i2h + h2h) + return h, h + + @property + def state_shape(self): + return (self.hidden_size, ) + + +class GRUCell(nn.RNNCellBase): + r""" + Gated Recurrent Unit (GRU) RNN cell. Given the inputs and previous states, + it computes the outputs and updates states. + The formula for GRU used is as follows: + .. math:: + r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}h_{t-1} + b_{hr}) + z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}h_{t-1} + b_{hz}) + \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}h_{t-1} + b_{hc})) + h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t} + y_{t} & = h_{t} + + where :math:`\sigma` is the sigmoid fucntion, and * is the elemetwise + multiplication operator. + """ + + def __init__(self, + input_size: int, + hidden_size: int, + weight_ih_attr=None, + weight_hh_attr=None, + bias_ih_attr=None, + bias_hh_attr=None, + name=None): + super().__init__() + std = 1.0 / math.sqrt(hidden_size) + self.weight_hh = self.create_parameter( + (3 * hidden_size, hidden_size), + weight_hh_attr, + default_initializer=I.Uniform(-std, std)) + self.bias_ih = None + self.bias_hh = self.create_parameter( + (3 * hidden_size, ), + bias_hh_attr, + is_bias=True, + default_initializer=I.Uniform(-std, std)) + + self.hidden_size = hidden_size + self.input_size = input_size + self._gate_activation = F.sigmoid + self._activation = paddle.relu + + def forward(self, inputs, states=None): + if states is None: + states = self.get_initial_states(inputs, self.state_shape) + + pre_hidden = states # shape [batch_size, hidden_size] + + x_gates = inputs + if self.bias_ih is not None: + x_gates = x_gates + self.bias_ih + bias_u, bias_r, bias_c = paddle.split( + self.bias_hh, num_or_sections=3, axis=0) + + weight_hh = paddle.transpose( + self.weight_hh, + perm=[1, 0]) #weight_hh:shape[hidden_size, 3 * hidden_size] + w_u_r_c = paddle.flatten(weight_hh) + size_u_r = self.hidden_size * 2 * self.hidden_size + w_u_r = paddle.reshape(w_u_r_c[:size_u_r], + (self.hidden_size, self.hidden_size * 2)) + w_u, w_r = paddle.split(w_u_r, num_or_sections=2, axis=1) + w_c = paddle.reshape(w_u_r_c[size_u_r:], + (self.hidden_size, self.hidden_size)) + + h_u = paddle.matmul( + pre_hidden, w_u, + transpose_y=False) + bias_u #shape [batch_size, hidden_size] + h_r = paddle.matmul( + pre_hidden, w_r, + transpose_y=False) + bias_r #shape [batch_size, hidden_size] + + x_u, x_r, x_c = paddle.split( + x_gates, num_or_sections=3, axis=1) #shape[batch_size, hidden_size] + + u = self._gate_activation(x_u + h_u) #shape [batch_size, hidden_size] + r = self._gate_activation(x_r + h_r) #shape [batch_size, hidden_size] + c = self._activation( + x_c + paddle.matmul(r * pre_hidden, w_c, transpose_y=False) + + bias_c) # [batch_size, hidden_size] + + h = (1 - u) * pre_hidden + u * c + # https://www.paddlepaddle.org.cn/documentation/docs/zh/api/paddle/fluid/layers/dynamic_gru_cn.html#dynamic-gru + return h, h + + @property + def state_shape(self): + r""" + The `state_shape` of GRUCell is a shape `[hidden_size]` (-1 for batch + size would be automatically inserted into shape). The shape corresponds + to the shape of :math:`h_{t-1}`. + """ + return (self.hidden_size, ) + + +class BiRNNWithBN(nn.Layer): + """Bidirectonal simple rnn layer with sequence-wise batch normalization. + The batch normalization is only performed on input-state weights. + + :param size: Dimension of RNN cells. + :type size: int + :param share_weights: Whether to share input-hidden weights between + forward and backward directional RNNs. + :type share_weights: bool + :return: Bidirectional simple rnn layer. + :rtype: Variable + """ + + def __init__(self, i_size: int, h_size: int, share_weights: bool): + super().__init__() + self.share_weights = share_weights + if self.share_weights: + #input-hidden weights shared between bi-directional rnn. + self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False) + # batch norm is only performed on input-state projection + self.fw_bn = nn.BatchNorm1D( + h_size, bias_attr=None, data_format='NLC') + self.bw_fc = self.fw_fc + self.bw_bn = self.fw_bn + else: + self.fw_fc = nn.Linear(i_size, h_size, bias_attr=False) + self.fw_bn = nn.BatchNorm1D( + h_size, bias_attr=None, data_format='NLC') + self.bw_fc = nn.Linear(i_size, h_size, bias_attr=False) + self.bw_bn = nn.BatchNorm1D( + h_size, bias_attr=None, data_format='NLC') + + self.fw_cell = RNNCell(hidden_size=h_size, activation='brelu') + self.bw_cell = RNNCell(hidden_size=h_size, activation='brelu') + self.fw_rnn = nn.RNN( + self.fw_cell, is_reverse=False, time_major=False) #[B, T, D] + self.bw_rnn = nn.RNN( + self.bw_cell, is_reverse=True, time_major=False) #[B, T, D] + + def forward(self, x: paddle.Tensor, x_len: paddle.Tensor): + # x, shape [B, T, D] + fw_x = self.fw_bn(self.fw_fc(x)) + bw_x = self.bw_bn(self.bw_fc(x)) + fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len) + bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len) + x = paddle.concat([fw_x, bw_x], axis=-1) + return x, x_len + + +class BiGRUWithBN(nn.Layer): + """Bidirectonal gru layer with sequence-wise batch normalization. + The batch normalization is only performed on input-state weights. + + :param name: Name of the layer. + :type name: string + :param input: Input layer. + :type input: Variable + :param size: Dimension of GRU cells. + :type size: int + :param act: Activation type. + :type act: string + :return: Bidirectional GRU layer. + :rtype: Variable + """ + + def __init__(self, i_size: int, h_size: int): + super().__init__() + hidden_size = h_size * 3 + + self.fw_fc = nn.Linear(i_size, hidden_size, bias_attr=False) + self.fw_bn = nn.BatchNorm1D( + hidden_size, bias_attr=None, data_format='NLC') + self.bw_fc = nn.Linear(i_size, hidden_size, bias_attr=False) + self.bw_bn = nn.BatchNorm1D( + hidden_size, bias_attr=None, data_format='NLC') + + self.fw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size) + self.bw_cell = GRUCell(input_size=hidden_size, hidden_size=h_size) + self.fw_rnn = nn.RNN( + self.fw_cell, is_reverse=False, time_major=False) #[B, T, D] + self.bw_rnn = nn.RNN( + self.bw_cell, is_reverse=True, time_major=False) #[B, T, D] + + def forward(self, x, x_len): + # x, shape [B, T, D] + fw_x = self.fw_bn(self.fw_fc(x)) + + bw_x = self.bw_bn(self.bw_fc(x)) + fw_x, _ = self.fw_rnn(inputs=fw_x, sequence_length=x_len) + bw_x, _ = self.bw_rnn(inputs=bw_x, sequence_length=x_len) + x = paddle.concat([fw_x, bw_x], axis=-1) + return x, x_len + + +class RNNStack(nn.Layer): + """RNN group with stacked bidirectional simple RNN or GRU layers. + + :param input: Input layer. + :type input: Variable + :param size: Dimension of RNN cells in each layer. + :type size: int + :param num_stacks: Number of stacked rnn layers. + :type num_stacks: int + :param use_gru: Use gru if set True. Use simple rnn if set False. + :type use_gru: bool + :param share_rnn_weights: Whether to share input-hidden weights between + forward and backward directional RNNs. + It is only available when use_gru=False. + :type share_weights: bool + :return: Output layer of the RNN group. + :rtype: Variable + """ + + def __init__(self, + i_size: int, + h_size: int, + num_stacks: int, + use_gru: bool, + share_rnn_weights: bool): + super().__init__() + rnn_stacks = [] + for i in range(num_stacks): + if use_gru: + #default:GRU using tanh + rnn_stacks.append(BiGRUWithBN(i_size=i_size, h_size=h_size)) + else: + rnn_stacks.append( + BiRNNWithBN( + i_size=i_size, + h_size=h_size, + share_weights=share_rnn_weights)) + i_size = h_size * 2 + + self.rnn_stacks = nn.LayerList(rnn_stacks) + + def forward(self, x: paddle.Tensor, x_len: paddle.Tensor): + """ + x: shape [B, T, D] + x_len: shpae [B] + """ + for i, rnn in enumerate(self.rnn_stacks): + x, x_len = rnn(x, x_len) + masks = make_non_pad_mask(x_len) #[B, T] + masks = masks.unsqueeze(-1) # [B, T, 1] + # TODO(Hui Zhang): not support bool multiply + masks = masks.astype(x.dtype) + x = x.multiply(masks) + return x, x_len diff --git a/examples/transv1.8to2.x/deepspeech/modules/__init__.py b/examples/transv1.8to2.x/deepspeech/modules/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/modules/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/transv1.8to2.x/deepspeech/modules/activation.py b/examples/transv1.8to2.x/deepspeech/modules/activation.py new file mode 100644 index 00000000..30132775 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/modules/activation.py @@ -0,0 +1,145 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from collections import OrderedDict + +import paddle +from paddle import nn + +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = ["get_activation", "brelu", "LinearGLUBlock", "ConvGLUBlock"] + + +def brelu(x, t_min=0.0, t_max=24.0, name=None): + # paddle.to_tensor is dygraph_only can not work under JIT + t_min = paddle.full(shape=[1], fill_value=t_min, dtype='float32') + t_max = paddle.full(shape=[1], fill_value=t_max, dtype='float32') + return x.maximum(t_min).minimum(t_max) + + +class LinearGLUBlock(nn.Layer): + """A linear Gated Linear Units (GLU) block.""" + + def __init__(self, idim: int): + """ GLU. + Args: + idim (int): input and output dimension + """ + super().__init__() + self.fc = nn.Linear(idim, idim * 2) + + def forward(self, xs): + return glu(self.fc(xs), dim=-1) + + +class ConvGLUBlock(nn.Layer): + def __init__(self, kernel_size, in_ch, out_ch, bottlececk_dim=0, + dropout=0.): + """A convolutional Gated Linear Units (GLU) block. + + Args: + kernel_size (int): kernel size + in_ch (int): number of input channels + out_ch (int): number of output channels + bottlececk_dim (int): dimension of the bottleneck layers for computational efficiency. Defaults to 0. + dropout (float): dropout probability. Defaults to 0.. + """ + + super().__init__() + + self.conv_residual = None + if in_ch != out_ch: + self.conv_residual = nn.utils.weight_norm( + nn.Conv2D( + in_channels=in_ch, out_channels=out_ch, kernel_size=(1, 1)), + name='weight', + dim=0) + self.dropout_residual = nn.Dropout(p=dropout) + + self.pad_left = nn.Pad2d((0, 0, kernel_size - 1, 0), 0) + + layers = OrderedDict() + if bottlececk_dim == 0: + layers['conv'] = nn.utils.weight_norm( + nn.Conv2D( + in_channels=in_ch, + out_channels=out_ch * 2, + kernel_size=(kernel_size, 1)), + name='weight', + dim=0) + # TODO(hirofumi0810): padding? + layers['dropout'] = nn.Dropout(p=dropout) + layers['glu'] = GLU() + + elif bottlececk_dim > 0: + layers['conv_in'] = nn.utils.weight_norm( + nn.Conv2D( + in_channels=in_ch, + out_channels=bottlececk_dim, + kernel_size=(1, 1)), + name='weight', + dim=0) + layers['dropout_in'] = nn.Dropout(p=dropout) + layers['conv_bottleneck'] = nn.utils.weight_norm( + nn.Conv2D( + in_channels=bottlececk_dim, + out_channels=bottlececk_dim, + kernel_size=(kernel_size, 1)), + name='weight', + dim=0) + layers['dropout'] = nn.Dropout(p=dropout) + layers['glu'] = GLU() + layers['conv_out'] = nn.utils.weight_norm( + nn.Conv2D( + in_channels=bottlececk_dim, + out_channels=out_ch * 2, + kernel_size=(1, 1)), + name='weight', + dim=0) + layers['dropout_out'] = nn.Dropout(p=dropout) + + self.layers = nn.Sequential(layers) + + def forward(self, xs): + """Forward pass. + Args: + xs (FloatTensor): `[B, in_ch, T, feat_dim]` + Returns: + out (FloatTensor): `[B, out_ch, T, feat_dim]` + """ + residual = xs + if self.conv_residual is not None: + residual = self.dropout_residual(self.conv_residual(residual)) + xs = self.pad_left(xs) # `[B, embed_dim, T+kernel-1, 1]` + xs = self.layers(xs) # `[B, out_ch * 2, T ,1]` + xs = xs + residual + return xs + + +def get_activation(act): + """Return activation function.""" + # Lazy load to avoid unused import + activation_funcs = { + "hardtanh": paddle.nn.Hardtanh, + "tanh": paddle.nn.Tanh, + "relu": paddle.nn.ReLU, + "selu": paddle.nn.SELU, + "swish": paddle.nn.Swish, + "gelu": paddle.nn.GELU, + "brelu": brelu, + } + + return activation_funcs[act]() diff --git a/examples/transv1.8to2.x/deepspeech/modules/cmvn.py b/examples/transv1.8to2.x/deepspeech/modules/cmvn.py new file mode 100644 index 00000000..b98af052 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/modules/cmvn.py @@ -0,0 +1,51 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle +from paddle import nn + +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = ['GlobalCMVN'] + + +class GlobalCMVN(nn.Layer): + def __init__(self, + mean: paddle.Tensor, + istd: paddle.Tensor, + norm_var: bool=True): + """ + Args: + mean (paddle.Tensor): mean stats + istd (paddle.Tensor): inverse std, std which is 1.0 / std + """ + super().__init__() + assert mean.shape == istd.shape + self.norm_var = norm_var + # The buffer can be accessed from this module using self.mean + self.register_buffer("mean", mean) + self.register_buffer("istd", istd) + + def forward(self, x: paddle.Tensor): + """ + Args: + x (paddle.Tensor): (batch, max_len, feat_dim) + Returns: + (paddle.Tensor): normalized feature + """ + x = x - self.mean + if self.norm_var: + x = x * self.istd + return x diff --git a/examples/transv1.8to2.x/deepspeech/modules/crf.py b/examples/transv1.8to2.x/deepspeech/modules/crf.py new file mode 100644 index 00000000..b6b481a0 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/modules/crf.py @@ -0,0 +1,370 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle +from paddle import nn + +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = ['CRF'] + + +class CRF(nn.Layer): + """ + Linear-chain Conditional Random Field (CRF). + + Args: + nb_labels (int): number of labels in your tagset, including special symbols. + bos_tag_id (int): integer representing the beginning of sentence symbol in + your tagset. + eos_tag_id (int): integer representing the end of sentence symbol in your tagset. + pad_tag_id (int, optional): integer representing the pad symbol in your tagset. + If None, the model will treat the PAD as a normal tag. Otherwise, the model + will apply constraints for PAD transitions. + batch_first (bool): Whether the first dimension represents the batch dimension. + """ + + def __init__(self, + nb_labels: int, + bos_tag_id: int, + eos_tag_id: int, + pad_tag_id: int=None, + batch_first: bool=True): + super().__init__() + + self.nb_labels = nb_labels + self.BOS_TAG_ID = bos_tag_id + self.EOS_TAG_ID = eos_tag_id + self.PAD_TAG_ID = pad_tag_id + self.batch_first = batch_first + + # initialize transitions from a random uniform distribution between -0.1 and 0.1 + self.transitions = self.create_parameter( + [self.nb_labels, self.nb_labels], + default_initializer=nn.initializer.Uniform(-0.1, 0.1)) + self.init_weights() + + def init_weights(self): + # enforce contraints (rows=from, columns=to) with a big negative number + # so exp(-10000) will tend to zero + + # no transitions allowed to the beginning of sentence + self.transitions[:, self.BOS_TAG_ID] = -10000.0 + # no transition alloed from the end of sentence + self.transitions[self.EOS_TAG_ID, :] = -10000.0 + + if self.PAD_TAG_ID is not None: + # no transitions from padding + self.transitions[self.PAD_TAG_ID, :] = -10000.0 + # no transitions to padding + self.transitions[:, self.PAD_TAG_ID] = -10000.0 + # except if the end of sentence is reached + # or we are already in a pad position + self.transitions[self.PAD_TAG_ID, self.EOS_TAG_ID] = 0.0 + self.transitions[self.PAD_TAG_ID, self.PAD_TAG_ID] = 0.0 + + def forward(self, + emissions: paddle.Tensor, + tags: paddle.Tensor, + mask: paddle.Tensor=None) -> paddle.Tensor: + """Compute the negative log-likelihood. See `log_likelihood` method.""" + nll = -self.log_likelihood(emissions, tags, mask=mask) + return nll + + def log_likelihood(self, emissions, tags, mask=None): + """Compute the probability of a sequence of tags given a sequence of + emissions scores. + + Args: + emissions (paddle.Tensor): Sequence of emissions for each label. + Shape of (batch_size, seq_len, nb_labels) if batch_first is True, + (seq_len, batch_size, nb_labels) otherwise. + tags (paddle.LongTensor): Sequence of labels. + Shape of (batch_size, seq_len) if batch_first is True, + (seq_len, batch_size) otherwise. + mask (paddle.FloatTensor, optional): Tensor representing valid positions. + If None, all positions are considered valid. + Shape of (batch_size, seq_len) if batch_first is True, + (seq_len, batch_size) otherwise. + + Returns: + paddle.Tensor: sum of the log-likelihoods for each sequence in the batch. + Shape of () + """ + # fix tensors order by setting batch as the first dimension + if not self.batch_first: + emissions = emissions.transpose(0, 1) + tags = tags.transpose(0, 1) + + if mask is None: + mask = paddle.ones(emissions.shape[:2], dtype=paddle.float) + + scores = self._compute_scores(emissions, tags, mask=mask) + partition = self._compute_log_partition(emissions, mask=mask) + return paddle.sum(scores - partition) + + def decode(self, emissions, mask=None): + """Find the most probable sequence of labels given the emissions using + the Viterbi algorithm. + + Args: + emissions (paddle.Tensor): Sequence of emissions for each label. + Shape (batch_size, seq_len, nb_labels) if batch_first is True, + (seq_len, batch_size, nb_labels) otherwise. + mask (paddle.FloatTensor, optional): Tensor representing valid positions. + If None, all positions are considered valid. + Shape (batch_size, seq_len) if batch_first is True, + (seq_len, batch_size) otherwise. + + Returns: + paddle.Tensor: the viterbi score for the for each batch. + Shape of (batch_size,) + list of lists: the best viterbi sequence of labels for each batch. [B, T] + """ + # fix tensors order by setting batch as the first dimension + if not self.batch_first: + emissions = emissions.transpose(0, 1) + tags = tags.transpose(0, 1) + + if mask is None: + mask = paddle.ones(emissions.shape[:2], dtype=paddle.float) + + scores, sequences = self._viterbi_decode(emissions, mask) + return scores, sequences + + def _compute_scores(self, emissions, tags, mask): + """Compute the scores for a given batch of emissions with their tags. + + Args: + emissions (paddle.Tensor): (batch_size, seq_len, nb_labels) + tags (Paddle.LongTensor): (batch_size, seq_len) + mask (Paddle.FloatTensor): (batch_size, seq_len) + + Returns: + paddle.Tensor: Scores for each batch. + Shape of (batch_size,) + """ + batch_size, seq_length = tags.shape + scores = paddle.zeros([batch_size]) + + # save first and last tags to be used later + first_tags = tags[:, 0] + last_valid_idx = mask.int().sum(1) - 1 + + # TODO(Hui Zhang): not support fancy index. + # last_tags = tags.gather(last_valid_idx.unsqueeze(1), axis=1).squeeze() + batch_idx = paddle.arange(batch_size, dtype=last_valid_idx.dtype) + gather_last_valid_idx = paddle.stack( + [batch_idx, last_valid_idx], axis=-1) + last_tags = tags.gather_nd(gather_last_valid_idx) + + # add the transition from BOS to the first tags for each batch + # t_scores = self.transitions[self.BOS_TAG_ID, first_tags] + t_scores = self.transitions[self.BOS_TAG_ID].gather(first_tags) + + # add the [unary] emission scores for the first tags for each batch + # for all batches, the first word, see the correspondent emissions + # for the first tags (which is a list of ids): + # emissions[:, 0, [tag_1, tag_2, ..., tag_nblabels]] + # e_scores = emissions[:, 0].gather(1, first_tags.unsqueeze(1)).squeeze() + gather_first_tags_idx = paddle.stack([batch_idx, first_tags], axis=-1) + e_scores = emissions[:, 0].gather_nd(gather_first_tags_idx) + + # the scores for a word is just the sum of both scores + scores += e_scores + t_scores + + # now lets do this for each remaining word + for i in range(1, seq_length): + + # we could: iterate over batches, check if we reached a mask symbol + # and stop the iteration, but vecotrizing is faster due to gpu, + # so instead we perform an element-wise multiplication + is_valid = mask[:, i] + + previous_tags = tags[:, i - 1] + current_tags = tags[:, i] + + # calculate emission and transition scores as we did before + # e_scores = emissions[:, i].gather(1, current_tags.unsqueeze(1)).squeeze() + gather_current_tags_idx = paddle.stack( + [batch_idx, current_tags], axis=-1) + e_scores = emissions[:, i].gather_nd(gather_current_tags_idx) + # t_scores = self.transitions[previous_tags, current_tags] + gather_transitions_idx = paddle.stack( + [previous_tags, current_tags], axis=-1) + t_scores = self.transitions.gather_nd(gather_transitions_idx) + + # apply the mask + e_scores = e_scores * is_valid + t_scores = t_scores * is_valid + + scores += e_scores + t_scores + + # add the transition from the end tag to the EOS tag for each batch + # scores += self.transitions[last_tags, self.EOS_TAG_ID] + scores += self.transitions.gather(last_tags)[:, self.EOS_TAG_ID] + + return scores + + def _compute_log_partition(self, emissions, mask): + """Compute the partition function in log-space using the forward-algorithm. + + Args: + emissions (paddle.Tensor): (batch_size, seq_len, nb_labels) + mask (Paddle.FloatTensor): (batch_size, seq_len) + + Returns: + paddle.Tensor: the partition scores for each batch. + Shape of (batch_size,) + """ + batch_size, seq_length, nb_labels = emissions.shape + + # in the first iteration, BOS will have all the scores + alphas = self.transitions[self.BOS_TAG_ID, :].unsqueeze( + 0) + emissions[:, 0] + + for i in range(1, seq_length): + # (bs, nb_labels) -> (bs, 1, nb_labels) + e_scores = emissions[:, i].unsqueeze(1) + + # (nb_labels, nb_labels) -> (bs, nb_labels, nb_labels) + t_scores = self.transitions.unsqueeze(0) + + # (bs, nb_labels) -> (bs, nb_labels, 1) + a_scores = alphas.unsqueeze(2) + + scores = e_scores + t_scores + a_scores + new_alphas = paddle.logsumexp(scores, axis=1) + + # set alphas if the mask is valid, otherwise keep the current values + is_valid = mask[:, i].unsqueeze(-1) + alphas = is_valid * new_alphas + (1 - is_valid) * alphas + + # add the scores for the final transition + last_transition = self.transitions[:, self.EOS_TAG_ID] + end_scores = alphas + last_transition.unsqueeze(0) + + # return a *log* of sums of exps + return paddle.logsumexp(end_scores, axis=1) + + def _viterbi_decode(self, emissions, mask): + """Compute the viterbi algorithm to find the most probable sequence of labels + given a sequence of emissions. + + Args: + emissions (paddle.Tensor): (batch_size, seq_len, nb_labels) + mask (Paddle.FloatTensor): (batch_size, seq_len) + + Returns: + paddle.Tensor: the viterbi score for the for each batch. + Shape of (batch_size,) + list of lists of ints: the best viterbi sequence of labels for each batch + """ + batch_size, seq_length, nb_labels = emissions.shape + + # in the first iteration, BOS will have all the scores and then, the max + alphas = self.transitions[self.BOS_TAG_ID, :].unsqueeze( + 0) + emissions[:, 0] + + backpointers = [] + + for i in range(1, seq_length): + # (bs, nb_labels) -> (bs, 1, nb_labels) + e_scores = emissions[:, i].unsqueeze(1) + + # (nb_labels, nb_labels) -> (bs, nb_labels, nb_labels) + t_scores = self.transitions.unsqueeze(0) + + # (bs, nb_labels) -> (bs, nb_labels, 1) + a_scores = alphas.unsqueeze(2) + + # combine current scores with previous alphas + scores = e_scores + t_scores + a_scores + + # so far is exactly like the forward algorithm, + # but now, instead of calculating the logsumexp, + # we will find the highest score and the tag associated with it + # max_scores, max_score_tags = paddle.max(scores, axis=1) + max_scores = paddle.max(scores, axis=1) + max_score_tags = paddle.argmax(scores, axis=1) + + # set alphas if the mask is valid, otherwise keep the current values + is_valid = mask[:, i].unsqueeze(-1) + alphas = is_valid * max_scores + (1 - is_valid) * alphas + + # add the max_score_tags for our list of backpointers + # max_scores has shape (batch_size, nb_labels) so we transpose it to + # be compatible with our previous loopy version of viterbi + backpointers.append(max_score_tags.t()) + + # add the scores for the final transition + last_transition = self.transitions[:, self.EOS_TAG_ID] + end_scores = alphas + last_transition.unsqueeze(0) + + # get the final most probable score and the final most probable tag + # max_final_scores, max_final_tags = paddle.max(end_scores, axis=1) + max_final_scores = paddle.max(end_scores, axis=1) + max_final_tags = paddle.argmax(end_scores, axis=1) + + # find the best sequence of labels for each sample in the batch + best_sequences = [] + emission_lengths = mask.int().sum(axis=1) + for i in range(batch_size): + + # recover the original sentence length for the i-th sample in the batch + sample_length = emission_lengths[i].item() + + # recover the max tag for the last timestep + sample_final_tag = max_final_tags[i].item() + + # limit the backpointers until the last but one + # since the last corresponds to the sample_final_tag + sample_backpointers = backpointers[:sample_length - 1] + + # follow the backpointers to build the sequence of labels + sample_path = self._find_best_path(i, sample_final_tag, + sample_backpointers) + + # add this path to the list of best sequences + best_sequences.append(sample_path) + + return max_final_scores, best_sequences + + def _find_best_path(self, sample_id, best_tag, backpointers): + """Auxiliary function to find the best path sequence for a specific sample. + + Args: + sample_id (int): sample index in the range [0, batch_size) + best_tag (int): tag which maximizes the final score + backpointers (list of lists of tensors): list of pointers with + shape (seq_len_i-1, nb_labels, batch_size) where seq_len_i + represents the length of the ith sample in the batch + + Returns: + list of ints: a list of tag indexes representing the bast path + """ + # add the final best_tag to our best path + best_path = [best_tag] + + # traverse the backpointers in backwards + for backpointers_t in reversed(backpointers): + + # recover the best_tag at this timestep + best_tag = backpointers_t[best_tag][sample_id].item() + + # append to the beginning of the list so we don't need to reverse it later + best_path.insert(0, best_tag) + + return best_path diff --git a/examples/transv1.8to2.x/deepspeech/modules/ctc.py b/examples/transv1.8to2.x/deepspeech/modules/ctc.py new file mode 100644 index 00000000..356910ce --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/modules/ctc.py @@ -0,0 +1,274 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle +from paddle import nn +from paddle.nn import functional as F +from typeguard import check_argument_types + +from deepspeech.modules.loss import CTCLoss +from deepspeech.utils import ctc_utils +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +try: + from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch # noqa: F401 + from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder # noqa: F401 + from deepspeech.decoders.swig_wrapper import Scorer # noqa: F401 +except Exception as e: + logger.info("ctcdecoder not installed!") + +__all__ = ['CTCDecoder'] + + +class CTCDecoder(nn.Layer): + def __init__(self, + odim, + enc_n_units, + blank_id=0, + dropout_rate: float=0.0, + reduction: bool=True, + batch_average: bool=True): + """CTC decoder + + Args: + odim ([int]): text vocabulary size + enc_n_units ([int]): encoder output dimention + dropout_rate (float): dropout rate (0.0 ~ 1.0) + reduction (bool): reduce the CTC loss into a scalar, True for 'sum' or 'none' + batch_average (bool): do batch dim wise average. + grad_norm_type (str): one of 'instance', 'batchsize', 'frame', None. + """ + assert check_argument_types() + super().__init__() + + self.blank_id = blank_id + self.odim = odim + self.dropout_rate = dropout_rate + self.ctc_lo = nn.Linear(enc_n_units, self.odim) + reduction_type = "sum" if reduction else "none" + self.criterion = CTCLoss( + blank=self.blank_id, + reduction=reduction_type, + batch_average=batch_average) + + # CTCDecoder LM Score handle + self._ext_scorer = None + + def forward(self, hs_pad, hlens, ys_pad, ys_lens): + """Calculate CTC loss. + + Args: + hs_pad (Tensor): batch of padded hidden state sequences (B, Tmax, D) + hlens (Tensor): batch of lengths of hidden state sequences (B) + ys_pad (Tenosr): batch of padded character id sequence tensor (B, Lmax) + ys_lens (Tensor): batch of lengths of character sequence (B) + Returns: + loss (Tenosr): ctc loss value, scalar. + """ + logits = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) + loss = self.criterion(logits, ys_pad, hlens, ys_lens) + return loss + + def softmax(self, eouts: paddle.Tensor, temperature: float=1.0): + """Get CTC probabilities. + Args: + eouts (FloatTensor): `[B, T, enc_units]` + Returns: + probs (FloatTensor): `[B, T, odim]` + """ + self.probs = F.softmax(self.ctc_lo(eouts) / temperature, axis=2) + return self.probs + + def log_softmax(self, hs_pad: paddle.Tensor, + temperature: float=1.0) -> paddle.Tensor: + """log_softmax of frame activations + Args: + Tensor hs_pad: 3d tensor (B, Tmax, eprojs) + Returns: + paddle.Tensor: log softmax applied 3d tensor (B, Tmax, odim) + """ + return F.log_softmax(self.ctc_lo(hs_pad) / temperature, axis=2) + + def argmax(self, hs_pad: paddle.Tensor) -> paddle.Tensor: + """argmax of frame activations + Args: + paddle.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) + Returns: + paddle.Tensor: argmax applied 2d tensor (B, Tmax) + """ + return paddle.argmax(self.ctc_lo(hs_pad), dim=2) + + def forced_align(self, + ctc_probs: paddle.Tensor, + y: paddle.Tensor, + blank_id=0) -> list: + """ctc forced alignment. + Args: + ctc_probs (paddle.Tensor): hidden state sequence, 2d tensor (T, D) + y (paddle.Tensor): label id sequence tensor, 1d tensor (L) + blank_id (int): blank symbol index + Returns: + paddle.Tensor: best alignment result, (T). + """ + return ctc_utils.forced_align(ctc_probs, y, blank_id) + + def _decode_batch_greedy(self, probs_split, vocab_list): + """Decode by best path for a batch of probs matrix input. + :param probs_split: List of 2-D probability matrix, and each consists + of prob vectors for one speech utterancce. + :param probs_split: List of matrix + :param vocab_list: List of tokens in the vocabulary, for decoding. + :type vocab_list: list + :return: List of transcription texts. + :rtype: List of str + """ + results = [] + for i, probs in enumerate(probs_split): + output_transcription = ctc_greedy_decoder( + probs_seq=probs, vocabulary=vocab_list, blank_id=self.blank_id) + results.append(output_transcription) + return results + + def _init_ext_scorer(self, beam_alpha, beam_beta, language_model_path, + vocab_list): + """Initialize the external scorer. + :param beam_alpha: Parameter associated with language model. + :type beam_alpha: float + :param beam_beta: Parameter associated with word count. + :type beam_beta: float + :param language_model_path: Filepath for language model. If it is + empty, the external scorer will be set to + None, and the decoding method will be pure + beam search without scorer. + :type language_model_path: str|None + :param vocab_list: List of tokens in the vocabulary, for decoding. + :type vocab_list: list + """ + # init once + if self._ext_scorer is not None: + return + + if language_model_path != '': + logger.info("begin to initialize the external scorer " + "for decoding") + self._ext_scorer = Scorer(beam_alpha, beam_beta, + language_model_path, vocab_list) + lm_char_based = self._ext_scorer.is_character_based() + lm_max_order = self._ext_scorer.get_max_order() + lm_dict_size = self._ext_scorer.get_dict_size() + logger.info("language model: " + "is_character_based = %d," % lm_char_based + + " max_order = %d," % lm_max_order + " dict_size = %d" % + lm_dict_size) + logger.info("end initializing scorer") + else: + self._ext_scorer = None + logger.info("no language model provided, " + "decoding by pure beam search without scorer.") + + def _decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta, + beam_size, cutoff_prob, cutoff_top_n, + vocab_list, num_processes): + """Decode by beam search for a batch of probs matrix input. + :param probs_split: List of 2-D probability matrix, and each consists + of prob vectors for one speech utterancce. + :param probs_split: List of matrix + :param beam_alpha: Parameter associated with language model. + :type beam_alpha: float + :param beam_beta: Parameter associated with word count. + :type beam_beta: float + :param beam_size: Width for Beam search. + :type beam_size: int + :param cutoff_prob: Cutoff probability in pruning, + default 1.0, no pruning. + :type cutoff_prob: float + :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n + characters with highest probs in vocabulary will be + used in beam search, default 40. + :type cutoff_top_n: int + :param vocab_list: List of tokens in the vocabulary, for decoding. + :type vocab_list: list + :param num_processes: Number of processes (CPU) for decoder. + :type num_processes: int + :return: List of transcription texts. + :rtype: List of str + """ + if self._ext_scorer is not None: + self._ext_scorer.reset_params(beam_alpha, beam_beta) + + # beam search decode + num_processes = min(num_processes, len(probs_split)) + beam_search_results = ctc_beam_search_decoder_batch( + probs_split=probs_split, + vocabulary=vocab_list, + beam_size=beam_size, + num_processes=num_processes, + ext_scoring_func=self._ext_scorer, + cutoff_prob=cutoff_prob, + cutoff_top_n=cutoff_top_n, + blank_id=self.blank_id) + + results = [result[0][1] for result in beam_search_results] + return results + + def init_decode(self, beam_alpha, beam_beta, lang_model_path, vocab_list, + decoding_method): + + if decoding_method == "ctc_beam_search": + self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path, + vocab_list) + + def decode_probs(self, probs, logits_lens, vocab_list, decoding_method, + lang_model_path, beam_alpha, beam_beta, beam_size, + cutoff_prob, cutoff_top_n, num_processes): + """ctc decoding with probs. + + Args: + probs (Tenosr): activation after softmax + logits_lens (Tenosr): audio output lens + vocab_list ([type]): [description] + decoding_method ([type]): [description] + lang_model_path ([type]): [description] + beam_alpha ([type]): [description] + beam_beta ([type]): [description] + beam_size ([type]): [description] + cutoff_prob ([type]): [description] + cutoff_top_n ([type]): [description] + num_processes ([type]): [description] + + Raises: + ValueError: when decoding_method not support. + + Returns: + List[str]: transcripts. + """ + + probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)] + if decoding_method == "ctc_greedy": + result_transcripts = self._decode_batch_greedy( + probs_split=probs_split, vocab_list=vocab_list) + elif decoding_method == "ctc_beam_search": + result_transcripts = self._decode_batch_beam_search( + probs_split=probs_split, + beam_alpha=beam_alpha, + beam_beta=beam_beta, + beam_size=beam_size, + cutoff_prob=cutoff_prob, + cutoff_top_n=cutoff_top_n, + vocab_list=vocab_list, + num_processes=num_processes) + else: + raise ValueError(f"Not support: {decoding_method}") + return result_transcripts diff --git a/examples/transv1.8to2.x/deepspeech/modules/decoder.py b/examples/transv1.8to2.x/deepspeech/modules/decoder.py new file mode 100644 index 00000000..87c9fa49 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/modules/decoder.py @@ -0,0 +1,182 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Decoder definition.""" +from typing import List +from typing import Optional +from typing import Tuple + +import paddle +from paddle import nn +from typeguard import check_argument_types + +from deepspeech.modules.attention import MultiHeadedAttention +from deepspeech.modules.decoder_layer import DecoderLayer +from deepspeech.modules.embedding import PositionalEncoding +from deepspeech.modules.mask import make_non_pad_mask +from deepspeech.modules.mask import subsequent_mask +from deepspeech.modules.positionwise_feed_forward import PositionwiseFeedForward +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = ["TransformerDecoder"] + + +class TransformerDecoder(nn.Layer): + """Base class of Transfomer decoder module. + Args: + vocab_size: output dim + encoder_output_size: dimension of attention + attention_heads: the number of heads of multi head attention + linear_units: the hidden units number of position-wise feedforward + num_blocks: the number of decoder blocks + dropout_rate: dropout rate + self_attention_dropout_rate: dropout rate for attention + input_layer: input layer type, `embed` + use_output_layer: whether to use output layer + pos_enc_class: PositionalEncoding module + normalize_before: + True: use layer_norm before each sub-block of a layer. + False: use layer_norm after each sub-block of a layer. + concat_after: whether to concat attention layer's input and output + True: x -> x + linear(concat(x, att(x))) + False: x -> x + att(x) + """ + + def __init__( + self, + vocab_size: int, + encoder_output_size: int, + attention_heads: int=4, + linear_units: int=2048, + num_blocks: int=6, + dropout_rate: float=0.1, + positional_dropout_rate: float=0.1, + self_attention_dropout_rate: float=0.0, + src_attention_dropout_rate: float=0.0, + input_layer: str="embed", + use_output_layer: bool=True, + normalize_before: bool=True, + concat_after: bool=False, ): + + assert check_argument_types() + super().__init__() + attention_dim = encoder_output_size + + if input_layer == "embed": + self.embed = nn.Sequential( + nn.Embedding(vocab_size, attention_dim), + PositionalEncoding(attention_dim, positional_dropout_rate), ) + else: + raise ValueError(f"only 'embed' is supported: {input_layer}") + + self.normalize_before = normalize_before + self.after_norm = nn.LayerNorm(attention_dim, epsilon=1e-12) + self.use_output_layer = use_output_layer + self.output_layer = nn.Linear(attention_dim, vocab_size) + + self.decoders = nn.LayerList([ + DecoderLayer( + size=attention_dim, + self_attn=MultiHeadedAttention(attention_heads, attention_dim, + self_attention_dropout_rate), + src_attn=MultiHeadedAttention(attention_heads, attention_dim, + src_attention_dropout_rate), + feed_forward=PositionwiseFeedForward( + attention_dim, linear_units, dropout_rate), + dropout_rate=dropout_rate, + normalize_before=normalize_before, + concat_after=concat_after, ) for _ in range(num_blocks) + ]) + + def forward( + self, + memory: paddle.Tensor, + memory_mask: paddle.Tensor, + ys_in_pad: paddle.Tensor, + ys_in_lens: paddle.Tensor, ) -> Tuple[paddle.Tensor, paddle.Tensor]: + """Forward decoder. + Args: + memory: encoded memory, float32 (batch, maxlen_in, feat) + memory_mask: encoder memory mask, (batch, 1, maxlen_in) + ys_in_pad: padded input token ids, int64 (batch, maxlen_out) + ys_in_lens: input lengths of this batch (batch) + Returns: + (tuple): tuple containing: + x: decoded token score before softmax (batch, maxlen_out, vocab_size) + if use_output_layer is True, + olens: (batch, ) + """ + tgt = ys_in_pad + # tgt_mask: (B, 1, L) + tgt_mask = (make_non_pad_mask(ys_in_lens).unsqueeze(1)) + # m: (1, L, L) + m = subsequent_mask(tgt_mask.size(-1)).unsqueeze(0) + # tgt_mask: (B, L, L) + # TODO(Hui Zhang): not support & for tensor + # tgt_mask = tgt_mask & m + tgt_mask = tgt_mask.logical_and(m) + + x, _ = self.embed(tgt) + for layer in self.decoders: + x, tgt_mask, memory, memory_mask = layer(x, tgt_mask, memory, + memory_mask) + if self.normalize_before: + x = self.after_norm(x) + if self.use_output_layer: + x = self.output_layer(x) + + # TODO(Hui Zhang): reduce_sum not support bool type + # olens = tgt_mask.sum(1) + olens = tgt_mask.astype(paddle.int).sum(1) + return x, olens + + def forward_one_step( + self, + memory: paddle.Tensor, + memory_mask: paddle.Tensor, + tgt: paddle.Tensor, + tgt_mask: paddle.Tensor, + cache: Optional[List[paddle.Tensor]]=None, + ) -> Tuple[paddle.Tensor, List[paddle.Tensor]]: + """Forward one step. + This is only used for decoding. + Args: + memory: encoded memory, float32 (batch, maxlen_in, feat) + memory_mask: encoded memory mask, (batch, 1, maxlen_in) + tgt: input token ids, int64 (batch, maxlen_out) + tgt_mask: input token mask, (batch, maxlen_out, maxlen_out) + dtype=paddle.bool + cache: cached output list of (batch, max_time_out-1, size) + Returns: + y, cache: NN output value and cache per `self.decoders`. + y.shape` is (batch, token) + """ + x, _ = self.embed(tgt) + new_cache = [] + for i, decoder in enumerate(self.decoders): + if cache is None: + c = None + else: + c = cache[i] + x, tgt_mask, memory, memory_mask = decoder( + x, tgt_mask, memory, memory_mask, cache=c) + new_cache.append(x) + if self.normalize_before: + y = self.after_norm(x[:, -1]) + else: + y = x[:, -1] + if self.use_output_layer: + y = paddle.log_softmax(self.output_layer(y), axis=-1) + return y, new_cache diff --git a/examples/transv1.8to2.x/deepspeech/modules/decoder_layer.py b/examples/transv1.8to2.x/deepspeech/modules/decoder_layer.py new file mode 100644 index 00000000..47c42615 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/modules/decoder_layer.py @@ -0,0 +1,151 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Decoder self-attention layer definition.""" +from typing import Optional +from typing import Tuple + +import paddle +from paddle import nn + +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = ["DecoderLayer"] + + +class DecoderLayer(nn.Layer): + """Single decoder layer module. + Args: + size (int): Input dimension. + self_attn (nn.Layer): Self-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + src_attn (nn.Layer): Self-attention module instance. + `MultiHeadedAttention` instance can be used as the argument. + feed_forward (nn.Layer): Feed-forward module instance. + `PositionwiseFeedForward` instance can be used as the argument. + dropout_rate (float): Dropout rate. + normalize_before (bool): + True: use layer_norm before each sub-block. + False: to use layer_norm after each sub-block. + concat_after (bool): Whether to concat attention layer's input + and output. + True: x -> x + linear(concat(x, att(x))) + False: x -> x + att(x) + """ + + def __init__( + self, + size: int, + self_attn: nn.Layer, + src_attn: nn.Layer, + feed_forward: nn.Layer, + dropout_rate: float, + normalize_before: bool=True, + concat_after: bool=False, ): + """Construct an DecoderLayer object.""" + super().__init__() + self.size = size + self.self_attn = self_attn + self.src_attn = src_attn + self.feed_forward = feed_forward + self.norm1 = nn.LayerNorm(size, epsilon=1e-12) + self.norm2 = nn.LayerNorm(size, epsilon=1e-12) + self.norm3 = nn.LayerNorm(size, epsilon=1e-12) + self.dropout = nn.Dropout(dropout_rate) + self.normalize_before = normalize_before + self.concat_after = concat_after + self.concat_linear1 = nn.Linear(size + size, size) + self.concat_linear2 = nn.Linear(size + size, size) + + def forward( + self, + tgt: paddle.Tensor, + tgt_mask: paddle.Tensor, + memory: paddle.Tensor, + memory_mask: paddle.Tensor, + cache: Optional[paddle.Tensor]=None + ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor, paddle.Tensor]: + """Compute decoded features. + Args: + tgt (paddle.Tensor): Input tensor (#batch, maxlen_out, size). + tgt_mask (paddle.Tensor): Mask for input tensor + (#batch, maxlen_out). + memory (paddle.Tensor): Encoded memory + (#batch, maxlen_in, size). + memory_mask (paddle.Tensor): Encoded memory mask + (#batch, maxlen_in). + cache (paddle.Tensor): cached tensors. + (#batch, maxlen_out - 1, size). + Returns: + paddle.Tensor: Output tensor (#batch, maxlen_out, size). + paddle.Tensor: Mask for output tensor (#batch, maxlen_out). + paddle.Tensor: Encoded memory (#batch, maxlen_in, size). + paddle.Tensor: Encoded memory mask (#batch, maxlen_in). + """ + residual = tgt + if self.normalize_before: + tgt = self.norm1(tgt) + + if cache is None: + tgt_q = tgt + tgt_q_mask = tgt_mask + else: + # compute only the last frame query keeping dim: max_time_out -> 1 + assert cache.shape == [ + tgt.shape[0], + tgt.shape[1] - 1, + self.size, + ], f"{cache.shape} == {[tgt.shape[0], tgt.shape[1] - 1, self.size]}" + tgt_q = tgt[:, -1:, :] + residual = residual[:, -1:, :] + # TODO(Hui Zhang): slice not support bool type + # tgt_q_mask = tgt_mask[:, -1:, :] + tgt_q_mask = tgt_mask.cast(paddle.int64)[:, -1:, :].cast( + paddle.bool) + + if self.concat_after: + tgt_concat = paddle.cat( + (tgt_q, self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)), dim=-1) + x = residual + self.concat_linear1(tgt_concat) + else: + x = residual + self.dropout( + self.self_attn(tgt_q, tgt, tgt, tgt_q_mask)) + if not self.normalize_before: + x = self.norm1(x) + + residual = x + if self.normalize_before: + x = self.norm2(x) + if self.concat_after: + x_concat = paddle.cat( + (x, self.src_attn(x, memory, memory, memory_mask)), dim=-1) + x = residual + self.concat_linear2(x_concat) + else: + x = residual + self.dropout( + self.src_attn(x, memory, memory, memory_mask)) + if not self.normalize_before: + x = self.norm2(x) + + residual = x + if self.normalize_before: + x = self.norm3(x) + x = residual + self.dropout(self.feed_forward(x)) + if not self.normalize_before: + x = self.norm3(x) + + if cache is not None: + x = paddle.cat([cache, x], dim=1) + + return x, tgt_mask, memory, memory_mask diff --git a/examples/transv1.8to2.x/deepspeech/modules/encoder.py b/examples/transv1.8to2.x/deepspeech/modules/encoder.py new file mode 100644 index 00000000..71ec61a0 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/modules/encoder.py @@ -0,0 +1,453 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Encoder definition.""" +from typing import List +from typing import Optional +from typing import Tuple + +import paddle +from paddle import nn +from typeguard import check_argument_types + +from deepspeech.modules.activation import get_activation +from deepspeech.modules.attention import MultiHeadedAttention +from deepspeech.modules.attention import RelPositionMultiHeadedAttention +from deepspeech.modules.conformer_convolution import ConvolutionModule +from deepspeech.modules.embedding import PositionalEncoding +from deepspeech.modules.embedding import RelPositionalEncoding +from deepspeech.modules.encoder_layer import ConformerEncoderLayer +from deepspeech.modules.encoder_layer import TransformerEncoderLayer +from deepspeech.modules.mask import add_optional_chunk_mask +from deepspeech.modules.mask import make_non_pad_mask +from deepspeech.modules.positionwise_feed_forward import PositionwiseFeedForward +from deepspeech.modules.subsampling import Conv2dSubsampling4 +from deepspeech.modules.subsampling import Conv2dSubsampling6 +from deepspeech.modules.subsampling import Conv2dSubsampling8 +from deepspeech.modules.subsampling import LinearNoSubsampling +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = ["BaseEncoder", 'TransformerEncoder', "ConformerEncoder"] + + +class BaseEncoder(nn.Layer): + def __init__( + self, + input_size: int, + output_size: int=256, + attention_heads: int=4, + linear_units: int=2048, + num_blocks: int=6, + dropout_rate: float=0.1, + positional_dropout_rate: float=0.1, + attention_dropout_rate: float=0.0, + input_layer: str="conv2d", + pos_enc_layer_type: str="abs_pos", + normalize_before: bool=True, + concat_after: bool=False, + static_chunk_size: int=0, + use_dynamic_chunk: bool=False, + global_cmvn: paddle.nn.Layer=None, + use_dynamic_left_chunk: bool=False, ): + """ + Args: + input_size (int): input dim, d_feature + output_size (int): dimension of attention, d_model + attention_heads (int): the number of heads of multi head attention + linear_units (int): the hidden units number of position-wise feed + forward + num_blocks (int): the number of encoder blocks + dropout_rate (float): dropout rate + attention_dropout_rate (float): dropout rate in attention + positional_dropout_rate (float): dropout rate after adding + positional encoding + input_layer (str): input layer type. + optional [linear, conv2d, conv2d6, conv2d8] + pos_enc_layer_type (str): Encoder positional encoding layer type. + opitonal [abs_pos, scaled_abs_pos, rel_pos] + normalize_before (bool): + True: use layer_norm before each sub-block of a layer. + False: use layer_norm after each sub-block of a layer. + concat_after (bool): whether to concat attention layer's input + and output. + True: x -> x + linear(concat(x, att(x))) + False: x -> x + att(x) + static_chunk_size (int): chunk size for static chunk training and + decoding + use_dynamic_chunk (bool): whether use dynamic chunk size for + training or not, You can only use fixed chunk(chunk_size > 0) + or dyanmic chunk size(use_dynamic_chunk = True) + global_cmvn (Optional[paddle.nn.Layer]): Optional GlobalCMVN layer + use_dynamic_left_chunk (bool): whether use dynamic left chunk in + dynamic chunk training + """ + assert check_argument_types() + super().__init__() + self._output_size = output_size + + if pos_enc_layer_type == "abs_pos": + pos_enc_class = PositionalEncoding + elif pos_enc_layer_type == "rel_pos": + pos_enc_class = RelPositionalEncoding + else: + raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type) + + if input_layer == "linear": + subsampling_class = LinearNoSubsampling + elif input_layer == "conv2d": + subsampling_class = Conv2dSubsampling4 + elif input_layer == "conv2d6": + subsampling_class = Conv2dSubsampling6 + elif input_layer == "conv2d8": + subsampling_class = Conv2dSubsampling8 + else: + raise ValueError("unknown input_layer: " + input_layer) + + self.global_cmvn = global_cmvn + self.embed = subsampling_class( + idim=input_size, + odim=output_size, + dropout_rate=dropout_rate, + pos_enc_class=pos_enc_class( + d_model=output_size, dropout_rate=positional_dropout_rate), ) + + self.normalize_before = normalize_before + self.after_norm = nn.LayerNorm(output_size, epsilon=1e-12) + self.static_chunk_size = static_chunk_size + self.use_dynamic_chunk = use_dynamic_chunk + self.use_dynamic_left_chunk = use_dynamic_left_chunk + + def output_size(self) -> int: + return self._output_size + + def forward( + self, + xs: paddle.Tensor, + xs_lens: paddle.Tensor, + decoding_chunk_size: int=0, + num_decoding_left_chunks: int=-1, + ) -> Tuple[paddle.Tensor, paddle.Tensor]: + """Embed positions in tensor. + Args: + xs: padded input tensor (B, L, D) + xs_lens: input length (B) + decoding_chunk_size: decoding chunk size for dynamic chunk + 0: default for training, use random dynamic chunk. + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + num_decoding_left_chunks: number of left chunks, this is for decoding, + the chunk size is decoding_chunk_size. + >=0: use num_decoding_left_chunks + <0: use all left chunks + Returns: + encoder output tensor, lens and mask + """ + masks = make_non_pad_mask(xs_lens).unsqueeze(1) # (B, 1, L) + + if self.global_cmvn is not None: + xs = self.global_cmvn(xs) + #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor + xs, pos_emb, masks = self.embed(xs, masks.type_as(xs), offset=0) + #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor + masks = masks.astype(paddle.bool) + #TODO(Hui Zhang): mask_pad = ~masks + mask_pad = masks.logical_not() + chunk_masks = add_optional_chunk_mask( + xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk, + decoding_chunk_size, self.static_chunk_size, + num_decoding_left_chunks) + for layer in self.encoders: + xs, chunk_masks, _ = layer(xs, chunk_masks, pos_emb, mask_pad) + if self.normalize_before: + xs = self.after_norm(xs) + # Here we assume the mask is not changed in encoder layers, so just + # return the masks before encoder layers, and the masks will be used + # for cross attention with decoder later + return xs, masks + + def forward_chunk( + self, + xs: paddle.Tensor, + offset: int, + required_cache_size: int, + subsampling_cache: Optional[paddle.Tensor]=None, + elayers_output_cache: Optional[List[paddle.Tensor]]=None, + conformer_cnn_cache: Optional[List[paddle.Tensor]]=None, + ) -> Tuple[paddle.Tensor, paddle.Tensor, List[paddle.Tensor], List[ + paddle.Tensor]]: + """ Forward just one chunk + Args: + xs (paddle.Tensor): chunk input, [B=1, T, D] + offset (int): current offset in encoder output time stamp + required_cache_size (int): cache size required for next chunk + compuation + >=0: actual cache size + <0: means all history cache is required + subsampling_cache (Optional[paddle.Tensor]): subsampling cache + elayers_output_cache (Optional[List[paddle.Tensor]]): + transformer/conformer encoder layers output cache + conformer_cnn_cache (Optional[List[paddle.Tensor]]): conformer + cnn cache + Returns: + paddle.Tensor: output of current input xs + paddle.Tensor: subsampling cache required for next chunk computation + List[paddle.Tensor]: encoder layers output cache required for next + chunk computation + List[paddle.Tensor]: conformer cnn cache + """ + assert xs.size(0) == 1 # batch size must be one + # tmp_masks is just for interface compatibility + # TODO(Hui Zhang): stride_slice not support bool tensor + # tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool) + tmp_masks = paddle.ones([1, xs.size(1)], dtype=paddle.int32) + tmp_masks = tmp_masks.unsqueeze(1) #[B=1, C=1, T] + + if self.global_cmvn is not None: + xs = self.global_cmvn(xs) + + xs, pos_emb, _ = self.embed( + xs, tmp_masks, offset=offset) #xs=(B, T, D), pos_emb=(B=1, T, D) + + if subsampling_cache is not None: + cache_size = subsampling_cache.size(1) #T + xs = paddle.cat((subsampling_cache, xs), dim=1) + else: + cache_size = 0 + + # only used when using `RelPositionMultiHeadedAttention` + pos_emb = self.embed.position_encoding( + offset=offset - cache_size, size=xs.size(1)) + + if required_cache_size < 0: + next_cache_start = 0 + elif required_cache_size == 0: + next_cache_start = xs.size(1) + else: + next_cache_start = xs.size(1) - required_cache_size + r_subsampling_cache = xs[:, next_cache_start:, :] + + # Real mask for transformer/conformer layers + masks = paddle.ones([1, xs.size(1)], dtype=paddle.bool) + masks = masks.unsqueeze(1) #[B=1, L'=1, T] + r_elayers_output_cache = [] + r_conformer_cnn_cache = [] + for i, layer in enumerate(self.encoders): + attn_cache = None if elayers_output_cache is None else elayers_output_cache[ + i] + cnn_cache = None if conformer_cnn_cache is None else conformer_cnn_cache[ + i] + xs, _, new_cnn_cache = layer( + xs, + masks, + pos_emb, + output_cache=attn_cache, + cnn_cache=cnn_cache) + r_elayers_output_cache.append(xs[:, next_cache_start:, :]) + r_conformer_cnn_cache.append(new_cnn_cache) + if self.normalize_before: + xs = self.after_norm(xs) + + return (xs[:, cache_size:, :], r_subsampling_cache, + r_elayers_output_cache, r_conformer_cnn_cache) + + def forward_chunk_by_chunk( + self, + xs: paddle.Tensor, + decoding_chunk_size: int, + num_decoding_left_chunks: int=-1, + ) -> Tuple[paddle.Tensor, paddle.Tensor]: + """ Forward input chunk by chunk with chunk_size like a streaming + fashion + Here we should pay special attention to computation cache in the + streaming style forward chunk by chunk. Three things should be taken + into account for computation in the current network: + 1. transformer/conformer encoder layers output cache + 2. convolution in conformer + 3. convolution in subsampling + However, we don't implement subsampling cache for: + 1. We can control subsampling module to output the right result by + overlapping input instead of cache left context, even though it + wastes some computation, but subsampling only takes a very + small fraction of computation in the whole model. + 2. Typically, there are several covolution layers with subsampling + in subsampling module, it is tricky and complicated to do cache + with different convolution layers with different subsampling + rate. + 3. Currently, nn.Sequential is used to stack all the convolution + layers in subsampling, we need to rewrite it to make it work + with cache, which is not prefered. + Args: + xs (paddle.Tensor): (1, max_len, dim) + chunk_size (int): decoding chunk size. + num_left_chunks (int): decoding with num left chunks. + """ + assert decoding_chunk_size > 0 + # The model is trained by static or dynamic chunk + assert self.static_chunk_size > 0 or self.use_dynamic_chunk + + # feature stride and window for `subsampling` module + subsampling = self.embed.subsampling_rate + context = self.embed.right_context + 1 # Add current frame + stride = subsampling * decoding_chunk_size + decoding_window = (decoding_chunk_size - 1) * subsampling + context + + num_frames = xs.size(1) + required_cache_size = decoding_chunk_size * num_decoding_left_chunks + subsampling_cache: Optional[paddle.Tensor] = None + elayers_output_cache: Optional[List[paddle.Tensor]] = None + conformer_cnn_cache: Optional[List[paddle.Tensor]] = None + outputs = [] + offset = 0 + # Feed forward overlap input step by step + for cur in range(0, num_frames - context + 1, stride): + end = min(cur + decoding_window, num_frames) + chunk_xs = xs[:, cur:end, :] + (y, subsampling_cache, elayers_output_cache, + conformer_cnn_cache) = self.forward_chunk( + chunk_xs, offset, required_cache_size, subsampling_cache, + elayers_output_cache, conformer_cnn_cache) + outputs.append(y) + offset += y.size(1) + ys = paddle.cat(outputs, 1) + # fake mask, just for jit script and compatibility with `forward` api + masks = paddle.ones([1, ys.size(1)], dtype=paddle.bool) + masks = masks.unsqueeze(1) + return ys, masks + + +class TransformerEncoder(BaseEncoder): + """Transformer encoder module.""" + + def __init__( + self, + input_size: int, + output_size: int=256, + attention_heads: int=4, + linear_units: int=2048, + num_blocks: int=6, + dropout_rate: float=0.1, + positional_dropout_rate: float=0.1, + attention_dropout_rate: float=0.0, + input_layer: str="conv2d", + pos_enc_layer_type: str="abs_pos", + normalize_before: bool=True, + concat_after: bool=False, + static_chunk_size: int=0, + use_dynamic_chunk: bool=False, + global_cmvn: nn.Layer=None, + use_dynamic_left_chunk: bool=False, ): + """ Construct TransformerEncoder + See Encoder for the meaning of each parameter. + """ + assert check_argument_types() + super().__init__(input_size, output_size, attention_heads, linear_units, + num_blocks, dropout_rate, positional_dropout_rate, + attention_dropout_rate, input_layer, + pos_enc_layer_type, normalize_before, concat_after, + static_chunk_size, use_dynamic_chunk, global_cmvn, + use_dynamic_left_chunk) + self.encoders = nn.LayerList([ + TransformerEncoderLayer( + size=output_size, + self_attn=MultiHeadedAttention(attention_heads, output_size, + attention_dropout_rate), + feed_forward=PositionwiseFeedForward(output_size, linear_units, + dropout_rate), + dropout_rate=dropout_rate, + normalize_before=normalize_before, + concat_after=concat_after) for _ in range(num_blocks) + ]) + + +class ConformerEncoder(BaseEncoder): + """Conformer encoder module.""" + + def __init__( + self, + input_size: int, + output_size: int=256, + attention_heads: int=4, + linear_units: int=2048, + num_blocks: int=6, + dropout_rate: float=0.1, + positional_dropout_rate: float=0.1, + attention_dropout_rate: float=0.0, + input_layer: str="conv2d", + pos_enc_layer_type: str="rel_pos", + normalize_before: bool=True, + concat_after: bool=False, + static_chunk_size: int=0, + use_dynamic_chunk: bool=False, + global_cmvn: nn.Layer=None, + use_dynamic_left_chunk: bool=False, + positionwise_conv_kernel_size: int=1, + macaron_style: bool=True, + selfattention_layer_type: str="rel_selfattn", + activation_type: str="swish", + use_cnn_module: bool=True, + cnn_module_kernel: int=15, + causal: bool=False, + cnn_module_norm: str="batch_norm", ): + """Construct ConformerEncoder + Args: + input_size to use_dynamic_chunk, see in BaseEncoder + positionwise_conv_kernel_size (int): Kernel size of positionwise + conv1d layer. + macaron_style (bool): Whether to use macaron style for + positionwise layer. + selfattention_layer_type (str): Encoder attention layer type, + the parameter has no effect now, it's just for configure + compatibility. + activation_type (str): Encoder activation function type. + use_cnn_module (bool): Whether to use convolution module. + cnn_module_kernel (int): Kernel size of convolution module. + causal (bool): whether to use causal convolution or not. + cnn_module_norm (str): cnn conv norm type, Optional['batch_norm','layer_norm'] + """ + assert check_argument_types() + super().__init__(input_size, output_size, attention_heads, linear_units, + num_blocks, dropout_rate, positional_dropout_rate, + attention_dropout_rate, input_layer, + pos_enc_layer_type, normalize_before, concat_after, + static_chunk_size, use_dynamic_chunk, global_cmvn, + use_dynamic_left_chunk) + activation = get_activation(activation_type) + + # self-attention module definition + encoder_selfattn_layer = RelPositionMultiHeadedAttention + encoder_selfattn_layer_args = (attention_heads, output_size, + attention_dropout_rate) + # feed-forward module definition + positionwise_layer = PositionwiseFeedForward + positionwise_layer_args = (output_size, linear_units, dropout_rate, + activation) + # convolution module definition + convolution_layer = ConvolutionModule + convolution_layer_args = (output_size, cnn_module_kernel, activation, + cnn_module_norm, causal) + + self.encoders = nn.LayerList([ + ConformerEncoderLayer( + size=output_size, + self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args), + feed_forward=positionwise_layer(*positionwise_layer_args), + feed_forward_macaron=positionwise_layer( + *positionwise_layer_args) if macaron_style else None, + conv_module=convolution_layer(*convolution_layer_args) + if use_cnn_module else None, + dropout_rate=dropout_rate, + normalize_before=normalize_before, + concat_after=concat_after) for _ in range(num_blocks) + ]) diff --git a/examples/transv1.8to2.x/deepspeech/modules/loss.py b/examples/transv1.8to2.x/deepspeech/modules/loss.py new file mode 100644 index 00000000..8918ca66 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/modules/loss.py @@ -0,0 +1,144 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle +from paddle import nn +from paddle.nn import functional as F + +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = ['CTCLoss', "LabelSmoothingLoss"] + + +class CTCLoss(nn.Layer): + def __init__(self, blank=0, reduction='sum', batch_average=False): + super().__init__() + # last token id as blank id + self.loss = nn.CTCLoss(blank=blank, reduction=reduction) + self.batch_average = batch_average + + def forward(self, logits, ys_pad, hlens, ys_lens): + """Compute CTC loss. + + Args: + logits ([paddle.Tensor]): [B, Tmax, D] + ys_pad ([paddle.Tensor]): [B, Tmax] + hlens ([paddle.Tensor]): [B] + ys_lens ([paddle.Tensor]): [B] + + Returns: + [paddle.Tensor]: scalar. If reduction is 'none', then (N), where N = \text{batch size}. + """ + B = paddle.shape(logits)[0] + # warp-ctc need logits, and do softmax on logits by itself + # warp-ctc need activation with shape [T, B, V + 1] + # logits: (B, L, D) -> (L, B, D) + logits = logits.transpose([1, 0, 2]) + # (TODO:Hui Zhang) ctc loss does not support int64 labels + ys_pad = ys_pad.astype(paddle.int32) + loss = self.loss( + logits, ys_pad, hlens, ys_lens, norm_by_times=self.batch_average) + if self.batch_average: + # Batch-size average + loss = loss / B + return loss + + +class LabelSmoothingLoss(nn.Layer): + """Label-smoothing loss. + In a standard CE loss, the label's data distribution is: + [0,1,2] -> + [ + [1.0, 0.0, 0.0], + [0.0, 1.0, 0.0], + [0.0, 0.0, 1.0], + ] + In the smoothing version CE Loss,some probabilities + are taken from the true label prob (1.0) and are divided + among other labels. + e.g. + smoothing=0.1 + [0,1,2] -> + [ + [0.9, 0.05, 0.05], + [0.05, 0.9, 0.05], + [0.05, 0.05, 0.9], + ] + + """ + + def __init__(self, + size: int, + padding_idx: int, + smoothing: float, + normalize_length: bool=False): + """Label-smoothing loss. + + Args: + size (int): the number of class + padding_idx (int): padding class id which will be ignored for loss + smoothing (float): smoothing rate (0.0 means the conventional CE) + normalize_length (bool): + True, normalize loss by sequence length; + False, normalize loss by batch size. + Defaults to False. + """ + super().__init__() + self.size = size + self.padding_idx = padding_idx + self.smoothing = smoothing + self.confidence = 1.0 - smoothing + self.normalize_length = normalize_length + self.criterion = nn.KLDivLoss(reduction="none") + + def forward(self, x: paddle.Tensor, target: paddle.Tensor) -> paddle.Tensor: + """Compute loss between x and target. + The model outputs and data labels tensors are flatten to + (batch*seqlen, class) shape and a mask is applied to the + padding part which should not be calculated for loss. + + Args: + x (paddle.Tensor): prediction (batch, seqlen, class) + target (paddle.Tensor): + target signal masked with self.padding_id (batch, seqlen) + Returns: + loss (paddle.Tensor) : The KL loss, scalar float value + """ + B, T, D = paddle.shape(x) + assert D == self.size + x = x.reshape((-1, self.size)) + target = target.reshape([-1]) + + # use zeros_like instead of torch.no_grad() for true_dist, + # since no_grad() can not be exported by JIT + true_dist = paddle.full_like(x, self.smoothing / (self.size - 1)) + ignore = target == self.padding_idx # (B,) + + # target = target * (1 - ignore) # avoid -1 index + target = target.masked_fill(ignore, 0) # avoid -1 index + # true_dist.scatter_(1, target.unsqueeze(1), self.confidence) + target_mask = F.one_hot(target, self.size) + true_dist *= (1 - target_mask) + true_dist += target_mask * self.confidence + + kl = self.criterion(F.log_softmax(x, axis=1), true_dist) + + #TODO(Hui Zhang): sum not support bool type + #total = len(target) - int(ignore.sum()) + total = len(target) - int(ignore.type_as(target).sum()) + denom = total if self.normalize_length else B + #numer = (kl * (1 - ignore)).sum() + numer = kl.masked_fill(ignore.unsqueeze(1), 0).sum() + return numer / denom diff --git a/examples/transv1.8to2.x/deepspeech/modules/mask.py b/examples/transv1.8to2.x/deepspeech/modules/mask.py new file mode 100644 index 00000000..05e86eb3 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/modules/mask.py @@ -0,0 +1,260 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import paddle + +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = [ + "make_pad_mask", "make_non_pad_mask", "subsequent_mask", + "subsequent_chunk_mask", "add_optional_chunk_mask", "mask_finished_scores", + "mask_finished_preds" +] + + +def make_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor: + """Make mask tensor containing indices of padded part. + See description of make_non_pad_mask. + Args: + lengths (paddle.Tensor): Batch of lengths (B,). + Returns: + paddle.Tensor: Mask tensor containing indices of padded part. + Examples: + >>> lengths = [5, 3, 2] + >>> make_pad_mask(lengths) + masks = [[0, 0, 0, 0 ,0], + [0, 0, 0, 1, 1], + [0, 0, 1, 1, 1]] + """ + # (TODO: Hui Zhang): jit not support Tenosr.dim() and Tensor.ndim + # assert lengths.dim() == 1 + batch_size = int(lengths.shape[0]) + max_len = int(lengths.max()) + seq_range = paddle.arange(0, max_len, dtype=paddle.int64) + seq_range_expand = seq_range.unsqueeze(0).expand([batch_size, max_len]) + seq_length_expand = lengths.unsqueeze(-1) + mask = seq_range_expand >= seq_length_expand + return mask + + +def make_non_pad_mask(lengths: paddle.Tensor) -> paddle.Tensor: + """Make mask tensor containing indices of non-padded part. + The sequences in a batch may have different lengths. To enable + batch computing, padding is need to make all sequence in same + size. To avoid the padding part pass value to context dependent + block such as attention or convolution , this padding part is + masked. + This pad_mask is used in both encoder and decoder. + 1 for non-padded part and 0 for padded part. + Args: + lengths (paddle.Tensor): Batch of lengths (B,). + Returns: + paddle.Tensor: mask tensor containing indices of padded part. + Examples: + >>> lengths = [5, 3, 2] + >>> make_non_pad_mask(lengths) + masks = [[1, 1, 1, 1 ,1], + [1, 1, 1, 0, 0], + [1, 1, 0, 0, 0]] + """ + #TODO(Hui Zhang): return ~make_pad_mask(lengths), not support ~ + return make_pad_mask(lengths).logical_not() + + +def subsequent_mask(size: int) -> paddle.Tensor: + """Create mask for subsequent steps (size, size). + This mask is used only in decoder which works in an auto-regressive mode. + This means the current step could only do attention with its left steps. + In encoder, fully attention is used when streaming is not necessary and + the sequence is not long. In this case, no attention mask is needed. + When streaming is need, chunk-based attention is used in encoder. See + subsequent_chunk_mask for the chunk-based attention mask. + Args: + size (int): size of mask + Returns: + paddle.Tensor: mask, [size, size] + Examples: + >>> subsequent_mask(3) + [[1, 0, 0], + [1, 1, 0], + [1, 1, 1]] + """ + ret = paddle.ones([size, size], dtype=paddle.bool) + #TODO(Hui Zhang): tril not support bool + #return paddle.tril(ret) + ret = ret.astype(paddle.float) + ret = paddle.tril(ret) + ret = ret.astype(paddle.bool) + return ret + + +def subsequent_chunk_mask( + size: int, + chunk_size: int, + num_left_chunks: int=-1, ) -> paddle.Tensor: + """Create mask for subsequent steps (size, size) with chunk size, + this is for streaming encoder + Args: + size (int): size of mask + chunk_size (int): size of chunk + num_left_chunks (int): number of left chunks + <0: use full chunk + >=0: use num_left_chunks + Returns: + paddle.Tensor: mask, [size, size] + Examples: + >>> subsequent_chunk_mask(4, 2) + [[1, 1, 0, 0], + [1, 1, 0, 0], + [1, 1, 1, 1], + [1, 1, 1, 1]] + """ + ret = paddle.zeros([size, size], dtype=paddle.bool) + for i in range(size): + if num_left_chunks < 0: + start = 0 + else: + start = max(0, (i // chunk_size - num_left_chunks) * chunk_size) + ending = min(size, (i // chunk_size + 1) * chunk_size) + ret[i, start:ending] = True + return ret + + +def add_optional_chunk_mask(xs: paddle.Tensor, + masks: paddle.Tensor, + use_dynamic_chunk: bool, + use_dynamic_left_chunk: bool, + decoding_chunk_size: int, + static_chunk_size: int, + num_decoding_left_chunks: int): + """ Apply optional mask for encoder. + Args: + xs (paddle.Tensor): padded input, (B, L, D), L for max length + mask (paddle.Tensor): mask for xs, (B, 1, L) + use_dynamic_chunk (bool): whether to use dynamic chunk or not + use_dynamic_left_chunk (bool): whether to use dynamic left chunk for + training. + decoding_chunk_size (int): decoding chunk size for dynamic chunk, it's + 0: default for training, use random dynamic chunk. + <0: for decoding, use full chunk. + >0: for decoding, use fixed chunk size as set. + static_chunk_size (int): chunk size for static chunk training/decoding + if it's greater than 0, if use_dynamic_chunk is true, + this parameter will be ignored + num_decoding_left_chunks (int): number of left chunks, this is for decoding, + the chunk size is decoding_chunk_size. + >=0: use num_decoding_left_chunks + <0: use all left chunks + Returns: + paddle.Tensor: chunk mask of the input xs. + """ + # Whether to use chunk mask or not + if use_dynamic_chunk: + max_len = xs.shape[1] + if decoding_chunk_size < 0: + chunk_size = max_len + num_left_chunks = -1 + elif decoding_chunk_size > 0: + chunk_size = decoding_chunk_size + num_left_chunks = num_decoding_left_chunks + else: + # chunk size is either [1, 25] or full context(max_len). + # Since we use 4 times subsampling and allow up to 1s(100 frames) + # delay, the maximum frame is 100 / 4 = 25. + chunk_size = int(paddle.randint(1, max_len, (1, ))) + num_left_chunks = -1 + if chunk_size > max_len // 2: + chunk_size = max_len + else: + chunk_size = chunk_size % 25 + 1 + if use_dynamic_left_chunk: + max_left_chunks = (max_len - 1) // chunk_size + num_left_chunks = int( + paddle.randint(0, max_left_chunks, (1, ))) + chunk_masks = subsequent_chunk_mask(xs.shape[1], chunk_size, + num_left_chunks) # (L, L) + chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) + # chunk_masks = masks & chunk_masks # (B, L, L) + chunk_masks = masks.logical_and(chunk_masks) # (B, L, L) + elif static_chunk_size > 0: + num_left_chunks = num_decoding_left_chunks + chunk_masks = subsequent_chunk_mask(xs.shape[1], static_chunk_size, + num_left_chunks) # (L, L) + chunk_masks = chunk_masks.unsqueeze(0) # (1, L, L) + # chunk_masks = masks & chunk_masks # (B, L, L) + chunk_masks = masks.logical_and(chunk_masks) # (B, L, L) + else: + chunk_masks = masks + return chunk_masks + + +def mask_finished_scores(score: paddle.Tensor, + flag: paddle.Tensor) -> paddle.Tensor: + """ + If a sequence is finished, we only allow one alive branch. This function + aims to give one branch a zero score and the rest -inf score. + Args: + score (paddle.Tensor): A real value array with shape + (batch_size * beam_size, beam_size). + flag (paddle.Tensor): A bool array with shape + (batch_size * beam_size, 1). + Returns: + paddle.Tensor: (batch_size * beam_size, beam_size). + Examples: + flag: tensor([[ True], + [False]]) + score: tensor([[-0.3666, -0.6664, 0.6019], + [-1.1490, -0.2948, 0.7460]]) + unfinished: tensor([[False, True, True], + [False, False, False]]) + finished: tensor([[ True, False, False], + [False, False, False]]) + return: tensor([[ 0.0000, -inf, -inf], + [-1.1490, -0.2948, 0.7460]]) + """ + beam_size = score.shape[-1] + zero_mask = paddle.zeros_like(flag, dtype=paddle.bool) + if beam_size > 1: + unfinished = paddle.concat( + (zero_mask, flag.tile([1, beam_size - 1])), axis=1) + finished = paddle.concat( + (flag, zero_mask.tile([1, beam_size - 1])), axis=1) + else: + unfinished = zero_mask + finished = flag + + # infs = paddle.ones_like(score) * -float('inf') + # score = paddle.where(unfinished, infs, score) + # score = paddle.where(finished, paddle.zeros_like(score), score) + score.masked_fill_(unfinished, -float('inf')) + score.masked_fill_(finished, 0) + return score + + +def mask_finished_preds(pred: paddle.Tensor, flag: paddle.Tensor, + eos: int) -> paddle.Tensor: + """ + If a sequence is finished, all of its branch should be + Args: + pred (paddle.Tensor): A int array with shape + (batch_size * beam_size, beam_size). + flag (paddle.Tensor): A bool array with shape + (batch_size * beam_size, 1). + Returns: + paddle.Tensor: (batch_size * beam_size). + """ + beam_size = pred.shape[-1] + finished = flag.repeat(1, beam_size) + return pred.masked_fill_(finished, eos) diff --git a/examples/transv1.8to2.x/deepspeech/utils/__init__.py b/examples/transv1.8to2.x/deepspeech/utils/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/transv1.8to2.x/deepspeech/utils/bleu_score.py b/examples/transv1.8to2.x/deepspeech/utils/bleu_score.py new file mode 100644 index 00000000..09646133 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/utils/bleu_score.py @@ -0,0 +1,54 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""This module provides functions to calculate bleu score in different level. +e.g. wer for word-level, cer for char-level. +""" +import sacrebleu + +__all__ = ['bleu', 'char_bleu'] + + +def bleu(hypothesis, reference): + """Calculate BLEU. BLEU compares reference text and + hypothesis text in word-level using scarebleu. + + + + :param reference: The reference sentences. + :type reference: list[list[str]] + :param hypothesis: The hypothesis sentence. + :type hypothesis: list[str] + :raises ValueError: If the reference length is zero. + """ + + return sacrebleu.corpus_bleu(hypothesis, reference) + + +def char_bleu(hypothesis, reference): + """Calculate BLEU. BLEU compares reference text and + hypothesis text in char-level using scarebleu. + + + + :param reference: The reference sentences. + :type reference: list[list[str]] + :param hypothesis: The hypothesis sentence. + :type hypothesis: list[str] + :raises ValueError: If the reference number is zero. + """ + hypothesis = [' '.join(list(hyp.replace(' ', ''))) for hyp in hypothesis] + reference = [[' '.join(list(ref_i.replace(' ', ''))) for ref_i in ref] + for ref in reference] + + return sacrebleu.corpus_bleu(hypothesis, reference) diff --git a/examples/transv1.8to2.x/deepspeech/utils/checkpoint.py b/examples/transv1.8to2.x/deepspeech/utils/checkpoint.py new file mode 100644 index 00000000..a59f8be7 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/utils/checkpoint.py @@ -0,0 +1,298 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import glob +import json +import os +import re +from pathlib import Path +from typing import Text +from typing import Union + +import paddle +from paddle import distributed as dist +from paddle.optimizer import Optimizer + +from deepspeech.utils import mp_tools +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = ["Checkpoint"] + + +class Checkpoint(): + def __init__(self, kbest_n: int=5, latest_n: int=1): + self.best_records: Mapping[Path, float] = {} + self.latest_records = [] + self.kbest_n = kbest_n + self.latest_n = latest_n + self._save_all = (kbest_n == -1) + + def add_checkpoint(self, + checkpoint_dir, + tag_or_iteration: Union[int, Text], + model: paddle.nn.Layer, + optimizer: Optimizer=None, + infos: dict=None, + metric_type="val_loss"): + """Save checkpoint in best_n and latest_n. + + Args: + checkpoint_dir (str): the directory where checkpoint is saved. + tag_or_iteration (int or str): the latest iteration(step or epoch) number or tag. + model (Layer): model to be checkpointed. + optimizer (Optimizer, optional): optimizer to be checkpointed. + infos (dict or None)): any info you want to save. + metric_type (str, optional): metric type. Defaults to "val_loss". + """ + if (metric_type not in infos.keys()): + self._save_parameters(checkpoint_dir, tag_or_iteration, model, + optimizer, infos) + return + + #save best + if self._should_save_best(infos[metric_type]): + self._save_best_checkpoint_and_update( + infos[metric_type], checkpoint_dir, tag_or_iteration, model, + optimizer, infos) + #save latest + self._save_latest_checkpoint_and_update( + checkpoint_dir, tag_or_iteration, model, optimizer, infos) + + if isinstance(tag_or_iteration, int): + self._save_checkpoint_record(checkpoint_dir, tag_or_iteration) + + def load_parameters(self, + model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None, + record_file="checkpoint_latest"): + """Load a last model checkpoint from disk. + Args: + model (Layer): model to load parameters. + optimizer (Optimizer, optional): optimizer to load states if needed. + Defaults to None. + checkpoint_dir (str, optional): the directory where checkpoint is saved. + checkpoint_path (str, optional): if specified, load the checkpoint + stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will + be ignored. Defaults to None. + record_file "checkpoint_latest" or "checkpoint_best" + Returns: + configs (dict): epoch or step, lr and other meta info should be saved. + """ + configs = {} + + if checkpoint_path is not None: + pass + elif checkpoint_dir is not None and record_file is not None: + # load checkpint from record file + checkpoint_record = os.path.join(checkpoint_dir, record_file) + iteration = self._load_checkpoint_idx(checkpoint_record) + if iteration == -1: + return configs + checkpoint_path = os.path.join(checkpoint_dir, + "{}".format(iteration)) + else: + raise ValueError( + "At least one of 'checkpoint_path' or 'checkpoint_dir' should be specified!" + ) + + rank = dist.get_rank() + + params_path = checkpoint_path + ".pdparams" + model_dict = paddle.load(params_path) + model.set_state_dict(model_dict) + logger.info("Rank {}: loaded model from {}".format(rank, params_path)) + + optimizer_path = checkpoint_path + ".pdopt" + if optimizer and os.path.isfile(optimizer_path): + optimizer_dict = paddle.load(optimizer_path) + optimizer.set_state_dict(optimizer_dict) + logger.info("Rank {}: loaded optimizer state from {}".format( + rank, optimizer_path)) + + info_path = re.sub('.pdparams$', '.json', params_path) + if os.path.exists(info_path): + with open(info_path, 'r') as fin: + configs = json.load(fin) + return configs + + def load_latest_parameters(self, + model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None): + """Load a last model checkpoint from disk. + Args: + model (Layer): model to load parameters. + optimizer (Optimizer, optional): optimizer to load states if needed. + Defaults to None. + checkpoint_dir (str, optional): the directory where checkpoint is saved. + checkpoint_path (str, optional): if specified, load the checkpoint + stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will + be ignored. Defaults to None. + Returns: + configs (dict): epoch or step, lr and other meta info should be saved. + """ + return self.load_parameters(model, optimizer, checkpoint_dir, + checkpoint_path, "checkpoint_latest") + + def load_best_parameters(self, + model, + optimizer=None, + checkpoint_dir=None, + checkpoint_path=None): + """Load a last model checkpoint from disk. + Args: + model (Layer): model to load parameters. + optimizer (Optimizer, optional): optimizer to load states if needed. + Defaults to None. + checkpoint_dir (str, optional): the directory where checkpoint is saved. + checkpoint_path (str, optional): if specified, load the checkpoint + stored in the checkpoint_path(prefix) and the argument 'checkpoint_dir' will + be ignored. Defaults to None. + Returns: + configs (dict): epoch or step, lr and other meta info should be saved. + """ + return self.load_parameters(model, optimizer, checkpoint_dir, + checkpoint_path, "checkpoint_best") + + def _should_save_best(self, metric: float) -> bool: + if not self._best_full(): + return True + + # already full + worst_record_path = max(self.best_records, key=self.best_records.get) + # worst_record_path = max(self.best_records.iteritems(), key=operator.itemgetter(1))[0] + worst_metric = self.best_records[worst_record_path] + return metric < worst_metric + + def _best_full(self): + return (not self._save_all) and len(self.best_records) == self.kbest_n + + def _latest_full(self): + return len(self.latest_records) == self.latest_n + + def _save_best_checkpoint_and_update(self, metric, checkpoint_dir, + tag_or_iteration, model, optimizer, + infos): + # remove the worst + if self._best_full(): + worst_record_path = max(self.best_records, + key=self.best_records.get) + self.best_records.pop(worst_record_path) + if (worst_record_path not in self.latest_records): + logger.info( + "remove the worst checkpoint: {}".format(worst_record_path)) + self._del_checkpoint(checkpoint_dir, worst_record_path) + + # add the new one + self._save_parameters(checkpoint_dir, tag_or_iteration, model, + optimizer, infos) + self.best_records[tag_or_iteration] = metric + + def _save_latest_checkpoint_and_update( + self, checkpoint_dir, tag_or_iteration, model, optimizer, infos): + # remove the old + if self._latest_full(): + to_del_fn = self.latest_records.pop(0) + if (to_del_fn not in self.best_records.keys()): + logger.info( + "remove the latest checkpoint: {}".format(to_del_fn)) + self._del_checkpoint(checkpoint_dir, to_del_fn) + self.latest_records.append(tag_or_iteration) + + self._save_parameters(checkpoint_dir, tag_or_iteration, model, + optimizer, infos) + + def _del_checkpoint(self, checkpoint_dir, tag_or_iteration): + checkpoint_path = os.path.join(checkpoint_dir, + "{}".format(tag_or_iteration)) + for filename in glob.glob(checkpoint_path + ".*"): + os.remove(filename) + logger.info("delete file: {}".format(filename)) + + def _load_checkpoint_idx(self, checkpoint_record: str) -> int: + """Get the iteration number corresponding to the latest saved checkpoint. + Args: + checkpoint_path (str): the saved path of checkpoint. + Returns: + int: the latest iteration number. -1 for no checkpoint to load. + """ + if not os.path.isfile(checkpoint_record): + return -1 + + # Fetch the latest checkpoint index. + with open(checkpoint_record, "rt") as handle: + latest_checkpoint = handle.readlines()[-1].strip() + iteration = int(latest_checkpoint.split(":")[-1]) + return iteration + + def _save_checkpoint_record(self, checkpoint_dir: str, iteration: int): + """Save the iteration number of the latest model to be checkpoint record. + Args: + checkpoint_dir (str): the directory where checkpoint is saved. + iteration (int): the latest iteration number. + Returns: + None + """ + checkpoint_record_latest = os.path.join(checkpoint_dir, + "checkpoint_latest") + checkpoint_record_best = os.path.join(checkpoint_dir, "checkpoint_best") + + with open(checkpoint_record_best, "w") as handle: + for i in self.best_records.keys(): + handle.write("model_checkpoint_path:{}\n".format(i)) + with open(checkpoint_record_latest, "w") as handle: + for i in self.latest_records: + handle.write("model_checkpoint_path:{}\n".format(i)) + + @mp_tools.rank_zero_only + def _save_parameters(self, + checkpoint_dir: str, + tag_or_iteration: Union[int, str], + model: paddle.nn.Layer, + optimizer: Optimizer=None, + infos: dict=None): + """Checkpoint the latest trained model parameters. + Args: + checkpoint_dir (str): the directory where checkpoint is saved. + tag_or_iteration (int or str): the latest iteration(step or epoch) number. + model (Layer): model to be checkpointed. + optimizer (Optimizer, optional): optimizer to be checkpointed. + Defaults to None. + infos (dict or None): any info you want to save. + Returns: + None + """ + checkpoint_path = os.path.join(checkpoint_dir, + "{}".format(tag_or_iteration)) + + model_dict = model.state_dict() + params_path = checkpoint_path + ".pdparams" + paddle.save(model_dict, params_path) + logger.info("Saved model to {}".format(params_path)) + + if optimizer: + opt_dict = optimizer.state_dict() + optimizer_path = checkpoint_path + ".pdopt" + paddle.save(opt_dict, optimizer_path) + logger.info("Saved optimzier state to {}".format(optimizer_path)) + + info_path = re.sub('.pdparams$', '.json', params_path) + infos = {} if infos is None else infos + with open(info_path, 'w') as fout: + data = json.dumps(infos) + fout.write(data) diff --git a/examples/transv1.8to2.x/deepspeech/utils/ctc_utils.py b/examples/transv1.8to2.x/deepspeech/utils/ctc_utils.py new file mode 100644 index 00000000..09543d48 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/utils/ctc_utils.py @@ -0,0 +1,134 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import List + +import numpy as np +import paddle + +from deepspeech.utils.log import Log + +logger = Log(__name__).getlog() + +__all__ = ["forced_align", "remove_duplicates_and_blank", "insert_blank"] + + +def remove_duplicates_and_blank(hyp: List[int], blank_id=0) -> List[int]: + """ctc alignment to ctc label ids. + + "abaa-acee-" -> "abaace" + + Args: + hyp (List[int]): hypotheses ids, (L) + blank_id (int, optional): blank id. Defaults to 0. + + Returns: + List[int]: remove dupicate ids, then remove blank id. + """ + new_hyp: List[int] = [] + cur = 0 + while cur < len(hyp): + # add non-blank into new_hyp + if hyp[cur] != blank_id: + new_hyp.append(hyp[cur]) + # skip repeat label + prev = cur + while cur < len(hyp) and hyp[cur] == hyp[prev]: + cur += 1 + return new_hyp + + +def insert_blank(label: np.ndarray, blank_id: int=0) -> np.ndarray: + """Insert blank token between every two label token. + + "abcdefg" -> "-a-b-c-d-e-f-g-" + + Args: + label ([np.ndarray]): label ids, List[int], (L). + blank_id (int, optional): blank id. Defaults to 0. + + Returns: + [np.ndarray]: (2L+1). + """ + label = np.expand_dims(label, 1) #[L, 1] + blanks = np.zeros((label.shape[0], 1), dtype=np.int64) + blank_id + label = np.concatenate([blanks, label], axis=1) #[L, 2] + label = label.reshape(-1) #[2L], -l-l-l + label = np.append(label, label[0]) #[2L + 1], -l-l-l- + return label + + +def forced_align(ctc_probs: paddle.Tensor, y: paddle.Tensor, + blank_id=0) -> List[int]: + """ctc forced alignment. + + https://distill.pub/2017/ctc/ + + Args: + ctc_probs (paddle.Tensor): hidden state sequence, 2d tensor (T, D) + y (paddle.Tensor): label id sequence tensor, 1d tensor (L) + blank_id (int): blank symbol index + Returns: + List[int]: best alignment result, (T). + """ + y_insert_blank = insert_blank(y, blank_id) #(2L+1) + + log_alpha = paddle.zeros( + (ctc_probs.size(0), len(y_insert_blank))) #(T, 2L+1) + log_alpha = log_alpha - float('inf') # log of zero + # TODO(Hui Zhang): zeros not support paddle.int16 + state_path = (paddle.zeros( + (ctc_probs.size(0), len(y_insert_blank)), dtype=paddle.int32) - 1 + ) # state path, Tuple((T, 2L+1)) + + # init start state + # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64 + log_alpha[0, 0] = ctc_probs[0][int(y_insert_blank[0])] # State-b, Sb + log_alpha[0, 1] = ctc_probs[0][int(y_insert_blank[1])] # State-nb, Snb + + for t in range(1, ctc_probs.size(0)): # T + for s in range(len(y_insert_blank)): # 2L+1 + if y_insert_blank[s] == blank_id or s < 2 or y_insert_blank[ + s] == y_insert_blank[s - 2]: + candidates = paddle.to_tensor( + [log_alpha[t - 1, s], log_alpha[t - 1, s - 1]]) + prev_state = [s, s - 1] + else: + candidates = paddle.to_tensor([ + log_alpha[t - 1, s], + log_alpha[t - 1, s - 1], + log_alpha[t - 1, s - 2], + ]) + prev_state = [s, s - 1, s - 2] + # TODO(Hui Zhang): VarBase.__getitem__() not support np.int64 + log_alpha[t, s] = paddle.max(candidates) + ctc_probs[t][int( + y_insert_blank[s])] + state_path[t, s] = prev_state[paddle.argmax(candidates)] + + # TODO(Hui Zhang): zeros not support paddle.int16 + state_seq = -1 * paddle.ones((ctc_probs.size(0), 1), dtype=paddle.int32) + + candidates = paddle.to_tensor([ + log_alpha[-1, len(y_insert_blank) - 1], # Sb + log_alpha[-1, len(y_insert_blank) - 2] # Snb + ]) + prev_state = [len(y_insert_blank) - 1, len(y_insert_blank) - 2] + state_seq[-1] = prev_state[paddle.argmax(candidates)] + for t in range(ctc_probs.size(0) - 2, -1, -1): + state_seq[t] = state_path[t + 1, state_seq[t + 1, 0]] + + output_alignment = [] + for t in range(0, ctc_probs.size(0)): + output_alignment.append(y_insert_blank[state_seq[t, 0]]) + + return output_alignment diff --git a/examples/transv1.8to2.x/deepspeech/utils/dynamic_import.py b/examples/transv1.8to2.x/deepspeech/utils/dynamic_import.py new file mode 100644 index 00000000..533f15ee --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/utils/dynamic_import.py @@ -0,0 +1,67 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import importlib +import inspect +from typing import Any +from typing import Dict +from typing import List +from typing import Text + +from deepspeech.utils.log import Log +from deepspeech.utils.tensor_utils import has_tensor + +logger = Log(__name__).getlog() + +__all__ = ["dynamic_import", "instance_class"] + + +def dynamic_import(import_path, alias=dict()): + """dynamic import module and class + + :param str import_path: syntax 'module_name:class_name' + e.g., 'deepspeech.models.u2:U2Model' + :param dict alias: shortcut for registered class + :return: imported class + """ + if import_path not in alias and ":" not in import_path: + raise ValueError("import_path should be one of {} or " + 'include ":", e.g. "deepspeech.models.u2:U2Model" : ' + "{}".format(set(alias), import_path)) + if ":" not in import_path: + import_path = alias[import_path] + + module_name, objname = import_path.split(":") + m = importlib.import_module(module_name) + return getattr(m, objname) + + +def filter_valid_args(args: Dict[Text, Any], valid_keys: List[Text]): + # filter by `valid_keys` and filter `val` is not None + new_args = { + key: val + for key, val in args.items() if (key in valid_keys and val is not None) + } + return new_args + + +def filter_out_tenosr(args: Dict[Text, Any]): + return {key: val for key, val in args.items() if not has_tensor(val)} + + +def instance_class(module_class, args: Dict[Text, Any]): + valid_keys = inspect.signature(module_class).parameters.keys() + new_args = filter_valid_args(args, valid_keys) + logger.info( + f"Instance: {module_class.__name__} {filter_out_tenosr(new_args)}.") + return module_class(**new_args) diff --git a/examples/transv1.8to2.x/deepspeech/utils/error_rate.py b/examples/transv1.8to2.x/deepspeech/utils/error_rate.py new file mode 100644 index 00000000..b6399bab --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/utils/error_rate.py @@ -0,0 +1,206 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""This module provides functions to calculate error rate in different level. +e.g. wer for word-level, cer for char-level. +""" +import numpy as np + +__all__ = ['word_errors', 'char_errors', 'wer', 'cer'] + + +def _levenshtein_distance(ref, hyp): + """Levenshtein distance is a string metric for measuring the difference + between two sequences. Informally, the levenshtein disctance is defined as + the minimum number of single-character edits (substitutions, insertions or + deletions) required to change one word into the other. We can naturally + extend the edits to word level when calculate levenshtein disctance for + two sentences. + """ + m = len(ref) + n = len(hyp) + + # special case + if ref == hyp: + return 0 + if m == 0: + return n + if n == 0: + return m + + if m < n: + ref, hyp = hyp, ref + m, n = n, m + + # use O(min(m, n)) space + distance = np.zeros((2, n + 1), dtype=np.int32) + + # initialize distance matrix + for j in range(n + 1): + distance[0][j] = j + + # calculate levenshtein distance + for i in range(1, m + 1): + prev_row_idx = (i - 1) % 2 + cur_row_idx = i % 2 + distance[cur_row_idx][0] = i + for j in range(1, n + 1): + if ref[i - 1] == hyp[j - 1]: + distance[cur_row_idx][j] = distance[prev_row_idx][j - 1] + else: + s_num = distance[prev_row_idx][j - 1] + 1 + i_num = distance[cur_row_idx][j - 1] + 1 + d_num = distance[prev_row_idx][j] + 1 + distance[cur_row_idx][j] = min(s_num, i_num, d_num) + + return distance[m % 2][n] + + +def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '): + """Compute the levenshtein distance between reference sequence and + hypothesis sequence in word-level. + + :param reference: The reference sentence. + :type reference: str + :param hypothesis: The hypothesis sentence. + :type hypothesis: str + :param ignore_case: Whether case-sensitive or not. + :type ignore_case: bool + :param delimiter: Delimiter of input sentences. + :type delimiter: char + :return: Levenshtein distance and word number of reference sentence. + :rtype: list + """ + if ignore_case: + reference = reference.lower() + hypothesis = hypothesis.lower() + + ref_words = list(filter(None, reference.split(delimiter))) + hyp_words = list(filter(None, hypothesis.split(delimiter))) + + edit_distance = _levenshtein_distance(ref_words, hyp_words) + return float(edit_distance), len(ref_words) + + +def char_errors(reference, hypothesis, ignore_case=False, remove_space=False): + """Compute the levenshtein distance between reference sequence and + hypothesis sequence in char-level. + + :param reference: The reference sentence. + :type reference: str + :param hypothesis: The hypothesis sentence. + :type hypothesis: str + :param ignore_case: Whether case-sensitive or not. + :type ignore_case: bool + :param remove_space: Whether remove internal space characters + :type remove_space: bool + :return: Levenshtein distance and length of reference sentence. + :rtype: list + """ + if ignore_case: + reference = reference.lower() + hypothesis = hypothesis.lower() + + join_char = ' ' + if remove_space: + join_char = '' + + reference = join_char.join(list(filter(None, reference.split(' ')))) + hypothesis = join_char.join(list(filter(None, hypothesis.split(' ')))) + + edit_distance = _levenshtein_distance(reference, hypothesis) + return float(edit_distance), len(reference) + + +def wer(reference, hypothesis, ignore_case=False, delimiter=' '): + """Calculate word error rate (WER). WER compares reference text and + hypothesis text in word-level. WER is defined as: + + .. math:: + WER = (Sw + Dw + Iw) / Nw + + where + + .. code-block:: text + + Sw is the number of words subsituted, + Dw is the number of words deleted, + Iw is the number of words inserted, + Nw is the number of words in the reference + + We can use levenshtein distance to calculate WER. Please draw an attention + that empty items will be removed when splitting sentences by delimiter. + + :param reference: The reference sentence. + :type reference: str + :param hypothesis: The hypothesis sentence. + :type hypothesis: str + :param ignore_case: Whether case-sensitive or not. + :type ignore_case: bool + :param delimiter: Delimiter of input sentences. + :type delimiter: char + :return: Word error rate. + :rtype: float + :raises ValueError: If word number of reference is zero. + """ + edit_distance, ref_len = word_errors(reference, hypothesis, ignore_case, + delimiter) + + if ref_len == 0: + raise ValueError("Reference's word number should be greater than 0.") + + wer = float(edit_distance) / ref_len + return wer + + +def cer(reference, hypothesis, ignore_case=False, remove_space=False): + """Calculate charactor error rate (CER). CER compares reference text and + hypothesis text in char-level. CER is defined as: + + .. math:: + CER = (Sc + Dc + Ic) / Nc + + where + + .. code-block:: text + + Sc is the number of characters substituted, + Dc is the number of characters deleted, + Ic is the number of characters inserted + Nc is the number of characters in the reference + + We can use levenshtein distance to calculate CER. Chinese input should be + encoded to unicode. Please draw an attention that the leading and tailing + space characters will be truncated and multiple consecutive space + characters in a sentence will be replaced by one space character. + + :param reference: The reference sentence. + :type reference: str + :param hypothesis: The hypothesis sentence. + :type hypothesis: str + :param ignore_case: Whether case-sensitive or not. + :type ignore_case: bool + :param remove_space: Whether remove internal space characters + :type remove_space: bool + :return: Character error rate. + :rtype: float + :raises ValueError: If the reference length is zero. + """ + edit_distance, ref_len = char_errors(reference, hypothesis, ignore_case, + remove_space) + + if ref_len == 0: + raise ValueError("Length of reference should be greater than 0.") + + cer = float(edit_distance) / ref_len + return cer diff --git a/examples/transv1.8to2.x/deepspeech/utils/layer_tools.py b/examples/transv1.8to2.x/deepspeech/utils/layer_tools.py new file mode 100644 index 00000000..fb076c0c --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/utils/layer_tools.py @@ -0,0 +1,88 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import numpy as np +from paddle import nn + +__all__ = [ + "summary", "gradient_norm", "freeze", "unfreeze", "print_grads", + "print_params" +] + + +def summary(layer: nn.Layer, print_func=print): + if print_func is None: + return + num_params = num_elements = 0 + for name, param in layer.state_dict().items(): + if print_func: + print_func( + "{} | {} | {}".format(name, param.shape, np.prod(param.shape))) + num_elements += np.prod(param.shape) + num_params += 1 + if print_func: + num_elements = num_elements / 1024**2 + print_func( + f"Total parameters: {num_params}, {num_elements:.2f}M elements.") + + +def print_grads(model, print_func=print): + if print_func is None: + return + for n, p in model.named_parameters(): + msg = f"param grad: {n}: shape: {p.shape} grad: {p.grad}" + print_func(msg) + + +def print_params(model, print_func=print): + if print_func is None: + return + total = 0.0 + num_params = 0.0 + for n, p in model.named_parameters(): + msg = f"{n} | {p.shape} | {np.prod(p.shape)} | {not p.stop_gradient}" + total += np.prod(p.shape) + num_params += 1 + if print_func: + print_func(msg) + if print_func: + total = total / 1024**2 + print_func(f"Total parameters: {num_params}, {total:.2f}M elements.") + + +def gradient_norm(layer: nn.Layer): + grad_norm_dict = {} + for name, param in layer.state_dict().items(): + if param.trainable: + grad = param.gradient() # return numpy.ndarray + grad_norm_dict[name] = np.linalg.norm(grad) / grad.size + return grad_norm_dict + + +def recursively_remove_weight_norm(layer: nn.Layer): + for layer in layer.sublayers(): + try: + nn.utils.remove_weight_norm(layer) + except ValueError as e: + # ther is not weight norm hoom in this layer + pass + + +def freeze(layer: nn.Layer): + for param in layer.parameters(): + param.trainable = False + + +def unfreeze(layer: nn.Layer): + for param in layer.parameters(): + param.trainable = True diff --git a/examples/transv1.8to2.x/deepspeech/utils/log.py b/examples/transv1.8to2.x/deepspeech/utils/log.py new file mode 100644 index 00000000..3fd7d248 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/utils/log.py @@ -0,0 +1,182 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import getpass +import logging +import os +import socket +import sys + +from paddle import inference + +FORMAT_STR = '[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s' +DATE_FMT_STR = '%Y/%m/%d %H:%M:%S' + +logging.basicConfig( + level=logging.DEBUG, format=FORMAT_STR, datefmt=DATE_FMT_STR) + + +def find_log_dir(log_dir=None): + """Returns the most suitable directory to put log files into. + Args: + log_dir: str|None, if specified, the logfile(s) will be created in that + directory. Otherwise if the --log_dir command-line flag is provided, + the logfile will be created in that directory. Otherwise the logfile + will be created in a standard location. + Raises: + FileNotFoundError: raised when it cannot find a log directory. + """ + # Get a list of possible log dirs (will try to use them in order). + if log_dir: + # log_dir was explicitly specified as an arg, so use it and it alone. + dirs = [log_dir] + else: + dirs = ['/tmp/', './'] + + # Find the first usable log dir. + for d in dirs: + if os.path.isdir(d) and os.access(d, os.W_OK): + return d + raise FileNotFoundError( + "Can't find a writable directory for logs, tried %s" % dirs) + + +def find_log_dir_and_names(program_name=None, log_dir=None): + """Computes the directory and filename prefix for log file. + Args: + program_name: str|None, the filename part of the path to the program that + is running without its extension. e.g: if your program is called + 'usr/bin/foobar.py' this method should probably be called with + program_name='foobar' However, this is just a convention, you can + pass in any string you want, and it will be used as part of the + log filename. If you don't pass in anything, the default behavior + is as described in the example. In python standard logging mode, + the program_name will be prepended with py_ if it is the program_name + argument is omitted. + log_dir: str|None, the desired log directory. + Returns: + (log_dir, file_prefix, symlink_prefix) + Raises: + FileNotFoundError: raised in Python 3 when it cannot find a log directory. + OSError: raised in Python 2 when it cannot find a log directory. + """ + if not program_name: + # Strip the extension (foobar.par becomes foobar, and + # fubar.py becomes fubar). We do this so that the log + # file names are similar to C++ log file names. + program_name = os.path.splitext(os.path.basename(sys.argv[0]))[0] + + # Prepend py_ to files so that python code gets a unique file, and + # so that C++ libraries do not try to write to the same log files as us. + program_name = 'py_%s' % program_name + + actual_log_dir = find_log_dir(log_dir=log_dir) + + try: + username = getpass.getuser() + except KeyError: + # This can happen, e.g. when running under docker w/o passwd file. + if hasattr(os, 'getuid'): + # Windows doesn't have os.getuid + username = str(os.getuid()) + else: + username = 'unknown' + hostname = socket.gethostname() + file_prefix = '%s.%s.%s.log' % (program_name, hostname, username) + + return actual_log_dir, file_prefix, program_name + + +class Log(): + + log_name = None + + def __init__(self, logger=None): + self.logger = logging.getLogger(logger) + self.logger.setLevel(logging.DEBUG) + + file_dir = os.getcwd() + '/log' + if not os.path.exists(file_dir): + os.mkdir(file_dir) + self.log_dir = file_dir + + actual_log_dir, file_prefix, symlink_prefix = find_log_dir_and_names( + program_name=None, log_dir=self.log_dir) + + basename = '%s.DEBUG.%d' % (file_prefix, os.getpid()) + filename = os.path.join(actual_log_dir, basename) + if Log.log_name is None: + Log.log_name = filename + + # Create a symlink to the log file with a canonical name. + symlink = os.path.join(actual_log_dir, symlink_prefix + '.DEBUG') + try: + if os.path.islink(symlink): + os.unlink(symlink) + os.symlink(os.path.basename(Log.log_name), symlink) + except EnvironmentError: + # If it fails, we're sad but it's no error. Commonly, this + # fails because the symlink was created by another user and so + # we can't modify it + pass + + if not self.logger.hasHandlers(): + formatter = logging.Formatter(fmt=FORMAT_STR, datefmt=DATE_FMT_STR) + fh = logging.FileHandler(Log.log_name) + fh.setLevel(logging.DEBUG) + fh.setFormatter(formatter) + self.logger.addHandler(fh) + + ch = logging.StreamHandler() + ch.setLevel(logging.INFO) + ch.setFormatter(formatter) + self.logger.addHandler(ch) + + # stop propagate for propagating may print + # log multiple times + self.logger.propagate = False + + def getlog(self): + return self.logger + + +class Autolog: + def __init__(self, + batch_size, + model_name="DeepSpeech", + model_precision="fp32"): + import auto_log + pid = os.getpid() + if (os.environ['CUDA_VISIBLE_DEVICES'].strip() != ''): + gpu_id = int(os.environ['CUDA_VISIBLE_DEVICES'].split(',')[0]) + infer_config = inference.Config() + infer_config.enable_use_gpu(100, gpu_id) + else: + gpu_id = None + infer_config = inference.Config() + autolog = auto_log.AutoLogger( + model_name=model_name, + model_precision=model_precision, + batch_size=batch_size, + data_shape="dynamic", + save_path="./output/auto_log.lpg", + inference_config=infer_config, + pids=pid, + process_name=None, + gpu_ids=gpu_id, + time_keys=['preprocess_time', 'inference_time', 'postprocess_time'], + warmup=0) + self.autolog = autolog + + def getlog(self): + return self.autolog diff --git a/examples/transv1.8to2.x/deepspeech/utils/mp_tools.py b/examples/transv1.8to2.x/deepspeech/utils/mp_tools.py new file mode 100644 index 00000000..d3e25aab --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/utils/mp_tools.py @@ -0,0 +1,30 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from functools import wraps + +from paddle import distributed as dist + +__all__ = ["rank_zero_only"] + + +def rank_zero_only(func): + @wraps(func) + def wrapper(*args, **kwargs): + rank = dist.get_rank() + if rank != 0: + return + result = func(*args, **kwargs) + return result + + return wrapper diff --git a/examples/transv1.8to2.x/deepspeech/utils/socket_server.py b/examples/transv1.8to2.x/deepspeech/utils/socket_server.py new file mode 100644 index 00000000..45c659f6 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/utils/socket_server.py @@ -0,0 +1,112 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import os +import random +import socket +import socketserver +import struct +import time +import wave +from time import gmtime +from time import strftime + +from deepspeech.frontend.utility import read_manifest + +__all__ = ["socket_send", "warm_up_test", "AsrTCPServer", "AsrRequestHandler"] + + +def socket_send(server_ip: str, server_port: str, data: bytes): + # Connect to server and send data + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.connect((server_ip, server_port)) + sent = data + sock.sendall(struct.pack('>i', len(sent)) + sent) + print('Speech[length=%d] Sent.' % len(sent)) + # Receive data from the server and shut down + received = sock.recv(1024) + print("Recognition Results: {}".format(received.decode('utf8'))) + sock.close() + + +def warm_up_test(audio_process_handler, + manifest_path, + num_test_cases, + random_seed=0): + """Warming-up test.""" + manifest = read_manifest(manifest_path) + rng = random.Random(random_seed) + samples = rng.sample(manifest, num_test_cases) + for idx, sample in enumerate(samples): + print("Warm-up Test Case %d: %s" % (idx, sample['feat'])) + start_time = time.time() + transcript = audio_process_handler(sample['feat']) + finish_time = time.time() + print("Response Time: %f, Transcript: %s" % + (finish_time - start_time, transcript)) + + +class AsrTCPServer(socketserver.TCPServer): + """The ASR TCP Server.""" + + def __init__(self, + server_address, + RequestHandlerClass, + speech_save_dir, + audio_process_handler, + bind_and_activate=True): + self.speech_save_dir = speech_save_dir + self.audio_process_handler = audio_process_handler + socketserver.TCPServer.__init__( + self, server_address, RequestHandlerClass, bind_and_activate=True) + + +class AsrRequestHandler(socketserver.BaseRequestHandler): + """The ASR request handler.""" + + def handle(self): + # receive data through TCP socket + chunk = self.request.recv(1024) + target_len = struct.unpack('>i', chunk[:4])[0] + data = chunk[4:] + while len(data) < target_len: + chunk = self.request.recv(1024) + data += chunk + # write to file + filename = self._write_to_file(data) + + print("Received utterance[length=%d] from %s, saved to %s." % + (len(data), self.client_address[0], filename)) + start_time = time.time() + transcript = self.server.audio_process_handler(filename) + finish_time = time.time() + print("Response Time: %f, Transcript: %s" % + (finish_time - start_time, transcript)) + self.request.sendall(transcript.encode('utf-8')) + + def _write_to_file(self, data): + # prepare save dir and filename + if not os.path.exists(self.server.speech_save_dir): + os.mkdir(self.server.speech_save_dir) + timestamp = strftime("%Y%m%d%H%M%S", gmtime()) + out_filename = os.path.join( + self.server.speech_save_dir, + timestamp + "_" + self.client_address[0] + ".wav") + # write to wav file + file = wave.open(out_filename, 'wb') + file.setnchannels(1) + file.setsampwidth(2) + file.setframerate(16000) + file.writeframes(data) + file.close() + return out_filename diff --git a/examples/transv1.8to2.x/deepspeech/utils/tensor_utils.py b/examples/transv1.8to2.x/deepspeech/utils/tensor_utils.py new file mode 100644 index 00000000..9bff6b0f --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/utils/tensor_utils.py @@ -0,0 +1,180 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Unility functions for Transformer.""" +from typing import List +from typing import Tuple + +import paddle + +from deepspeech.utils.log import Log + +__all__ = ["pad_sequence", "add_sos_eos", "th_accuracy", "has_tensor"] + +logger = Log(__name__).getlog() + + +def has_tensor(val): + if isinstance(val, (list, tuple)): + for item in val: + if has_tensor(item): + return True + elif isinstance(val, dict): + for k, v in val.items(): + print(k) + if has_tensor(v): + return True + else: + return paddle.is_tensor(val) + + +def pad_sequence(sequences: List[paddle.Tensor], + batch_first: bool=False, + padding_value: float=0.0) -> paddle.Tensor: + r"""Pad a list of variable length Tensors with ``padding_value`` + + ``pad_sequence`` stacks a list of Tensors along a new dimension, + and pads them to equal length. For example, if the input is list of + sequences with size ``L x *`` and if batch_first is False, and ``T x B x *`` + otherwise. + + `B` is batch size. It is equal to the number of elements in ``sequences``. + `T` is length of the longest sequence. + `L` is length of the sequence. + `*` is any number of trailing dimensions, including none. + + Example: + >>> from paddle.nn.utils.rnn import pad_sequence + >>> a = paddle.ones(25, 300) + >>> b = paddle.ones(22, 300) + >>> c = paddle.ones(15, 300) + >>> pad_sequence([a, b, c]).size() + paddle.Tensor([25, 3, 300]) + + Note: + This function returns a Tensor of size ``T x B x *`` or ``B x T x *`` + where `T` is the length of the longest sequence. This function assumes + trailing dimensions and type of all the Tensors in sequences are same. + + Args: + sequences (list[Tensor]): list of variable length sequences. + batch_first (bool, optional): output will be in ``B x T x *`` if True, or in + ``T x B x *`` otherwise + padding_value (float, optional): value for padded elements. Default: 0. + + Returns: + Tensor of size ``T x B x *`` if :attr:`batch_first` is ``False``. + Tensor of size ``B x T x *`` otherwise + """ + + # assuming trailing dimensions and type of all the Tensors + # in sequences are same and fetching those from sequences[0] + max_size = sequences[0].size() + # (TODO Hui Zhang): slice not supprot `end==start` + # trailing_dims = max_size[1:] + trailing_dims = max_size[1:] if max_size.ndim >= 2 else () + max_len = max([s.size(0) for s in sequences]) + if batch_first: + out_dims = (len(sequences), max_len) + trailing_dims + else: + out_dims = (max_len, len(sequences)) + trailing_dims + + out_tensor = sequences[0].new_full(out_dims, padding_value) + for i, tensor in enumerate(sequences): + length = tensor.size(0) + # use index notation to prevent duplicate references to the tensor + if batch_first: + out_tensor[i, :length, ...] = tensor + else: + out_tensor[:length, i, ...] = tensor + + return out_tensor + + +def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int, + ignore_id: int) -> Tuple[paddle.Tensor, paddle.Tensor]: + """Add and labels. + Args: + ys_pad (paddle.Tensor): batch of padded target sequences (B, Lmax) + sos (int): index of + eos (int): index of + ignore_id (int): index of padding + Returns: + ys_in (paddle.Tensor) : (B, Lmax + 1) + ys_out (paddle.Tensor) : (B, Lmax + 1) + Examples: + >>> sos_id = 10 + >>> eos_id = 11 + >>> ignore_id = -1 + >>> ys_pad + tensor([[ 1, 2, 3, 4, 5], + [ 4, 5, 6, -1, -1], + [ 7, 8, 9, -1, -1]], dtype=paddle.int32) + >>> ys_in,ys_out=add_sos_eos(ys_pad, sos_id , eos_id, ignore_id) + >>> ys_in + tensor([[10, 1, 2, 3, 4, 5], + [10, 4, 5, 6, 11, 11], + [10, 7, 8, 9, 11, 11]]) + >>> ys_out + tensor([[ 1, 2, 3, 4, 5, 11], + [ 4, 5, 6, 11, -1, -1], + [ 7, 8, 9, 11, -1, -1]]) + """ + # TODO(Hui Zhang): using comment code, + #_sos = paddle.to_tensor( + # [sos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place) + #_eos = paddle.to_tensor( + # [eos], dtype=paddle.long, stop_gradient=True, place=ys_pad.place) + #ys = [y[y != ignore_id] for y in ys_pad] # parse padded ys + #ys_in = [paddle.cat([_sos, y], dim=0) for y in ys] + #ys_out = [paddle.cat([y, _eos], dim=0) for y in ys] + #return pad_sequence(ys_in, padding_value=eos), pad_sequence(ys_out, padding_value=ignore_id) + B = ys_pad.size(0) + _sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos + _eos = paddle.ones([B, 1], dtype=ys_pad.dtype) * eos + ys_in = paddle.cat([_sos, ys_pad], dim=1) + mask_pad = (ys_in == ignore_id) + ys_in = ys_in.masked_fill(mask_pad, eos) + + ys_out = paddle.cat([ys_pad, _eos], dim=1) + ys_out = ys_out.masked_fill(mask_pad, eos) + mask_eos = (ys_out == ignore_id) + ys_out = ys_out.masked_fill(mask_eos, eos) + ys_out = ys_out.masked_fill(mask_pad, ignore_id) + return ys_in, ys_out + + +def th_accuracy(pad_outputs: paddle.Tensor, + pad_targets: paddle.Tensor, + ignore_label: int) -> float: + """Calculate accuracy. + Args: + pad_outputs (Tensor): Prediction tensors (B * Lmax, D). + pad_targets (LongTensor): Target label tensors (B, Lmax, D). + ignore_label (int): Ignore label id. + Returns: + float: Accuracy value (0.0 - 1.0). + """ + pad_pred = pad_outputs.view( + pad_targets.size(0), pad_targets.size(1), pad_outputs.size(1)).argmax(2) + mask = pad_targets != ignore_label + #TODO(Hui Zhang): sum not support bool type + # numerator = paddle.sum( + # pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) + numerator = ( + pad_pred.masked_select(mask) == pad_targets.masked_select(mask)) + numerator = paddle.sum(numerator.type_as(pad_targets)) + #TODO(Hui Zhang): sum not support bool type + # denominator = paddle.sum(mask) + denominator = paddle.sum(mask.type_as(pad_targets)) + return float(numerator) / float(denominator) diff --git a/examples/transv1.8to2.x/deepspeech/utils/text_grid.py b/examples/transv1.8to2.x/deepspeech/utils/text_grid.py new file mode 100644 index 00000000..3af58c9b --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/utils/text_grid.py @@ -0,0 +1,127 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Dict +from typing import List +from typing import Text + +import textgrid + + +def segment_alignment(alignment: List[int], blank_id=0) -> List[List[int]]: + """segment ctc alignment ids by continuous blank and repeat label. + + Args: + alignment (List[int]): ctc alignment id sequence. + e.g. [0, 0, 0, 1, 1, 1, 2, 0, 0, 3] + blank_id (int, optional): blank id. Defaults to 0. + + Returns: + List[List[int]]: token align, segment aligment id sequence. + e.g. [[0, 0, 0, 1, 1, 1], [2], [0, 0, 3]] + """ + # convert alignment to a praat format, which is a doing phonetics + # by computer and helps analyzing alignment + align_segs = [] + # get frames level duration for each token + start = 0 + end = 0 + while end < len(alignment): + while end < len(alignment) and alignment[end] == blank_id: # blank + end += 1 + if end == len(alignment): + align_segs[-1].extend(alignment[start:]) + break + end += 1 + while end < len(alignment) and alignment[end - 1] == alignment[ + end]: # repeat label + end += 1 + align_segs.append(alignment[start:end]) + start = end + return align_segs + + +def align_to_tierformat(align_segs: List[List[int]], + subsample: int, + token_dict: Dict[int, Text], + blank_id=0) -> List[Text]: + """Generate textgrid.Interval format from alignment segmentations. + + Args: + align_segs (List[List[int]]): segmented ctc alignment ids. + subsample (int): 25ms frame_length, 10ms hop_length, 1/subsample + token_dict (Dict[int, Text]): int -> str map. + + Returns: + List[Text]: list of textgrid.Interval text, str(start, end, text). + """ + hop_length = 10 # ms + second_ms = 1000 # ms + frame_per_second = second_ms / hop_length # 25ms frame_length, 10ms hop_length + second_per_frame = 1.0 / frame_per_second + + begin = 0 + duration = 0 + tierformat = [] + + for idx, tokens in enumerate(align_segs): + token_len = len(tokens) + token = tokens[-1] + # time duration in second + duration = token_len * subsample * second_per_frame + if idx < len(align_segs) - 1: + print(f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}") + tierformat.append( + f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}\n") + else: + for i in tokens: + if i != blank_id: + token = i + break + print(f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}") + tierformat.append( + f"{begin:.2f} {begin + duration:.2f} {token_dict[token]}\n") + begin = begin + duration + + return tierformat + + +def generate_textgrid(maxtime: float, + intervals: List[Text], + output: Text, + name: Text='ali') -> None: + """Create alignment textgrid file. + + Args: + maxtime (float): audio duartion. + intervals (List[Text]): ctc output alignment. e.g. "start-time end-time word" per item. + output (Text): textgrid filepath. + name (Text, optional): tier or layer name. Defaults to 'ali'. + """ + # Download Praat: https://www.fon.hum.uva.nl/praat/ + avg_interval = maxtime / (len(intervals) + 1) + print(f"average second/token: {avg_interval}") + margin = 0.0001 + + tg = textgrid.TextGrid(maxTime=maxtime) + tier = textgrid.IntervalTier(name=name, maxTime=maxtime) + + i = 0 + for dur in intervals: + s, e, text = dur.split() + tier.add(minTime=float(s) + margin, maxTime=float(e), mark=text) + + tg.append(tier) + + tg.write(output) + print("successfully generator textgrid {}.".format(output)) diff --git a/examples/transv1.8to2.x/deepspeech/utils/utility.py b/examples/transv1.8to2.x/deepspeech/utils/utility.py new file mode 100644 index 00000000..e18fc1f7 --- /dev/null +++ b/examples/transv1.8to2.x/deepspeech/utils/utility.py @@ -0,0 +1,110 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains common utility functions.""" +import distutils.util +import math +import os +import random +from typing import List + +import numpy as np +import paddle + +__all__ = ["seed_all", 'print_arguments', 'add_arguments', "log_add"] + + +def seed_all(seed: int=210329): + np.random.seed(seed) + random.seed(seed) + paddle.seed(seed) + + +def print_arguments(args, info=None): + """Print argparse's arguments. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + parser.add_argument("name", default="Jonh", type=str, help="User name.") + args = parser.parse_args() + print_arguments(args) + + :param args: Input argparse.Namespace for printing. + :type args: argparse.Namespace + """ + filename = "" + if info: + filename = info["__file__"] + filename = os.path.basename(filename) + print(f"----------- {filename} Configuration Arguments -----------") + for arg, value in sorted(vars(args).items()): + print("%s: %s" % (arg, value)) + print("-----------------------------------------------------------") + + +def add_arguments(argname, type, default, help, argparser, **kwargs): + """Add argparse's argument. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + add_argument("name", str, "Jonh", "User name.", parser) + args = parser.parse_args() + """ + type = distutils.util.strtobool if type == bool else type + argparser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) + + +def log_add(args: List[int]) -> float: + """Stable log add + + Args: + args (List[int]): log scores + + Returns: + float: sum of log scores + """ + if all(a == -float('inf') for a in args): + return -float('inf') + a_max = max(args) + lsp = math.log(sum(math.exp(a - a_max) for a in args)) + return a_max + lsp + + +def get_subsample(config): + """Subsample rate from config. + + Args: + config (yacs.config.CfgNode): yaml config + + Returns: + int: subsample rate. + """ + input_layer = config["model"]["encoder_conf"]["input_layer"] + assert input_layer in ["conv2d", "conv2d6", "conv2d8"] + if input_layer == "conv2d": + return 4 + elif input_layer == "conv2d6": + return 6 + elif input_layer == "conv2d8": + return 8 diff --git a/examples/transv1.8to2.x/example/aishell/run_data.sh b/examples/transv1.8to2.x/example/aishell/run_data.sh new file mode 100644 index 00000000..877381f0 --- /dev/null +++ b/examples/transv1.8to2.x/example/aishell/run_data.sh @@ -0,0 +1,42 @@ +#! /usr/bin/env bash + +cd ../.. > /dev/null + +# download data, generate manifests +PYTHONPATH=.:$PYTHONPATH python3 data/aishell/aishell.py \ +--manifest_prefix='data/aishell/manifest' \ +--target_dir='../dataset/aishell' + +if [ $? -ne 0 ]; then + echo "Prepare Aishell failed. Terminated." + exit 1 +fi + + +# build vocabulary +python3 tools/build_vocab.py \ +--count_threshold=0 \ +--vocab_path='data/aishell/vocab.txt' \ +--manifest_paths 'data/aishell/manifest.train' 'data/aishell/manifest.dev' + +if [ $? -ne 0 ]; then + echo "Build vocabulary failed. Terminated." + exit 1 +fi + + +# compute mean and stddev for normalizer +python3 tools/compute_mean_std.py \ +--manifest_path='data/aishell/manifest.train' \ +--num_samples=2000 \ +--specgram_type='linear' \ +--output_path='data/aishell/mean_std.npz' + +if [ $? -ne 0 ]; then + echo "Compute mean and stddev failed. Terminated." + exit 1 +fi + + +echo "Aishell data preparation done." +exit 0 diff --git a/examples/transv1.8to2.x/example/aishell/run_infer_golden.sh b/examples/transv1.8to2.x/example/aishell/run_infer_golden.sh new file mode 100644 index 00000000..f4255975 --- /dev/null +++ b/examples/transv1.8to2.x/example/aishell/run_infer_golden.sh @@ -0,0 +1,55 @@ +#! /usr/bin/env bash + +cd ../.. > /dev/null + +# download language model +cd models/lm > /dev/null +bash download_lm_ch.sh +if [ $? -ne 0 ]; then + exit 1 +fi +cd - > /dev/null + + +# download well-trained model +cd models/aishell > /dev/null +bash download_model.sh +if [ $? -ne 0 ]; then + exit 1 +fi +cd - > /dev/null + + +# infer +CUDA_VISIBLE_DEVICES=0 \ +python3 -u infer2x.py \ +--num_samples=10 \ +--beam_size=300 \ +--feat_dim=161 \ +--num_proc_bsearch=8 \ +--num_conv_layers=2 \ +--num_rnn_layers=3 \ +--rnn_layer_size=1024 \ +--alpha=2.6 \ +--beta=5.0 \ +--cutoff_prob=0.99 \ +--cutoff_top_n=40 \ +--use_gru=True \ +--use_gpu=False \ +--share_rnn_weights=False \ +--infer_manifest='data/aishell/manifest.test' \ +--mean_std_path='models/aishell/mean_std.npz' \ +--vocab_path='models/aishell/vocab.txt' \ +--model_path='models/aishell/aishell_v1.8.pdparams' \ +--lang_model_path='models/lm/zh_giga.no_cna_cmn.prune01244.klm' \ +--decoding_method='ctc_beam_search' \ +--error_rate_type='cer' \ +--specgram_type='linear' + +if [ $? -ne 0 ]; then + echo "Failed in inference!" + exit 1 +fi + + +exit 0 diff --git a/examples/transv1.8to2.x/example/aishell/run_test_golden.sh b/examples/transv1.8to2.x/example/aishell/run_test_golden.sh new file mode 100644 index 00000000..75eaf16e --- /dev/null +++ b/examples/transv1.8to2.x/example/aishell/run_test_golden.sh @@ -0,0 +1,54 @@ +#! /usr/bin/env bash + +cd ../.. > /dev/null + +# download language model +cd models/lm > /dev/null +bash download_lm_ch.sh +if [ $? -ne 0 ]; then + exit 1 +fi +cd - > /dev/null + +# download well-trained model +cd models/aishell > /dev/null +bash download_model.sh +if [ $? -ne 0 ]; then + exit 1 +fi +cd - > /dev/null + + +# evaluate model +CUDA_VISIBLE_DEVICES=1 \ +python3 -u test2x.py \ +--batch_size=64 \ +--beam_size=300 \ +--feat_dim=161 \ +--num_proc_bsearch=8 \ +--num_conv_layers=2 \ +--num_rnn_layers=3 \ +--rnn_layer_size=1024 \ +--alpha=2.6 \ +--beta=5.0 \ +--cutoff_prob=0.99 \ +--cutoff_top_n=40 \ +--use_gru=True \ +--use_gpu=True \ +--share_rnn_weights=False \ +--test_manifest='data/aishell/manifest.test' \ +--mean_std_path='models/aishell/mean_std.npz' \ +--vocab_path='models/aishell/vocab.txt' \ +--model_path='models/aishell/aishell_v1.8.pdparams' \ +--lang_model_path='models/lm/zh_giga.no_cna_cmn.prune01244.klm' \ +--decoding_method='ctc_beam_search' \ +--error_rate_type='cer' \ +--specgram_type='linear' + +if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 +fi + + +exit 0 diff --git a/examples/transv1.8to2.x/example/baidu_en8k/run_data.sh b/examples/transv1.8to2.x/example/baidu_en8k/run_data.sh new file mode 100644 index 00000000..487d2d22 --- /dev/null +++ b/examples/transv1.8to2.x/example/baidu_en8k/run_data.sh @@ -0,0 +1,45 @@ +#! /usr/bin/env bash + +cd ../.. > /dev/null + +# download data, generate manifests +PYTHONPATH=.:$PYTHONPATH python3 data/librispeech/librispeech.py \ +--manifest_prefix='data/librispeech/manifest' \ +--target_dir='../dataset/librispeech' \ +--full_download='True' + +if [ $? -ne 0 ]; then + echo "Prepare LibriSpeech failed. Terminated." + exit 1 +fi + +cat data/librispeech/manifest.train-* | shuf > data/librispeech/manifest.train + + +# build vocabulary +python3 tools/build_vocab.py \ +--count_threshold=0 \ +--vocab_path='data/librispeech/vocab.txt' \ +--manifest_paths='data/librispeech/manifest.train' + +if [ $? -ne 0 ]; then + echo "Build vocabulary failed. Terminated." + exit 1 +fi + + +# compute mean and stddev for normalizer +python3 tools/compute_mean_std.py \ +--manifest_path='data/librispeech/manifest.train' \ +--num_samples=2000 \ +--specgram_type='linear' \ +--output_path='data/librispeech/mean_std.npz' + +if [ $? -ne 0 ]; then + echo "Compute mean and stddev failed. Terminated." + exit 1 +fi + + +echo "LibriSpeech Data preparation done." +exit 0 diff --git a/examples/transv1.8to2.x/example/baidu_en8k/run_infer_golden.sh b/examples/transv1.8to2.x/example/baidu_en8k/run_infer_golden.sh new file mode 100644 index 00000000..a94c082f --- /dev/null +++ b/examples/transv1.8to2.x/example/baidu_en8k/run_infer_golden.sh @@ -0,0 +1,55 @@ +#! /usr/bin/env bash + +cd ../.. > /dev/null + +# download language model +cd models/lm > /dev/null +bash download_lm_en.sh +if [ $? -ne 0 ]; then + exit 1 +fi +cd - > /dev/null + + +# download well-trained model +cd models/baidu_en8k > /dev/null +bash download_model.sh +if [ $? -ne 0 ]; then + exit 1 +fi +cd - > /dev/null + + +# infer +CUDA_VISIBLE_DEVICES=0 \ +python3 -u infer2x.py \ +--num_samples=10 \ +--beam_size=500 \ +--feat_dim=161 \ +--num_proc_bsearch=5 \ +--num_conv_layers=2 \ +--num_rnn_layers=3 \ +--rnn_layer_size=1024 \ +--alpha=1.4 \ +--beta=0.35 \ +--cutoff_prob=1.0 \ +--cutoff_top_n=40 \ +--use_gru=True \ +--use_gpu=False \ +--share_rnn_weights=False \ +--infer_manifest='data/librispeech/manifest.test-clean' \ +--mean_std_path='models/baidu_en8k/mean_std.npz' \ +--vocab_path='models/baidu_en8k/vocab.txt' \ +--model_path='models/baidu_en8k/baidu_en8k_v1.8.pdparams' \ +--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \ +--decoding_method='ctc_beam_search' \ +--error_rate_type='wer' \ +--specgram_type='linear' + +if [ $? -ne 0 ]; then + echo "Failed in inference!" + exit 1 +fi + + +exit 0 diff --git a/examples/transv1.8to2.x/example/baidu_en8k/run_test_golden.sh b/examples/transv1.8to2.x/example/baidu_en8k/run_test_golden.sh new file mode 100644 index 00000000..9746015e --- /dev/null +++ b/examples/transv1.8to2.x/example/baidu_en8k/run_test_golden.sh @@ -0,0 +1,55 @@ +#! /usr/bin/env bash + +cd ../.. > /dev/null + +# download language model +cd models/lm > /dev/null +bash download_lm_en.sh +if [ $? -ne 0 ]; then + exit 1 +fi +cd - > /dev/null + + + +# download well-trained model +cd models/baidu_en8k > /dev/null +bash download_model.sh +if [ $? -ne 0 ]; then + exit 1 +fi +cd - > /dev/null + + +# evaluate model +CUDA_VISIBLE_DEVICES=0 \ +python3 -u test2x.py \ +--batch_size=32 \ +--beam_size=500 \ +--feat_dim=161 \ +--num_proc_bsearch=8 \ +--num_conv_layers=2 \ +--num_rnn_layers=3 \ +--rnn_layer_size=1024 \ +--alpha=1.4 \ +--beta=0.35 \ +--cutoff_prob=1.0 \ +--cutoff_top_n=40 \ +--use_gru=True \ +--use_gpu=False \ +--share_rnn_weights=False \ +--test_manifest='data/librispeech/manifest.test-clean' \ +--mean_std_path='models/baidu_en8k/mean_std.npz' \ +--vocab_path='models/baidu_en8k/vocab.txt' \ +--model_path='models/baidu_en8k/baidu_en8k_v1.8.pdparams' \ +--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \ +--decoding_method='ctc_beam_search' \ +--error_rate_type='wer' \ +--specgram_type='linear' + +if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 +fi + +exit 0 diff --git a/examples/transv1.8to2.x/example/librispeech/run_data.sh b/examples/transv1.8to2.x/example/librispeech/run_data.sh new file mode 100644 index 00000000..487d2d22 --- /dev/null +++ b/examples/transv1.8to2.x/example/librispeech/run_data.sh @@ -0,0 +1,45 @@ +#! /usr/bin/env bash + +cd ../.. > /dev/null + +# download data, generate manifests +PYTHONPATH=.:$PYTHONPATH python3 data/librispeech/librispeech.py \ +--manifest_prefix='data/librispeech/manifest' \ +--target_dir='../dataset/librispeech' \ +--full_download='True' + +if [ $? -ne 0 ]; then + echo "Prepare LibriSpeech failed. Terminated." + exit 1 +fi + +cat data/librispeech/manifest.train-* | shuf > data/librispeech/manifest.train + + +# build vocabulary +python3 tools/build_vocab.py \ +--count_threshold=0 \ +--vocab_path='data/librispeech/vocab.txt' \ +--manifest_paths='data/librispeech/manifest.train' + +if [ $? -ne 0 ]; then + echo "Build vocabulary failed. Terminated." + exit 1 +fi + + +# compute mean and stddev for normalizer +python3 tools/compute_mean_std.py \ +--manifest_path='data/librispeech/manifest.train' \ +--num_samples=2000 \ +--specgram_type='linear' \ +--output_path='data/librispeech/mean_std.npz' + +if [ $? -ne 0 ]; then + echo "Compute mean and stddev failed. Terminated." + exit 1 +fi + + +echo "LibriSpeech Data preparation done." +exit 0 diff --git a/examples/transv1.8to2.x/example/librispeech/run_infer_golden.sh b/examples/transv1.8to2.x/example/librispeech/run_infer_golden.sh new file mode 100644 index 00000000..e860a8b2 --- /dev/null +++ b/examples/transv1.8to2.x/example/librispeech/run_infer_golden.sh @@ -0,0 +1,55 @@ +#! /usr/bin/env bash + +cd ../.. > /dev/null + +# download language model +cd models/lm > /dev/null +bash download_lm_en.sh +if [ $? -ne 0 ]; then + exit 1 +fi +cd - > /dev/null + + +# download well-trained model +cd models/librispeech > /dev/null +bash download_model.sh +if [ $? -ne 0 ]; then + exit 1 +fi +cd - > /dev/null + + +# infer +CUDA_VISIBLE_DEVICES=0 \ +python3 -u infer2x.py \ +--num_samples=10 \ +--beam_size=500 \ +--feat_dim=161 \ +--num_proc_bsearch=8 \ +--num_conv_layers=2 \ +--num_rnn_layers=3 \ +--rnn_layer_size=2048 \ +--alpha=2.5 \ +--beta=0.3 \ +--cutoff_prob=1.0 \ +--cutoff_top_n=40 \ +--use_gru=False \ +--use_gpu=True \ +--share_rnn_weights=True \ +--infer_manifest='data/librispeech/manifest.test-clean' \ +--mean_std_path='models/librispeech/mean_std.npz' \ +--vocab_path='models/librispeech/vocab.txt' \ +--model_path='models/librispeech/librispeech_v1.8.pdparams' \ +--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \ +--decoding_method='ctc_beam_search' \ +--error_rate_type='wer' \ +--specgram_type='linear' + +if [ $? -ne 0 ]; then + echo "Failed in inference!" + exit 1 +fi + + +exit 0 diff --git a/examples/transv1.8to2.x/example/librispeech/run_test_golden.sh b/examples/transv1.8to2.x/example/librispeech/run_test_golden.sh new file mode 100644 index 00000000..d0335840 --- /dev/null +++ b/examples/transv1.8to2.x/example/librispeech/run_test_golden.sh @@ -0,0 +1,55 @@ +#! /usr/bin/env bash + +cd ../.. > /dev/null + +# download language model +cd models/lm > /dev/null +bash download_lm_en.sh +if [ $? -ne 0 ]; then + exit 1 +fi +cd - > /dev/null + + +# download well-trained model +cd models/librispeech > /dev/null +bash download_model.sh +if [ $? -ne 0 ]; then + exit 1 +fi +cd - > /dev/null + + +# evaluate model +CUDA_VISIBLE_DEVICES=0 \ +python3 -u test2x.py \ +--batch_size=32 \ +--beam_size=500 \ +--feat_dim=161 \ +--num_proc_bsearch=8 \ +--num_conv_layers=2 \ +--num_rnn_layers=3 \ +--rnn_layer_size=2048 \ +--alpha=2.5 \ +--beta=0.3 \ +--cutoff_prob=1.0 \ +--cutoff_top_n=40 \ +--use_gru=False \ +--use_gpu=True \ +--share_rnn_weights=True \ +--test_manifest='data/librispeech/manifest.test-clean' \ +--mean_std_path='models/librispeech/mean_std.npz' \ +--vocab_path='models/librispeech/vocab.txt' \ +--model_path='models/librispeech/librispeech_v1.8.pdparams' \ +--lang_model_path='models/lm/common_crawl_00.prune01111.trie.klm' \ +--decoding_method='ctc_beam_search' \ +--error_rate_type='wer' \ +--specgram_type='linear' + +if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 +fi + + +exit 0 diff --git a/examples/transv1.8to2.x/infer2x.py b/examples/transv1.8to2.x/infer2x.py new file mode 100644 index 00000000..f461b05b --- /dev/null +++ b/examples/transv1.8to2.x/infer2x.py @@ -0,0 +1,163 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Inferer for DeepSpeech2 model.""" +import argparse +import functools + +import numpy as np +import paddle +import paddle.fluid as fluid +from data_utils.data import DataGenerator +from model_utils.model_check import check_cuda +from model_utils.model_check import check_version + +from deepspeech.models.ds2 import DeepSpeech2Model as DS2 +from utils.error_rate import cer +from utils.error_rate import wer +from utils.utility import add_arguments +from utils.utility import print_arguments +parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('num_samples', int, 10, "# of samples to infer.") +add_arg('beam_size', int, 500, "Beam search width.") +add_arg('feat_dim', int, 161, "Feature dim.") +add_arg('num_proc_bsearch', int, 8, "# of CPUs for beam search.") +add_arg('num_conv_layers', int, 2, "# of convolution layers.") +add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") +add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('alpha', float, 2.5, "Coef of LM for beam search.") +add_arg('beta', float, 0.3, "Coef of WC for beam search.") +add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.") +add_arg('cutoff_top_n', int, 40, "Cutoff number for pruning.") +add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") +add_arg('use_gpu', bool, True, "Use GPU or not.") +add_arg('share_rnn_weights', bool, True, "Share input-hidden weights across bi-directional RNNs. Not for GRU.") +add_arg('infer_manifest', str, + 'data/librispeech/manifest.dev-clean', + "Filepath of manifest to infer.") +add_arg('mean_std_path', str, + 'data/librispeech/mean_std.npz', + "Filepath of normalizer's mean & std.") +add_arg('vocab_path', str, + 'data/librispeech/vocab.txt', + "Filepath of vocabulary.") +add_arg('lang_model_path', str, + 'models/lm/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") +add_arg('model_path', str, + './checkpoints/libri/step_final', + "If None, the training starts from scratch, " + "otherwise, it resumes from the pre-trained model.") +add_arg('decoding_method', str, + 'ctc_beam_search', + "Decoding method. Options: ctc_beam_search, ctc_greedy", + choices=['ctc_beam_search', 'ctc_greedy']) +add_arg('error_rate_type', str, + 'wer', + "Error rate type for evaluation.", + choices=['wer', 'cer']) +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +# yapf: disable +args = parser.parse_args() + + +def infer(): + """Inference for DeepSpeech2.""" + + # check if set use_gpu=True in paddlepaddle cpu version + check_cuda(args.use_gpu) + # check if paddlepaddle version is satisfied + check_version() + + if args.use_gpu: + place = fluid.CUDAPlace(0) + else: + place = fluid.CPUPlace() + + data_generator = DataGenerator( + vocab_filepath=args.vocab_path, + mean_std_filepath=args.mean_std_path, + augmentation_config='{}', + specgram_type=args.specgram_type, + keep_transcription_text=True, + place=place, + is_training=False) + batch_reader = data_generator.batch_reader_creator( + manifest_path=args.infer_manifest, + batch_size=args.num_samples, + sortagrad=False, + shuffle_method=None) + + # decoders only accept string encoded in utf-8 + vocab_list = [chars for chars in data_generator.vocab_list] + for i, char in enumerate(vocab_list): + if vocab_list[i] == '': + vocab_list[i] = " " + + model = DS2( + feat_size=args.feat_dim, + dict_size=len(vocab_list), + num_conv_layers=args.num_conv_layers, + num_rnn_layers=args.num_rnn_layers, + rnn_size=args.rnn_layer_size, + use_gru=args.use_gru, + share_rnn_weights=args.share_rnn_weights, + blank_id=len(vocab_list) - 1 + ) + params_path = args.model_path + model_dict = paddle.load(params_path) + model.set_state_dict(model_dict) + model.eval() + error_rate_func = cer if args.error_rate_type == 'cer' else wer + print("start inference ...") + for infer_data in batch_reader(): + target_transcripts = infer_data[1] + audio, target_transcripts, audio_len, mask = infer_data + audio = np.transpose(audio, (0, 2, 1)) + audio_len = audio_len.reshape(-1) + audio = paddle.to_tensor(audio) + audio_len = paddle.to_tensor(audio_len) + + result_transcripts = model.decode( + audio=audio, + audio_len=audio_len, + lang_model_path=args.lang_model_path, + decoding_method=args.decoding_method, + beam_alpha=args.alpha, + beam_beta=args.beta, + beam_size=args.beam_size, + cutoff_prob=args.cutoff_prob, + cutoff_top_n=args.cutoff_top_n, + vocab_list=vocab_list, + num_processes=args.num_proc_bsearch + ) + for target, result in zip(target_transcripts, result_transcripts): + print("\nTarget Transcription: %s\nOutput Transcription: %s" % + (target, result)) + print("Current error rate [%s] = %f" % + (args.error_rate_type, error_rate_func(target, result))) + + print("finish inference") + +def main(): + print_arguments(args) + infer() + + +if __name__ == '__main__': + main() diff --git a/examples/transv1.8to2.x/model_utils/__init__.py b/examples/transv1.8to2.x/model_utils/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/examples/transv1.8to2.x/model_utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/transv1.8to2.x/model_utils/model_check.py b/examples/transv1.8to2.x/model_utils/model_check.py new file mode 100644 index 00000000..f64299ee --- /dev/null +++ b/examples/transv1.8to2.x/model_utils/model_check.py @@ -0,0 +1,49 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import sys + +import paddle.fluid as fluid + + +def check_cuda( + use_cuda, + err="\nYou can not set use_cuda = True in the model because you are using paddlepaddle-cpu.\n \ + Please: 1. Install paddlepaddle-gpu to run your models on GPU or 2. Set use_cuda = False to run models on CPU.\n" +): + """ + Log error and exit when set use_gpu=true in paddlepaddle + cpu version. + """ + try: + if use_cuda is True and fluid.is_compiled_with_cuda() is False: + print(err) + sys.exit(1) + except Exception as e: + pass + + +def check_version(): + """ + Log error and exit when the installed version of paddlepaddle is + not satisfied. + """ + err = "PaddlePaddle version 1.6 or higher is required, " \ + "or a suitable develop version is satisfied as well. \n" \ + "Please make sure the version is good with your code." \ + + try: + fluid.require_version('1.6.0') + except Exception as e: + print(err) + sys.exit(1) diff --git a/examples/transv1.8to2.x/models/aishell/download_model.sh b/examples/transv1.8to2.x/models/aishell/download_model.sh new file mode 100644 index 00000000..939382a1 --- /dev/null +++ b/examples/transv1.8to2.x/models/aishell/download_model.sh @@ -0,0 +1,19 @@ +#! /usr/bin/env bash + +. ../../utils/utility.sh + +URL='https://deepspeech.bj.bcebos.com/mandarin_models/aishell_model_v1.8_to_v2.x.tar.gz' +MD5=4ade113c69ea291b8ce5ec6a03296659 +TARGET=./aishell_model_v1.8_to_v2.x.tar.gz + + +echo "Download Aishell model ..." +download $URL $MD5 $TARGET +if [ $? -ne 0 ]; then + echo "Fail to download Aishell model!" + exit 1 +fi +tar -zxvf $TARGET + + +exit 0 diff --git a/examples/transv1.8to2.x/models/baidu_en8k/download_model.sh b/examples/transv1.8to2.x/models/baidu_en8k/download_model.sh new file mode 100644 index 00000000..e91bd39c --- /dev/null +++ b/examples/transv1.8to2.x/models/baidu_en8k/download_model.sh @@ -0,0 +1,19 @@ +#! /usr/bin/env bash + +. ../../utils/utility.sh + +URL='https://deepspeech.bj.bcebos.com/eng_models/baidu_en8k_v1.8_to_v2.x.tar.gz' +MD5=fdabeb6c96963ac85d9188f0275c6a1b +TARGET=./baidu_en8k_v1.8_to_v2.x.tar.gz + + +echo "Download BaiduEn8k model ..." +download $URL $MD5 $TARGET +if [ $? -ne 0 ]; then + echo "Fail to download BaiduEn8k model!" + exit 1 +fi +tar -zxvf $TARGET + + +exit 0 diff --git a/examples/transv1.8to2.x/models/librispeech/download_model.sh b/examples/transv1.8to2.x/models/librispeech/download_model.sh new file mode 100644 index 00000000..e0940a7f --- /dev/null +++ b/examples/transv1.8to2.x/models/librispeech/download_model.sh @@ -0,0 +1,19 @@ +#! /usr/bin/env bash + +. ../../utils/utility.sh + +URL='https://deepspeech.bj.bcebos.com/eng_models/librispeech_v1.8_to_v2.x.tar.gz' +MD5=7b0f582fe2f5a840b840e7ee52246bc5 +TARGET=./librispeech_v1.8_to_v2.x.tar.gz + + +echo "Download LibriSpeech model ..." +download $URL $MD5 $TARGET +if [ $? -ne 0 ]; then + echo "Fail to download LibriSpeech model!" + exit 1 +fi +tar -zxvf $TARGET + + +exit 0 diff --git a/examples/transv1.8to2.x/models/lm/download_lm_ch.sh b/examples/transv1.8to2.x/models/lm/download_lm_ch.sh new file mode 100644 index 00000000..0e491526 --- /dev/null +++ b/examples/transv1.8to2.x/models/lm/download_lm_ch.sh @@ -0,0 +1,18 @@ +#! /usr/bin/env bash + +. ../../utils/utility.sh + +URL='https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm' +MD5="29e02312deb2e59b3c8686c7966d4fe3" +TARGET=./zh_giga.no_cna_cmn.prune01244.klm + + +echo "Download language model ..." +download $URL $MD5 $TARGET +if [ $? -ne 0 ]; then + echo "Fail to download the language model!" + exit 1 +fi + + +exit 0 diff --git a/examples/transv1.8to2.x/models/lm/download_lm_en.sh b/examples/transv1.8to2.x/models/lm/download_lm_en.sh new file mode 100644 index 00000000..cc8d3203 --- /dev/null +++ b/examples/transv1.8to2.x/models/lm/download_lm_en.sh @@ -0,0 +1,18 @@ +#! /usr/bin/env bash + +. ../../utils/utility.sh + +URL=https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm +MD5="099a601759d467cd0a8523ff939819c5" +TARGET=./common_crawl_00.prune01111.trie.klm + + +echo "Download language model ..." +download $URL $MD5 $TARGET +if [ $? -ne 0 ]; then + echo "Fail to download the language model!" + exit 1 +fi + + +exit 0 diff --git a/examples/transv1.8to2.x/test2x.py b/examples/transv1.8to2.x/test2x.py new file mode 100644 index 00000000..85a71b54 --- /dev/null +++ b/examples/transv1.8to2.x/test2x.py @@ -0,0 +1,169 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Evaluation for DeepSpeech2 model.""" +import argparse +import functools + +import numpy as np +import paddle +import paddle.fluid as fluid +from data_utils.data import DataGenerator +from model_utils.model_check import check_cuda +from model_utils.model_check import check_version + +from deepspeech.models.ds2 import DeepSpeech2Model as DS2 +from utils.error_rate import char_errors +from utils.error_rate import word_errors +from utils.utility import add_arguments +from utils.utility import print_arguments +parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('batch_size', int, 128, "Minibatch size.") +add_arg('beam_size', int, 500, "Beam search width.") +add_arg('feat_dim', int, 161, "Feature dim.") +add_arg('num_proc_bsearch', int, 8, "# of CPUs for beam search.") +add_arg('num_conv_layers', int, 2, "# of convolution layers.") +add_arg('num_rnn_layers', int, 3, "# of recurrent layers.") +add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.") +add_arg('alpha', float, 2.5, "Coef of LM for beam search.") +add_arg('beta', float, 0.3, "Coef of WC for beam search.") +add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.") +add_arg('cutoff_top_n', int, 40, "Cutoff number for pruning.") +add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.") +add_arg('use_gpu', bool, True, "Use GPU or not.") +add_arg('share_rnn_weights', bool, True, "Share input-hidden weights across " + "bi-directional RNNs. Not for GRU.") +add_arg('test_manifest', str, + 'data/librispeech/manifest.test-clean', + "Filepath of manifest to evaluate.") +add_arg('mean_std_path', str, + 'data/librispeech/mean_std.npz', + "Filepath of normalizer's mean & std.") +add_arg('vocab_path', str, + 'data/librispeech/vocab.txt', + "Filepath of vocabulary.") +add_arg('model_path', str, + './checkpoints/libri/step_final', + "If None, the training starts from scratch, " + "otherwise, it resumes from the pre-trained model.") +add_arg('lang_model_path', str, + 'models/lm/common_crawl_00.prune01111.trie.klm', + "Filepath for language model.") +add_arg('decoding_method', str, + 'ctc_beam_search', + "Decoding method. Options: ctc_beam_search, ctc_greedy", + choices=['ctc_beam_search', 'ctc_greedy']) +add_arg('error_rate_type', str, + 'wer', + "Error rate type for evaluation.", + choices=['wer', 'cer']) +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +# yapf: disable +args = parser.parse_args() + +def evaluate(): + """Evaluate on whole test data for DeepSpeech2.""" + + # check if set use_gpu=True in paddlepaddle cpu version + check_cuda(args.use_gpu) + # check if paddlepaddle version is satisfied + check_version() + + if args.use_gpu: + place = fluid.CUDAPlace(0) + else: + place = fluid.CPUPlace() + + data_generator = DataGenerator( + vocab_filepath=args.vocab_path, + mean_std_filepath=args.mean_std_path, + augmentation_config='{}', + specgram_type=args.specgram_type, + keep_transcription_text=True, + place=place, + is_training=False) + batch_reader = data_generator.batch_reader_creator( + manifest_path=args.test_manifest, + batch_size=args.batch_size, + sortagrad=False, + shuffle_method=None) + + + # decoders only accept string encoded in utf-8 + vocab_list = [chars for chars in data_generator.vocab_list] + for i, char in enumerate(vocab_list): + if vocab_list[i] == '': + vocab_list[i] = " " + + model = DS2( + feat_size=args.feat_dim, + dict_size=len(vocab_list), + num_conv_layers=args.num_conv_layers, + num_rnn_layers=args.num_rnn_layers, + rnn_size=args.rnn_layer_size, + use_gru=args.use_gru, + share_rnn_weights=args.share_rnn_weights, + blank_id=len(vocab_list) - 1 + ) + + params_path = args.model_path + model_dict = paddle.load(params_path) + model.set_state_dict(model_dict) + model.eval() + errors_func = char_errors if args.error_rate_type == 'cer' else word_errors + errors_sum, len_refs, num_ins = 0.0, 0, 0 + + print("start evaluation ...") + for infer_data in batch_reader(): + audio, target_transcripts, audio_len, mask = infer_data + audio = np.transpose(audio, (0, 2, 1)) + audio_len = audio_len.reshape(-1) + audio = paddle.to_tensor(audio) + audio_len = paddle.to_tensor(audio_len) + result_transcripts = model.decode( + audio=audio, + audio_len=audio_len, + lang_model_path=args.lang_model_path, + decoding_method=args.decoding_method, + beam_alpha=args.alpha, + beam_beta=args.beta, + beam_size=args.beam_size, + cutoff_prob=args.cutoff_prob, + cutoff_top_n=args.cutoff_top_n, + vocab_list=vocab_list, + num_processes=args.num_proc_bsearch + ) + for target, result in zip(target_transcripts, result_transcripts): + errors, len_ref = errors_func(target, result) + errors_sum += errors + len_refs += len_ref + num_ins += 1 + print("Error rate [%s] (%d/?) = %f" % + (args.error_rate_type, num_ins, errors_sum / len_refs)) + print("Final error rate [%s] (%d/%d) = %f" % + (args.error_rate_type, num_ins, num_ins, errors_sum / len_refs)) + + print("finish evaluation") + +def main(): + print_arguments(args) + evaluate() + + +if __name__ == '__main__': + main() diff --git a/examples/transv1.8to2.x/tools/_init_paths.py b/examples/transv1.8to2.x/tools/_init_paths.py new file mode 100644 index 00000000..2f7a5dbd --- /dev/null +++ b/examples/transv1.8to2.x/tools/_init_paths.py @@ -0,0 +1,31 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Set up paths for DS2""" +import os.path +import sys + + +def add_path(path): + if path not in sys.path: + sys.path.insert(0, path) + + +this_dir = os.path.dirname(__file__) +# Add project path to PYTHONPATH +proj_path = os.path.join(this_dir, '..') +add_path(proj_path) + + +def do_nothing(): + pass diff --git a/examples/transv1.8to2.x/tools/build_vocab.py b/examples/transv1.8to2.x/tools/build_vocab.py new file mode 100644 index 00000000..4ecf9bcc --- /dev/null +++ b/examples/transv1.8to2.x/tools/build_vocab.py @@ -0,0 +1,70 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Build vocabulary from manifest files. + +Each item in vocabulary file is a character. +""" +import argparse +import codecs +import functools +from collections import Counter + +import _init_paths +from data_utils.utility import read_manifest + +from utils.utility import add_arguments +from utils.utility import print_arguments + +parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('count_threshold', int, 0, "Truncation threshold for char counts.") +add_arg('vocab_path', str, + 'data/librispeech/vocab.txt', + "Filepath to write the vocabulary.") +add_arg('manifest_paths', str, + None, + "Filepaths of manifests for building vocabulary. " + "You can provide multiple manifest files.", + nargs='+', + required=True) +# yapf: disable +args = parser.parse_args() + +_init_paths.do_nothing() + +def count_manifest(counter, manifest_path): + manifest_jsons = read_manifest(manifest_path) + for line_json in manifest_jsons: + for char in line_json['text']: + counter.update(char) + + +def main(): + print_arguments(args) + + counter = Counter() + for manifest_path in args.manifest_paths: + count_manifest(counter, manifest_path) + + count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True) + with codecs.open(args.vocab_path, 'w', 'utf-8') as fout: + for char, count in count_sorted: + if count < args.count_threshold: + break + fout.write(char + '\n') + + +if __name__ == '__main__': + main() diff --git a/examples/transv1.8to2.x/tools/compute_mean_std.py b/examples/transv1.8to2.x/tools/compute_mean_std.py new file mode 100644 index 00000000..cd402817 --- /dev/null +++ b/examples/transv1.8to2.x/tools/compute_mean_std.py @@ -0,0 +1,64 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Compute mean and std for feature normalizer, and save to file.""" +import argparse +import functools + +import _init_paths +from data_utils.augmentor.augmentation import AugmentationPipeline +from data_utils.featurizer.audio_featurizer import AudioFeaturizer +from data_utils.normalizer import FeatureNormalizer + +from utils.utility import add_arguments +from utils.utility import print_arguments + +parser = argparse.ArgumentParser(description=__doc__) +add_arg = functools.partial(add_arguments, argparser=parser) +# yapf: disable +add_arg('num_samples', int, 2000, "# of samples to for statistics.") +add_arg('specgram_type', str, + 'linear', + "Audio feature type. Options: linear, mfcc.", + choices=['linear', 'mfcc']) +add_arg('manifest_path', str, + 'data/librispeech/manifest.train', + "Filepath of manifest to compute normalizer's mean and stddev.") +add_arg('output_path', str, + 'data/librispeech/mean_std.npz', + "Filepath of write mean and stddev to (.npz).") +# yapf: disable +args = parser.parse_args() + +_init_paths.do_nothing() + +def main(): + print_arguments(args) + + augmentation_pipeline = AugmentationPipeline('{}') + audio_featurizer = AudioFeaturizer(specgram_type=args.specgram_type) + + def augment_and_featurize(audio_segment): + augmentation_pipeline.transform_audio(audio_segment) + return audio_featurizer.featurize(audio_segment) + + normalizer = FeatureNormalizer( + mean_std_filepath=None, + manifest_path=args.manifest_path, + featurize_func=augment_and_featurize, + num_samples=args.num_samples) + normalizer.write_to_file(args.output_path) + + +if __name__ == '__main__': + main() diff --git a/examples/transv1.8to2.x/utils/__init__.py b/examples/transv1.8to2.x/utils/__init__.py new file mode 100644 index 00000000..185a92b8 --- /dev/null +++ b/examples/transv1.8to2.x/utils/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/examples/transv1.8to2.x/utils/error_rate.py b/examples/transv1.8to2.x/utils/error_rate.py new file mode 100644 index 00000000..1fe8a726 --- /dev/null +++ b/examples/transv1.8to2.x/utils/error_rate.py @@ -0,0 +1,204 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""This module provides functions to calculate error rate in different level. +e.g. wer for word-level, cer for char-level. +""" +import numpy as np + + +def _levenshtein_distance(ref, hyp): + """Levenshtein distance is a string metric for measuring the difference + between two sequences. Informally, the levenshtein disctance is defined as + the minimum number of single-character edits (substitutions, insertions or + deletions) required to change one word into the other. We can naturally + extend the edits to word level when calculate levenshtein disctance for + two sentences. + """ + m = len(ref) + n = len(hyp) + + # special case + if ref == hyp: + return 0 + if m == 0: + return n + if n == 0: + return m + + if m < n: + ref, hyp = hyp, ref + m, n = n, m + + # use O(min(m, n)) space + distance = np.zeros((2, n + 1), dtype=np.int32) + + # initialize distance matrix + for j in range(n + 1): + distance[0][j] = j + + # calculate levenshtein distance + for i in range(1, m + 1): + prev_row_idx = (i - 1) % 2 + cur_row_idx = i % 2 + distance[cur_row_idx][0] = i + for j in range(1, n + 1): + if ref[i - 1] == hyp[j - 1]: + distance[cur_row_idx][j] = distance[prev_row_idx][j - 1] + else: + s_num = distance[prev_row_idx][j - 1] + 1 + i_num = distance[cur_row_idx][j - 1] + 1 + d_num = distance[prev_row_idx][j] + 1 + distance[cur_row_idx][j] = min(s_num, i_num, d_num) + + return distance[m % 2][n] + + +def word_errors(reference, hypothesis, ignore_case=False, delimiter=' '): + """Compute the levenshtein distance between reference sequence and + hypothesis sequence in word-level. + + :param reference: The reference sentence. + :type reference: str + :param hypothesis: The hypothesis sentence. + :type hypothesis: str + :param ignore_case: Whether case-sensitive or not. + :type ignore_case: bool + :param delimiter: Delimiter of input sentences. + :type delimiter: char + :return: Levenshtein distance and word number of reference sentence. + :rtype: list + """ + if ignore_case is True: + reference = reference.lower() + hypothesis = hypothesis.lower() + + ref_words = list(filter(None, reference.split(delimiter))) + hyp_words = list(filter(None, hypothesis.split(delimiter))) + + edit_distance = _levenshtein_distance(ref_words, hyp_words) + return float(edit_distance), len(ref_words) + + +def char_errors(reference, hypothesis, ignore_case=False, remove_space=False): + """Compute the levenshtein distance between reference sequence and + hypothesis sequence in char-level. + + :param reference: The reference sentence. + :type reference: str + :param hypothesis: The hypothesis sentence. + :type hypothesis: str + :param ignore_case: Whether case-sensitive or not. + :type ignore_case: bool + :param remove_space: Whether remove internal space characters + :type remove_space: bool + :return: Levenshtein distance and length of reference sentence. + :rtype: list + """ + if ignore_case is True: + reference = reference.lower() + hypothesis = hypothesis.lower() + + join_char = ' ' + if remove_space is True: + join_char = '' + + reference = join_char.join(list(filter(None, reference.split(' ')))) + hypothesis = join_char.join(list(filter(None, hypothesis.split(' ')))) + + edit_distance = _levenshtein_distance(reference, hypothesis) + return float(edit_distance), len(reference) + + +def wer(reference, hypothesis, ignore_case=False, delimiter=' '): + """Calculate word error rate (WER). WER compares reference text and + hypothesis text in word-level. WER is defined as: + + .. math:: + WER = (Sw + Dw + Iw) / Nw + + where + + .. code-block:: text + + Sw is the number of words subsituted, + Dw is the number of words deleted, + Iw is the number of words inserted, + Nw is the number of words in the reference + + We can use levenshtein distance to calculate WER. Please draw an attention + that empty items will be removed when splitting sentences by delimiter. + + :param reference: The reference sentence. + :type reference: str + :param hypothesis: The hypothesis sentence. + :type hypothesis: str + :param ignore_case: Whether case-sensitive or not. + :type ignore_case: bool + :param delimiter: Delimiter of input sentences. + :type delimiter: char + :return: Word error rate. + :rtype: float + :raises ValueError: If word number of reference is zero. + """ + edit_distance, ref_len = word_errors(reference, hypothesis, ignore_case, + delimiter) + + if ref_len == 0: + raise ValueError("Reference's word number should be greater than 0.") + + wer = float(edit_distance) / ref_len + return wer + + +def cer(reference, hypothesis, ignore_case=False, remove_space=False): + """Calculate charactor error rate (CER). CER compares reference text and + hypothesis text in char-level. CER is defined as: + + .. math:: + CER = (Sc + Dc + Ic) / Nc + + where + + .. code-block:: text + + Sc is the number of characters substituted, + Dc is the number of characters deleted, + Ic is the number of characters inserted + Nc is the number of characters in the reference + + We can use levenshtein distance to calculate CER. Chinese input should be + encoded to unicode. Please draw an attention that the leading and tailing + space characters will be truncated and multiple consecutive space + characters in a sentence will be replaced by one space character. + + :param reference: The reference sentence. + :type reference: str + :param hypothesis: The hypothesis sentence. + :type hypothesis: str + :param ignore_case: Whether case-sensitive or not. + :type ignore_case: bool + :param remove_space: Whether remove internal space characters + :type remove_space: bool + :return: Character error rate. + :rtype: float + :raises ValueError: If the reference length is zero. + """ + edit_distance, ref_len = char_errors(reference, hypothesis, ignore_case, + remove_space) + + if ref_len == 0: + raise ValueError("Length of reference should be greater than 0.") + + cer = float(edit_distance) / ref_len + return cer diff --git a/examples/transv1.8to2.x/utils/tests/test_error_rate.py b/examples/transv1.8to2.x/utils/tests/test_error_rate.py new file mode 100644 index 00000000..2235218d --- /dev/null +++ b/examples/transv1.8to2.x/utils/tests/test_error_rate.py @@ -0,0 +1,124 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Test error rate.""" +import unittest + +from utils import error_rate + + +class TestParse(unittest.TestCase): + def test_wer_1(self): + ref = 'i UM the PHONE IS i LEFT THE portable PHONE UPSTAIRS last night' + hyp = 'i GOT IT TO the FULLEST i LOVE TO portable FROM OF STORES last '\ + 'night' + word_error_rate = error_rate.wer(ref, hyp) + self.assertTrue(abs(word_error_rate - 0.769230769231) < 1e-6) + + def test_wer_2(self): + ref = 'as any in england i would say said gamewell proudly that is '\ + 'in his day' + hyp = 'as any in england i would say said came well proudly that is '\ + 'in his day' + word_error_rate = error_rate.wer(ref, hyp) + self.assertTrue(abs(word_error_rate - 0.1333333) < 1e-6) + + def test_wer_3(self): + ref = 'the lieutenant governor lilburn w boggs afterward governor '\ + 'was a pronounced mormon hater and throughout the period of '\ + 'the troubles he manifested sympathy with the persecutors' + hyp = 'the lieutenant governor little bit how bags afterward '\ + 'governor was a pronounced warman hater and throughout the '\ + 'period of th troubles he manifests sympathy with the '\ + 'persecutors' + word_error_rate = error_rate.wer(ref, hyp) + self.assertTrue(abs(word_error_rate - 0.2692307692) < 1e-6) + + def test_wer_4(self): + ref = 'the wood flamed up splendidly under the large brewing copper '\ + 'and it sighed so deeply' + hyp = 'the wood flame do splendidly under the large brewing copper '\ + 'and its side so deeply' + word_error_rate = error_rate.wer(ref, hyp) + self.assertTrue(abs(word_error_rate - 0.2666666667) < 1e-6) + + def test_wer_5(self): + ref = 'all the morning they trudged up the mountain path and at noon '\ + 'unc and ojo sat on a fallen tree trunk and ate the last of '\ + 'the bread which the old munchkin had placed in his pocket' + hyp = 'all the morning they trudged up the mountain path and at noon '\ + 'unc in ojo sat on a fallen tree trunk and ate the last of '\ + 'the bread which the old munchkin had placed in his pocket' + word_error_rate = error_rate.wer(ref, hyp) + self.assertTrue(abs(word_error_rate - 0.027027027) < 1e-6) + + def test_wer_6(self): + ref = 'i UM the PHONE IS i LEFT THE portable PHONE UPSTAIRS last night' + word_error_rate = error_rate.wer(ref, ref) + self.assertEqual(word_error_rate, 0.0) + + def test_wer_7(self): + ref = ' ' + hyp = 'Hypothesis sentence' + with self.assertRaises(ValueError): + word_error_rate = error_rate.wer(ref, hyp) + + def test_cer_1(self): + ref = 'werewolf' + hyp = 'weae wolf' + char_error_rate = error_rate.cer(ref, hyp) + self.assertTrue(abs(char_error_rate - 0.25) < 1e-6) + + def test_cer_2(self): + ref = 'werewolf' + hyp = 'weae wolf' + char_error_rate = error_rate.cer(ref, hyp, remove_space=True) + self.assertTrue(abs(char_error_rate - 0.125) < 1e-6) + + def test_cer_3(self): + ref = 'were wolf' + hyp = 'were wolf' + char_error_rate = error_rate.cer(ref, hyp) + self.assertTrue(abs(char_error_rate - 0.0) < 1e-6) + + def test_cer_4(self): + ref = 'werewolf' + char_error_rate = error_rate.cer(ref, ref) + self.assertEqual(char_error_rate, 0.0) + + def test_cer_5(self): + ref = u'我是中国人' + hyp = u'我是 美洲人' + char_error_rate = error_rate.cer(ref, hyp) + self.assertTrue(abs(char_error_rate - 0.6) < 1e-6) + + def test_cer_6(self): + ref = u'我 是 中 国 人' + hyp = u'我 是 美 洲 人' + char_error_rate = error_rate.cer(ref, hyp, remove_space=True) + self.assertTrue(abs(char_error_rate - 0.4) < 1e-6) + + def test_cer_7(self): + ref = u'我是中国人' + char_error_rate = error_rate.cer(ref, ref) + self.assertFalse(char_error_rate, 0.0) + + def test_cer_8(self): + ref = '' + hyp = 'Hypothesis' + with self.assertRaises(ValueError): + char_error_rate = error_rate.cer(ref, hyp) + + +if __name__ == '__main__': + unittest.main() diff --git a/examples/transv1.8to2.x/utils/utility.py b/examples/transv1.8to2.x/utils/utility.py new file mode 100644 index 00000000..f9790843 --- /dev/null +++ b/examples/transv1.8to2.x/utils/utility.py @@ -0,0 +1,56 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Contains common utility functions.""" +import distutils.util + + +def print_arguments(args): + """Print argparse's arguments. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + parser.add_argument("name", default="Jonh", type=str, help="User name.") + args = parser.parse_args() + print_arguments(args) + + :param args: Input argparse.Namespace for printing. + :type args: argparse.Namespace + """ + print("----------- Configuration Arguments -----------") + for arg, value in sorted(vars(args).items()): + print("%s: %s" % (arg, value)) + print("------------------------------------------------") + + +def add_arguments(argname, type, default, help, argparser, **kwargs): + """Add argparse's argument. + + Usage: + + .. code-block:: python + + parser = argparse.ArgumentParser() + add_argument("name", str, "Jonh", "User name.", parser) + args = parser.parse_args() + """ + type = distutils.util.strtobool if type == bool else type + argparser.add_argument( + "--" + argname, + default=default, + type=type, + help=help + ' Default: %(default)s.', + **kwargs) diff --git a/examples/transv1.8to2.x/utils/utility.sh b/examples/transv1.8to2.x/utils/utility.sh new file mode 100644 index 00000000..baae0474 --- /dev/null +++ b/examples/transv1.8to2.x/utils/utility.sh @@ -0,0 +1,23 @@ +download() { + URL=$1 + MD5=$2 + TARGET=$3 + + if [ -e $TARGET ]; then + md5_result=`md5sum $TARGET | awk -F[' '] '{print $1}'` + if [ $MD5 == $md5_result ]; then + echo "$TARGET already exists, download skipped." + return 0 + fi + fi + + wget -c $URL -O "$TARGET" + if [ $? -ne 0 ]; then + return 1 + fi + + md5_result=`md5sum $TARGET | awk -F[' '] '{print $1}'` + if [ ! $MD5 == $md5_result ]; then + return 1 + fi +} -- GitLab