{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "medieval-monday", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/workspace/DeepSpeech-2.x\n" ] }, { "data": { "text/plain": [ "'/workspace/DeepSpeech-2.x'" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "%cd ..\n", "%pwd" ] }, { "cell_type": "code", "execution_count": 2, "id": "emerging-meter", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", " def convert_to_list(value, n, name, dtype=np.int):\n" ] } ], "source": [ "import math\n", "import random\n", "import tarfile\n", "import logging\n", "import numpy as np\n", "from collections import namedtuple\n", "from functools import partial\n", "\n", "import paddle\n", "from paddle.io import Dataset\n", "from paddle.io import DataLoader\n", "from paddle.io import BatchSampler\n", "from paddle.io import DistributedBatchSampler\n", "from paddle import distributed as dist\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "id": "excessive-american", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 3, "id": "naval-brave", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", " and should_run_async(code)\n", "WARNING:root:register user softmax to paddle, remove this when fixed!\n", "WARNING:root:register user log_softmax to paddle, remove this when fixed!\n", "WARNING:root:register user sigmoid to paddle, remove this when fixed!\n", "WARNING:root:register user log_sigmoid to paddle, remove this when fixed!\n", "WARNING:root:register user relu to paddle, remove this when fixed!\n", "WARNING:root:override cat of paddle if exists or register, remove this when fixed!\n", "WARNING:root:override item of paddle.Tensor if exists or register, remove this when fixed!\n", "WARNING:root:override long of paddle.Tensor if exists or register, remove this when fixed!\n", "WARNING:root:override new_full of paddle.Tensor if exists or register, remove this when fixed!\n", "WARNING:root:override eq of paddle.Tensor if exists or register, remove this when fixed!\n", "WARNING:root:override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n", "WARNING:root:override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n", "WARNING:root:register user view to paddle.Tensor, remove this when fixed!\n", "WARNING:root:register user view_as to paddle.Tensor, remove this when fixed!\n", "WARNING:root:register user masked_fill to paddle.Tensor, remove this when fixed!\n", "WARNING:root:register user masked_fill_ to paddle.Tensor, remove this when fixed!\n", "WARNING:root:register user fill_ to paddle.Tensor, remove this when fixed!\n", "WARNING:root:register user repeat to paddle.Tensor, remove this when fixed!\n", "WARNING:root:register user softmax to paddle.Tensor, remove this when fixed!\n", "WARNING:root:register user sigmoid to paddle.Tensor, remove this when fixed!\n", "WARNING:root:register user relu to paddle.Tensor, remove this when fixed!\n", "WARNING:root:register user type_as to paddle.Tensor, remove this when fixed!\n", "WARNING:root:register user to to paddle.Tensor, remove this when fixed!\n", "WARNING:root:register user float to paddle.Tensor, remove this when fixed!\n", "WARNING:root:register user glu to paddle.nn.functional, remove this when fixed!\n", "WARNING:root:override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n", "WARNING:root:register user Module to paddle.nn, remove this when fixed!\n", "WARNING:root:register user ModuleList to paddle.nn, remove this when fixed!\n", "WARNING:root:register user GLU to paddle.nn, remove this when fixed!\n", "WARNING:root:register user ConstantPad2d to paddle.nn, remove this when fixed!\n", "WARNING:root:register user export to paddle.jit, remove this when fixed!\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "{'num_samples': 5, 'beam_size': 500, 'num_proc_bsearch': 8, 'num_conv_layers': 2, 'num_rnn_layers': 3, 'rnn_layer_size': 2048, 'alpha': 2.5, 'beta': 0.3, 'cutoff_prob': 1.0, 'cutoff_top_n': 40, 'use_gru': False, 'use_gpu': True, 'share_rnn_weights': True, 'unit_type': 'char', 'spm_model_prefix': 'examples/tiny/s1/data/spm_bpe', 'infer_manifest': 'examples/tiny/s1/data/manifest.tiny', 'mean_std_path': 'examples/tiny/s1/data/mean_std.npz', 'vocab_path': 'examples/tiny/s1/data/vocab.txt', 'lang_model_path': 'models/lm/common_crawl_00.prune01111.trie.klm', 'model_path': 'examples/tiny/s1/checkpoints/step_final', 'decoding_method': 'ctc_beam_search', 'error_rate_type': 'wer', 'specgram_type': 'fbank', 'feat_dim': 80, 'delta_delta': False}\n" ] } ], "source": [ "import sys\n", "import argparse\n", "import functools\n", "from deepspeech.utils.utility import add_arguments, print_arguments\n", "parser = argparse.ArgumentParser(description=__doc__)\n", "add_arg = functools.partial(add_arguments, argparser=parser)\n", "# yapf: disable\n", "add_arg('num_samples', int, 5, \"# of samples to infer.\")\n", "add_arg('beam_size', int, 500, \"Beam search width.\")\n", "add_arg('num_proc_bsearch', int, 8, \"# of CPUs for beam search.\")\n", "add_arg('num_conv_layers', int, 2, \"# of convolution layers.\")\n", "add_arg('num_rnn_layers', int, 3, \"# of recurrent layers.\")\n", "add_arg('rnn_layer_size', int, 2048, \"# of recurrent cells per layer.\")\n", "add_arg('alpha', float, 2.5, \"Coef of LM for beam search.\")\n", "add_arg('beta', float, 0.3, \"Coef of WC for beam search.\")\n", "add_arg('cutoff_prob', float, 1.0, \"Cutoff probability for pruning.\")\n", "add_arg('cutoff_top_n', int, 40, \"Cutoff number for pruning.\")\n", "add_arg('use_gru', bool, False, \"Use GRUs instead of simple RNNs.\")\n", "add_arg('use_gpu', bool, True, \"Use GPU or not.\")\n", "add_arg('share_rnn_weights',bool, True, \"Share input-hidden weights across \"\n", " \"bi-directional RNNs. Not for GRU.\")\n", "add_arg('unit_type', str,\n", " 'char',\n", " \"Options: char, word, spm.\",\n", " choices=['char', 'word', 'spm'])\n", "add_arg('spm_model_prefix', str,\n", " 'examples/tiny/s1/data/spm_bpe',\n", " \"spm model prefix.\",)\n", "add_arg('infer_manifest', str,\n", " 'examples/tiny/s1/data/manifest.tiny',\n", " \"Filepath of manifest to infer.\")\n", "add_arg('mean_std_path', str,\n", " 'examples/tiny/s1/data/mean_std.npz',\n", " \"Filepath of normalizer's mean & std.\")\n", "add_arg('vocab_path', str,\n", " 'examples/tiny/s1/data/vocab.txt',\n", " \"Filepath of vocabulary.\")\n", "add_arg('lang_model_path', str,\n", " 'models/lm/common_crawl_00.prune01111.trie.klm',\n", " \"Filepath for language model.\")\n", "add_arg('model_path', str,\n", " 'examples/tiny/s1/checkpoints/step_final',\n", " \"If None, the training starts from scratch, \"\n", " \"otherwise, it resumes from the pre-trained model.\")\n", "add_arg('decoding_method', str,\n", " 'ctc_beam_search',\n", " \"Decoding method. Options: ctc_beam_search, ctc_greedy\",\n", " choices = ['ctc_beam_search', 'ctc_greedy'])\n", "add_arg('error_rate_type', str,\n", " 'wer',\n", " \"Error rate type for evaluation.\",\n", " choices=['wer', 'cer'])\n", "add_arg('specgram_type', str,\n", " 'fbank',\n", " \"Audio feature type. Options: linear, mfcc.\",\n", " choices=['linear', 'mfcc'])\n", "add_arg('feat_dim', int, 80, \"mfcc or fbank feat dim.\")\n", "add_arg('delta_delta', bool, False, \"delta delta\")\n", "# yapf: disable\n", "args = parser.parse_args([])\n", "print(vars(args))" ] }, { "cell_type": "code", "execution_count": 4, "id": "bearing-physics", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated. Instead of using dual, use the functions directly from numpy or scipy.\n", " from numpy.dual import register_func\n", "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", " from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,\n", "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:108: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", " long_ = _make_signed(np.long)\n", "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:109: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", " ulong = _make_unsigned(np.long)\n" ] } ], "source": [ "from deepspeech.frontend.utility import read_manifest\n", "from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline\n", "from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer\n", "from deepspeech.frontend.speech import SpeechSegment\n", "from deepspeech.frontend.normalizer import FeatureNormalizer\n", "\n", "\n", "from deepspeech.io.collator import SpeechCollator\n", "from deepspeech.io.dataset import ManifestDataset\n", "from deepspeech.io.sampler import (\n", " SortagradDistributedBatchSampler,\n", " SortagradBatchSampler,\n", ")\n", "from deepspeech.io import create_dataloader\n", "batch_reader = create_dataloader(\n", " manifest_path=args.infer_manifest,\n", " unit_type=args.unit_type,\n", " vocab_filepath=args.vocab_path,\n", " mean_std_filepath=args.mean_std_path,\n", " spm_model_prefix=args.spm_model_prefix,\n", " augmentation_config='{}',\n", " max_input_len=27.0,\n", " min_input_len=0.0,\n", " max_output_len=float('inf'),\n", " min_output_len=0.0,\n", " max_output_input_ratio=float('inf'),\n", " min_output_input_ratio=0.0,\n", " stride_ms=10.0,\n", " window_ms=20.0,\n", " max_freq=None,\n", " specgram_type=args.specgram_type,\n", " feat_dim=args.feat_dim,\n", " delta_delta=args.delta_delta,\n", " use_dB_normalization=True,\n", " random_seed=0,\n", " keep_transcription_text=True,\n", " is_training=False,\n", " batch_size=args.num_samples,\n", " num_workers=0,\n", " sortagrad=True,\n", " shuffle_method=None,\n", " dist=False)" ] }, { "cell_type": "code", "execution_count": 5, "id": "classified-melissa", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", " and should_run_async(code)\n", "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py:354: DeprecationWarning: `np.object` is a deprecated alias for the builtin `object`. To silence this warning, use `object` by itself. Doing this will not modify any behavior and is safe. \n", "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", " if arr.dtype == np.object:\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "test: Tensor(shape=[5, 23], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n", " [[116, 104, 101, 32, 116, 119, 101, 110, 116, 105, 101, 115, -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ],\n", " [119, 104, 101, 114, 101, 32, 105, 115, 32, 116, 104, 97, 116, -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ],\n", " [116, 101, 110, 32, 115, 101, 99, 111, 110, 100, 115, -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ],\n", " [104, 101, 32, 100, 111, 101, 115, 110, 39, 116, 32, 119, 111, 114, 107, 32, 97, 116, 32, 97, 108, 108, -1 ],\n", " [119, 104, 101, 114, 101, 32, 105, 115, 32, 109, 121, 32, 98, 114, 111, 116, 104, 101, 114, 32, 110, 111, 119]])\n", "test raw: the twenties\n", "test raw: where is my brother now\n", "audio len: Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n", " [163, 173, 184, 190, 203])\n", "test len: Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n", " [12, 13, 11, 22, 23])\n", "audio: Tensor(shape=[5, 203, 80], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n", " [[[-51.32406616, -17.91388321, 0.00000000 , ..., -26.66350746, -27.46039391, -27.22303963],\n", " [-15.19027233, -20.52460480, 0.00000000 , ..., -28.47811317, -26.87953568, -25.13592339],\n", " [-22.80181694, -19.48889351, 0.00000000 , ..., -29.96320724, -25.96619034, -24.57164192],\n", " ...,\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", "\n", " [[-15.38297653, -18.95307732, 0.00000000 , ..., -15.22777271, -16.46900940, -12.32327461],\n", " [-14.06289291, -12.69954872, 0.00000000 , ..., -15.68012810, -16.92030334, -13.49134445],\n", " [-19.78544235, -11.63046265, 0.00000000 , ..., -14.35409069, -14.82787228, -15.72653484],\n", " ...,\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", "\n", " [[-22.65289879, -21.11938667, 0.00000000 , ..., -31.80981827, -30.58669853, -28.68988228],\n", " [-31.04699135, -21.68680763, 0.00000000 , ..., -29.90789604, -30.31726456, -30.99709320],\n", " [-18.16406441, -17.50658417, 0.00000000 , ..., -29.47821617, -29.77137375, -30.45121002],\n", " ...,\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", "\n", " [[-16.17608452, -15.22302818, 0.00000000 , ..., -8.82944202 , -7.88900328 , -6.10806322 ],\n", " [-19.40717316, -12.32932186, 0.00000000 , ..., -8.05214977 , -8.03145599 , -7.35137606 ],\n", " [-11.01850796, -13.20147514, 0.00000000 , ..., -9.65334892 , -8.96987629 , -9.13897228 ],\n", " ...,\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", "\n", " [[-16.55369759, -16.95514297, 0.00000000 , ..., -7.00301647 , -6.53273058 , -10.14600754],\n", " [-19.51947975, -14.86818218, 0.00000000 , ..., -6.82891273 , -6.22576237 , -9.42883873 ],\n", " [-15.26447582, -22.26662445, 0.00000000 , ..., -13.31693172, -11.05612659, -12.70977211],\n", " ...,\n", " [-4.81728077 , -10.65084648, 0.00000000 , ..., 3.19982862 , 8.42359638 , 7.95100546 ],\n", " [-7.54755068 , -12.56441689, 0.00000000 , ..., 4.12789631 , 6.98472023 , 7.79936218 ],\n", " [-8.79256725 , -11.23776722, 0.00000000 , ..., 1.31829071 , 1.30352044 , 6.80789280 ]]])\n" ] } ], "source": [ "for idx, (audio, audio_len, text, text_len) in enumerate(batch_reader()):\n", " print('test:', text)\n", " print(\"test raw:\", ''.join( chr(i) for i in text[0][:int(text_len[0])] ))\n", " print(\"test raw:\", ''.join( chr(i) for i in text[-1][:int(text_len[-1])] ))\n", " print('audio len:', audio_len)\n", " print('test len:', text_len)\n", " print('audio:', audio)\n", " break" ] }, { "cell_type": "code", "execution_count": null, "id": "unexpected-skating", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 6, "id": "minus-modern", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "test: Tensor(shape=[5, 23], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n", " [[87, 37, 26, 1, 87, 97, 26, 61, 87, 38, 26, 82, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],\n", " [97, 37, 26, 79, 26, 1, 38, 82, 1, 87, 37, 3, 87, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],\n", " [87, 26, 61, 1, 82, 26, 18, 64, 61, 25, 82, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],\n", " [37, 26, 1, 25, 64, 26, 82, 61, 2, 87, 1, 97, 64, 79, 52, 1, 3, 87, 1, 3, 53, 53, -1],\n", " [97, 37, 26, 79, 26, 1, 38, 82, 1, 58, 102, 1, 17, 79, 64, 87, 37, 26, 79, 1, 61, 64, 97]])\n", "test raw: W%\u001a\u0001Wa\u001a=W&\u001aR\n", "test raw: a%\u001aO\u001a\u0001&R\u0001:f\u0001\u0011O@W%\u001aO\u0001=@a\n", "test len: Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n", " [12, 13, 11, 22, 23])\n", "audio: Tensor(shape=[5, 203, 80], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n", " [[[-51.32406616, -17.91388321, 0.00000000 , ..., -26.66350746, -27.46039391, -27.22303963],\n", " [-15.19027233, -20.52460480, 0.00000000 , ..., -28.47811317, -26.87953568, -25.13592339],\n", " [-22.80181694, -19.48889351, 0.00000000 , ..., -29.96320724, -25.96619034, -24.57164192],\n", " ...,\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", "\n", " [[-15.38297653, -18.95307732, 0.00000000 , ..., -15.22777271, -16.46900940, -12.32327461],\n", " [-14.06289291, -12.69954872, 0.00000000 , ..., -15.68012810, -16.92030334, -13.49134445],\n", " [-19.78544235, -11.63046265, 0.00000000 , ..., -14.35409069, -14.82787228, -15.72653484],\n", " ...,\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", "\n", " [[-22.65289879, -21.11938667, 0.00000000 , ..., -31.80981827, -30.58669853, -28.68988228],\n", " [-31.04699135, -21.68680763, 0.00000000 , ..., -29.90789604, -30.31726456, -30.99709320],\n", " [-18.16406441, -17.50658417, 0.00000000 , ..., -29.47821617, -29.77137375, -30.45121002],\n", " ...,\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", "\n", " [[-16.17608452, -15.22302818, 0.00000000 , ..., -8.82944202 , -7.88900328 , -6.10806322 ],\n", " [-19.40717316, -12.32932186, 0.00000000 , ..., -8.05214977 , -8.03145599 , -7.35137606 ],\n", " [-11.01850796, -13.20147514, 0.00000000 , ..., -9.65334892 , -8.96987629 , -9.13897228 ],\n", " ...,\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", " [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", "\n", " [[-16.55369759, -16.95514297, 0.00000000 , ..., -7.00301647 , -6.53273058 , -10.14600754],\n", " [-19.51947975, -14.86818218, 0.00000000 , ..., -6.82891273 , -6.22576237 , -9.42883873 ],\n", " [-15.26447582, -22.26662445, 0.00000000 , ..., -13.31693172, -11.05612659, -12.70977211],\n", " ...,\n", " [-4.81728077 , -10.65084648, 0.00000000 , ..., 3.19982862 , 8.42359638 , 7.95100546 ],\n", " [-7.54755068 , -12.56441689, 0.00000000 , ..., 4.12789631 , 6.98472023 , 7.79936218 ],\n", " [-8.79256725 , -11.23776722, 0.00000000 , ..., 1.31829071 , 1.30352044 , 6.80789280 ]]])\n", "audio len: Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n", " [163, 173, 184, 190, 203])\n" ] } ], "source": [ "keep_transcription_text=False\n", "batch_reader = create_dataloader(\n", " manifest_path=args.infer_manifest,\n", " unit_type=args.unit_type,\n", " vocab_filepath=args.vocab_path,\n", " mean_std_filepath=args.mean_std_path,\n", " spm_model_prefix=args.spm_model_prefix,\n", " augmentation_config='{}',\n", " max_input_len=27.0,\n", " min_input_len=0.0,\n", " max_output_len=float('inf'),\n", " min_output_len=0.0,\n", " max_output_input_ratio=float('inf'),\n", " min_output_input_ratio=0.0,\n", " stride_ms=10.0,\n", " window_ms=20.0,\n", " max_freq=None,\n", " specgram_type=args.specgram_type,\n", " feat_dim=args.feat_dim,\n", " delta_delta=args.delta_delta,\n", " use_dB_normalization=True,\n", " random_seed=0,\n", " keep_transcription_text=keep_transcription_text,\n", " is_training=False,\n", " batch_size=args.num_samples,\n", " num_workers=0,\n", " sortagrad=True,\n", " shuffle_method=None,\n", " dist=False)\n", "for idx, (audio, audio_len, text, text_len) in enumerate(batch_reader()):\n", " print('test:', text)\n", " print(\"test raw:\", ''.join( chr(i) for i in text[0][:int(text_len[0])] ))\n", " print(\"test raw:\", ''.join( chr(i) for i in text[-1][:int(text_len[-1])] ))\n", " print('test len:', text_len)\n", " print('audio:', audio)\n", " print('audio len:', audio_len)\n", " break" ] }, { "cell_type": "code", "execution_count": null, "id": "competitive-mounting", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 5 }