diff --git a/.notebook/compute_cmvn_loader_test.ipynb b/.notebook/compute_cmvn_loader_test.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..91f1480fb4aeb17e0f669dac76b5d8d36493ec6f --- /dev/null +++ b/.notebook/compute_cmvn_loader_test.ipynb @@ -0,0 +1,409 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "purple-consequence", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/workspace/DeepSpeech-2.x\n" + ] + }, + { + "data": { + "text/plain": [ + "'/workspace/DeepSpeech-2.x'" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "%cd ..\n", + "%pwd" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "defensive-mason", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "patient-convention", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " def convert_to_list(value, n, name, dtype=np.int):\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:93] register user softmax to paddle, remove this when fixed!\n", + "2021-04-16 15:30:29,345 - WARNING - register user softmax to paddle, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:97] register user log_softmax to paddle, remove this when fixed!\n", + "2021-04-16 15:30:29,346 - WARNING - register user log_softmax to paddle, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:101] register user sigmoid to paddle, remove this when fixed!\n", + "2021-04-16 15:30:29,347 - WARNING - register user sigmoid to paddle, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:105] register user log_sigmoid to paddle, remove this when fixed!\n", + "2021-04-16 15:30:29,348 - WARNING - register user log_sigmoid to paddle, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:109] register user relu to paddle, remove this when fixed!\n", + "2021-04-16 15:30:29,349 - WARNING - register user relu to paddle, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:119] override cat of paddle if exists or register, remove this when fixed!\n", + "2021-04-16 15:30:29,349 - WARNING - override cat of paddle if exists or register, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:133] override item of paddle.Tensor if exists or register, remove this when fixed!\n", + "2021-04-16 15:30:29,350 - WARNING - override item of paddle.Tensor if exists or register, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:144] override long of paddle.Tensor if exists or register, remove this when fixed!\n", + "2021-04-16 15:30:29,351 - WARNING - override long of paddle.Tensor if exists or register, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:164] override new_full of paddle.Tensor if exists or register, remove this when fixed!\n", + "2021-04-16 15:30:29,352 - WARNING - override new_full of paddle.Tensor if exists or register, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:179] override eq of paddle.Tensor if exists or register, remove this when fixed!\n", + "2021-04-16 15:30:29,353 - WARNING - override eq of paddle.Tensor if exists or register, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:185] override eq of paddle if exists or register, remove this when fixed!\n", + "2021-04-16 15:30:29,354 - WARNING - override eq of paddle if exists or register, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:195] override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n", + "2021-04-16 15:30:29,355 - WARNING - override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:212] override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n", + "2021-04-16 15:30:29,356 - WARNING - override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:223] register user view to paddle.Tensor, remove this when fixed!\n", + "2021-04-16 15:30:29,357 - WARNING - register user view to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:233] register user view_as to paddle.Tensor, remove this when fixed!\n", + "2021-04-16 15:30:29,361 - WARNING - register user view_as to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:259] register user masked_fill to paddle.Tensor, remove this when fixed!\n", + "2021-04-16 15:30:29,362 - WARNING - register user masked_fill to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:277] register user masked_fill_ to paddle.Tensor, remove this when fixed!\n", + "2021-04-16 15:30:29,363 - WARNING - register user masked_fill_ to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:288] register user fill_ to paddle.Tensor, remove this when fixed!\n", + "2021-04-16 15:30:29,364 - WARNING - register user fill_ to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:298] register user repeat to paddle.Tensor, remove this when fixed!\n", + "2021-04-16 15:30:29,365 - WARNING - register user repeat to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:303] register user softmax to paddle.Tensor, remove this when fixed!\n", + "2021-04-16 15:30:29,366 - WARNING - register user softmax to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:308] register user sigmoid to paddle.Tensor, remove this when fixed!\n", + "2021-04-16 15:30:29,366 - WARNING - register user sigmoid to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:312] register user relu to paddle.Tensor, remove this when fixed!\n", + "2021-04-16 15:30:29,367 - WARNING - register user relu to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:322] register user type_as to paddle.Tensor, remove this when fixed!\n", + "2021-04-16 15:30:29,368 - WARNING - register user type_as to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:337] register user to to paddle.Tensor, remove this when fixed!\n", + "2021-04-16 15:30:29,369 - WARNING - register user to to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:346] register user float to paddle.Tensor, remove this when fixed!\n", + "2021-04-16 15:30:29,370 - WARNING - register user float to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:356] register user tolist to paddle.Tensor, remove this when fixed!\n", + "2021-04-16 15:30:29,370 - WARNING - register user tolist to paddle.Tensor, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:371] register user glu to paddle.nn.functional, remove this when fixed!\n", + "2021-04-16 15:30:29,371 - WARNING - register user glu to paddle.nn.functional, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:422] override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n", + "2021-04-16 15:30:29,372 - WARNING - override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:428] register user Module to paddle.nn, remove this when fixed!\n", + "2021-04-16 15:30:29,377 - WARNING - register user Module to paddle.nn, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:434] register user ModuleList to paddle.nn, remove this when fixed!\n", + "2021-04-16 15:30:29,378 - WARNING - register user ModuleList to paddle.nn, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:450] register user GLU to paddle.nn, remove this when fixed!\n", + "2021-04-16 15:30:29,379 - WARNING - register user GLU to paddle.nn, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:483] register user ConstantPad2d to paddle.nn, remove this when fixed!\n", + "2021-04-16 15:30:29,380 - WARNING - register user ConstantPad2d to paddle.nn, remove this when fixed!\n", + "[WARNING 2021/04/16 15:30:29 __init__.py:489] register user export to paddle.jit, remove this when fixed!\n", + "2021-04-16 15:30:29,381 - WARNING - register user export to paddle.jit, remove this when fixed!\n", + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/fftpack/__init__.py:103: DeprecationWarning: The module numpy.dual is deprecated. Instead of using dual, use the functions directly from numpy or scipy.\n", + " from numpy.dual import register_func\n", + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/scipy/special/orthogonal.py:81: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " from numpy import (exp, inf, pi, sqrt, floor, sin, cos, around, int,\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:108: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " long_ = _make_signed(np.long)\n", + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/numba/core/types/__init__.py:109: DeprecationWarning: `np.long` is a deprecated alias for `np.compat.long`. To silence this warning, use `np.compat.long` by itself. In the likely event your code does not need to work on Python 2 you can use the builtin `int` for which `np.compat.long` is itself an alias. Doing this will not modify any behaviour and is safe. When replacing `np.long`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", + "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", + " ulong = _make_unsigned(np.long)\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Namespace(delta_delta=False, feat_dim=13, manifest_path='examples/aishell/s1/data/manifest.train.raw', num_samples=-1, num_workers=1, output_path='data/librispeech/mean_std.npz', sample_rate=16000, specgram_type='linear', stride_ms=10.0, window_ms=20.0)\n" + ] + } + ], + "source": [ + "import argparse\n", + "import functools\n", + "\n", + "from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline\n", + "from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer\n", + "from deepspeech.frontend.normalizer import FeatureNormalizer\n", + "from deepspeech.utils.utility import add_arguments\n", + "from deepspeech.utils.utility import print_arguments\n", + "\n", + "parser = argparse.ArgumentParser(description=__doc__)\n", + "add_arg = functools.partial(add_arguments, argparser=parser)\n", + "# yapf: disable\n", + "add_arg('num_samples', int, -1, \"# of samples to for statistics.\")\n", + "add_arg('specgram_type', str,\n", + " 'linear',\n", + " \"Audio feature type. Options: linear, mfcc, fbank.\",\n", + " choices=['linear', 'mfcc', 'fbank'])\n", + "add_arg('feat_dim', int, 13, \"Audio feature dim.\")\n", + "add_arg('delta_delta', bool,\n", + " False,\n", + " \"Audio feature with delta delta.\")\n", + "add_arg('stride_ms', float, 10.0, \"stride length in ms.\")\n", + "add_arg('window_ms', float, 20.0, \"stride length in ms.\")\n", + "add_arg('sample_rate', int, 16000, \"target sample rate.\")\n", + "add_arg('manifest_path', str,\n", + " 'examples/aishell/s1/data/manifest.train.raw',\n", + " \"Filepath of manifest to compute normalizer's mean and stddev.\")\n", + "add_arg('num_workers',\n", + " default=1,\n", + " type=int,\n", + " help='num of subprocess workers for processing')\n", + "add_arg('output_path', str,\n", + " 'data/librispeech/mean_std.npz',\n", + " \"Filepath of write mean and stddev to (.npz).\")\n", + "# yapf: disable\n", + "args = parser.parse_args([])\n", + "print(args)" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "enormous-currency", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", + " and should_run_async(code)\n" + ] + } + ], + "source": [ + "import random\n", + "\n", + "import numpy as np\n", + "import paddle\n", + "from paddle.io import DataLoader\n", + "from paddle.io import Dataset\n", + "\n", + "from deepspeech.frontend.audio import AudioSegment\n", + "from deepspeech.frontend.utility import load_cmvn\n", + "from deepspeech.frontend.utility import read_manifest\n", + "\n", + "class CollateFunc(object):\n", + " ''' Collate function for AudioDataset\n", + " '''\n", + " def __init__(self):\n", + " pass\n", + " \n", + " def __call__(self, batch):\n", + " mean_stat = None\n", + " var_stat = None\n", + " number = 0\n", + " for feat in batch:\n", + " sums = np.sum(feat, axis=1)\n", + " if mean_stat is None:\n", + " mean_stat = sums\n", + " else:\n", + " mean_stat += sums\n", + "\n", + " square_sums = np.sum(np.square(feat), axis=1)\n", + " if var_stat is None:\n", + " var_stat = square_sums\n", + " else:\n", + " var_stat += square_sums\n", + "\n", + " number += feat.shape[1]\n", + " return paddle.to_tensor(number), paddle.to_tensor(mean_stat), paddle.to_tensor(var_stat)\n", + "\n", + "\n", + "class AudioDataset(Dataset):\n", + " def __init__(self, manifest_path, feature_func, num_samples=-1, rng=None):\n", + " self.feature_func = feature_func\n", + " self._rng = rng\n", + " manifest = read_manifest(manifest_path)\n", + " if num_samples == -1:\n", + " sampled_manifest = manifest\n", + " else:\n", + " sampled_manifest = self._rng.sample(manifest, num_samples)\n", + " self.items = sampled_manifest\n", + "\n", + " def __len__(self):\n", + " return len(self.items)\n", + "\n", + " def __getitem__(self, idx):\n", + " key = self.items[idx]['feat']\n", + " audioseg = AudioSegment.from_file(key)\n", + " feat = self.feature_func(audioseg) #(D, T)\n", + " return feat" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "armed-semester", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Exception ignored in: \n", + "Traceback (most recent call last):\n", + " File \"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py\", line 763, in __del__\n", + " self._try_shutdown_all()\n", + " File \"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py\", line 590, in _try_shutdown_all\n", + " w.join()\n", + " File \"/usr/local/lib/python3.7/multiprocessing/process.py\", line 140, in join\n", + " res = self._popen.wait(timeout)\n", + " File \"/usr/local/lib/python3.7/multiprocessing/popen_fork.py\", line 48, in wait\n", + " return self.poll(os.WNOHANG if timeout == 0.0 else 0)\n", + " File \"/usr/local/lib/python3.7/multiprocessing/popen_fork.py\", line 28, in poll\n", + " pid, sts = os.waitpid(self.pid, flag)\n", + "KeyboardInterrupt: \n", + "2021-04-16 15:44:43,413 - ERROR - DataLoader reader thread raised an exception!\n" + ] + }, + { + "ename": "SystemError", + "evalue": "(Fatal) Blocking queue is killed because the data reader raises an exception.\n [Hint: Expected killed_ != true, but received killed_:1 == true:1.] (at /paddle/paddle/fluid/operators/reader/blocking_queue.h:158)\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mSystemError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[1;32m 38\u001b[0m \u001b[0mwav_number\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 39\u001b[0m \u001b[0;31m# for i, batch in enumerate(data_loader()):\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 40\u001b[0;31m \u001b[0;32mfor\u001b[0m \u001b[0mbatch\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mdata_loader\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 41\u001b[0m \u001b[0mnumber\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmean_stat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvar_stat\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mbatch\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 42\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mi\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py\u001b[0m in \u001b[0;36m__next__\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 777\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 778\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0min_dygraph_mode\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 779\u001b[0;31m \u001b[0mdata\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_reader\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mread_next_var_list\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 780\u001b[0m \u001b[0;32melse\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 781\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_return_list\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mSystemError\u001b[0m: (Fatal) Blocking queue is killed because the data reader raises an exception.\n [Hint: Expected killed_ != true, but received killed_:1 == true:1.] (at /paddle/paddle/fluid/operators/reader/blocking_queue.h:158)\n" + ] + } + ], + "source": [ + "\n", + "augmentation_pipeline = AugmentationPipeline('{}')\n", + "audio_featurizer = AudioFeaturizer(\n", + " specgram_type=args.specgram_type,\n", + " feat_dim=args.feat_dim,\n", + " delta_delta=args.delta_delta,\n", + " stride_ms=args.stride_ms,\n", + " window_ms=args.window_ms,\n", + " n_fft=None,\n", + " max_freq=None,\n", + " target_sample_rate=args.sample_rate,\n", + " use_dB_normalization=True,\n", + " target_dB=-20)\n", + "\n", + "def augment_and_featurize(audio_segment):\n", + " augmentation_pipeline.transform_audio(audio_segment)\n", + " return audio_featurizer.featurize(audio_segment)\n", + "\n", + "\n", + "collate_func = CollateFunc()\n", + "\n", + "dataset = AudioDataset(\n", + " args.manifest_path,\n", + " augment_and_featurize, \n", + " args.num_samples)\n", + "\n", + "batch_size = 20\n", + "data_loader = DataLoader(\n", + " dataset,\n", + " batch_size=batch_size,\n", + " shuffle=False,\n", + " num_workers=args.num_workers,\n", + " collate_fn=collate_func)\n", + "\n", + "with paddle.no_grad():\n", + " all_mean_stat = None\n", + " all_var_stat = None\n", + " all_number = 0\n", + " wav_number = 0\n", + " # for i, batch in enumerate(data_loader()):\n", + " for batch in data_loader():\n", + " number, mean_stat, var_stat = batch\n", + " if i == 0:\n", + " all_mean_stat = mean_stat\n", + " all_var_stat = var_stat\n", + " else:\n", + " all_mean_stat += mean_stat\n", + " all_var_stat += var_stat\n", + " all_number += number\n", + " wav_number += batch_size\n", + "\n", + " if wav_number % 1000 == 0:\n", + " print('process {} wavs,{} frames'.format(wav_number,\n", + " all_number))\n", + "\n", + "cmvn_info = {\n", + " 'mean_stat': list(all_mean_stat.tolist()),\n", + " 'var_stat': list(all_var_stat.tolist()),\n", + " 'frame_num': all_number\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "danish-executive", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "accurate-terminal", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "dominant-abuse", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/.notebook/u2_model.ipynb b/.notebook/u2_model.ipynb index c3ba4fd6d15d2d9b8fe83ccecfd0b93364987842..f9e7c1eecd879b75f231dba3fed860879361c8ed 100644 --- a/.notebook/u2_model.ipynb +++ b/.notebook/u2_model.ipynb @@ -3,7 +3,7 @@ { "cell_type": "code", "execution_count": 1, - "id": "future-wesley", + "id": "choice-grade", "metadata": {}, "outputs": [ { @@ -32,7 +32,7 @@ { "cell_type": "code", "execution_count": 2, - "id": "eleven-istanbul", + "id": "broke-broad", "metadata": {}, "outputs": [ { @@ -42,39 +42,39 @@ "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/layers/utils.py:26: DeprecationWarning: `np.int` is a deprecated alias for the builtin `int`. To silence this warning, use `int` by itself. Doing this will not modify any behavior and is safe. When replacing `np.int`, you may wish to use e.g. `np.int64` or `np.int32` to specify the precision. If you wish to review your current use, check the release note link for additional information.\n", "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", " def convert_to_list(value, n, name, dtype=np.int):\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:93] register user softmax to paddle, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:97] register user log_softmax to paddle, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:101] register user sigmoid to paddle, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:105] register user log_sigmoid to paddle, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:109] register user relu to paddle, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:119] override cat of paddle if exists or register, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:133] override item of paddle.Tensor if exists or register, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:144] override long of paddle.Tensor if exists or register, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:164] override new_full of paddle.Tensor if exists or register, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:179] override eq of paddle.Tensor if exists or register, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:185] override eq of paddle if exists or register, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:195] override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:212] override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:223] register user view to paddle.Tensor, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:233] register user view_as to paddle.Tensor, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:259] register user masked_fill to paddle.Tensor, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:277] register user masked_fill_ to paddle.Tensor, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:288] register user fill_ to paddle.Tensor, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:298] register user repeat to paddle.Tensor, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:303] register user softmax to paddle.Tensor, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:308] register user sigmoid to paddle.Tensor, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:312] register user relu to paddle.Tensor, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:322] register user type_as to paddle.Tensor, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:337] register user to to paddle.Tensor, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:346] register user float to paddle.Tensor, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:356] register user tolist to paddle.Tensor, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:371] register user glu to paddle.nn.functional, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:422] override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:428] register user Module to paddle.nn, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:434] register user ModuleList to paddle.nn, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:450] register user GLU to paddle.nn, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:483] register user ConstantPad2d to paddle.nn, remove this when fixed!\n", - "[WARNING 2021/04/16 10:35:27 __init__.py:489] register user export to paddle.jit, remove this when fixed!\n" + "register user softmax to paddle, remove this when fixed!\n", + "register user log_softmax to paddle, remove this when fixed!\n", + "register user sigmoid to paddle, remove this when fixed!\n", + "register user log_sigmoid to paddle, remove this when fixed!\n", + "register user relu to paddle, remove this when fixed!\n", + "override cat of paddle if exists or register, remove this when fixed!\n", + "override item of paddle.Tensor if exists or register, remove this when fixed!\n", + "override long of paddle.Tensor if exists or register, remove this when fixed!\n", + "override new_full of paddle.Tensor if exists or register, remove this when fixed!\n", + "override eq of paddle.Tensor if exists or register, remove this when fixed!\n", + "override eq of paddle if exists or register, remove this when fixed!\n", + "override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n", + "override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n", + "register user view to paddle.Tensor, remove this when fixed!\n", + "register user view_as to paddle.Tensor, remove this when fixed!\n", + "register user masked_fill to paddle.Tensor, remove this when fixed!\n", + "register user masked_fill_ to paddle.Tensor, remove this when fixed!\n", + "register user fill_ to paddle.Tensor, remove this when fixed!\n", + "register user repeat to paddle.Tensor, remove this when fixed!\n", + "register user softmax to paddle.Tensor, remove this when fixed!\n", + "register user sigmoid to paddle.Tensor, remove this when fixed!\n", + "register user relu to paddle.Tensor, remove this when fixed!\n", + "register user type_as to paddle.Tensor, remove this when fixed!\n", + "register user to to paddle.Tensor, remove this when fixed!\n", + "register user float to paddle.Tensor, remove this when fixed!\n", + "register user tolist to paddle.Tensor, remove this when fixed!\n", + "register user glu to paddle.nn.functional, remove this when fixed!\n", + "override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n", + "register user Module to paddle.nn, remove this when fixed!\n", + "register user ModuleList to paddle.nn, remove this when fixed!\n", + "register user GLU to paddle.nn, remove this when fixed!\n", + "register user ConstantPad2d to paddle.nn, remove this when fixed!\n", + "register user export to paddle.jit, remove this when fixed!\n" ] } ], @@ -91,7 +91,7 @@ { "cell_type": "code", "execution_count": 3, - "id": "provincial-mexico", + "id": "permanent-summary", "metadata": {}, "outputs": [ { @@ -100,8 +100,7 @@ "text": [ "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n", " and should_run_async(code)\n", - "[INFO 2021/04/16 10:35:28 u2.py:834] U2 Encoder type: conformer\n", - "[INFO 2021/04/16 10:35:28 u2.py:834] U2 Encoder type: conformer\n" + "[INFO 2021/04/19 06:57:01 u2.py:834] U2 Encoder type: conformer\n" ] }, { @@ -112,8 +111,8 @@ "encoder.embed.conv.0.bias | [256] | 256 | True\n", "encoder.embed.conv.2.weight | [256, 256, 3, 3] | 589824 | True\n", "encoder.embed.conv.2.bias | [256] | 256 | True\n", - "encoder.embed.linear.weight | [4864, 256] | 1245184 | True\n", - "encoder.embed.linear.bias | [256] | 256 | True\n", + "encoder.embed.out.0.weight | [4864, 256] | 1245184 | True\n", + "encoder.embed.out.0.bias | [256] | 256 | True\n", "encoder.after_norm.weight | [256] | 256 | True\n", "encoder.after_norm.bias | [256] | 256 | True\n", "encoder.encoders.0.self_attn.pos_bias_u | [4, 64] | 256 | True\n", @@ -815,7 +814,7 @@ { "cell_type": "code", "execution_count": 4, - "id": "choice-psychology", + "id": "sapphire-agent", "metadata": {}, "outputs": [ { @@ -828,8 +827,8 @@ "encoder.embed.conv.0.bias | [256] | 256\n", "encoder.embed.conv.2.weight | [256, 256, 3, 3] | 589824\n", "encoder.embed.conv.2.bias | [256] | 256\n", - "encoder.embed.linear.weight | [4864, 256] | 1245184\n", - "encoder.embed.linear.bias | [256] | 256\n", + "encoder.embed.out.0.weight | [4864, 256] | 1245184\n", + "encoder.embed.out.0.bias | [256] | 256\n", "encoder.after_norm.weight | [256] | 256\n", "encoder.after_norm.bias | [256] | 256\n", "encoder.encoders.0.self_attn.pos_bias_u | [4, 64] | 256\n", @@ -1427,13 +1426,7 @@ "decoder.decoders.3.self_attn.linear_v.bias | [256] | 256\n", "decoder.decoders.3.self_attn.linear_out.weight | [256, 256] | 65536\n", "decoder.decoders.3.self_attn.linear_out.bias | [256] | 256\n", - "decoder.decoders.3.src_attn.linear_q.weight | [256, 256] | 65536\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ + "decoder.decoders.3.src_attn.linear_q.weight | [256, 256] | 65536\n", "decoder.decoders.3.src_attn.linear_q.bias | [256] | 256\n", "decoder.decoders.3.src_attn.linear_k.weight | [256, 256] | 65536\n", "decoder.decoders.3.src_attn.linear_k.bias | [256] | 256\n", @@ -1446,7 +1439,13 @@ "decoder.decoders.3.feed_forward.w_2.weight | [2048, 256] | 524288\n", "decoder.decoders.3.feed_forward.w_2.bias | [256] | 256\n", "decoder.decoders.3.norm1.weight | [256] | 256\n", - "decoder.decoders.3.norm1.bias | [256] | 256\n", + "decoder.decoders.3.norm1.bias | [256] | 256\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "decoder.decoders.3.norm2.weight | [256] | 256\n", "decoder.decoders.3.norm2.bias | [256] | 256\n", "decoder.decoders.3.norm3.weight | [256] | 256\n", @@ -1527,20 +1526,681 @@ }, { "cell_type": "code", - "execution_count": 5, - "id": "enabling-botswana", + "execution_count": 6, + "id": "ruled-invitation", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "U2Model(\n", + " (encoder): ConformerEncoder(\n", + " (global_cmvn): GlobalCMVN()\n", + " (embed): Conv2dSubsampling4(\n", + " (pos_enc): RelPositionalEncoding(\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " )\n", + " (conv): Sequential(\n", + " (0): Conv2D(1, 256, kernel_size=[3, 3], stride=[2, 2], data_format=NCHW)\n", + " (1): ReLU()\n", + " (2): Conv2D(256, 256, kernel_size=[3, 3], stride=[2, 2], data_format=NCHW)\n", + " (3): ReLU()\n", + " )\n", + " (out): Sequential(\n", + " (0): Linear(in_features=4864, out_features=256, dtype=float32)\n", + " )\n", + " )\n", + " (after_norm): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (encoders): LayerList(\n", + " (0): ConformerEncoderLayer(\n", + " (self_attn): RelPositionMultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " (linear_pos): Linear(in_features=256, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward_macaron): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (conv_module): ConvolutionModule(\n", + " (pointwise_conv1): Conv1D(256, 512, kernel_size=[1], data_format=NCL)\n", + " (depthwise_conv): Conv1D(256, 256, kernel_size=[15], padding=7, groups=256, data_format=NCL)\n", + " (norm): BatchNorm1D(num_features=256, momentum=0.9, epsilon=1e-05)\n", + " (pointwise_conv2): Conv1D(256, 256, kernel_size=[1], data_format=NCL)\n", + " (activation): Swish()\n", + " )\n", + " (norm_ff): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_mha): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_ff_macaron): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_conv): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_final): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (concat_linear): Linear(in_features=512, out_features=256, dtype=float32)\n", + " )\n", + " (1): ConformerEncoderLayer(\n", + " (self_attn): RelPositionMultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " (linear_pos): Linear(in_features=256, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward_macaron): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (conv_module): ConvolutionModule(\n", + " (pointwise_conv1): Conv1D(256, 512, kernel_size=[1], data_format=NCL)\n", + " (depthwise_conv): Conv1D(256, 256, kernel_size=[15], padding=7, groups=256, data_format=NCL)\n", + " (norm): BatchNorm1D(num_features=256, momentum=0.9, epsilon=1e-05)\n", + " (pointwise_conv2): Conv1D(256, 256, kernel_size=[1], data_format=NCL)\n", + " (activation): Swish()\n", + " )\n", + " (norm_ff): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_mha): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_ff_macaron): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_conv): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_final): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (concat_linear): Linear(in_features=512, out_features=256, dtype=float32)\n", + " )\n", + " (2): ConformerEncoderLayer(\n", + " (self_attn): RelPositionMultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " (linear_pos): Linear(in_features=256, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward_macaron): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (conv_module): ConvolutionModule(\n", + " (pointwise_conv1): Conv1D(256, 512, kernel_size=[1], data_format=NCL)\n", + " (depthwise_conv): Conv1D(256, 256, kernel_size=[15], padding=7, groups=256, data_format=NCL)\n", + " (norm): BatchNorm1D(num_features=256, momentum=0.9, epsilon=1e-05)\n", + " (pointwise_conv2): Conv1D(256, 256, kernel_size=[1], data_format=NCL)\n", + " (activation): Swish()\n", + " )\n", + " (norm_ff): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_mha): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_ff_macaron): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_conv): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_final): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (concat_linear): Linear(in_features=512, out_features=256, dtype=float32)\n", + " )\n", + " (3): ConformerEncoderLayer(\n", + " (self_attn): RelPositionMultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " (linear_pos): Linear(in_features=256, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward_macaron): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (conv_module): ConvolutionModule(\n", + " (pointwise_conv1): Conv1D(256, 512, kernel_size=[1], data_format=NCL)\n", + " (depthwise_conv): Conv1D(256, 256, kernel_size=[15], padding=7, groups=256, data_format=NCL)\n", + " (norm): BatchNorm1D(num_features=256, momentum=0.9, epsilon=1e-05)\n", + " (pointwise_conv2): Conv1D(256, 256, kernel_size=[1], data_format=NCL)\n", + " (activation): Swish()\n", + " )\n", + " (norm_ff): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_mha): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_ff_macaron): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_conv): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_final): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (concat_linear): Linear(in_features=512, out_features=256, dtype=float32)\n", + " )\n", + " (4): ConformerEncoderLayer(\n", + " (self_attn): RelPositionMultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " (linear_pos): Linear(in_features=256, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward_macaron): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (conv_module): ConvolutionModule(\n", + " (pointwise_conv1): Conv1D(256, 512, kernel_size=[1], data_format=NCL)\n", + " (depthwise_conv): Conv1D(256, 256, kernel_size=[15], padding=7, groups=256, data_format=NCL)\n", + " (norm): BatchNorm1D(num_features=256, momentum=0.9, epsilon=1e-05)\n", + " (pointwise_conv2): Conv1D(256, 256, kernel_size=[1], data_format=NCL)\n", + " (activation): Swish()\n", + " )\n", + " (norm_ff): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_mha): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_ff_macaron): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_conv): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_final): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (concat_linear): Linear(in_features=512, out_features=256, dtype=float32)\n", + " )\n", + " (5): ConformerEncoderLayer(\n", + " (self_attn): RelPositionMultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " (linear_pos): Linear(in_features=256, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward_macaron): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (conv_module): ConvolutionModule(\n", + " (pointwise_conv1): Conv1D(256, 512, kernel_size=[1], data_format=NCL)\n", + " (depthwise_conv): Conv1D(256, 256, kernel_size=[15], padding=7, groups=256, data_format=NCL)\n", + " (norm): BatchNorm1D(num_features=256, momentum=0.9, epsilon=1e-05)\n", + " (pointwise_conv2): Conv1D(256, 256, kernel_size=[1], data_format=NCL)\n", + " (activation): Swish()\n", + " )\n", + " (norm_ff): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_mha): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_ff_macaron): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_conv): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_final): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (concat_linear): Linear(in_features=512, out_features=256, dtype=float32)\n", + " )\n", + " (6): ConformerEncoderLayer(\n", + " (self_attn): RelPositionMultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " (linear_pos): Linear(in_features=256, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward_macaron): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (conv_module): ConvolutionModule(\n", + " (pointwise_conv1): Conv1D(256, 512, kernel_size=[1], data_format=NCL)\n", + " (depthwise_conv): Conv1D(256, 256, kernel_size=[15], padding=7, groups=256, data_format=NCL)\n", + " (norm): BatchNorm1D(num_features=256, momentum=0.9, epsilon=1e-05)\n", + " (pointwise_conv2): Conv1D(256, 256, kernel_size=[1], data_format=NCL)\n", + " (activation): Swish()\n", + " )\n", + " (norm_ff): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_mha): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_ff_macaron): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_conv): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_final): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (concat_linear): Linear(in_features=512, out_features=256, dtype=float32)\n", + " )\n", + " (7): ConformerEncoderLayer(\n", + " (self_attn): RelPositionMultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " (linear_pos): Linear(in_features=256, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward_macaron): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (conv_module): ConvolutionModule(\n", + " (pointwise_conv1): Conv1D(256, 512, kernel_size=[1], data_format=NCL)\n", + " (depthwise_conv): Conv1D(256, 256, kernel_size=[15], padding=7, groups=256, data_format=NCL)\n", + " (norm): BatchNorm1D(num_features=256, momentum=0.9, epsilon=1e-05)\n", + " (pointwise_conv2): Conv1D(256, 256, kernel_size=[1], data_format=NCL)\n", + " (activation): Swish()\n", + " )\n", + " (norm_ff): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_mha): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_ff_macaron): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_conv): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_final): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (concat_linear): Linear(in_features=512, out_features=256, dtype=float32)\n", + " )\n", + " (8): ConformerEncoderLayer(\n", + " (self_attn): RelPositionMultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " (linear_pos): Linear(in_features=256, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward_macaron): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (conv_module): ConvolutionModule(\n", + " (pointwise_conv1): Conv1D(256, 512, kernel_size=[1], data_format=NCL)\n", + " (depthwise_conv): Conv1D(256, 256, kernel_size=[15], padding=7, groups=256, data_format=NCL)\n", + " (norm): BatchNorm1D(num_features=256, momentum=0.9, epsilon=1e-05)\n", + " (pointwise_conv2): Conv1D(256, 256, kernel_size=[1], data_format=NCL)\n", + " (activation): Swish()\n", + " )\n", + " (norm_ff): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_mha): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_ff_macaron): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_conv): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_final): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (concat_linear): Linear(in_features=512, out_features=256, dtype=float32)\n", + " )\n", + " (9): ConformerEncoderLayer(\n", + " (self_attn): RelPositionMultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " (linear_pos): Linear(in_features=256, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward_macaron): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (conv_module): ConvolutionModule(\n", + " (pointwise_conv1): Conv1D(256, 512, kernel_size=[1], data_format=NCL)\n", + " (depthwise_conv): Conv1D(256, 256, kernel_size=[15], padding=7, groups=256, data_format=NCL)\n", + " (norm): BatchNorm1D(num_features=256, momentum=0.9, epsilon=1e-05)\n", + " (pointwise_conv2): Conv1D(256, 256, kernel_size=[1], data_format=NCL)\n", + " (activation): Swish()\n", + " )\n", + " (norm_ff): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_mha): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_ff_macaron): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_conv): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_final): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (concat_linear): Linear(in_features=512, out_features=256, dtype=float32)\n", + " )\n", + " (10): ConformerEncoderLayer(\n", + " (self_attn): RelPositionMultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " (linear_pos): Linear(in_features=256, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward_macaron): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (conv_module): ConvolutionModule(\n", + " (pointwise_conv1): Conv1D(256, 512, kernel_size=[1], data_format=NCL)\n", + " (depthwise_conv): Conv1D(256, 256, kernel_size=[15], padding=7, groups=256, data_format=NCL)\n", + " (norm): BatchNorm1D(num_features=256, momentum=0.9, epsilon=1e-05)\n", + " (pointwise_conv2): Conv1D(256, 256, kernel_size=[1], data_format=NCL)\n", + " (activation): Swish()\n", + " )\n", + " (norm_ff): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_mha): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_ff_macaron): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_conv): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_final): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (concat_linear): Linear(in_features=512, out_features=256, dtype=float32)\n", + " )\n", + " (11): ConformerEncoderLayer(\n", + " (self_attn): RelPositionMultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " (linear_pos): Linear(in_features=256, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (feed_forward_macaron): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): Swish()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (conv_module): ConvolutionModule(\n", + " (pointwise_conv1): Conv1D(256, 512, kernel_size=[1], data_format=NCL)\n", + " (depthwise_conv): Conv1D(256, 256, kernel_size=[15], padding=7, groups=256, data_format=NCL)\n", + " (norm): BatchNorm1D(num_features=256, momentum=0.9, epsilon=1e-05)\n", + " (pointwise_conv2): Conv1D(256, 256, kernel_size=[1], data_format=NCL)\n", + " (activation): Swish()\n", + " )\n", + " (norm_ff): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_mha): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_ff_macaron): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_conv): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm_final): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (concat_linear): Linear(in_features=512, out_features=256, dtype=float32)\n", + " )\n", + " )\n", + " )\n", + " (decoder): TransformerDecoder(\n", + " (embed): Sequential(\n", + " (0): Embedding(4233, 256, sparse=False)\n", + " (1): PositionalEncoding(\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " )\n", + " )\n", + " (after_norm): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (output_layer): Linear(in_features=256, out_features=4233, dtype=float32)\n", + " (decoders): LayerList(\n", + " (0): DecoderLayer(\n", + " (self_attn): MultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " )\n", + " (src_attn): MultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " )\n", + " (feed_forward): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): ReLU()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (norm1): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm2): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm3): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (concat_linear1): Linear(in_features=512, out_features=256, dtype=float32)\n", + " (concat_linear2): Linear(in_features=512, out_features=256, dtype=float32)\n", + " )\n", + " (1): DecoderLayer(\n", + " (self_attn): MultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " )\n", + " (src_attn): MultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " )\n", + " (feed_forward): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): ReLU()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (norm1): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm2): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm3): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (concat_linear1): Linear(in_features=512, out_features=256, dtype=float32)\n", + " (concat_linear2): Linear(in_features=512, out_features=256, dtype=float32)\n", + " )\n", + " (2): DecoderLayer(\n", + " (self_attn): MultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " )\n", + " (src_attn): MultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " )\n", + " (feed_forward): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): ReLU()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (norm1): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm2): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm3): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (concat_linear1): Linear(in_features=512, out_features=256, dtype=float32)\n", + " (concat_linear2): Linear(in_features=512, out_features=256, dtype=float32)\n", + " )\n", + " (3): DecoderLayer(\n", + " (self_attn): MultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " )\n", + " (src_attn): MultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " )\n", + " (feed_forward): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): ReLU()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (norm1): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm2): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm3): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (concat_linear1): Linear(in_features=512, out_features=256, dtype=float32)\n", + " (concat_linear2): Linear(in_features=512, out_features=256, dtype=float32)\n", + " )\n", + " (4): DecoderLayer(\n", + " (self_attn): MultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " )\n", + " (src_attn): MultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " )\n", + " (feed_forward): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): ReLU()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (norm1): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm2): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm3): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (concat_linear1): Linear(in_features=512, out_features=256, dtype=float32)\n", + " (concat_linear2): Linear(in_features=512, out_features=256, dtype=float32)\n", + " )\n", + " (5): DecoderLayer(\n", + " (self_attn): MultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " )\n", + " (src_attn): MultiHeadedAttention(\n", + " (linear_q): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_k): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_v): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (linear_out): Linear(in_features=256, out_features=256, dtype=float32)\n", + " (dropout): Dropout(p=0.0, axis=None, mode=upscale_in_train)\n", + " )\n", + " (feed_forward): PositionwiseFeedForward(\n", + " (w_1): Linear(in_features=256, out_features=2048, dtype=float32)\n", + " (activation): ReLU()\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (w_2): Linear(in_features=2048, out_features=256, dtype=float32)\n", + " )\n", + " (norm1): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm2): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (norm3): LayerNorm(normalized_shape=[256], epsilon=1e-12)\n", + " (dropout): Dropout(p=0.1, axis=None, mode=upscale_in_train)\n", + " (concat_linear1): Linear(in_features=512, out_features=256, dtype=float32)\n", + " (concat_linear2): Linear(in_features=512, out_features=256, dtype=float32)\n", + " )\n", + " )\n", + " )\n", + " (ctc): CTCDecoder(\n", + " (ctc_lo): Linear(in_features=256, out_features=4233, dtype=float32)\n", + " (criterion): CTCLoss(\n", + " (loss): CTCLoss()\n", + " )\n", + " )\n", + " (criterion_att): LabelSmoothingLoss(\n", + " (criterion): KLDivLoss()\n", + " )\n", + ")\n" + ] + } + ], + "source": [ + "print(model)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "fleet-despite", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "compute_cmvn_loader_test.ipynb jit_infer.ipynb\r\n", "dataloader.ipynb mask_and_masked_fill_test.ipynb\r\n", "dataloader_with_tokens_tokenids.ipynb model.npz\r\n", "data.npz python_test.ipynb\r\n", - "encoder.npz train_test.ipynb\r\n", - "hack_api_test.ipynb u2_model.ipynb\r\n", - "jit_infer.ipynb\r\n" + "decoder.npz train_test.ipynb\r\n", + "encoder.npz u2_model.ipynb\r\n", + "hack_api_test.ipynb\r\n" ] } ], @@ -1550,8 +2210,8 @@ }, { "cell_type": "code", - "execution_count": 6, - "id": "acute-hunter", + "execution_count": 8, + "id": "abroad-oracle", "metadata": {}, "outputs": [], "source": [ @@ -1565,8 +2225,8 @@ }, { "cell_type": "code", - "execution_count": 7, - "id": "impossible-mount", + "execution_count": 9, + "id": "false-instrument", "metadata": {}, "outputs": [ { @@ -1661,58 +2321,58 @@ }, { "cell_type": "code", - "execution_count": 8, - "id": "dying-ideal", + "execution_count": 10, + "id": "arctic-proxy", "metadata": {}, "outputs": [], "source": [ "# ['BAC009S0739W0246', 'BAC009S0727W0424', 'BAC009S0753W0412', 'BAC009S0756W0206', 'BAC009S0740W0414', 'BAC009S0728W0426', 'BAC009S0739W0214', 'BAC009S0753W0423', 'BAC009S0734W0201', 'BAC009S0740W0427', 'BAC009S0730W0423', 'BAC009S0728W0367', 'BAC009S0730W0418', 'BAC009S0727W0157', 'BAC009S0749W0409', 'BAC009S0727W0418']\n", "# torch.Size([16, 207, 80])\n", - "# tensor([[[ 8.9935, 9.5377, 9.1939, ..., 10.4592, 9.5223, 8.2839],\n", - "# [ 9.7994, 10.4060, 9.2669, ..., 10.2340, 9.5668, 8.7706],\n", - "# [10.6888, 10.3949, 8.0560, ..., 9.9335, 10.1175, 8.1560],\n", + "# tensor([[[ 8.9946, 9.5383, 9.1916, ..., 10.5074, 9.5633, 8.2564],\n", + "# [ 9.7988, 10.4052, 9.2651, ..., 10.2512, 9.5440, 8.8738],\n", + "# [10.6891, 10.3955, 8.0535, ..., 9.9067, 10.0649, 8.0509],\n", "# ...,\n", - "# [ 9.2174, 9.6504, 8.5052, ..., 9.6707, 8.7834, 8.0564],\n", - "# [10.1287, 9.9347, 9.3788, ..., 9.5698, 9.8277, 8.9262],\n", - "# [ 9.0959, 7.1305, 9.4666, ..., 9.5228, 8.9921, 7.4808]],\n", + "# [ 9.2180, 9.6507, 8.5053, ..., 9.6872, 8.7425, 7.9865],\n", + "# [10.1291, 9.9352, 9.3798, ..., 9.5639, 9.8260, 8.9795],\n", + "# [ 9.0955, 7.1338, 9.4680, ..., 9.4727, 9.0212, 7.4479]],\n", "\n", - "# [[11.4309, 10.6716, 6.0973, ..., 9.3820, 8.7208, 7.6153],\n", - "# [ 9.7314, 7.8097, 7.5711, ..., 10.0005, 9.2962, 7.5479],\n", - "# [10.6502, 10.6007, 8.4671, ..., 9.2416, 9.2412, 8.1083],\n", + "# [[11.4310, 10.6719, 6.0841, ..., 9.3827, 8.7297, 7.5316],\n", + "# [ 9.7317, 7.8105, 7.5715, ..., 10.0430, 9.2436, 7.3541],\n", + "# [10.6502, 10.6006, 8.4678, ..., 9.2814, 9.1869, 8.0703],\n", "# ...,\n", - "# [ 9.0977, 9.2650, 8.0763, ..., 8.3842, 8.4285, 8.0505],\n", - "# [10.4615, 10.1473, 6.7677, ..., 9.8455, 9.6548, 8.2006],\n", - "# [ 7.7949, 5.6219, 7.9746, ..., 9.9617, 9.8019, 8.0486]],\n", + "# [ 9.0970, 9.2637, 8.0753, ..., 8.4318, 8.3705, 8.0029],\n", + "# [10.4617, 10.1478, 6.7693, ..., 9.7794, 9.5775, 8.0807],\n", + "# [ 7.7944, 5.6211, 7.9751, ..., 9.9972, 9.8497, 8.0313]],\n", "\n", - "# [[ 7.3481, 7.8987, 7.5786, ..., 11.6611, 10.4626, 9.0665],\n", - "# [ 8.6274, 8.4604, 7.4981, ..., 12.4233, 11.0101, 8.9767],\n", - "# [ 9.8315, 10.2812, 8.9717, ..., 12.1325, 10.4014, 9.0196],\n", + "# [[ 7.3456, 7.8964, 7.5796, ..., 11.6310, 10.4513, 9.1236],\n", + "# [ 8.6287, 8.4631, 7.4992, ..., 12.4160, 10.9757, 8.9426],\n", + "# [ 9.8314, 10.2813, 8.9724, ..., 12.1387, 10.4017, 9.0055],\n", "# ...,\n", - "# [ 7.0872, 7.4009, 6.8090, ..., 9.3759, 9.2273, 8.1752],\n", + "# [ 7.0896, 7.4055, 6.8143, ..., 9.3252, 9.2732, 8.3534],\n", "# [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", "# [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]],\n", "\n", "# ...,\n", "\n", - "# [[10.9333, 10.4647, 7.7200, ..., 10.3486, 9.2818, 7.2852],\n", - "# [10.4503, 9.9080, 9.0299, ..., 9.9633, 9.4876, 7.6330],\n", - "# [10.4877, 9.8130, 9.8961, ..., 9.6017, 9.3175, 7.6303],\n", + "# [[10.9332, 10.4644, 7.7203, ..., 10.3488, 9.3023, 7.1553],\n", + "# [10.4499, 9.9070, 9.0293, ..., 9.9525, 9.4141, 7.5593],\n", + "# [10.4877, 9.8126, 9.8952, ..., 9.5866, 9.3413, 7.7849],\n", "# ...,\n", "# [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", "# [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", "# [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]],\n", "\n", - "# [[ 9.9448, 9.5868, 8.2200, ..., 11.6113, 11.0576, 8.7598],\n", - "# [ 7.6800, 8.3231, 7.5294, ..., 11.0965, 10.5442, 9.3556],\n", - "# [ 8.6248, 9.6746, 9.8406, ..., 11.4058, 10.9484, 8.9749],\n", + "# [[ 9.9444, 9.5859, 8.2203, ..., 11.5886, 11.0450, 8.8171],\n", + "# [ 7.6784, 8.3224, 7.5330, ..., 11.0551, 10.5357, 9.2746],\n", + "# [ 8.6262, 9.6759, 9.8410, ..., 11.3788, 10.9221, 8.9914],\n", "# ...,\n", "# [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", "# [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", "# [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000]],\n", "\n", - "# [[ 8.1097, 7.7619, 6.7079, ..., 12.6548, 11.4666, 11.0747],\n", - "# [11.3805, 11.2223, 8.6587, ..., 12.7926, 12.2433, 11.7217],\n", - "# [10.6778, 9.9210, 8.0447, ..., 13.5741, 12.5711, 11.1356],\n", + "# [[ 8.1079, 7.7590, 6.7103, ..., 12.6506, 11.4662, 11.0615],\n", + "# [11.3803, 11.2220, 8.6589, ..., 12.8106, 12.2222, 11.6893],\n", + "# [10.6777, 9.9206, 8.0461, ..., 13.5729, 12.5624, 11.1550],\n", "# ...,\n", "# [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", "# [ 0.0000, 0.0000, 0.0000, ..., 0.0000, 0.0000, 0.0000],\n", @@ -1740,20 +2400,21 @@ }, { "cell_type": "code", - "execution_count": 9, - "id": "pleased-isaac", + "execution_count": 12, + "id": "defined-brooks", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "compute_cmvn_loader_test.ipynb\t jit_infer.ipynb\r\n", "dataloader.ipynb\t\t mask_and_masked_fill_test.ipynb\r\n", "dataloader_with_tokens_tokenids.ipynb model.npz\r\n", "data.npz\t\t\t python_test.ipynb\r\n", - "encoder.npz\t\t\t train_test.ipynb\r\n", - "hack_api_test.ipynb\t\t u2_model.ipynb\r\n", - "jit_infer.ipynb\r\n" + "decoder.npz\t\t\t train_test.ipynb\r\n", + "encoder.npz\t\t\t u2_model.ipynb\r\n", + "hack_api_test.ipynb\r\n" ] } ], @@ -1777,22 +2438,22 @@ { "cell_type": "code", "execution_count": null, - "id": "appreciated-carpet", + "id": "exempt-viewer", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", - "execution_count": 10, - "id": "suitable-railway", + "execution_count": 13, + "id": "confident-piano", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/framework.py:686: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.\n", + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/framework.py:687: DeprecationWarning: `np.bool` is a deprecated alias for the builtin `bool`. To silence this warning, use `bool` by itself. Doing this will not modify any behavior and is safe. If you specifically wanted the numpy scalar type, use `np.bool_` here.\n", "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n", " elif dtype == np.bool:\n" ] @@ -1801,110 +2462,18 @@ "name": "stdout", "output_type": "stream", "text": [ - "Tensor(shape=[16, 51, 256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", - " [[[-0.98870903, -0.07539634, 0.03839889, ..., 0.28173494, 0.36495337, -0.03607073],\n", - " [-0.36884263, -0.22921059, -0.09850989, ..., 1.17554832, 0.49281687, 1.22981191],\n", - " [-0.42831492, 0.14306782, -0.40504572, ..., 1.19258320, 0.28560629, 0.84126252],\n", - " ...,\n", - " [-1.14067113, -0.90225518, 0.08112312, ..., 0.22529972, 0.98848087, 1.42083788],\n", - " [-1.34911966, 0.18967032, 0.27775878, ..., 0.31862095, 0.63177413, 0.15082565],\n", - " [-0.95137477, -0.03690310, -0.21094164, ..., 0.99404806, 0.53174424, 1.83114266]],\n", - "\n", - " [[-0.40670884, 0.22098994, -0.52978617, ..., -0.16111313, 0.73881495, 0.01380203],\n", - " [-0.51442140, -0.45173034, -0.45147005, ..., 1.22010005, 1.24763870, 0.03303454],\n", - " [-0.65140647, 0.16316377, -0.43823493, ..., 1.64499593, 0.57617754, 0.28116497],\n", - " ...,\n", - " [-0.53139108, -0.20081151, 0.54881495, ..., 0.31859449, 1.30965185, 1.90029418],\n", - " [-1.31833756, 0.42574614, -0.10103188, ..., 0.32908860, -0.09044939, -0.02275553],\n", - " [-0.90923810, 0.04415442, 0.16781625, ..., 1.19873142, 0.70491177, 1.67834747]],\n", - "\n", - " [[-0.53979987, 0.18136497, -0.01803534, ..., 0.19695832, 1.25342798, -0.06128683],\n", - " [ 0.55232340, -0.64118379, -0.37508020, ..., 1.14505792, 1.61396039, 0.87614059],\n", - " [-1.02553070, -0.25136885, 0.34500298, ..., 1.65974748, 0.41719219, 0.66209674],\n", - " ...,\n", - " [-1.29586899, -0.31949744, 0.15714335, ..., 0.75515050, 0.94777793, 2.14865851],\n", - " [-1.39566910, 0.06694880, 0.34747776, ..., 0.71009159, 0.68929648, -0.16454494],\n", - " [-0.95307189, -0.09190658, -0.10012256, ..., 1.55584967, 0.73311400, 1.79356611]],\n", - "\n", - " ...,\n", - "\n", - " [[-0.49390051, 0.14108033, -0.53815168, ..., 0.66417909, -0.43801153, 0.06367429],\n", - " [-0.13990593, -0.18394402, -0.51444548, ..., 1.64648640, 0.75647151, 0.73829728],\n", - " [-0.54492640, 0.11887605, 0.00587618, ..., 1.19514525, -0.07906327, 0.48107162],\n", - " ...,\n", - " [-1.33633518, -0.44442374, 0.00936849, ..., 0.91423398, 0.98535562, 0.98347098],\n", - " [-1.19861710, 0.70938700, 0.33154529, ..., 0.16847876, 0.02984418, -0.16296168],\n", - " [-0.89762348, 0.13328603, 0.37963712, ..., 1.21883786, 0.40238193, 1.44023502]],\n", - "\n", - " [[-0.08951648, 0.31010029, 0.40794152, ..., -0.10481174, 0.06963947, -0.45780548],\n", - " [ 0.62238014, -0.20880134, -0.22700992, ..., 1.21718991, 1.12063444, 0.40797234],\n", - " [-0.36213374, -0.26551899, 0.57684356, ..., 1.14578938, 0.28899658, 0.24930142],\n", - " ...,\n", - " [-0.88929099, -0.24094193, 0.38044125, ..., -0.01533419, 1.05152702, 0.98240042],\n", - " [-1.06873631, 0.38082325, 0.74675465, ..., -0.03644872, 0.26738623, -0.43120855],\n", - " [-0.94091892, -0.32104436, 0.47966722, ..., 0.61019003, 0.43108502, 1.11352766]],\n", - "\n", - " [[-0.03323537, 0.22007366, -0.03000726, ..., 0.36668554, 0.08975718, -0.25875339],\n", - " [ 0.40793720, -0.16809593, -0.73204160, ..., 1.41993105, 1.22917044, 0.72486037],\n", - " [-0.50788718, -0.43409127, 0.48296678, ..., 1.11637628, 0.16383135, 0.40282369],\n", - " ...,\n", - " [-0.74193639, -0.63939446, 0.55139303, ..., -0.00370563, 0.73491311, 1.21351111],\n", - " [-1.04918861, 0.59047806, 0.64082241, ..., 0.29343244, 0.25179449, -0.50433135],\n", - " [-0.86854327, -0.45206326, 0.32531947, ..., 0.38761431, 0.32762241, 1.13863206]]])\n", - "Tensor(shape=[16, 51, 4233], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", - " [[[ 0.01700813, 0.56431651, 0.45506364, ..., -0.90041381, 0.26185888, 0.68741971],\n", - " [-0.14328328, 0.16787037, 1.60204566, ..., -0.52197266, -0.27033603, 0.44314486],\n", - " [-0.22867197, 0.23935843, 1.40139520, ..., -0.58817720, 0.36277789, 0.60821676],\n", - " ...,\n", - " [-0.08569217, 0.53737843, 0.74085897, ..., -0.88298100, 0.06646422, 0.98183125],\n", - " [-0.33066741, 0.65147656, 0.50528461, ..., -0.88622850, 0.37098962, 1.03324938],\n", - " [ 0.39562812, 0.51454604, 0.33244559, ..., -0.73552674, -0.23745571, 0.55406201]],\n", - "\n", - " [[-0.38542494, 0.65172035, 0.47112849, ..., -0.60375690, 0.56403750, 0.86565256],\n", - " [-0.16968662, 0.32454279, 1.09088314, ..., -0.22235930, 0.33991110, 0.58421040],\n", - " [-0.32392421, 0.61689788, 0.94623339, ..., -0.51428318, 0.46278131, 0.49175799],\n", - " ...,\n", - " [ 0.18902412, 0.70370960, 0.22131878, ..., -0.49284744, -0.19460268, 0.56502676],\n", - " [-0.62619895, 1.07694829, 0.36491036, ..., -0.60827464, 0.18799752, 1.20347393],\n", - " [ 0.36972979, 0.69460171, 0.32603034, ..., -0.49348083, -0.15541299, 0.73012495]],\n", - "\n", - " [[-0.65226990, 0.72903591, -0.02955327, ..., -0.62513059, 0.78257781, 1.06949353],\n", - " [-0.31972992, 0.24137607, 1.32179105, ..., -0.31378460, 0.47126365, 0.50631112],\n", - " [-0.27153456, 0.61149585, 1.36779737, ..., -0.41040954, 0.19214611, 0.66955560],\n", - " ...,\n", - " [ 0.70239127, 0.59776336, 0.41315046, ..., -0.63964498, -0.05725104, 0.11523478],\n", - " [-0.61306721, 0.93517447, 0.13917899, ..., -1.07090628, 0.08259787, 1.05669415],\n", - " [ 0.30364236, 0.70674980, 0.27861559, ..., -0.45961899, -0.49536246, 0.42410135]],\n", - "\n", - " ...,\n", - "\n", - " [[-0.64492232, 1.16129804, 0.41210422, ..., -0.19025707, 0.62510222, 0.93904167],\n", - " [-0.21679890, 0.01541298, 1.22289670, ..., 0.38956094, 0.33127683, 0.55291802],\n", - " [-0.52435982, 0.53476179, 1.36162400, ..., -0.24845126, 0.30851704, 0.73026729],\n", - " ...,\n", - " [ 0.33089486, 1.21250021, 0.50133944, ..., -0.23968413, -0.05249966, 0.33221221],\n", - " [-0.85425609, 0.91674101, 0.37947315, ..., -0.54663503, 0.32272232, 0.91941363],\n", - " [ 0.73812121, 1.22125304, 0.54933113, ..., -0.34835899, -0.45703983, 0.10094876]],\n", - "\n", - " [[-0.14543423, 0.59343618, 0.48727173, ..., -0.48721361, 0.23470370, 1.04386616],\n", - " [-0.37399894, 0.05687386, 0.98770601, ..., 0.20608327, 0.28952795, 0.69849032],\n", - " [-0.52618062, 0.19394255, 1.08136940, ..., -0.51677036, 0.21367601, 0.81429225],\n", - " ...,\n", - " [ 0.91529322, 0.82572049, 0.56763554, ..., -0.48792118, -0.20669226, 0.10400648],\n", - " [-0.65565026, 0.82217371, 0.45654771, ..., -0.70658189, -0.00154681, 1.01031244],\n", - " [ 0.85112470, 0.92439699, 0.51105708, ..., -0.57625800, -0.60960227, -0.02037612]],\n", - "\n", - " [[-0.51818568, 0.87956434, 0.36026087, ..., -0.60333908, 0.30989277, 0.92859864],\n", - " [-0.36991373, -0.02736802, 1.04911196, ..., 0.23815414, 0.36916631, 0.56326580],\n", - " [-0.58471107, 0.27818793, 1.23031902, ..., -0.47299296, -0.03227636, 0.80790430],\n", - " ...,\n", - " [ 0.74327284, 1.02660847, 0.59810358, ..., -0.35650834, -0.50914389, 0.08961441],\n", - " [-0.64146334, 0.82072812, 0.35041004, ..., -0.80564159, -0.01707828, 0.84261787],\n", - " [ 0.85794026, 1.18059790, 0.34535947, ..., -0.57844251, -0.85070610, 0.06602620]]])\n", "Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", - " [139.62181091]) Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", - " [36.32815552]) Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", - " [380.64031982])\n" + " [142.48880005]) Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", + " [41.84146118]) Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", + " [377.33258057])\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dygraph/math_op_patch.py:238: UserWarning: The dtype of left and right variables are not the same, left dtype is VarType.FP32, but right dtype is VarType.INT32, the right dtype will convert to VarType.FP32\n", + " format(lhs_dtype, rhs_dtype, lhs_dtype))\n" ] } ], @@ -1912,7 +2481,7 @@ "import paddle\n", "feat=paddle.to_tensor(feat)\n", "feat_len=paddle.to_tensor(feat_len, dtype='int64')\n", - "text=paddle.to_tensor(text)\n", + "text=paddle.to_tensor(text, dtype='int64')\n", "text_len=paddle.to_tensor(text_len, dtype='int64')\n", "\n", "model.eval()\n", @@ -1924,25 +2493,25 @@ { "cell_type": "code", "execution_count": 11, - "id": "afraid-translation", + "id": "better-senator", "metadata": {}, "outputs": [], "source": [ - "# tensor(142.4635, grad_fn=) tensor(41.8416, grad_fn=) tensor(377.2479, grad_fn=)" + "# tensor(142.4858, device='cuda:0', grad_fn=) tensor(41.8416, device='cuda:0', grad_fn=) tensor(377.3222, device='cuda:0', grad_fn=)" ] }, { "cell_type": "code", "execution_count": null, - "id": "answering-slide", + "id": "related-banking", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", - "execution_count": 12, - "id": "undefined-glenn", + "execution_count": 14, + "id": "olympic-problem", "metadata": {}, "outputs": [ { @@ -1952,13 +2521,13 @@ "[16, 51, 256]\n", "[16, 1, 51]\n", "Tensor(shape=[51, 256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", - " [[-0.98870903, -0.07539634, 0.03839889, ..., 0.28173494, 0.36495337, -0.03607073],\n", - " [-0.36884263, -0.22921059, -0.09850989, ..., 1.17554832, 0.49281687, 1.22981191],\n", - " [-0.42831492, 0.14306782, -0.40504572, ..., 1.19258320, 0.28560629, 0.84126252],\n", + " [[-0.70194179, 0.56254166, 0.68803459, ..., 1.12373221, 0.78039235, 1.13693869],\n", + " [-0.77877808, 0.39126658, 0.71887815, ..., 1.25188220, 0.88616788, 1.31734526],\n", + " [-0.95908946, 0.63460249, 0.87671334, ..., 0.98183727, 0.74401081, 1.29032660],\n", " ...,\n", - " [-1.14067113, -0.90225518, 0.08112312, ..., 0.22529972, 0.98848087, 1.42083788],\n", - " [-1.34911966, 0.18967032, 0.27775878, ..., 0.31862095, 0.63177413, 0.15082565],\n", - " [-0.95137477, -0.03690310, -0.21094164, ..., 0.99404806, 0.53174424, 1.83114266]])\n" + " [-1.07322502, 0.67236906, 0.92303109, ..., 0.90754563, 0.81767166, 1.32396567],\n", + " [-1.16541159, 0.68199694, 0.69394493, ..., 1.22383487, 0.80282891, 1.45065081],\n", + " [-1.27320945, 0.71458030, 0.75819558, ..., 0.94154912, 0.87748396, 1.26230514]])\n" ] } ], @@ -1972,15 +2541,15 @@ { "cell_type": "code", "execution_count": null, - "id": "twenty-funds", + "id": "cubic-values", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", - "execution_count": 13, - "id": "threatened-phase", + "execution_count": 15, + "id": "shaped-alaska", "metadata": {}, "outputs": [ { @@ -2002,8 +2571,8 @@ }, { "cell_type": "code", - "execution_count": 14, - "id": "ordered-denver", + "execution_count": 16, + "id": "federal-rover", "metadata": {}, "outputs": [ { @@ -2020,8 +2589,8 @@ }, { "cell_type": "code", - "execution_count": 15, - "id": "above-investigator", + "execution_count": 17, + "id": "regulated-interstate", "metadata": {}, "outputs": [ { @@ -2029,19 +2598,19 @@ "output_type": "stream", "text": [ "False\n", - "[[-0.70815086 0.5656927 0.709813 ... 1.0982457 0.7758755\n", - " 1.1307045 ]\n", - " [-0.78350693 0.39481696 0.74499094 ... 1.2273936 0.8813775\n", - " 1.3142622 ]\n", - " [-0.9625825 0.63913065 0.90481734 ... 0.9587627 0.73829174\n", - " 1.2868171 ]\n", + "[[-0.7019424 0.56254166 0.6880345 ... 1.1237322 0.78039217\n", + " 1.1369387 ]\n", + " [-0.778778 0.39126638 0.7188779 ... 1.2518823 0.8861681\n", + " 1.3173454 ]\n", + " [-0.9590891 0.6346026 0.87671363 ... 0.9818373 0.74401116\n", + " 1.2903274 ]\n", " ...\n", - " [-1.089918 0.6853822 0.9498568 ... 0.8842667 0.81529033\n", - " 1.325533 ]\n", - " [-1.1811031 0.6971649 0.7225241 ... 1.200684 0.8006199\n", - " 1.4533575 ]\n", - " [-1.2878689 0.72914284 0.7896784 ... 0.916238 0.87275296\n", - " 1.2629912 ]]\n" + " [-1.0732253 0.6723689 0.9230311 ... 0.9075457 0.8176713\n", + " 1.3239657 ]\n", + " [-1.165412 0.6819976 0.69394535 ... 1.2238353 0.80282927\n", + " 1.4506509 ]\n", + " [-1.2732087 0.71458083 0.7581961 ... 0.9415482 0.877484\n", + " 1.2623053 ]]\n" ] } ], @@ -2052,119 +2621,32 @@ }, { "cell_type": "code", - "execution_count": 33, - "id": "dimensional-introduction", + "execution_count": 18, + "id": "proof-scheduling", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Tensor(shape=[16, 51, 256], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n", - " [[[-0.70815086, 0.56569272, 0.70981300, ..., 1.09824574, 0.77587551, 1.13070452],\n", - " [-0.78350693, 0.39481696, 0.74499094, ..., 1.22739363, 0.88137752, 1.31426215],\n", - " [-0.96258253, 0.63913065, 0.90481734, ..., 0.95876271, 0.73829174, 1.28681707],\n", - " ...,\n", - " [-1.08991802, 0.68538219, 0.94985682, ..., 0.88426667, 0.81529033, 1.32553303],\n", - " [-1.18110311, 0.69716489, 0.72252411, ..., 1.20068395, 0.80061990, 1.45335746],\n", - " [-1.28786886, 0.72914284, 0.78967839, ..., 0.91623801, 0.87275296, 1.26299119]],\n", - "\n", - " [[-0.92869806, 0.66449726, 0.50940996, ..., 0.67377377, 0.75721473, 1.44601321],\n", - " [-0.58323175, 0.39969942, 0.46701184, ..., 0.76123071, 0.82149148, 1.53387356],\n", - " [-0.66912395, 0.42107889, 0.53314692, ..., 0.77352434, 0.73588967, 1.55955291],\n", - " ...,\n", - " [-0.91979462, 0.78827965, 0.51364565, ..., 0.92784536, 0.88741118, 1.55079234],\n", - " [-0.90603584, 0.75470775, 0.51157582, ..., 0.99914151, 0.87281585, 1.49555171],\n", - " [-0.94746929, 0.86679929, 0.65138626, ..., 0.94967902, 0.74416542, 1.38868642]],\n", - "\n", - " [[-0.48889187, 0.40629929, -0.03985359, ..., 0.96110481, 0.72562295, 1.63959312],\n", - " [-0.23216049, 0.47649717, 0.06432461, ..., 1.12634289, 0.79304028, 1.72600341],\n", - " [-0.35049179, 0.45091787, 0.23251781, ..., 1.18179774, 0.77048856, 1.67785954],\n", - " ...,\n", - " [-0.95444369, 0.62471539, 0.32779199, ..., 1.38101709, 1.06079900, 1.50111783],\n", - " [-0.92489260, 0.75614768, 0.24058929, ..., 1.37509775, 1.08733690, 1.56775463],\n", - " [-0.87840986, 0.66779983, 0.13002315, ..., 1.30724812, 1.16084790, 1.31587541]],\n", - "\n", - " ...,\n", - "\n", - " [[-0.49060634, 0.58141297, 0.56432068, ..., 1.01875186, 0.29791155, 1.37021196],\n", - " [-0.68209082, 0.68713498, 0.37579197, ..., 1.03290558, 0.31765509, 1.85167778],\n", - " [-0.59835142, 0.70429099, 0.52930498, ..., 1.10545111, 0.27167040, 1.79945505],\n", - " ...,\n", - " [-0.92881185, 0.90744990, 0.30645573, ..., 1.21084821, 0.45378613, 1.54552865],\n", - " [-0.93471462, 0.98222488, 0.33421245, ..., 1.20006037, 0.48279485, 1.54707932],\n", - " [-0.88121003, 0.97045374, 0.41706085, ..., 1.17172730, 0.44657633, 1.51080203]],\n", - "\n", - " [[-0.75599599, -0.00976199, 0.22203811, ..., 0.83421057, 0.32212344, 1.65036464],\n", - " [-0.82587808, -0.04545709, 0.31506237, ..., 1.26919305, 0.44509020, 1.73162079],\n", - " [-0.76584357, -0.23916586, 0.41122752, ..., 1.08345842, 0.35172719, 1.59721172],\n", - " ...,\n", - " [-1.20936334, 0.74367058, 0.41594249, ..., 1.40040612, 0.81670052, 1.13627040],\n", - " [-1.21405351, 0.80623198, 0.41914314, ..., 1.40204942, 0.80985707, 1.16537964],\n", - " [-1.19519651, 0.79087526, 0.48453161, ..., 1.36768281, 0.76330566, 1.13404262]],\n", - "\n", - " [[-0.74483055, -0.33014604, 0.24039182, ..., 0.02945682, 0.71929377, 1.91275668],\n", - " [-0.56035036, -0.41564703, 0.36313012, ..., 0.41183007, 0.90209144, 1.80845654],\n", - " [-0.69359547, -0.13844451, 0.30018413, ..., 0.49444827, 0.56794512, 1.67332709],\n", - " ...,\n", - " [-0.99715513, 1.01512778, 0.43277434, ..., 1.09037900, 0.86760134, 1.29863596],\n", - " [-0.99962872, 1.07428896, 0.44226229, ..., 1.09051895, 0.88753319, 1.33773279],\n", - " [-0.98149830, 1.05249369, 0.51830143, ..., 1.04208529, 0.84298122, 1.31557417]]])\n", - "Tensor(shape=[16, 51, 4233], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", - " [[[-0.38691020, 1.10840213, -0.36066169, ..., -0.74562210, 0.38264662, 0.09510683],\n", - " [-0.40077940, 1.08683729, -0.28951788, ..., -0.79604387, 0.23650460, 0.21539813],\n", - " [-0.48349607, 1.13704205, -0.34528807, ..., -0.74176753, 0.15859264, 0.08665741],\n", - " ...,\n", - " [-0.33308679, 1.06052911, -0.28531107, ..., -0.56407875, 0.06546993, 0.34067774],\n", - " [-0.31819728, 1.02948821, -0.34244826, ..., -0.59871835, 0.13086139, 0.23477128],\n", - " [-0.46234924, 1.05966771, -0.25739416, ..., -0.73751336, 0.15748897, 0.26660469]],\n", - "\n", - " [[-0.31971461, 1.24201715, -0.42921415, ..., -1.03340065, 0.12717772, -0.02929212],\n", - " [-0.29465508, 1.20464718, -0.42703199, ..., -0.81277102, 0.03500172, 0.06429010],\n", - " [-0.23355499, 1.22438145, -0.37154198, ..., -0.80892444, -0.04463244, 0.14419895],\n", - " ...,\n", - " [-0.30919531, 1.24750948, -0.43951514, ..., -0.78709352, 0.09086802, 0.22021589],\n", - " [-0.33325344, 1.25496054, -0.43700716, ..., -0.88238114, 0.15829682, 0.27076158],\n", - " [-0.32431445, 1.20970893, -0.44767022, ..., -0.85771841, 0.15963244, 0.26043096]],\n", - "\n", - " [[-0.35240874, 1.21549594, -0.53064364, ..., -0.64634734, 0.05578946, -0.11943770],\n", - " [-0.57545149, 1.34280396, -0.46211162, ..., -0.65927368, 0.20014796, -0.09852441],\n", - " [-0.44432947, 1.32504761, -0.42148980, ..., -0.74191439, 0.19582249, -0.07732908],\n", - " ...,\n", - " [-0.44347334, 1.20052171, -0.54884982, ..., -0.68667632, 0.21917908, 0.24867907],\n", - " [-0.45897871, 1.27240980, -0.38485754, ..., -0.65063947, 0.28696120, 0.12868038],\n", - " [-0.39181367, 1.26035547, -0.59301054, ..., -0.83063912, 0.30225450, 0.48679376]],\n", - "\n", - " ...,\n", - "\n", - " [[-0.40601602, 1.20973194, -0.49095234, ..., -0.85096097, 0.20683658, 0.04339755],\n", - " [-0.40158153, 1.20891953, -0.45978341, ..., -0.98469758, 0.15744446, 0.03174295],\n", - " [-0.31740543, 1.22869027, -0.51616722, ..., -1.15985453, 0.11959577, -0.09386670],\n", - " ...,\n", - " [-0.44220674, 1.22656810, -0.59912074, ..., -1.26186705, 0.33093452, 0.08640137],\n", - " [-0.44747517, 1.23198783, -0.61370420, ..., -1.28686309, 0.32481337, 0.06313021],\n", - " [-0.45813197, 1.19577587, -0.57291198, ..., -1.30331659, 0.31380397, 0.09586264]],\n", - "\n", - " [[-0.34510323, 1.13676333, -0.41883209, ..., -0.19890606, -0.03747968, 0.15454675],\n", - " [-0.40049601, 1.07489455, -0.20783955, ..., -0.38220686, -0.01861078, 0.21973050],\n", - " [-0.27853724, 1.00104034, -0.15550351, ..., -0.38109386, -0.04351424, 0.20367554],\n", - " ...,\n", - " [-0.51515359, 1.21439159, -0.54381990, ..., -0.88646334, 0.26562017, 0.44584516],\n", - " [-0.52407527, 1.21481705, -0.54217672, ..., -0.92878431, 0.23799631, 0.44936055],\n", - " [-0.53740996, 1.18220830, -0.50675553, ..., -0.93877101, 0.24513872, 0.46150753]],\n", - "\n", - " [[-0.32741299, 0.97497153, -0.00948974, ..., -0.39587873, 0.03406802, 0.24171287],\n", - " [-0.43713254, 0.97446007, -0.12497631, ..., -0.57407486, 0.05668554, 0.24453926],\n", - " [-0.21812496, 0.95889568, -0.10461410, ..., -0.71747971, -0.03854717, 0.17685428],\n", - " ...,\n", - " [-0.50795484, 1.18626249, -0.55178732, ..., -1.05484831, 0.28090888, 0.26255831],\n", - " [-0.51629281, 1.18509519, -0.54967672, ..., -1.09254313, 0.25126994, 0.26916048],\n", - " [-0.52659053, 1.15537941, -0.52105296, ..., -1.09778607, 0.25193223, 0.28108835]]])\n", "Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", - " [377.24789429])\n", + " [377.32220459])\n", "[1.]\n", - "[-9.1570435e+00 4.7310561e-02 -1.8856564e-01 ... 8.5132439e-03\n", - " 2.1997439e-02 2.7489617e-02]\n" + "[[ 3.1708076e+00 -1.5184805e-02 4.9524564e-02 ... -2.4678309e-03\n", + " -5.9236852e-03 -7.2192554e-03]\n", + " [-1.7474542e+00 7.7654729e-03 -4.5106117e-02 ... 9.8463835e-04\n", + " 2.4569160e-03 2.2863639e-03]\n", + " [-2.3707268e+00 1.3136451e-02 -2.6281785e-02 ... 2.2738585e-03\n", + " 5.7726162e-03 7.4628354e-03]\n", + " ...\n", + " [-4.4350743e+00 2.4916438e-02 -9.0385124e-02 ... 4.4534383e-03\n", + " 1.1696636e-02 1.4515720e-02]\n", + " [-3.3899918e+00 1.7287316e-02 -6.3514955e-02 ... 3.2612216e-03\n", + " 8.5411733e-03 1.0692922e-02]\n", + " [-6.6964636e+00 3.5097409e-02 -1.2437013e-01 ... 6.3515711e-03\n", + " 1.6078018e-02 2.0318989e-02]]\n", + "[-4.4341431e+00 2.3347888e-02 -9.3501516e-02 ... 4.2512305e-03\n", + " 1.0928102e-02 1.3750527e-02]\n" ] } ], @@ -2193,38 +2675,41 @@ "encoder_out_lens = torch_mask_t.squeeze(1).sum(1)\n", "loss_ctc = model.ctc(paddle.to_tensor(torch_encoder_out), encoder_out_lens, text, text_len)\n", "print(loss_ctc)\n", - "# ctc tensor(377.2479, device='cuda:0', grad_fn=)\n", "loss_ctc.backward()\n", "print(loss_ctc.grad)\n", "print(model.ctc.ctc_lo.weight.grad)\n", "print(model.ctc.ctc_lo.bias.grad)\n", - "# tensor([[ 3.2806e+00, -1.8297e+00, -2.5472e+00, ..., -4.4421e+00,\n", - "# -3.4516e+00, -6.8526e+00],\n", - "# [-1.5462e-02, 8.0163e-03, 1.3837e-02, ..., 2.4541e-02,\n", - "# 1.7295e-02, 3.5211e-02],\n", - "# [ 5.0349e-02, -4.5908e-02, -2.8797e-02, ..., -8.8659e-02,\n", - "# -6.3412e-02, -1.2411e-01],\n", + "# tensor(377.3222, device='cuda:0', grad_fn=)\n", + "# None\n", + "# tensor([[ 3.1708e+00, -1.7475e+00, -2.3708e+00, ..., -4.4351e+00,\n", + "# -3.3900e+00, -6.6965e+00],\n", + "# [-1.5185e-02, 7.7655e-03, 1.3137e-02, ..., 2.4917e-02,\n", + "# 1.7287e-02, 3.5098e-02],\n", + "# [ 4.9522e-02, -4.5104e-02, -2.6280e-02, ..., -9.0381e-02,\n", + "# -6.3512e-02, -1.2436e-01],\n", "# ...,\n", - "# [-2.4901e-03, 1.0179e-03, 2.3745e-03, ..., 4.3330e-03,\n", - "# 3.2267e-03, 6.2963e-03],\n", - "# [-6.0131e-03, 2.5570e-03, 6.0628e-03, ..., 1.1443e-02,\n", - "# 8.4951e-03, 1.6021e-02],\n", - "# [-7.2826e-03, 2.3929e-03, 7.7757e-03, ..., 1.4101e-02,\n", - "# 1.0566e-02, 2.0105e-02]], device='cuda:0')" + "# [-2.4678e-03, 9.8464e-04, 2.2739e-03, ..., 4.4535e-03,\n", + "# 3.2612e-03, 6.3516e-03],\n", + "# [-5.9237e-03, 2.4569e-03, 5.7726e-03, ..., 1.1697e-02,\n", + "# 8.5412e-03, 1.6078e-02],\n", + "# [-7.2193e-03, 2.2864e-03, 7.4629e-03, ..., 1.4516e-02,\n", + "# 1.0693e-02, 2.0319e-02]], device='cuda:0')\n", + "# tensor([-4.4342e+00, 2.3348e-02, -9.3497e-02, ..., 4.2513e-03,\n", + "# 1.0928e-02, 1.3751e-02], device='cuda:0')" ] }, { "cell_type": "code", "execution_count": null, - "id": "basic-basement", + "id": "enclosed-consolidation", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", - "execution_count": 17, - "id": "decreased-automation", + "execution_count": 19, + "id": "synthetic-hungarian", "metadata": {}, "outputs": [ { @@ -2232,7 +2717,7 @@ "output_type": "stream", "text": [ "Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", - " [36.31002045]) 0.0\n" + " [41.84160995]) 0.0\n" ] } ], @@ -2245,29 +2730,50 @@ }, { "cell_type": "code", - "execution_count": 18, - "id": "marine-middle", + "execution_count": 20, + "id": "indian-sweden", "metadata": {}, "outputs": [], "source": [ - "# encoder, decoder + att_loss 不对齐" + "# encoder, decoder不对齐" ] }, { "cell_type": "code", - "execution_count": 20, - "id": "young-reserve", + "execution_count": 21, + "id": "marine-cuisine", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[[-3.7638968e-01 -8.2272053e-01 7.4276292e-01 ... 3.4200522e-01\n", + " 1.5034772e-02 4.0337229e-01]\n", + " [-8.7386459e-01 -3.1389427e-01 4.1987866e-01 ... 3.7723729e-01\n", + " -1.4352810e-01 -1.0023664e+00]\n", + " [-4.3505096e-01 3.4504786e-02 -2.8710306e-01 ... 7.7274129e-02\n", + " -1.1672243e+00 -2.6848501e-01]\n", + " ...\n", + " [ 4.2471480e-01 5.8885634e-01 2.0203922e-02 ... 3.7405500e-01\n", + " 4.5470044e-02 -3.7139410e-01]\n", + " [-3.7978446e-01 -8.1084180e-01 7.5725085e-01 ... 2.6038891e-01\n", + " -7.9347193e-04 4.2537671e-01]\n", + " [-3.8279903e-01 -8.1206715e-01 7.4943429e-01 ... 2.6173013e-01\n", + " -1.0499060e-03 4.2678756e-01]]\n" + ] + } + ], "source": [ "data = np.load(\".notebook/decoder.npz\", allow_pickle=True)\n", - "torch_decoder_out = data['decoder_out']" + "torch_decoder_out = data['decoder_out']\n", + "print(torch_decoder_out[0])" ] }, { "cell_type": "code", - "execution_count": 47, - "id": "differential-mileage", + "execution_count": 22, + "id": "several-result", "metadata": {}, "outputs": [], "source": [ @@ -2327,15 +2833,15 @@ }, { "cell_type": "code", - "execution_count": 48, - "id": "industrial-server", + "execution_count": 23, + "id": "possible-bulgaria", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Tensor(shape=[16, 7], dtype=int32, place=CUDAPlace(0), stop_gradient=True,\n", + "Tensor(shape=[16, 7], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n", " [[4232, 2995, 3116, 1209, 565 , 4232, 4232],\n", " [4232, 236 , 1176, 331 , 66 , 3925, 4077],\n", " [4232, 2693, 524 , 234 , 1145, 366 , 4232],\n", @@ -2352,7 +2858,7 @@ " [4232, 70 , 1741, 702 , 1666, 4232, 4232],\n", " [4232, 703 , 1778, 1030, 849 , 4232, 4232],\n", " [4232, 814 , 1674, 115 , 3827, 4232, 4232]])\n", - "Tensor(shape=[16, 7], dtype=int32, place=CUDAPlace(0), stop_gradient=True,\n", + "Tensor(shape=[16, 7], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n", " [[2995, 3116, 1209, 565, 4232, -1 , -1 ],\n", " [ 236, 1176, 331, 66 , 3925, 4077, 4232],\n", " [2693, 524, 234, 1145, 366, 4232, -1 ],\n", @@ -2384,8 +2890,8 @@ }, { "cell_type": "code", - "execution_count": 49, - "id": "noticed-soviet", + "execution_count": 24, + "id": "north-walter", "metadata": {}, "outputs": [ { @@ -2394,13 +2900,13 @@ "text": [ "[16, 7, 4233]\n", "Tensor(shape=[7, 4233], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", - " [[-0.37553221, -0.83114165, 0.70238966, ..., 0.30866742, 0.03037567, 0.43291825],\n", - " [-0.87047130, -0.32394654, 0.37882078, ..., 0.34444264, -0.12801090, -0.97179270],\n", - " [-0.43517584, 0.02496703, -0.32672805, ..., 0.04601809, -1.15214014, -0.23627253],\n", + " [[-0.37638962, -0.82272029, 0.74276292, ..., 0.34200522, 0.01503509, 0.40337229],\n", + " [-0.87386417, -0.31389427, 0.41987872, ..., 0.37723723, -0.14352795, -1.00236630],\n", + " [-0.43505096, 0.03450463, -0.28710306, ..., 0.07727426, -1.16722453, -0.26848495],\n", " ...,\n", - " [ 0.42706215, 0.58341736, -0.01791662, ..., 0.34311637, 0.06014483, -0.34610766],\n", - " [-0.37887222, -0.81906295, 0.71680295, ..., 0.22679621, 0.01455487, 0.45493346],\n", - " [-0.38187075, -0.82030386, 0.70901453, ..., 0.22812662, 0.01431785, 0.45638454]])\n", + " [ 0.42471474, 0.58885634, 0.02020410, ..., 0.37405482, 0.04546990, -0.37139422],\n", + " [-0.37978464, -0.81084198, 0.75725073, ..., 0.26038912, -0.00079346, 0.42537683],\n", + " [-0.38279879, -0.81206709, 0.74943423, ..., 0.26172996, -0.00104988, 0.42678767]])\n", "False\n" ] } @@ -2415,75 +2921,1153 @@ }, { "cell_type": "code", - "execution_count": 50, - "id": "clinical-matter", + "execution_count": null, + "id": "armed-cowboy", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fifty-earth", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "proud-commonwealth", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "assisted-fortune", + "metadata": {}, + "outputs": [], + "source": [ + "from paddle import nn\n", + "import paddle\n", + "class LabelSmoothingLoss(nn.Layer):\n", + "\n", + " def __init__(self,\n", + " size: int,\n", + " padding_idx: int,\n", + " smoothing: float,\n", + " normalize_length: bool=False):\n", + " super().__init__()\n", + " self.size = size\n", + " self.padding_idx = padding_idx\n", + " self.smoothing = smoothing\n", + " self.confidence = 1.0 - smoothing\n", + " self.normalize_length = normalize_length\n", + " self.criterion = nn.KLDivLoss(reduction=\"none\")\n", + "\n", + " def forward(self, x: paddle.Tensor, target: paddle.Tensor) -> paddle.Tensor:\n", + " \"\"\"Compute loss between x and target.\n", + " The model outputs and data labels tensors are flatten to\n", + " (batch*seqlen, class) shape and a mask is applied to the\n", + " padding part which should not be calculated for loss.\n", + " \n", + " Args:\n", + " x (paddle.Tensor): prediction (batch, seqlen, class)\n", + " target (paddle.Tensor):\n", + " target signal masked with self.padding_id (batch, seqlen)\n", + " Returns:\n", + " loss (paddle.Tensor) : The KL loss, scalar float value\n", + " \"\"\"\n", + " B, T, D = paddle.shape(x)\n", + " assert D == self.size\n", + " x = x.reshape((-1, self.size))\n", + " target = target.reshape([-1])\n", + "\n", + " # use zeros_like instead of torch.no_grad() for true_dist,\n", + " # since no_grad() can not be exported by JIT\n", + " true_dist = paddle.full_like(x, self.smoothing / (self.size - 1))\n", + " ignore = target == self.padding_idx # (B,)\n", + " print(self.smoothing / (self.size - 1))\n", + " print(true_dist)\n", + "\n", + " #target = target * (1 - ignore) # avoid -1 index\n", + " target = target.masked_fill(ignore, 0) # avoid -1 index\n", + " \n", + " \n", + " #true_dist += F.one_hot(target, self.size) * self.confidence\n", + " target_mask = F.one_hot(target, self.size)\n", + " true_dist *= (1 - target_mask)\n", + " true_dist += target_mask * self.confidence\n", + " \n", + "\n", + " kl = self.criterion(F.log_softmax(x, axis=1), true_dist)\n", + " \n", + " #TODO(Hui Zhang): sum not support bool type\n", + " #total = len(target) - int(ignore.sum())\n", + " total = len(target) - int(ignore.type_as(target).sum())\n", + " denom = total if self.normalize_length else B\n", + "\n", + " #numer = (kl * (1 - ignore)).sum()\n", + " numer = kl.masked_fill(ignore.unsqueeze(1), 0).sum()\n", + " return numer / denom\n" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "weighted-delight", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ + "2.3629489603024576e-05\n", + "Tensor(shape=[112, 4233], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n", + " [[0.00002363, 0.00002363, 0.00002363, ..., 0.00002363, 0.00002363, 0.00002363],\n", + " [0.00002363, 0.00002363, 0.00002363, ..., 0.00002363, 0.00002363, 0.00002363],\n", + " [0.00002363, 0.00002363, 0.00002363, ..., 0.00002363, 0.00002363, 0.00002363],\n", + " ...,\n", + " [0.00002363, 0.00002363, 0.00002363, ..., 0.00002363, 0.00002363, 0.00002363],\n", + " [0.00002363, 0.00002363, 0.00002363, ..., 0.00002363, 0.00002363, 0.00002363],\n", + " [0.00002363, 0.00002363, 0.00002363, ..., 0.00002363, 0.00002363, 0.00002363]])\n", "Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n", - " [41.84283447])\n", - "Tensor(shape=[16, 7], dtype=int32, place=CUDAPlace(0), stop_gradient=True,\n", - " [[2995, 3116, 1209, 565, 4232, -1 , -1 ],\n", - " [ 236, 1176, 331, 66 , 3925, 4077, 4232],\n", - " [2693, 524, 234, 1145, 366, 4232, -1 ],\n", - " [3875, 4211, 3062, 700, 4232, -1 , -1 ],\n", - " [ 272, 987, 1134, 494, 2959, 4232, -1 ],\n", - " [1936, 3715, 120, 2553, 2695, 2710, 4232],\n", - " [ 25 , 1149, 3930, 4232, -1 , -1 , -1 ],\n", - " [1753, 1778, 1237, 482, 3925, 110, 4232],\n", - " [3703, 2 , 565, 3827, 4232, -1 , -1 ],\n", - " [1150, 2734, 10 , 2478, 3490, 4232, -1 ],\n", - " [ 426, 811, 95 , 489, 144, 4232, -1 ],\n", - " [2313, 2006, 489, 975, 4232, -1 , -1 ],\n", - " [3702, 3414, 205, 1488, 2966, 1347, 4232],\n", - " [ 70 , 1741, 702, 1666, 4232, -1 , -1 ],\n", - " [ 703, 1778, 1030, 849, 4232, -1 , -1 ],\n", - " [ 814, 1674, 115, 3827, 4232, -1 , -1 ]])\n" + " [41.84160995])\n", + "VarType.INT64\n" ] } ], "source": [ - "loss_att = model.criterion_att(paddle.to_tensor(torch_decoder_out), ys_out_pad)\n", + "criteron = LabelSmoothingLoss(4233, -1, 0.1, False)\n", + "loss_att = criteron(paddle.to_tensor(torch_decoder_out), ys_out_pad.astype('int64'))\n", "print(loss_att)\n", - "print(ys_out_pad)\n", + "print(ys_out_pad.dtype)\n", "# tensor(41.8416, device='cuda:0', grad_fn=)" ] }, { "cell_type": "code", - "execution_count": 51, - "id": "checked-picking", + "execution_count": 27, + "id": "dress-shelter", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n", + " [41.84160995])\n", + "Tensor(shape=[1], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", + " [41.84146118])\n", + "4233\n", + "-1\n", + "0.1\n", + "False\n" + ] + } + ], + "source": [ + "decoder_out, _ = model.decoder(encoder_out, encoder_mask, ys_in_pad,\n", + " ys_in_lens)\n", + "\n", + "loss_att = model.criterion_att(paddle.to_tensor(torch_decoder_out), ys_out_pad)\n", + "print(loss_att)\n", + "\n", + "loss_att = model.criterion_att(decoder_out, ys_out_pad)\n", + "print(loss_att)\n", + "\n", + "print(model.criterion_att.size)\n", + "print(model.criterion_att.padding_idx)\n", + "print(model.criterion_att.smoothing)\n", + "print(model.criterion_att.normalize_length)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "growing-tooth", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", - "execution_count": 52, - "id": "normal-airfare", + "execution_count": 28, + "id": "speaking-shelf", + "metadata": {}, + "outputs": [], + "source": [ + "from typing import List\n", + "from typing import Optional\n", + "from typing import Tuple\n", + "\n", + "import paddle\n", + "from paddle import nn\n", + "from typeguard import check_argument_types\n", + "\n", + "from deepspeech.modules.activation import get_activation\n", + "from deepspeech.modules.attention import MultiHeadedAttention\n", + "from deepspeech.modules.attention import RelPositionMultiHeadedAttention\n", + "from deepspeech.modules.conformer_convolution import ConvolutionModule\n", + "from deepspeech.modules.embedding import PositionalEncoding\n", + "from deepspeech.modules.embedding import RelPositionalEncoding\n", + "from deepspeech.modules.encoder_layer import ConformerEncoderLayer\n", + "from deepspeech.modules.encoder_layer import TransformerEncoderLayer\n", + "from deepspeech.modules.mask import add_optional_chunk_mask\n", + "from deepspeech.modules.mask import make_non_pad_mask\n", + "from deepspeech.modules.positionwise_feed_forward import PositionwiseFeedForward\n", + "from deepspeech.modules.subsampling import Conv2dSubsampling4\n", + "from deepspeech.modules.subsampling import Conv2dSubsampling6\n", + "from deepspeech.modules.subsampling import Conv2dSubsampling8\n", + "from deepspeech.modules.subsampling import LinearNoSubsampling\n", + "\n", + "class BaseEncoder(nn.Layer):\n", + " def __init__(\n", + " self,\n", + " input_size: int,\n", + " output_size: int=256,\n", + " attention_heads: int=4,\n", + " linear_units: int=2048,\n", + " num_blocks: int=6,\n", + " dropout_rate: float=0.1,\n", + " positional_dropout_rate: float=0.1,\n", + " attention_dropout_rate: float=0.0,\n", + " input_layer: str=\"conv2d\",\n", + " pos_enc_layer_type: str=\"abs_pos\",\n", + " normalize_before: bool=True,\n", + " concat_after: bool=False,\n", + " static_chunk_size: int=0,\n", + " use_dynamic_chunk: bool=False,\n", + " global_cmvn: paddle.nn.Layer=None,\n", + " use_dynamic_left_chunk: bool=False, ):\n", + " \"\"\"\n", + " Args:\n", + " input_size (int): input dim, d_feature\n", + " output_size (int): dimension of attention, d_model\n", + " attention_heads (int): the number of heads of multi head attention\n", + " linear_units (int): the hidden units number of position-wise feed\n", + " forward\n", + " num_blocks (int): the number of encoder blocks\n", + " dropout_rate (float): dropout rate\n", + " attention_dropout_rate (float): dropout rate in attention\n", + " positional_dropout_rate (float): dropout rate after adding\n", + " positional encoding\n", + " input_layer (str): input layer type.\n", + " optional [linear, conv2d, conv2d6, conv2d8]\n", + " pos_enc_layer_type (str): Encoder positional encoding layer type.\n", + " opitonal [abs_pos, scaled_abs_pos, rel_pos]\n", + " normalize_before (bool):\n", + " True: use layer_norm before each sub-block of a layer.\n", + " False: use layer_norm after each sub-block of a layer.\n", + " concat_after (bool): whether to concat attention layer's input\n", + " and output.\n", + " True: x -> x + linear(concat(x, att(x)))\n", + " False: x -> x + att(x)\n", + " static_chunk_size (int): chunk size for static chunk training and\n", + " decoding\n", + " use_dynamic_chunk (bool): whether use dynamic chunk size for\n", + " training or not, You can only use fixed chunk(chunk_size > 0)\n", + " or dyanmic chunk size(use_dynamic_chunk = True)\n", + " global_cmvn (Optional[paddle.nn.Layer]): Optional GlobalCMVN layer\n", + " use_dynamic_left_chunk (bool): whether use dynamic left chunk in\n", + " dynamic chunk training\n", + " \"\"\"\n", + " assert check_argument_types()\n", + " super().__init__()\n", + " self._output_size = output_size\n", + "\n", + " if pos_enc_layer_type == \"abs_pos\":\n", + " pos_enc_class = PositionalEncoding\n", + " elif pos_enc_layer_type == \"rel_pos\":\n", + " pos_enc_class = RelPositionalEncoding\n", + " else:\n", + " raise ValueError(\"unknown pos_enc_layer: \" + pos_enc_layer_type)\n", + "\n", + " if input_layer == \"linear\":\n", + " subsampling_class = LinearNoSubsampling\n", + " elif input_layer == \"conv2d\":\n", + " subsampling_class = Conv2dSubsampling4\n", + " elif input_layer == \"conv2d6\":\n", + " subsampling_class = Conv2dSubsampling6\n", + " elif input_layer == \"conv2d8\":\n", + " subsampling_class = Conv2dSubsampling8\n", + " else:\n", + " raise ValueError(\"unknown input_layer: \" + input_layer)\n", + "\n", + " self.global_cmvn = global_cmvn\n", + " self.embed = subsampling_class(\n", + " idim=input_size,\n", + " odim=output_size,\n", + " dropout_rate=dropout_rate,\n", + " pos_enc_class=pos_enc_class(\n", + " d_model=output_size, dropout_rate=positional_dropout_rate), )\n", + "\n", + " self.normalize_before = normalize_before\n", + " self.after_norm = nn.LayerNorm(output_size, epsilon=1e-12)\n", + " self.static_chunk_size = static_chunk_size\n", + " self.use_dynamic_chunk = use_dynamic_chunk\n", + " self.use_dynamic_left_chunk = use_dynamic_left_chunk\n", + "\n", + " def output_size(self) -> int:\n", + " return self._output_size\n", + "\n", + " def forward(\n", + " self,\n", + " xs: paddle.Tensor,\n", + " xs_lens: paddle.Tensor,\n", + " decoding_chunk_size: int=0,\n", + " num_decoding_left_chunks: int=-1,\n", + " ) -> Tuple[paddle.Tensor, paddle.Tensor]:\n", + " \"\"\"Embed positions in tensor.\n", + " Args:\n", + " xs: padded input tensor (B, L, D)\n", + " xs_lens: input length (B)\n", + " decoding_chunk_size: decoding chunk size for dynamic chunk\n", + " 0: default for training, use random dynamic chunk.\n", + " <0: for decoding, use full chunk.\n", + " >0: for decoding, use fixed chunk size as set.\n", + " num_decoding_left_chunks: number of left chunks, this is for decoding,\n", + " the chunk size is decoding_chunk_size.\n", + " >=0: use num_decoding_left_chunks\n", + " <0: use all left chunks\n", + " Returns:\n", + " encoder output tensor, lens and mask\n", + " \"\"\"\n", + " masks = make_non_pad_mask(xs_lens).unsqueeze(1) # (B, 1, L)\n", + "\n", + " if self.global_cmvn is not None:\n", + " xs = self.global_cmvn(xs)\n", + " #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor\n", + " xs, pos_emb, masks = self.embed(xs, masks.type_as(xs), offset=0)\n", + " #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor\n", + " masks = masks.astype(paddle.bool)\n", + " #TODO(Hui Zhang): mask_pad = ~masks\n", + " mask_pad = masks.logical_not()\n", + " chunk_masks = add_optional_chunk_mask(\n", + " xs, masks, self.use_dynamic_chunk, self.use_dynamic_left_chunk,\n", + " decoding_chunk_size, self.static_chunk_size,\n", + " num_decoding_left_chunks)\n", + " for layer in self.encoders:\n", + " xs, chunk_masks, _ = layer(xs, chunk_masks, pos_emb, mask_pad)\n", + " if self.normalize_before:\n", + " xs = self.after_norm(xs)\n", + " # Here we assume the mask is not changed in encoder layers, so just\n", + " # return the masks before encoder layers, and the masks will be used\n", + " # for cross attention with decoder later\n", + " return xs, masks" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "sharp-municipality", + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "class ConformerEncoder(BaseEncoder):\n", + " \"\"\"Conformer encoder module.\"\"\"\n", + "\n", + " def __init__(\n", + " self,\n", + " input_size: int,\n", + " output_size: int=256,\n", + " attention_heads: int=4,\n", + " linear_units: int=2048,\n", + " num_blocks: int=6,\n", + " dropout_rate: float=0.1,\n", + " positional_dropout_rate: float=0.1,\n", + " attention_dropout_rate: float=0.0,\n", + " input_layer: str=\"conv2d\",\n", + " pos_enc_layer_type: str=\"rel_pos\",\n", + " normalize_before: bool=True,\n", + " concat_after: bool=False,\n", + " static_chunk_size: int=0,\n", + " use_dynamic_chunk: bool=False,\n", + " global_cmvn: nn.Layer=None,\n", + " use_dynamic_left_chunk: bool=False,\n", + " positionwise_conv_kernel_size: int=1,\n", + " macaron_style: bool=True,\n", + " selfattention_layer_type: str=\"rel_selfattn\",\n", + " activation_type: str=\"swish\",\n", + " use_cnn_module: bool=True,\n", + " cnn_module_kernel: int=15,\n", + " causal: bool=False,\n", + " cnn_module_norm: str=\"batch_norm\", ):\n", + " \"\"\"Construct ConformerEncoder\n", + " Args:\n", + " input_size to use_dynamic_chunk, see in BaseEncoder\n", + " positionwise_conv_kernel_size (int): Kernel size of positionwise\n", + " conv1d layer.\n", + " macaron_style (bool): Whether to use macaron style for\n", + " positionwise layer.\n", + " selfattention_layer_type (str): Encoder attention layer type,\n", + " the parameter has no effect now, it's just for configure\n", + " compatibility.\n", + " activation_type (str): Encoder activation function type.\n", + " use_cnn_module (bool): Whether to use convolution module.\n", + " cnn_module_kernel (int): Kernel size of convolution module.\n", + " causal (bool): whether to use causal convolution or not.\n", + " cnn_module_norm (str): cnn conv norm type, Optional['batch_norm','layer_norm']\n", + " \"\"\"\n", + " assert check_argument_types()\n", + " super().__init__(input_size, output_size, attention_heads, linear_units,\n", + " num_blocks, dropout_rate, positional_dropout_rate,\n", + " attention_dropout_rate, input_layer,\n", + " pos_enc_layer_type, normalize_before, concat_after,\n", + " static_chunk_size, use_dynamic_chunk, global_cmvn,\n", + " use_dynamic_left_chunk)\n", + " activation = get_activation(activation_type)\n", + "\n", + " # self-attention module definition\n", + " encoder_selfattn_layer = RelPositionMultiHeadedAttention\n", + " encoder_selfattn_layer_args = (attention_heads, output_size,\n", + " attention_dropout_rate)\n", + " # feed-forward module definition\n", + " positionwise_layer = PositionwiseFeedForward\n", + " positionwise_layer_args = (output_size, linear_units, dropout_rate,\n", + " activation)\n", + " # convolution module definition\n", + " convolution_layer = ConvolutionModule\n", + " convolution_layer_args = (output_size, cnn_module_kernel, activation,\n", + " cnn_module_norm, causal)\n", + "\n", + " self.encoders = nn.ModuleList([\n", + " ConformerEncoderLayer(\n", + " size=output_size,\n", + " self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args),\n", + " feed_forward=positionwise_layer(*positionwise_layer_args),\n", + " feed_forward_macaron=positionwise_layer(\n", + " *positionwise_layer_args) if macaron_style else None,\n", + " conv_module=convolution_layer(*convolution_layer_args)\n", + " if use_cnn_module else None,\n", + " dropout_rate=dropout_rate,\n", + " normalize_before=normalize_before,\n", + " concat_after=concat_after) for _ in range(num_blocks)\n", + " ])\n" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "tutorial-syndication", + "metadata": {}, + "outputs": [], + "source": [ + "from deepspeech.frontend.utility import load_cmvn\n", + "from deepspeech.modules.cmvn import GlobalCMVN\n", + "\n", + "configs=cfg.model\n", + "mean, istd = load_cmvn(configs['cmvn_file'],\n", + " configs['cmvn_file_type'])\n", + "global_cmvn = GlobalCMVN(\n", + " paddle.to_tensor(mean, dtype=paddle.float),\n", + " paddle.to_tensor(istd, dtype=paddle.float))\n", + "\n", + "\n", + "input_dim = configs['input_dim']\n", + "vocab_size = configs['output_dim']\n", + "encoder_type = configs.get('encoder', 'transformer')\n", + " \n", + "encoder = ConformerEncoder(\n", + " input_dim, global_cmvn=global_cmvn, **configs['encoder_conf'])" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "fuzzy-register", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "None\n" + "True\n" ] } ], "source": [ - "decoder_out, _ = model.decoder(encoder_out, encoder_mask, ys_in_pad,\n", - " ys_in_lens)\n", - "loss_att = model.criterion_att(paddle.to_tensor(torch_decoder_out), ys_out_pad)" + "o = global_cmvn(feat)\n", + "o2 = model.encoder.global_cmvn(feat)\n", + "print(np.allclose(o.numpy(), o2.numpy()))" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "balanced-locator", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tensor(shape=[16, 1, 207], dtype=bool, place=CUDAPlace(0), stop_gradient=True,\n", + " [[[True , True , True , ..., True , True , True ]],\n", + "\n", + " [[True , True , True , ..., True , True , True ]],\n", + "\n", + " [[True , True , True , ..., True , False, False]],\n", + "\n", + " ...,\n", + "\n", + " [[True , True , True , ..., False, False, False]],\n", + "\n", + " [[True , True , True , ..., False, False, False]],\n", + "\n", + " [[True , True , True , ..., False, False, False]]])\n" + ] + } + ], + "source": [ + "from deepspeech.modules.mask import make_non_pad_mask\n", + "from deepspeech.modules.mask import make_pad_mask\n", + "masks = make_non_pad_mask(feat_len).unsqueeze(1)\n", + "print(masks)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "induced-proposition", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tensor(shape=[16, 207, 80], dtype=float32, place=CUDAPlace(0), stop_gradient=True,\n", + " [[[-0.53697914, -0.19910523, -0.34997201, ..., -0.82427669, -1.02650309, -0.96300691],\n", + " [-0.04464225, 0.23176001, -0.32538742, ..., -0.90158713, -1.03248465, -0.75986791],\n", + " [ 0.50035292, 0.22691160, -0.73052198, ..., -1.00552964, -0.87123060, -1.03062117],\n", + " ...,\n", + " [-0.40023831, -0.14325078, -0.57947433, ..., -1.07178426, -1.28059900, -1.05180073],\n", + " [ 0.15755332, -0.00184949, -0.28702953, ..., -1.10898709, -0.94518697, -0.72506356],\n", + " [-0.47520429, -1.39415145, -0.25754252, ..., -1.13649082, -1.19430351, -1.22903371]],\n", + "\n", + " [[ 0.95454037, 0.36427975, -1.38908529, ..., -1.16366839, -1.28453600, -1.20151031],\n", + " [-0.08573537, -1.05785275, -0.89172721, ..., -0.96440506, -1.12547100, -1.25990939],\n", + " [ 0.47653601, 0.32886592, -0.59200549, ..., -1.19421589, -1.14302588, -1.02422845],\n", + " ...,\n", + " [-0.47431335, -0.33558893, -0.72325647, ..., -1.45058632, -1.39574063, -1.04641151],\n", + " [ 0.36112556, 0.10380996, -1.15994537, ..., -1.04394984, -1.02212358, -1.02083635],\n", + " [-1.27172923, -2.14601755, -0.75676596, ..., -0.97822225, -0.93785471, -1.03707945]],\n", + "\n", + " [[-1.54652190, -1.01517177, -0.88900733, ..., -0.48522446, -0.75163364, -0.67765164],\n", + " [-0.76100892, -0.73351598, -0.91587651, ..., -0.24835993, -0.58927339, -0.73722762],\n", + " [-0.02471367, 0.17015894, -0.42326337, ..., -0.33203802, -0.76695800, -0.71651691],\n", + " ...,\n", + " [-1.70319796, -1.25910866, -1.14492917, ..., -1.18101490, -1.11631835, -0.93108195],\n", + " [-6.04343224, -4.93973970, -3.42354989, ..., -3.99492049, -3.98687553, -3.67971063],\n", + " [-6.04343224, -4.93973970, -3.42354989, ..., -3.99492049, -3.98687553, -3.67971063]],\n", + "\n", + " ...,\n", + "\n", + " [[ 0.64982772, 0.26116797, -0.84196597, ..., -0.87213463, -1.10728693, -1.32531130],\n", + " [ 0.35391113, -0.01584581, -0.40424931, ..., -0.99173468, -1.07270539, -1.19239008],\n", + " [ 0.37704495, -0.06278508, -0.11467686, ..., -1.10212946, -1.09524000, -1.11815071],\n", + " ...,\n", + " [-6.04343224, -4.93973970, -3.42354989, ..., -3.99492049, -3.98687553, -3.67971063],\n", + " [-6.04343224, -4.93973970, -3.42354989, ..., -3.99492049, -3.98687553, -3.67971063],\n", + " [-6.04343224, -4.93973970, -3.42354989, ..., -3.99492049, -3.98687553, -3.67971063]],\n", + "\n", + " [[ 0.04445776, -0.17546852, -0.67475224, ..., -0.49801198, -0.56782746, -0.77852231],\n", + " [-1.34279025, -0.80342549, -0.90457231, ..., -0.65901577, -0.72549772, -0.62796098],\n", + " [-0.76252806, -0.13071291, -0.13280024, ..., -0.56132573, -0.60587686, -0.72114766],\n", + " ...,\n", + " [-6.04343224, -4.93973970, -3.42354989, ..., -3.99492049, -3.98687553, -3.67971063],\n", + " [-6.04343224, -4.93973970, -3.42354989, ..., -3.99492049, -3.98687553, -3.67971063],\n", + " [-6.04343224, -4.93973970, -3.42354989, ..., -3.99492049, -3.98687553, -3.67971063]],\n", + "\n", + " [[-1.07980299, -1.08341801, -1.17969072, ..., -0.17757270, -0.43746525, -0.04000654],\n", + " [ 0.92353648, 0.63770926, -0.52810186, ..., -0.12927933, -0.20342292, 0.16655664],\n", + " [ 0.49337494, -0.00911332, -0.73301607, ..., 0.10074048, -0.09811471, -0.00923573],\n", + " ...,\n", + " [-6.04343224, -4.93973970, -3.42354989, ..., -3.99492049, -3.98687553, -3.67971063],\n", + " [-6.04343224, -4.93973970, -3.42354989, ..., -3.99492049, -3.98687553, -3.67971063],\n", + " [-6.04343224, -4.93973970, -3.42354989, ..., -3.99492049, -3.98687553, -3.67971063]]])\n" + ] + } + ], + "source": [ + "xs = model.encoder.global_cmvn(feat)\n", + "print(xs)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "cutting-julian", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tensor(shape=[16, 256, 51, 19], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", + " [[[[0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " [[0. , 0. , 0. , ..., 0. , 0. , 0.00209083],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [0. , 0.01194306, 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0.04610471, 0. ],\n", + " [0. , 0. , 0. , ..., 0.00967231, 0.04613467, 0. ]],\n", + "\n", + " [[0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " ...,\n", + "\n", + " [[0.22816099, 0.24614786, 0.25304127, ..., 0.20401822, 0.23248228, 0.31190544],\n", + " [0.13587360, 0.28877240, 0.27991283, ..., 0.19210319, 0.20346391, 0.19934426],\n", + " [0.25739068, 0.39348233, 0.27877361, ..., 0.27482539, 0.19302306, 0.23810163],\n", + " ...,\n", + " [0.11939213, 0.28473237, 0.33082074, ..., 0.23838061, 0.22104350, 0.23905794],\n", + " [0.17387670, 0.20402060, 0.40263173, ..., 0.24782266, 0.26742202, 0.15426503],\n", + " [0. , 0.29080707, 0.27725950, ..., 0.17539823, 0.18478745, 0.22483408]],\n", + "\n", + " [[0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " [[0.35446781, 0.38861471, 0.39724261, ..., 0.38680089, 0.33568040, 0.34552398],\n", + " [0.41739127, 0.51038563, 0.41729912, ..., 0.33992639, 0.37081629, 0.35109508],\n", + " [0.36116859, 0.40744874, 0.48490953, ..., 0.34848654, 0.32321057, 0.35188958],\n", + " ...,\n", + " [0.23143977, 0.38021481, 0.51526314, ..., 0.36499465, 0.37411752, 0.39986172],\n", + " [0.34678638, 0.40238205, 0.50076538, ..., 0.36184520, 0.31596646, 0.36334658],\n", + " [0.36498138, 0.37943166, 0.51718897, ..., 0.31798238, 0.33656698, 0.34130475]]],\n", + "\n", + "\n", + " [[[0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " [[0.01456045, 0.09447514, 0. , ..., 0. , 0. , 0. ],\n", + " [0.01500242, 0.02963220, 0. , ..., 0. , 0. , 0. ],\n", + " [0.03295187, 0. , 0. , ..., 0.04584959, 0.02043908, 0. ],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0.04425837],\n", + " [0. , 0. , 0.02556529, ..., 0. , 0.00900441, 0.04908358]],\n", + "\n", + " [[0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0.11141267, 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " ...,\n", + "\n", + " [[0.33696529, 0.38526866, 0.32900479, ..., 0.28703830, 0.23351061, 0.19004467],\n", + " [0.13575366, 0.35783342, 0.33573425, ..., 0.22081660, 0.15854910, 0.13587447],\n", + " [0.21928655, 0.28900093, 0.28255141, ..., 0.20602837, 0.23927397, 0.21909429],\n", + " ...,\n", + " [0.23291890, 0.39096734, 0.36399242, ..., 0.20598020, 0.25373828, 0.23137446],\n", + " [0.18739152, 0.30793777, 0.30296701, ..., 0.27250600, 0.25191751, 0.20836820],\n", + " [0.22454213, 0.41402060, 0.54082996, ..., 0.31874508, 0.25079906, 0.25938687]],\n", + "\n", + " [[0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " [[0.26456982, 0.49519050, 0.56702250, ..., 0.30954638, 0.35292268, 0.32668519],\n", + " [0.21576807, 0.51833367, 0.49183372, ..., 0.36043224, 0.38523889, 0.36154741],\n", + " [0.20067888, 0.42784205, 0.52817714, ..., 0.31871423, 0.32452232, 0.31036487],\n", + " ...,\n", + " [0.49855131, 0.51001430, 0.52278662, ..., 0.36450142, 0.34338164, 0.33602941],\n", + " [0.41233343, 0.55517823, 0.52827710, ..., 0.40675971, 0.33873138, 0.36724189],\n", + " [0.40820011, 0.46187383, 0.47338152, ..., 0.38690975, 0.36039269, 0.38022059]]],\n", + "\n", + "\n", + " [[[0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " [[0. , 0.00578516, 0. , ..., 0.00748384, 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0.03035110, 0. , 0.00026720],\n", + " [0.00094807, 0. , 0. , ..., 0.00795512, 0. , 0. ],\n", + " ...,\n", + " [0.02032628, 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0.01080076, 0. ],\n", + " [0.18470290, 0. , 0. , ..., 0.05058352, 0.09475817, 0.05914564]],\n", + "\n", + " [[0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " ...,\n", + "\n", + " [[0.38708323, 0.28021947, 0.35892880, ..., 0.16595127, 0.16031364, 0.21136315],\n", + " [0.15595171, 0.30544323, 0.24666184, ..., 0.22675267, 0.25765014, 0.19682154],\n", + " [0.29517862, 0.41209796, 0.20063159, ..., 0.17595036, 0.22536841, 0.22214051],\n", + " ...,\n", + " [0.24744980, 0.26258564, 0.38654143, ..., 0.23620218, 0.23157144, 0.18514194],\n", + " [0.25714791, 0.29592845, 0.47744542, ..., 0.23545510, 0.25072727, 0.20976165],\n", + " [1.20154655, 0.84644288, 0.73385584, ..., 1.02517247, 0.95309550, 1.00134516]],\n", + "\n", + " [[0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " [[0.45013186, 0.47484034, 0.40540054, ..., 0.19346163, 0.17825794, 0.14776605],\n", + " [0.47545874, 0.48186573, 0.36760187, ..., 0.27809089, 0.32997063, 0.32337096],\n", + " [0.46160024, 0.40050328, 0.39060861, ..., 0.36612910, 0.35242686, 0.29738861],\n", + " ...,\n", + " [0.55148494, 0.51017821, 0.40132499, ..., 0.38948193, 0.35737294, 0.33088297],\n", + " [0.41972569, 0.45475486, 0.45320493, ..., 0.38343129, 0.40125814, 0.36180776],\n", + " [0.34279808, 0.31606171, 0.44701228, ..., 0.21665487, 0.23984617, 0.23903391]]],\n", + "\n", + "\n", + " ...,\n", + "\n", + "\n", + " [[[0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " [[0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0.04178291, 0. , 0.01580476, ..., 0. , 0.02250817, 0. ],\n", + " [0.04323414, 0.07786420, 0. , ..., 0.01634724, 0. , 0. ],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " [[0.03209178, 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0.13563479, 0. , 0. , ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " ...,\n", + "\n", + " [[0. , 0.25187218, 0.24979387, ..., 0.24774717, 0.22354351, 0.19149347],\n", + " [0.16540922, 0.19585510, 0.19812922, ..., 0.27344131, 0.20928150, 0.26150429],\n", + " [0.10494646, 0.06329897, 0.33843631, ..., 0.25138417, 0.12470355, 0.23926635],\n", + " ...,\n", + " [1.12572610, 0.87340784, 0.78169060, ..., 1.04576325, 1.00935984, 1.02209163],\n", + " [1.12572610, 0.87340784, 0.78169060, ..., 1.04576325, 1.00935984, 1.02209163],\n", + " [1.12572610, 0.87340784, 0.78169060, ..., 1.04576325, 1.00935984, 1.02209163]],\n", + "\n", + " [[0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " [[0.11428106, 0.45667490, 0.46820879, ..., 0.32057840, 0.33578536, 0.39012644],\n", + " [0.10441341, 0.45739070, 0.46107352, ..., 0.38467997, 0.38291249, 0.36685589],\n", + " [0.19867736, 0.35519636, 0.44313061, ..., 0.40679252, 0.38067645, 0.30645671],\n", + " ...,\n", + " [1.44883108, 1.02119160, 0.94472742, ..., 1.23630035, 1.21888959, 1.23804700],\n", + " [1.44883108, 1.02119160, 0.94472742, ..., 1.23630035, 1.21888959, 1.23804700],\n", + " [1.44883108, 1.02119160, 0.94472742, ..., 1.23630035, 1.21888959, 1.23804700]]],\n", + "\n", + "\n", + " [[[0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " [[0.02465414, 0. , 0. , ..., 0. , 0. , 0.03390232],\n", + " [0. , 0. , 0.01830704, ..., 0.05166877, 0.00948385, 0.07453502],\n", + " [0.09921519, 0. , 0.01587192, ..., 0.01620276, 0.05140074, 0.00192392],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " [[0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " ...,\n", + "\n", + " [[0.40034360, 0.25306445, 0.20217699, ..., 0.09816189, 0.07064310, 0.04974059],\n", + " [0.12567598, 0.21030979, 0.11181555, ..., 0.04278110, 0.11968569, 0.12005232],\n", + " [0.28786880, 0.24030517, 0.22565845, ..., 0. , 0.06418110, 0.05872961],\n", + " ...,\n", + " [1.12572610, 0.87340784, 0.78169060, ..., 1.04576325, 1.00935984, 1.02209163],\n", + " [1.12572610, 0.87340784, 0.78169060, ..., 1.04576325, 1.00935984, 1.02209163],\n", + " [1.12572610, 0.87340784, 0.78169060, ..., 1.04576325, 1.00935984, 1.02209163]],\n", + "\n", + " [[0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " [[0.38404641, 0.30990323, 0.37156230, ..., 0.18125033, 0.15050662, 0.19619957],\n", + " [0.47285745, 0.40528792, 0.39718056, ..., 0.24709940, 0.04565683, 0.11500744],\n", + " [0.32620737, 0.30072594, 0.30477354, ..., 0.23529193, 0.21356541, 0.16985542],\n", + " ...,\n", + " [1.44883108, 1.02119160, 0.94472742, ..., 1.23630035, 1.21888959, 1.23804700],\n", + " [1.44883108, 1.02119160, 0.94472742, ..., 1.23630035, 1.21888959, 1.23804700],\n", + " [1.44883108, 1.02119160, 0.94472742, ..., 1.23630035, 1.21888959, 1.23804700]]],\n", + "\n", + "\n", + " [[[0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " [[0.03343770, 0.00123780, 0.05297198, ..., 0.07271163, 0.08656286, 0.14493589],\n", + " [0.11043239, 0.06143146, 0.06362963, ..., 0.08127750, 0.06259022, 0.08315435],\n", + " [0.01767678, 0.00201111, 0.07875030, ..., 0.06963293, 0.08979890, 0.05326346],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " [[0.10033827, 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0.15627117, 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0.05144687, 0. , 0. , ..., 0. , 0. , 0.00436414],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " ...,\n", + "\n", + " [[0.25142455, 0.45964020, 0.37346074, ..., 0.04763087, 0. , 0. ],\n", + " [0.19760093, 0.26626948, 0.11190540, ..., 0.03044968, 0. , 0. ],\n", + " [0.16340607, 0.32938001, 0.25689697, ..., 0.05569421, 0. , 0. ],\n", + " ...,\n", + " [1.12572610, 0.87340784, 0.78169060, ..., 1.04576325, 1.00935984, 1.02209163],\n", + " [1.12572610, 0.87340784, 0.78169060, ..., 1.04576325, 1.00935984, 1.02209163],\n", + " [1.12572610, 0.87340784, 0.78169060, ..., 1.04576325, 1.00935984, 1.02209163]],\n", + "\n", + " [[0. , 0. , 0. , ..., 0. , 0.02218930, 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0.02848953],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " ...,\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n", + " [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n", + "\n", + " [[0.25810039, 0.63016868, 0.37037861, ..., 0.18704373, 0.08269356, 0.09912672],\n", + " [0.17292863, 0.50678611, 0.40738991, ..., 0.16006103, 0.11725381, 0.09940521],\n", + " [0.24175072, 0.41616210, 0.41256818, ..., 0.13519743, 0.07912572, 0.12846369],\n", + " ...,\n", + " [1.44883108, 1.02119160, 0.94472742, ..., 1.23630035, 1.21888959, 1.23804700],\n", + " [1.44883108, 1.02119160, 0.94472742, ..., 1.23630035, 1.21888959, 1.23804700],\n", + " [1.44883108, 1.02119160, 0.94472742, ..., 1.23630035, 1.21888959, 1.23804700]]]])\n" + ] + } + ], + "source": [ + "xs = model.encoder.global_cmvn(feat)\n", + "masks = make_non_pad_mask(feat_len).unsqueeze(1)\n", + "\n", + "\n", + "#xs, pos_emb, masks = model.encoder.embed(xs, masks.type_as(xs), offset=0)\n", + "# print(xs)\n", + "\n", + "x = xs.unsqueeze(1)\n", + "x = model.encoder.embed.conv(x)\n", + "print(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "friendly-nightlife", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tensor(shape=[16, 51, 256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", + " [[[-0.03426375, 0.14291267, -0.06718873, ..., 0.09064753, 0.01809387, -0.04340880],\n", + " [-0.05007839, 0.11054724, -0.10399298, ..., 0.11457238, 0.04244684, -0.01249714],\n", + " [-0.10695291, 0.16910909, -0.08352133, ..., 0.07710276, 0.01168563, -0.03584499],\n", + " ...,\n", + " [-0.06060536, 0.14455931, -0.05470302, ..., 0.05364908, 0.03033342, -0.02610814],\n", + " [-0.08505894, 0.13611752, -0.11132983, ..., 0.13079923, 0.01580139, -0.02281028],\n", + " [-0.10604677, 0.14714901, -0.10885533, ..., 0.08543444, 0.03719445, -0.04634233]],\n", + "\n", + " [[-0.12392755, 0.14486063, -0.05674079, ..., 0.02573164, 0.03128851, 0.00545091],\n", + " [-0.04775286, 0.08473608, -0.08507854, ..., 0.04573154, 0.04240163, 0.01053247],\n", + " [-0.05940291, 0.10023535, -0.08143730, ..., 0.03596500, 0.01673085, 0.02089563],\n", + " ...,\n", + " [-0.09222981, 0.15823206, -0.07700447, ..., 0.08122957, 0.03136991, -0.00646474],\n", + " [-0.07331756, 0.14482647, -0.07838815, ..., 0.10869440, 0.01356864, -0.02777974],\n", + " [-0.07937264, 0.20143102, -0.05544947, ..., 0.10287814, 0.00608235, -0.04799180]],\n", + "\n", + " [[-0.03670349, 0.08931590, -0.08718812, ..., 0.01314050, 0.00642052, 0.00573716],\n", + " [ 0.01089254, 0.11146393, -0.10263617, ..., 0.05070438, 0.01960694, 0.03521532],\n", + " [-0.02182280, 0.11443964, -0.06678198, ..., 0.04327708, 0.00861394, 0.02871092],\n", + " ...,\n", + " [-0.06792898, 0.14376275, -0.07899005, ..., 0.11248926, 0.03208683, -0.03264240],\n", + " [-0.07884051, 0.17024788, -0.08583611, ..., 0.09028331, 0.03588808, -0.02075090],\n", + " [-0.13792302, 0.27163863, -0.23930418, ..., 0.13391261, 0.07521040, -0.08621951]],\n", + "\n", + " ...,\n", + "\n", + " [[-0.02446348, 0.11595841, -0.03591986, ..., 0.06288970, 0.02895011, -0.06532725],\n", + " [-0.05378424, 0.12607370, -0.09023033, ..., 0.09078894, 0.01035743, 0.03701983],\n", + " [-0.04566649, 0.14275314, -0.06686870, ..., 0.09890588, -0.00612222, 0.03439377],\n", + " ...,\n", + " [-0.31763062, 0.53700209, -0.26335421, ..., 0.39182857, 0.00337184, -0.18293698],\n", + " [-0.31763062, 0.53700209, -0.26335421, ..., 0.39182857, 0.00337184, -0.18293698],\n", + " [-0.31763062, 0.53700209, -0.26335421, ..., 0.39182857, 0.00337184, -0.18293698]],\n", + "\n", + " [[-0.01012144, 0.03909408, -0.07077143, ..., 0.00452683, -0.01377654, 0.02897627],\n", + " [-0.00519154, 0.03594019, -0.06831125, ..., 0.05693541, -0.00406374, 0.04561640],\n", + " [-0.01762631, 0.00500899, -0.05886075, ..., 0.02112178, -0.00729015, 0.02782153],\n", + " ...,\n", + " [-0.31763062, 0.53700209, -0.26335421, ..., 0.39182857, 0.00337184, -0.18293698],\n", + " [-0.31763062, 0.53700209, -0.26335421, ..., 0.39182857, 0.00337184, -0.18293698],\n", + " [-0.31763062, 0.53700209, -0.26335421, ..., 0.39182857, 0.00337184, -0.18293698]],\n", + "\n", + " [[-0.03411558, -0.04318277, -0.08497842, ..., -0.04886402, 0.04296734, 0.06151697],\n", + " [ 0.00263296, -0.06913657, -0.08993219, ..., -0.00149064, 0.05696633, 0.03304394],\n", + " [-0.01818341, -0.01178640, -0.09679577, ..., -0.00870231, 0.00362198, 0.01916483],\n", + " ...,\n", + " [-0.31763062, 0.53700209, -0.26335421, ..., 0.39182857, 0.00337184, -0.18293698],\n", + " [-0.31763062, 0.53700209, -0.26335421, ..., 0.39182857, 0.00337184, -0.18293698],\n", + " [-0.31763062, 0.53700209, -0.26335421, ..., 0.39182857, 0.00337184, -0.18293698]]])\n", + "Tensor(shape=[16, 51, 256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", + " [[[-0.54821998, 2.28660274, -1.07501972, ..., 1.45036042, 0.28950194, -0.69454080],\n", + " [-0.80125421, 1.76875579, -1.66388774, ..., 1.83315802, 0.67914939, -0.19995420],\n", + " [-1.71124649, 2.70574546, -1.33634126, ..., 1.23364413, 0.18697014, -0.57351983],\n", + " ...,\n", + " [-0.96968573, 2.31294894, -0.87524825, ..., 0.85838526, 0.48533469, -0.41773027],\n", + " [-1.36094308, 2.17788029, -1.78127730, ..., 2.09278774, 0.25282228, -0.36496443],\n", + " [-1.69674826, 2.35438418, -1.74168527, ..., 1.36695099, 0.59511113, -0.74147725]],\n", + "\n", + " [[-1.98284078, 2.31777000, -0.90785271, ..., 0.41170627, 0.50061619, 0.08721463],\n", + " [-0.76404583, 1.35577726, -1.36125672, ..., 0.73170459, 0.67842603, 0.16851945],\n", + " [-0.95044655, 1.60376561, -1.30299675, ..., 0.57544005, 0.26769355, 0.33433008],\n", + " ...,\n", + " [-1.47567701, 2.53171301, -1.23207152, ..., 1.29967308, 0.50191855, -0.10343577],\n", + " [-1.17308092, 2.31722355, -1.25421047, ..., 1.73911047, 0.21709818, -0.44447583],\n", + " [-1.26996231, 3.22289634, -0.88719147, ..., 1.64605021, 0.09731755, -0.76786882]],\n", + "\n", + " [[-0.58725590, 1.42905438, -1.39500988, ..., 0.21024795, 0.10272825, 0.09179455],\n", + " [ 0.17428070, 1.78342295, -1.64217877, ..., 0.81127012, 0.31371105, 0.56344515],\n", + " [-0.34916472, 1.83103430, -1.06851172, ..., 0.69243336, 0.13782299, 0.45937473],\n", + " ...,\n", + " [-1.08686376, 2.30020404, -1.26384079, ..., 1.79982817, 0.51338923, -0.52227837],\n", + " [-1.26144814, 2.72396612, -1.37337780, ..., 1.44453299, 0.57420933, -0.33201432],\n", + " [-2.20676827, 4.34621811, -3.82886696, ..., 2.14260173, 1.20336640, -1.37951219]],\n", + "\n", + " ...,\n", + "\n", + " [[-0.39141566, 1.85533464, -0.57471782, ..., 1.00623512, 0.46320182, -1.04523599],\n", + " [-0.86054784, 2.01717925, -1.44368529, ..., 1.45262301, 0.16571884, 0.59231722],\n", + " [-0.73066384, 2.28405023, -1.06989920, ..., 1.58249414, -0.09795550, 0.55030036],\n", + " ...,\n", + " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170],\n", + " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170],\n", + " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170]],\n", + "\n", + " [[-0.16194311, 0.62550521, -1.13234293, ..., 0.07242929, -0.22042468, 0.46362036],\n", + " [-0.08306468, 0.57504302, -1.09298003, ..., 0.91096652, -0.06501988, 0.72986233],\n", + " [-0.28202093, 0.08014385, -0.94177192, ..., 0.33794850, -0.11664233, 0.44514441],\n", + " ...,\n", + " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170],\n", + " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170],\n", + " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170]],\n", + "\n", + " [[-0.54584920, -0.69092435, -1.35965478, ..., -0.78182435, 0.68747747, 0.98427159],\n", + " [ 0.04212743, -1.10618520, -1.43891501, ..., -0.02385022, 0.91146135, 0.52870303],\n", + " [-0.29093450, -0.18858244, -1.54873240, ..., -0.13923697, 0.05795169, 0.30663735],\n", + " ...,\n", + " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170],\n", + " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170],\n", + " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170]]])\n" + ] + } + ], + "source": [ + "b, c, t, f = paddle.shape(x)\n", + "x = model.encoder.embed.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f]))\n", + "print(x)\n", + "x, pos_emb = model.encoder.embed.pos_enc(x, 0)\n", + "print(x)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "exempt-cloud", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Tensor(shape=[16, 51, 256], dtype=float32, place=CUDAPlace(0), stop_gradient=False,\n", + " [[[-0.54821998, 2.28660274, -1.07501972, ..., 1.45036042, 0.28950194, -0.69454080],\n", + " [-0.80125421, 1.76875579, -1.66388774, ..., 1.83315802, 0.67914939, -0.19995420],\n", + " [-1.71124649, 2.70574546, -1.33634126, ..., 1.23364413, 0.18697014, -0.57351983],\n", + " ...,\n", + " [-0.96968573, 2.31294894, -0.87524825, ..., 0.85838526, 0.48533469, -0.41773027],\n", + " [-1.36094308, 2.17788029, -1.78127730, ..., 2.09278774, 0.25282228, -0.36496443],\n", + " [-1.69674826, 2.35438418, -1.74168527, ..., 1.36695099, 0.59511113, -0.74147725]],\n", + "\n", + " [[-1.98284078, 2.31777000, -0.90785271, ..., 0.41170627, 0.50061619, 0.08721463],\n", + " [-0.76404583, 1.35577726, -1.36125672, ..., 0.73170459, 0.67842603, 0.16851945],\n", + " [-0.95044655, 1.60376561, -1.30299675, ..., 0.57544005, 0.26769355, 0.33433008],\n", + " ...,\n", + " [-1.47567701, 2.53171301, -1.23207152, ..., 1.29967308, 0.50191855, -0.10343577],\n", + " [-1.17308092, 2.31722355, -1.25421047, ..., 1.73911047, 0.21709818, -0.44447583],\n", + " [-1.26996231, 3.22289634, -0.88719147, ..., 1.64605021, 0.09731755, -0.76786882]],\n", + "\n", + " [[-0.58725590, 1.42905438, -1.39500988, ..., 0.21024795, 0.10272825, 0.09179455],\n", + " [ 0.17428070, 1.78342295, -1.64217877, ..., 0.81127012, 0.31371105, 0.56344515],\n", + " [-0.34916472, 1.83103430, -1.06851172, ..., 0.69243336, 0.13782299, 0.45937473],\n", + " ...,\n", + " [-1.08686376, 2.30020404, -1.26384079, ..., 1.79982817, 0.51338923, -0.52227837],\n", + " [-1.26144814, 2.72396612, -1.37337780, ..., 1.44453299, 0.57420933, -0.33201432],\n", + " [-2.20676827, 4.34621811, -3.82886696, ..., 2.14260173, 1.20336640, -1.37951219]],\n", + "\n", + " ...,\n", + "\n", + " [[-0.39141566, 1.85533464, -0.57471782, ..., 1.00623512, 0.46320182, -1.04523599],\n", + " [-0.86054784, 2.01717925, -1.44368529, ..., 1.45262301, 0.16571884, 0.59231722],\n", + " [-0.73066384, 2.28405023, -1.06989920, ..., 1.58249414, -0.09795550, 0.55030036],\n", + " ...,\n", + " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170],\n", + " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170],\n", + " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170]],\n", + "\n", + " [[-0.16194311, 0.62550521, -1.13234293, ..., 0.07242929, -0.22042468, 0.46362036],\n", + " [-0.08306468, 0.57504302, -1.09298003, ..., 0.91096652, -0.06501988, 0.72986233],\n", + " [-0.28202093, 0.08014385, -0.94177192, ..., 0.33794850, -0.11664233, 0.44514441],\n", + " ...,\n", + " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170],\n", + " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170],\n", + " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170]],\n", + "\n", + " [[-0.54584920, -0.69092435, -1.35965478, ..., -0.78182435, 0.68747747, 0.98427159],\n", + " [ 0.04212743, -1.10618520, -1.43891501, ..., -0.02385022, 0.91146135, 0.52870303],\n", + " [-0.29093450, -0.18858244, -1.54873240, ..., -0.13923697, 0.05795169, 0.30663735],\n", + " ...,\n", + " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170],\n", + " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170],\n", + " [-5.08208990, 8.59203339, -4.21366739, ..., 6.26925707, 0.05394945, -2.92699170]]])\n" + ] + } + ], + "source": [ + "xs = model.encoder.global_cmvn(feat)\n", + "masks = make_non_pad_mask(feat_len).unsqueeze(1)\n", + "\n", + "xs, pos_emb, masks = model.encoder.embed(xs, masks.type_as(xs), offset=0)\n", + "print(xs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "composite-involvement", + "metadata": {}, + "outputs": [], + "source": [ + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "handed-harris", + "metadata": {}, + "outputs": [ + { + "ename": "SystemError", + "evalue": "(Fatal) Operator elementwise_sub raises an paddle::memory::allocation::BadAlloc exception.\nThe exception content is\n:ResourceExhaustedError: \n\nOut of memory error on GPU 0. Cannot allocate 1.010986MB memory on GPU 0, available memory is only 6.437500MB.\n\nPlease check whether there is any other process using GPU 0.\n1. If yes, please stop them, or start PaddlePaddle on another GPU.\n2. If no, please decrease the batch size of your model. \n\n (at /paddle/paddle/fluid/memory/allocation/cuda_allocator.cc:69)\n. (at /paddle/paddle/fluid/imperative/tracer.cc:172)\n", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mSystemError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mencoder_out\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mencoder_mask\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mencoder\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfeat\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mfeat_len\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mencoder_out\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mencoder_mask\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mshape\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mencoder_out\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtorch_encoder_out\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dygraph/layers.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *inputs, **kwargs)\u001b[0m\n\u001b[1;32m 900\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_built\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 901\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 902\u001b[0;31m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 903\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 904\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mforward_post_hook\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_forward_post_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/workspace/DeepSpeech-2.x/deepspeech/modules/encoder.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, xs, xs_lens, decoding_chunk_size, num_decoding_left_chunks)\u001b[0m\n\u001b[1;32m 158\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mglobal_cmvn\u001b[0m \u001b[0;32mis\u001b[0m \u001b[0;32mnot\u001b[0m \u001b[0;32mNone\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 160\u001b[0;31m \u001b[0mxs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mglobal_cmvn\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 161\u001b[0m \u001b[0;31m#TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 162\u001b[0m \u001b[0mxs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mpos_emb\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmasks\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0membed\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mmasks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtype_as\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mxs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0moffset\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dygraph/layers.py\u001b[0m in \u001b[0;36m__call__\u001b[0;34m(self, *inputs, **kwargs)\u001b[0m\n\u001b[1;32m 900\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_built\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;32mTrue\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 901\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 902\u001b[0;31m \u001b[0moutputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mforward\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m*\u001b[0m\u001b[0minputs\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m**\u001b[0m\u001b[0mkwargs\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 903\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 904\u001b[0m \u001b[0;32mfor\u001b[0m \u001b[0mforward_post_hook\u001b[0m \u001b[0;32min\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0m_forward_post_hooks\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mvalues\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/workspace/DeepSpeech-2.x/deepspeech/modules/cmvn.py\u001b[0m in \u001b[0;36mforward\u001b[0;34m(self, x)\u001b[0m\n\u001b[1;32m 46\u001b[0m \u001b[0;34m(\u001b[0m\u001b[0mpaddle\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mTensor\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mnormalized\u001b[0m \u001b[0mfeature\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 47\u001b[0m \"\"\"\n\u001b[0;32m---> 48\u001b[0;31m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m-\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmean\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 49\u001b[0m \u001b[0;32mif\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mnorm_var\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 50\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mx\u001b[0m \u001b[0;34m*\u001b[0m \u001b[0mself\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mistd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;32m/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dygraph/math_op_patch.py\u001b[0m in \u001b[0;36m__impl__\u001b[0;34m(self, other_var)\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[0maxis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m-\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 248\u001b[0m \u001b[0mmath_op\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mgetattr\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mcore\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mops\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mop_type\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m--> 249\u001b[0;31m \u001b[0;32mreturn\u001b[0m \u001b[0mmath_op\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mother_var\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'axis'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 250\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 251\u001b[0m \u001b[0mcomment\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mOpProtoHolder\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0minstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mget_op_proto\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mop_type\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mcomment\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mSystemError\u001b[0m: (Fatal) Operator elementwise_sub raises an paddle::memory::allocation::BadAlloc exception.\nThe exception content is\n:ResourceExhaustedError: \n\nOut of memory error on GPU 0. Cannot allocate 1.010986MB memory on GPU 0, available memory is only 6.437500MB.\n\nPlease check whether there is any other process using GPU 0.\n1. If yes, please stop them, or start PaddlePaddle on another GPU.\n2. If no, please decrease the batch size of your model. \n\n (at /paddle/paddle/fluid/memory/allocation/cuda_allocator.cc:69)\n. (at /paddle/paddle/fluid/imperative/tracer.cc:172)\n" + ] + } + ], + "source": [ + "encoder_out, encoder_mask = model.encoder(feat, feat_len)\n", + "print(encoder_out.shape)\n", + "print(encoder_mask.shape)\n", + "print(encoder_out[0])\n", + "print(torch_encoder_out[0])" ] }, { "cell_type": "code", "execution_count": null, - "id": "fewer-drill", + "id": "sonic-thumb", "metadata": {}, "outputs": [], "source": [] diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index 0b7dbd13e390728ab2a726bfd1d9c84d48adadce..0c206c4a38663c8d3045ca316918a94ead385d0a 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -72,8 +72,7 @@ class SpeechCollator(): padded_audios = pad_sequence( audios, padding_value=0.0).astype(np.float32) #[B, T, D] audio_lens = np.array(audio_lens).astype(np.int64) - # (TODO:Hui Zhang) ctc loss does not support int64 labels padded_texts = pad_sequence( - texts, padding_value=IGNORE_ID).astype(np.int32) + texts, padding_value=IGNORE_ID).astype(np.int64) text_lens = np.array(text_lens).astype(np.int64) return padded_audios, audio_lens, padded_texts, text_lens diff --git a/deepspeech/modules/loss.py b/deepspeech/modules/loss.py index 870dd7cd9b6a92eb09213b5ee890ee339011b53e..3e441bbbce40f8d65bcd9156b427ccc13fd56db2 100644 --- a/deepspeech/modules/loss.py +++ b/deepspeech/modules/loss.py @@ -46,6 +46,8 @@ class CTCLoss(nn.Layer): # warp-ctc need activation with shape [T, B, V + 1] # logits: (B, L, D) -> (L, B, D) logits = logits.transpose([1, 0, 2]) + # (TODO:Hui Zhang) ctc loss does not support int64 labels + ys_pad = ys_pad.astype(paddle.int32) loss = self.loss(logits, ys_pad, hlens, ys_lens) if self.batch_average: # Batch-size average @@ -123,9 +125,12 @@ class LabelSmoothingLoss(nn.Layer): true_dist = paddle.full_like(x, self.smoothing / (self.size - 1)) ignore = target == self.padding_idx # (B,) - #target = target * (1 - ignore) # avoid -1 index + # target = target * (1 - ignore) # avoid -1 index target = target.masked_fill(ignore, 0) # avoid -1 index - true_dist += F.one_hot(target, self.size) * self.confidence + # true_dist.scatter_(1, target.unsqueeze(1), self.confidence) + target_mask = F.one_hot(target, self.size) + true_dist *= (1 - target_mask) + true_dist += target_mask * self.confidence kl = self.criterion(F.log_softmax(x, axis=1), true_dist) diff --git a/deepspeech/modules/subsampling.py b/deepspeech/modules/subsampling.py index 512d3c290bf14ff8c0cede2bbd481fa6d48aff52..5aa2fd8eacdafeff7829f19bc220749ed76b33c2 100644 --- a/deepspeech/modules/subsampling.py +++ b/deepspeech/modules/subsampling.py @@ -104,7 +104,8 @@ class Conv2dSubsampling4(BaseSubsampling): nn.ReLU(), nn.Conv2D(odim, odim, 3, 2), nn.ReLU(), ) - self.linear = nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim) + self.out = nn.Sequential( + nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim)) self.subsampling_rate = 4 # The right context for every conv layer is computed by: # (kernel_size - 1) / 2 * stride * frame_rate_of_this_layer @@ -128,7 +129,7 @@ class Conv2dSubsampling4(BaseSubsampling): x = x.unsqueeze(1) # (b, c=1, t, f) x = self.conv(x) b, c, t, f = paddle.shape(x) - x = self.linear(x.transpose([0, 1, 2, 3]).reshape([b, t, c * f])) + x = self.out(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f])) x, pos_emb = self.pos_enc(x, offset) return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2] @@ -181,7 +182,7 @@ class Conv2dSubsampling6(BaseSubsampling): x = x.unsqueeze(1) # (b, c, t, f) x = self.conv(x) b, c, t, f = paddle.shape(x) - x = self.linear(x.transpose([0, 1, 2, 3]).reshape([b, t, c * f])) + x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f])) x, pos_emb = self.pos_enc(x, offset) return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-4:3] @@ -233,6 +234,6 @@ class Conv2dSubsampling8(BaseSubsampling): """ x = x.unsqueeze(1) # (b, c, t, f) x = self.conv(x) - x = self.linear(x.transpose([0, 1, 2, 3]).reshape([b, t, c * f])) + x = self.linear(x.transpose([0, 2, 1, 3]).reshape([b, t, c * f])) x, pos_emb = self.pos_enc(x, offset) return x, pos_emb, x_mask[:, :, :-2:2][:, :, :-2:2][:, :, :-2:2] diff --git a/requirements.txt b/requirements.txt index 8e004cfae6d934cdae40e928036bf126221fc7ba..e3f7aeafd42037a136235e3af2403111e131815f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,5 @@ coverage pre-commit -python_speech_features resampy==0.2.2 scipy==1.2.1 sentencepiece diff --git a/third_party/python_kaldi_features/.gitignore b/third_party/python_kaldi_features/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..f2d5801f6d149f0fe654f9f1a9092e734626f569 --- /dev/null +++ b/third_party/python_kaldi_features/.gitignore @@ -0,0 +1,3 @@ +python_speech_features.egg-info/ +dist/ +build/ diff --git a/third_party/python_kaldi_features/build/lib/python_speech_features/__init__.py b/third_party/python_kaldi_features/build/lib/python_speech_features/__init__.py deleted file mode 100644 index 9b5ed21c9e3f65438f966db7f1913bc34b7ffa97..0000000000000000000000000000000000000000 --- a/third_party/python_kaldi_features/build/lib/python_speech_features/__init__.py +++ /dev/null @@ -1 +0,0 @@ -from .base import * diff --git a/third_party/python_kaldi_features/build/lib/python_speech_features/base.py b/third_party/python_kaldi_features/build/lib/python_speech_features/base.py deleted file mode 100644 index 592cb4f1e5916a8379feb511fdacea14b2fe8bc6..0000000000000000000000000000000000000000 --- a/third_party/python_kaldi_features/build/lib/python_speech_features/base.py +++ /dev/null @@ -1,166 +0,0 @@ -# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications -# Author: James Lyons 2012 -from __future__ import division -import numpy -from python_speech_features import sigproc -from scipy.fftpack import dct - -def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13, - nfilt=23,nfft=512,lowfreq=20,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97, - ceplifter=22,useEnergy=True,wintype='povey'): - """Compute MFCC features from an audio signal. - - :param signal: the audio signal from which to compute features. Should be an N*1 array - :param samplerate: the samplerate of the signal we are working with. - :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) - :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) - :param numcep: the number of cepstrum to return, default 13 - :param nfilt: the number of filters in the filterbank, default 26. - :param nfft: the FFT size. Default is 512. - :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. - :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 - :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. - :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22. - :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy. - :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming - :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector. - """ - feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither,remove_dc_offset,preemph,wintype) - feat = numpy.log(feat) - feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep] - feat = lifter(feat,ceplifter) - if useEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy - return feat - -def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, - nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97, - wintype='hamming'): - """Compute Mel-filterbank energy features from an audio signal. - - :param signal: the audio signal from which to compute features. Should be an N*1 array - :param samplerate: the samplerate of the signal we are working with. - :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) - :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) - :param nfilt: the number of filters in the filterbank, default 26. - :param nfft: the FFT size. Default is 512. - :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. - :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 - :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. - :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming - winfunc=lambda x:numpy.ones((x,)) - :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The - second return value is the energy in each frame (total energy, unwindowed) - """ - highfreq= highfreq or samplerate/2 - frames,raw_frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, dither, preemph, remove_dc_offset, wintype) - pspec = sigproc.powspec(frames,nfft) # nearly the same until this part - energy = numpy.sum(raw_frames**2,1) # this stores the raw energy in each frame - energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log - - fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) - feat = numpy.dot(pspec,fb.T) # compute the filterbank energies - feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log - - return feat,energy - -def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, - nfilt=40,nfft=512,lowfreq=64,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,wintype='hamming'): - """Compute log Mel-filterbank energy features from an audio signal. - - :param signal: the audio signal from which to compute features. Should be an N*1 array - :param samplerate: the samplerate of the signal we are working with. - :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) - :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) - :param nfilt: the number of filters in the filterbank, default 26. - :param nfft: the FFT size. Default is 512. - :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. - :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 - :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. - :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. - """ - feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither, remove_dc_offset,preemph,wintype) - return numpy.log(feat) - -def hz2mel(hz): - """Convert a value in Hertz to Mels - - :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise. - :returns: a value in Mels. If an array was passed in, an identical sized array is returned. - """ - return 1127 * numpy.log(1+hz/700.0) - - -def mel2hz(mel): - """Convert a value in Mels to Hertz - - :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise. - :returns: a value in Hertz. If an array was passed in, an identical sized array is returned. - """ - return 700 * (numpy.exp(mel/1127.0)-1) - -def get_filterbanks(nfilt=26,nfft=512,samplerate=16000,lowfreq=0,highfreq=None): - """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond - to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1) - - :param nfilt: the number of filters in the filterbank, default 20. - :param nfft: the FFT size. Default is 512. - :param samplerate: the samplerate of the signal we are working with. Affects mel spacing. - :param lowfreq: lowest band edge of mel filters, default 0 Hz - :param highfreq: highest band edge of mel filters, default samplerate/2 - :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter. - """ - highfreq= highfreq or samplerate/2 - assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2" - - # compute points evenly spaced in mels - lowmel = hz2mel(lowfreq) - highmel = hz2mel(highfreq) - - # check kaldi/src/feat/Mel-computations.h - fbank = numpy.zeros([nfilt,nfft//2+1]) - mel_freq_delta = (highmel-lowmel)/(nfilt+1) - for j in range(0,nfilt): - leftmel = lowmel+j*mel_freq_delta - centermel = lowmel+(j+1)*mel_freq_delta - rightmel = lowmel+(j+2)*mel_freq_delta - for i in range(0,nfft//2): - mel=hz2mel(i*samplerate/nfft) - if mel>leftmel and mel 0: - nframes,ncoeff = numpy.shape(cepstra) - n = numpy.arange(ncoeff) - lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L) - return lift*cepstra - else: - # values of L <= 0, do nothing - return cepstra - -def delta(feat, N): - """Compute delta features from a feature vector sequence. - - :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector. - :param N: For each frame, calculate delta features based on preceding and following N frames - :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector. - """ - if N < 1: - raise ValueError('N must be an integer >= 1') - NUMFRAMES = len(feat) - denominator = 2 * sum([i**2 for i in range(1, N+1)]) - delta_feat = numpy.empty_like(feat) - padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat - for t in range(NUMFRAMES): - delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1] - return delta_feat diff --git a/third_party/python_kaldi_features/build/lib/python_speech_features/base_orig.py b/third_party/python_kaldi_features/build/lib/python_speech_features/base_orig.py deleted file mode 100644 index 3efaec190186d80a77ad2ec886eba3385bc5e3dd..0000000000000000000000000000000000000000 --- a/third_party/python_kaldi_features/build/lib/python_speech_features/base_orig.py +++ /dev/null @@ -1,190 +0,0 @@ -# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications -# Author: James Lyons 2012 -from __future__ import division -import numpy -from python_speech_features import sigproc -from scipy.fftpack import dct - -def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13, - nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True, - winfunc=lambda x:numpy.ones((x,))): - """Compute MFCC features from an audio signal. - - :param signal: the audio signal from which to compute features. Should be an N*1 array - :param samplerate: the samplerate of the signal we are working with. - :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) - :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) - :param numcep: the number of cepstrum to return, default 13 - :param nfilt: the number of filters in the filterbank, default 26. - :param nfft: the FFT size. Default is 512. - :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. - :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 - :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. - :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22. - :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy. - :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming - :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector. - """ - feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph,winfunc) - feat = numpy.log(feat) - feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep] - feat = lifter(feat,ceplifter) - if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy - return feat - -def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, - nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, - winfunc=lambda x:numpy.ones((x,))): - """Compute Mel-filterbank energy features from an audio signal. - - :param signal: the audio signal from which to compute features. Should be an N*1 array - :param samplerate: the samplerate of the signal we are working with. - :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) - :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) - :param nfilt: the number of filters in the filterbank, default 26. - :param nfft: the FFT size. Default is 512. - :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. - :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 - :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. - :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming - :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The - second return value is the energy in each frame (total energy, unwindowed) - """ - highfreq= highfreq or samplerate/2 - signal = sigproc.preemphasis(signal,preemph) - frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) - pspec = sigproc.powspec(frames,nfft) - energy = numpy.sum(pspec,1) # this stores the total energy in each frame - energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log - - fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) - feat = numpy.dot(pspec,fb.T) # compute the filterbank energies - feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log - - return feat,energy - -def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, - nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): - """Compute log Mel-filterbank energy features from an audio signal. - - :param signal: the audio signal from which to compute features. Should be an N*1 array - :param samplerate: the samplerate of the signal we are working with. - :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) - :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) - :param nfilt: the number of filters in the filterbank, default 26. - :param nfft: the FFT size. Default is 512. - :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. - :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 - :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. - :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. - """ - feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph) - return numpy.log(feat) - -def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, - nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, - winfunc=lambda x:numpy.ones((x,))): - """Compute Spectral Subband Centroid features from an audio signal. - - :param signal: the audio signal from which to compute features. Should be an N*1 array - :param samplerate: the samplerate of the signal we are working with. - :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) - :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) - :param nfilt: the number of filters in the filterbank, default 26. - :param nfft: the FFT size. Default is 512. - :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. - :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 - :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. - :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming - :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. - """ - highfreq= highfreq or samplerate/2 - signal = sigproc.preemphasis(signal,preemph) - frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) - pspec = sigproc.powspec(frames,nfft) - pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems - - fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) - feat = numpy.dot(pspec,fb.T) # compute the filterbank energies - R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1)) - - return numpy.dot(pspec*R,fb.T) / feat - -def hz2mel(hz): - """Convert a value in Hertz to Mels - - :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise. - :returns: a value in Mels. If an array was passed in, an identical sized array is returned. - """ - return 2595 * numpy.log10(1+hz/700.) - -def mel2hz(mel): - """Convert a value in Mels to Hertz - - :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise. - :returns: a value in Hertz. If an array was passed in, an identical sized array is returned. - """ - return 700*(10**(mel/2595.0)-1) - -def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None): - """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond - to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1) - - :param nfilt: the number of filters in the filterbank, default 20. - :param nfft: the FFT size. Default is 512. - :param samplerate: the samplerate of the signal we are working with. Affects mel spacing. - :param lowfreq: lowest band edge of mel filters, default 0 Hz - :param highfreq: highest band edge of mel filters, default samplerate/2 - :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter. - """ - highfreq= highfreq or samplerate/2 - assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2" - - # compute points evenly spaced in mels - lowmel = hz2mel(lowfreq) - highmel = hz2mel(highfreq) - melpoints = numpy.linspace(lowmel,highmel,nfilt+2) - # our points are in Hz, but we use fft bins, so we have to convert - # from Hz to fft bin number - bin = numpy.floor((nfft+1)*mel2hz(melpoints)/samplerate) - - fbank = numpy.zeros([nfilt,nfft//2+1]) - for j in range(0,nfilt): - for i in range(int(bin[j]), int(bin[j+1])): - fbank[j,i] = (i - bin[j]) / (bin[j+1]-bin[j]) - for i in range(int(bin[j+1]), int(bin[j+2])): - fbank[j,i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1]) - return fbank - -def lifter(cepstra, L=22): - """Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the - magnitude of the high frequency DCT coeffs. - - :param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size. - :param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter. - """ - if L > 0: - nframes,ncoeff = numpy.shape(cepstra) - n = numpy.arange(ncoeff) - lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L) - return lift*cepstra - else: - # values of L <= 0, do nothing - return cepstra - -def delta(feat, N): - """Compute delta features from a feature vector sequence. - - :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector. - :param N: For each frame, calculate delta features based on preceding and following N frames - :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector. - """ - if N < 1: - raise ValueError('N must be an integer >= 1') - NUMFRAMES = len(feat) - denominator = 2 * sum([i**2 for i in range(1, N+1)]) - delta_feat = numpy.empty_like(feat) - padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat - for t in range(NUMFRAMES): - delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1] - return delta_feat diff --git a/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc.py b/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc.py deleted file mode 100644 index b7c78a8032eaa00a0d09fb299bec225f5e793b56..0000000000000000000000000000000000000000 --- a/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc.py +++ /dev/null @@ -1,158 +0,0 @@ -# This file includes routines for basic signal processing including framing and computing power spectra. -# Author: James Lyons 2012 -import decimal - -import numpy -import math -import logging - - -def round_half_up(number): - return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP)) - - -def rolling_window(a, window, step=1): - # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick - shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) - strides = a.strides + (a.strides[-1],) - return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step] - - -def framesig(sig, frame_len, frame_step, dither=1.0, preemph=0.97, remove_dc_offset=True, wintype='hamming', stride_trick=True): - """Frame a signal into overlapping frames. - - :param sig: the audio signal to frame. - :param frame_len: length of each frame measured in samples. - :param frame_step: number of samples after the start of the previous frame that the next frame should begin. - :param winfunc: the analysis window to apply to each frame. By default no window is applied. - :param stride_trick: use stride trick to compute the rolling window and window multiplication faster - :returns: an array of frames. Size is NUMFRAMES by frame_len. - """ - slen = len(sig) - frame_len = int(round_half_up(frame_len)) - frame_step = int(round_half_up(frame_step)) - if slen <= frame_len: - numframes = 1 - else: - numframes = 1 + (( slen - frame_len) // frame_step) - - # check kaldi/src/feat/feature-window.h - padsignal = sig[:(numframes-1)*frame_step+frame_len] - if wintype is 'povey': - win = numpy.empty(frame_len) - for i in range(frame_len): - win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85 - else: # the hamming window - win = numpy.hamming(frame_len) - - if stride_trick: - frames = rolling_window(padsignal, window=frame_len, step=frame_step) - else: - indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile( - numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T - indices = numpy.array(indices, dtype=numpy.int32) - frames = padsignal[indices] - win = numpy.tile(win, (numframes, 1)) - - frames = frames.astype(numpy.float32) - raw_frames = numpy.zeros(frames.shape) - for frm in range(frames.shape[0]): - frames[frm,:] = do_dither(frames[frm,:], dither) # dither - frames[frm,:] = do_remove_dc_offset(frames[frm,:]) # remove dc offset - raw_frames[frm,:] = frames[frm,:] - frames[frm,:] = do_preemphasis(frames[frm,:], preemph) # preemphasize - - return frames * win, raw_frames - -def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))): - """Does overlap-add procedure to undo the action of framesig. - - :param frames: the array of frames. - :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples. - :param frame_len: length of each frame measured in samples. - :param frame_step: number of samples after the start of the previous frame that the next frame should begin. - :param winfunc: the analysis window to apply to each frame. By default no window is applied. - :returns: a 1-D signal. - """ - frame_len = round_half_up(frame_len) - frame_step = round_half_up(frame_step) - numframes = numpy.shape(frames)[0] - assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len' - - indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile( - numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T - indices = numpy.array(indices, dtype=numpy.int32) - padlen = (numframes - 1) * frame_step + frame_len - - if siglen <= 0: siglen = padlen - - rec_signal = numpy.zeros((padlen,)) - window_correction = numpy.zeros((padlen,)) - win = winfunc(frame_len) - - for i in range(0, numframes): - window_correction[indices[i, :]] = window_correction[ - indices[i, :]] + win + 1e-15 # add a little bit so it is never zero - rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :] - - rec_signal = rec_signal / window_correction - return rec_signal[0:siglen] - - -def magspec(frames, NFFT): - """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). - - :param frames: the array of frames. Each row is a frame. - :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. - :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame. - """ - if numpy.shape(frames)[1] > NFFT: - logging.warn( - 'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.', - numpy.shape(frames)[1], NFFT) - complex_spec = numpy.fft.rfft(frames, NFFT) - return numpy.absolute(complex_spec) - - -def powspec(frames, NFFT): - """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). - - :param frames: the array of frames. Each row is a frame. - :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. - :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame. - """ - return numpy.square(magspec(frames, NFFT)) - - -def logpowspec(frames, NFFT, norm=1): - """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). - - :param frames: the array of frames. Each row is a frame. - :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. - :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0. - :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame. - """ - ps = powspec(frames, NFFT); - ps[ps <= 1e-30] = 1e-30 - lps = 10 * numpy.log10(ps) - if norm: - return lps - numpy.max(lps) - else: - return lps - -def do_dither(signal, dither_value=1.0): - signal += numpy.random.normal(size=signal.shape) * dither_value - return signal - -def do_remove_dc_offset(signal): - signal -= numpy.mean(signal) - return signal - -def do_preemphasis(signal, coeff=0.97): - """perform preemphasis on the input signal. - - :param signal: The signal to filter. - :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95. - :returns: the filtered signal. - """ - return numpy.append((1-coeff)*signal[0], signal[1:] - coeff * signal[:-1]) diff --git a/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc_orig.py b/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc_orig.py deleted file mode 100644 index a786c4fb68809c3182cd471fb1d3fb21d27574df..0000000000000000000000000000000000000000 --- a/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc_orig.py +++ /dev/null @@ -1,140 +0,0 @@ -# This file includes routines for basic signal processing including framing and computing power spectra. -# Author: James Lyons 2012 -import decimal - -import numpy -import math -import logging - - -def round_half_up(number): - return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP)) - - -def rolling_window(a, window, step=1): - # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick - shape = a.shape[:-1] + (a.shape[-1] - window + 1, window) - strides = a.strides + (a.strides[-1],) - return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step] - - -def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,)), stride_trick=True): - """Frame a signal into overlapping frames. - - :param sig: the audio signal to frame. - :param frame_len: length of each frame measured in samples. - :param frame_step: number of samples after the start of the previous frame that the next frame should begin. - :param winfunc: the analysis window to apply to each frame. By default no window is applied. - :param stride_trick: use stride trick to compute the rolling window and window multiplication faster - :returns: an array of frames. Size is NUMFRAMES by frame_len. - """ - slen = len(sig) - frame_len = int(round_half_up(frame_len)) - frame_step = int(round_half_up(frame_step)) - if slen <= frame_len: - numframes = 1 - else: - numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step)) - - padlen = int((numframes - 1) * frame_step + frame_len) - - zeros = numpy.zeros((padlen - slen,)) - padsignal = numpy.concatenate((sig, zeros)) - if stride_trick: - win = winfunc(frame_len) - frames = rolling_window(padsignal, window=frame_len, step=frame_step) - else: - indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile( - numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T - indices = numpy.array(indices, dtype=numpy.int32) - frames = padsignal[indices] - win = numpy.tile(winfunc(frame_len), (numframes, 1)) - - return frames * win - - -def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))): - """Does overlap-add procedure to undo the action of framesig. - - :param frames: the array of frames. - :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples. - :param frame_len: length of each frame measured in samples. - :param frame_step: number of samples after the start of the previous frame that the next frame should begin. - :param winfunc: the analysis window to apply to each frame. By default no window is applied. - :returns: a 1-D signal. - """ - frame_len = round_half_up(frame_len) - frame_step = round_half_up(frame_step) - numframes = numpy.shape(frames)[0] - assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len' - - indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile( - numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T - indices = numpy.array(indices, dtype=numpy.int32) - padlen = (numframes - 1) * frame_step + frame_len - - if siglen <= 0: siglen = padlen - - rec_signal = numpy.zeros((padlen,)) - window_correction = numpy.zeros((padlen,)) - win = winfunc(frame_len) - - for i in range(0, numframes): - window_correction[indices[i, :]] = window_correction[ - indices[i, :]] + win + 1e-15 # add a little bit so it is never zero - rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :] - - rec_signal = rec_signal / window_correction - return rec_signal[0:siglen] - - -def magspec(frames, NFFT): - """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). - - :param frames: the array of frames. Each row is a frame. - :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. - :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame. - """ - if numpy.shape(frames)[1] > NFFT: - logging.warn( - 'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.', - numpy.shape(frames)[1], NFFT) - complex_spec = numpy.fft.rfft(frames, NFFT) - return numpy.absolute(complex_spec) - - -def powspec(frames, NFFT): - """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). - - :param frames: the array of frames. Each row is a frame. - :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. - :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame. - """ - return 1.0 / NFFT * numpy.square(magspec(frames, NFFT)) - - -def logpowspec(frames, NFFT, norm=1): - """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1). - - :param frames: the array of frames. Each row is a frame. - :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded. - :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0. - :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame. - """ - ps = powspec(frames, NFFT); - ps[ps <= 1e-30] = 1e-30 - lps = 10 * numpy.log10(ps) - if norm: - return lps - numpy.max(lps) - else: - return lps - - -def preemphasis(signal, coeff=0.95): - """perform preemphasis on the input signal. - - :param signal: The signal to filter. - :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95. - :returns: the filtered signal. - """ - return numpy.append(signal[0], signal[1:] - coeff * signal[:-1]) diff --git a/third_party/python_kaldi_features/dist/python_speech_features-0.6-py3.7.egg b/third_party/python_kaldi_features/dist/python_speech_features-0.6-py3.7.egg deleted file mode 100644 index 0936a26299677b69a52e7ebdbf354713ae03fb5f..0000000000000000000000000000000000000000 Binary files a/third_party/python_kaldi_features/dist/python_speech_features-0.6-py3.7.egg and /dev/null differ diff --git a/third_party/python_kaldi_features/python_speech_features.egg-info/PKG-INFO b/third_party/python_kaldi_features/python_speech_features.egg-info/PKG-INFO deleted file mode 100644 index c08c0032cff4af37a48c3a949b8b2a5ad39ebb5a..0000000000000000000000000000000000000000 --- a/third_party/python_kaldi_features/python_speech_features.egg-info/PKG-INFO +++ /dev/null @@ -1,10 +0,0 @@ -Metadata-Version: 1.0 -Name: python-speech-features -Version: 0.6 -Summary: Python Speech Feature extraction -Home-page: https://github.com/jameslyons/python_speech_features -Author: James Lyons -Author-email: james.lyons0@gmail.com -License: MIT -Description: UNKNOWN -Platform: UNKNOWN diff --git a/third_party/python_kaldi_features/python_speech_features.egg-info/SOURCES.txt b/third_party/python_kaldi_features/python_speech_features.egg-info/SOURCES.txt deleted file mode 100644 index 492aefcaec6cde98922f0668446275def86110f2..0000000000000000000000000000000000000000 --- a/third_party/python_kaldi_features/python_speech_features.egg-info/SOURCES.txt +++ /dev/null @@ -1,12 +0,0 @@ -README.rst -setup.py -python_speech_features/__init__.py -python_speech_features/base.py -python_speech_features/base_orig.py -python_speech_features/sigproc.py -python_speech_features/sigproc_orig.py -python_speech_features.egg-info/PKG-INFO -python_speech_features.egg-info/SOURCES.txt -python_speech_features.egg-info/dependency_links.txt -python_speech_features.egg-info/top_level.txt -test/test_sigproc.py \ No newline at end of file diff --git a/third_party/python_kaldi_features/python_speech_features.egg-info/dependency_links.txt b/third_party/python_kaldi_features/python_speech_features.egg-info/dependency_links.txt deleted file mode 100644 index 8b137891791fe96927ad78e64b0aad7bded08bdc..0000000000000000000000000000000000000000 --- a/third_party/python_kaldi_features/python_speech_features.egg-info/dependency_links.txt +++ /dev/null @@ -1 +0,0 @@ - diff --git a/third_party/python_kaldi_features/python_speech_features.egg-info/top_level.txt b/third_party/python_kaldi_features/python_speech_features.egg-info/top_level.txt deleted file mode 100644 index 42c4020dd035a5d83fd50fd8a11ecc23b9bd22e3..0000000000000000000000000000000000000000 --- a/third_party/python_kaldi_features/python_speech_features.egg-info/top_level.txt +++ /dev/null @@ -1 +0,0 @@ -python_speech_features