提交 48f4bda3 编写于 作者: H Hui Zhang

kaldi fbank and mfcc

上级 281d46da
......@@ -12,6 +12,7 @@ exclude =
.git,
# python cache
__pycache__,
third_party/,
# Provide a comma-separate list of glob patterns to include for checks.
filename =
*.py
......
......@@ -83,37 +83,39 @@
"text": [
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n",
"WARNING:root:register user softmax to paddle, remove this when fixed!\n",
"WARNING:root:register user log_softmax to paddle, remove this when fixed!\n",
"WARNING:root:register user sigmoid to paddle, remove this when fixed!\n",
"WARNING:root:register user log_sigmoid to paddle, remove this when fixed!\n",
"WARNING:root:register user relu to paddle, remove this when fixed!\n",
"WARNING:root:override cat of paddle if exists or register, remove this when fixed!\n",
"WARNING:root:override item of paddle.Tensor if exists or register, remove this when fixed!\n",
"WARNING:root:override long of paddle.Tensor if exists or register, remove this when fixed!\n",
"WARNING:root:override new_full of paddle.Tensor if exists or register, remove this when fixed!\n",
"WARNING:root:override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
"WARNING:root:override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
"WARNING:root:override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
"WARNING:root:register user view to paddle.Tensor, remove this when fixed!\n",
"WARNING:root:register user view_as to paddle.Tensor, remove this when fixed!\n",
"WARNING:root:register user masked_fill to paddle.Tensor, remove this when fixed!\n",
"WARNING:root:register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
"WARNING:root:register user fill_ to paddle.Tensor, remove this when fixed!\n",
"WARNING:root:register user repeat to paddle.Tensor, remove this when fixed!\n",
"WARNING:root:register user softmax to paddle.Tensor, remove this when fixed!\n",
"WARNING:root:register user sigmoid to paddle.Tensor, remove this when fixed!\n",
"WARNING:root:register user relu to paddle.Tensor, remove this when fixed!\n",
"WARNING:root:register user type_as to paddle.Tensor, remove this when fixed!\n",
"WARNING:root:register user to to paddle.Tensor, remove this when fixed!\n",
"WARNING:root:register user float to paddle.Tensor, remove this when fixed!\n",
"WARNING:root:register user glu to paddle.nn.functional, remove this when fixed!\n",
"WARNING:root:override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
"WARNING:root:register user Module to paddle.nn, remove this when fixed!\n",
"WARNING:root:register user ModuleList to paddle.nn, remove this when fixed!\n",
"WARNING:root:register user GLU to paddle.nn, remove this when fixed!\n",
"WARNING:root:register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
"WARNING:root:register user export to paddle.jit, remove this when fixed!\n"
"[WARNING 2021/04/16 06:32:09 __init__.py:93] register user softmax to paddle, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:97] register user log_softmax to paddle, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:101] register user sigmoid to paddle, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:105] register user log_sigmoid to paddle, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:109] register user relu to paddle, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:119] override cat of paddle if exists or register, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:133] override item of paddle.Tensor if exists or register, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:144] override long of paddle.Tensor if exists or register, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:164] override new_full of paddle.Tensor if exists or register, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:179] override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:185] override eq of paddle if exists or register, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:195] override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:212] override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:223] register user view to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:233] register user view_as to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:259] register user masked_fill to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:277] register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:288] register user fill_ to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:298] register user repeat to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:303] register user softmax to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:308] register user sigmoid to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:312] register user relu to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:322] register user type_as to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:337] register user to to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:346] register user float to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:356] register user tolist to paddle.Tensor, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:371] register user glu to paddle.nn.functional, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:422] override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:428] register user Module to paddle.nn, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:434] register user ModuleList to paddle.nn, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:450] register user GLU to paddle.nn, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:483] register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
"[WARNING 2021/04/16 06:32:09 __init__.py:489] register user export to paddle.jit, remove this when fixed!\n"
]
},
{
......@@ -191,6 +193,84 @@
{
"cell_type": "code",
"execution_count": 4,
"id": "wired-principal",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'num_samples': 5, 'beam_size': 500, 'num_proc_bsearch': 8, 'num_conv_layers': 2, 'num_rnn_layers': 3, 'rnn_layer_size': 2048, 'alpha': 2.5, 'beta': 0.3, 'cutoff_prob': 1.0, 'cutoff_top_n': 40, 'use_gru': False, 'use_gpu': True, 'share_rnn_weights': True, 'unit_type': 'char', 'spm_model_prefix': 'examples/aishell/s1/data/spm_bpe', 'infer_manifest': 'examples/aishell/s1/data/manifest.test', 'mean_std_path': '', 'vocab_path': 'examples/aishell/s1/data/vocab.txt', 'lang_model_path': 'models/lm/common_crawl_00.prune01111.trie.klm', 'model_path': 'examples/aishell/s1/checkpoints/step_final', 'decoding_method': 'ctc_beam_search', 'error_rate_type': 'wer', 'specgram_type': 'fbank', 'feat_dim': 80, 'delta_delta': False}\n"
]
}
],
"source": [
"import sys\n",
"import argparse\n",
"import functools\n",
"from deepspeech.utils.utility import add_arguments, print_arguments\n",
"parser = argparse.ArgumentParser(description=__doc__)\n",
"add_arg = functools.partial(add_arguments, argparser=parser)\n",
"# yapf: disable\n",
"add_arg('num_samples', int, 5, \"# of samples to infer.\")\n",
"add_arg('beam_size', int, 500, \"Beam search width.\")\n",
"add_arg('num_proc_bsearch', int, 8, \"# of CPUs for beam search.\")\n",
"add_arg('num_conv_layers', int, 2, \"# of convolution layers.\")\n",
"add_arg('num_rnn_layers', int, 3, \"# of recurrent layers.\")\n",
"add_arg('rnn_layer_size', int, 2048, \"# of recurrent cells per layer.\")\n",
"add_arg('alpha', float, 2.5, \"Coef of LM for beam search.\")\n",
"add_arg('beta', float, 0.3, \"Coef of WC for beam search.\")\n",
"add_arg('cutoff_prob', float, 1.0, \"Cutoff probability for pruning.\")\n",
"add_arg('cutoff_top_n', int, 40, \"Cutoff number for pruning.\")\n",
"add_arg('use_gru', bool, False, \"Use GRUs instead of simple RNNs.\")\n",
"add_arg('use_gpu', bool, True, \"Use GPU or not.\")\n",
"add_arg('share_rnn_weights',bool, True, \"Share input-hidden weights across \"\n",
" \"bi-directional RNNs. Not for GRU.\")\n",
"add_arg('unit_type', str,\n",
" 'char',\n",
" \"Options: char, word, spm.\",\n",
" choices=['char', 'word', 'spm'])\n",
"add_arg('spm_model_prefix', str,\n",
" 'examples/aishell/s1/data/spm_bpe',\n",
" \"spm model prefix.\",)\n",
"add_arg('infer_manifest', str,\n",
" 'examples/aishell/s1/data/manifest.test',\n",
" \"Filepath of manifest to infer.\")\n",
"add_arg('mean_std_path', str,\n",
" '',\n",
" \"examples/aishell/s1/data/mean_std.npz, Filepath of normalizer's mean & std.\")\n",
"add_arg('vocab_path', str,\n",
" 'examples/aishell/s1/data/vocab.txt',\n",
" \"Filepath of vocabulary.\")\n",
"add_arg('lang_model_path', str,\n",
" 'models/lm/common_crawl_00.prune01111.trie.klm',\n",
" \"Filepath for language model.\")\n",
"add_arg('model_path', str,\n",
" 'examples/aishell/s1/checkpoints/step_final',\n",
" \"If None, the training starts from scratch, \"\n",
" \"otherwise, it resumes from the pre-trained model.\")\n",
"add_arg('decoding_method', str,\n",
" 'ctc_beam_search',\n",
" \"Decoding method. Options: ctc_beam_search, ctc_greedy\",\n",
" choices = ['ctc_beam_search', 'ctc_greedy'])\n",
"add_arg('error_rate_type', str,\n",
" 'wer',\n",
" \"Error rate type for evaluation.\",\n",
" choices=['wer', 'cer'])\n",
"add_arg('specgram_type', str,\n",
" 'fbank',\n",
" \"Audio feature type. Options: linear, mfcc.\",\n",
" choices=['linear', 'mfcc', 'fbank'])\n",
"add_arg('feat_dim', int, 80, \"mfcc or fbank feat dim.\")\n",
"add_arg('delta_delta', bool, False, \"delta delta\")\n",
"# yapf: disable\n",
"args = parser.parse_args([])\n",
"print(vars(args))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "bearing-physics",
"metadata": {},
"outputs": [
......@@ -259,7 +339,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 6,
"id": "classified-melissa",
"metadata": {},
"outputs": [
......@@ -268,7 +348,31 @@
"output_type": "stream",
"text": [
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
" and should_run_async(code)\n",
" and should_run_async(code)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"fbank\n",
"[232 387 331 ... 249 249 262] int16\n",
"fbank\n",
"[-138 -219 -192 ... 338 324 351] int16\n",
"fbank\n",
"[ 694 1175 1022 ... 553 514 627] int16\n",
"fbank\n",
"[-39 -79 -53 ... 139 172 99] int16\n",
"fbank\n",
"[-277 -480 -425 ... 758 767 739] int16\n",
"fbank\n",
"[ 399 693 609 ... 1291 1270 1291] int16\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py:354: DeprecationWarning: `np.object` is a deprecated alias for the builtin `object`. To silence this warning, use `object` by itself. Doing this will not modify any behavior and is safe. \n",
"Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
" if arr.dtype == np.object:\n"
......@@ -278,58 +382,106 @@
"name": "stdout",
"output_type": "stream",
"text": [
"test: Tensor(shape=[5, 23], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [[116, 104, 101, 32, 116, 119, 101, 110, 116, 105, 101, 115, -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ],\n",
" [119, 104, 101, 114, 101, 32, 105, 115, 32, 116, 104, 97, 116, -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ],\n",
" [116, 101, 110, 32, 115, 101, 99, 111, 110, 100, 115, -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ],\n",
" [104, 101, 32, 100, 111, 101, 115, 110, 39, 116, 32, 119, 111, 114, 107, 32, 97, 116, 32, 97, 108, 108, -1 ],\n",
" [119, 104, 101, 114, 101, 32, 105, 115, 32, 109, 121, 32, 98, 114, 111, 116, 104, 101, 114, 32, 110, 111, 119]])\n",
"test raw: the twenties\n",
"test raw: where is my brother now\n",
"fbank\n",
"[ -750 -1254 -1107 ... 2276 1889 2067] int16\n",
"fbank\n",
"[ -127 -199 -149 ... -5243 -5065 -5398] int16\n",
"fbank\n",
"[ 465 783 677 ... 980 903 1008] int16\n",
"fbank\n",
"[ 90 160 157 ... -2 -16 -21] int16\n",
"fbank\n",
"[ 213 345 295 ... 2483 2246 2501] int16\n",
"fbank\n",
"[ -86 -159 -131 ... 270 258 290] int16\n",
"fbank\n",
"[-1023 -1714 -1505 ... 1532 1596 1575] int16\n",
"fbank\n",
"[-366 -602 -527 ... 374 370 379] int16\n",
"fbank\n",
"[ 761 1275 1127 ... 369 413 295] int16\n",
"fbank\n",
"[382 621 550 ... 161 161 174] int16\n",
"fbank\n",
"[ -28 -91 -120 ... 28 34 11] int16\n",
"fbank\n",
"[ -5 -5 -5 ... 268 294 341] int16\n",
"fbank\n",
"[240 417 684 ... 267 262 219] int16\n",
"fbank\n",
"[131 206 194 ... 383 320 343] int16\n",
"test: Tensor(shape=[5, 7], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [[31069, 21487, 29233, 30340, 20320, -1 , -1 ],\n",
" [20540, 24471, 19968, 25552, 30340, 26159, -1 ],\n",
" [36825, 20010, 31243, 24230, 26159, 32654, 30340],\n",
" [20108, 21040, 20108, -1 , -1 , -1 , -1 ],\n",
" [21435, 34892, 25919, 21270, -1 , -1 , -1 ]])\n",
"fbank\n",
"[1155 1890 1577 ... 1092 989 1130] int16\n",
"fbank\n",
"[296 358 296 ... 140 140 168] int16\n",
"fbank\n",
"[-50 -91 -63 ... 104 104 86] int16\n",
"fbank\n",
"[-37 -66 -50 ... -31 -45 -52] int16\n",
"fbank\n",
"[-401 -652 -547 ... -339 -307 -344] int16\n",
"fbank\n",
"[-21 -47 -51 ... 94 81 107] int16\n",
"fbank\n",
"[ 533 887 755 ... 3074 2853 3254] int16\n",
"fbank\n",
"[ 44 71 66 ... -628 -733 -601] int16\n",
"fbank\n",
"[ 50 86 79 ... 129 116 138] int16\n",
"fbank\n",
"[ 92 146 126 ... -208 -193 -179] int16\n",
"test raw: 祝可爱的你\n",
"test raw: 去行政化\n",
"audio len: Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [163, 173, 184, 190, 203])\n",
" [184, 194, 196, 204, 207])\n",
"test len: Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n",
" [12, 13, 11, 22, 23])\n",
"audio: Tensor(shape=[5, 203, 80], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [[[-51.32406616, -17.91388321, 0.00000000 , ..., -26.66350746, -27.46039391, -27.22303963],\n",
" [-15.19027233, -20.52460480, 0.00000000 , ..., -28.47811317, -26.87953568, -25.13592339],\n",
" [-22.80181694, -19.48889351, 0.00000000 , ..., -29.96320724, -25.96619034, -24.57164192],\n",
" [5, 6, 7, 3, 4])\n",
"audio: Tensor(shape=[5, 207, 80], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [[[12.25633812, 12.61639309, 10.36936474, ..., 13.02949619, 11.51365757, 10.59789085],\n",
" [13.32148266, 13.41071606, 11.43800735, ..., 13.69783783, 12.83939362, 11.51259613],\n",
" [12.62640572, 12.53621101, 10.97212505, ..., 13.33757591, 12.32293034, 10.75493717],\n",
" ...,\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
"\n",
" [[-15.38297653, -18.95307732, 0.00000000 , ..., -15.22777271, -16.46900940, -12.32327461],\n",
" [-14.06289291, -12.69954872, 0.00000000 , ..., -15.68012810, -16.92030334, -13.49134445],\n",
" [-19.78544235, -11.63046265, 0.00000000 , ..., -14.35409069, -14.82787228, -15.72653484],\n",
" [[10.99619484, 11.35202599, 9.56922054 , ..., 9.94971657 , 9.88354111 , 9.55315971 ],\n",
" [10.44461155, 9.81688595 , 5.62538481 , ..., 10.60468388, 10.94417381, 9.42646980 ],\n",
" [10.23835754, 10.23407459, 7.99464273 , ..., 10.68097591, 9.91640091 , 10.04131031],\n",
" ...,\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
"\n",
" [[-22.65289879, -21.11938667, 0.00000000 , ..., -31.80981827, -30.58669853, -28.68988228],\n",
" [-31.04699135, -21.68680763, 0.00000000 , ..., -29.90789604, -30.31726456, -30.99709320],\n",
" [-18.16406441, -17.50658417, 0.00000000 , ..., -29.47821617, -29.77137375, -30.45121002],\n",
" [[14.10299397, 14.50298119, 12.87738323, ..., 12.62796497, 12.69949627, 11.43171215],\n",
" [13.85035992, 13.15289116, 10.66541386, ..., 13.34364223, 13.46972179, 11.02160740],\n",
" [13.19866467, 13.23537827, 11.65760899, ..., 12.72559357, 12.42716217, 11.74562359],\n",
" ...,\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
"\n",
" [[-16.17608452, -15.22302818, 0.00000000 , ..., -8.82944202 , -7.88900328 , -6.10806322 ],\n",
" [-19.40717316, -12.32932186, 0.00000000 , ..., -8.05214977 , -8.03145599 , -7.35137606 ],\n",
" [-11.01850796, -13.20147514, 0.00000000 , ..., -9.65334892 , -8.96987629 , -9.13897228 ],\n",
" [[12.85668373, 12.82431412, 11.68144703, ..., 14.10119247, 15.12791920, 13.68221378],\n",
" [13.19507027, 13.40244961, 11.43618393, ..., 13.32919979, 13.68267441, 12.73429012],\n",
" [13.02173328, 12.92082500, 11.44303989, ..., 12.77793121, 13.10915661, 11.77327728],\n",
" ...,\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
"\n",
" [[-16.55369759, -16.95514297, 0.00000000 , ..., -7.00301647 , -6.53273058 , -10.14600754],\n",
" [-19.51947975, -14.86818218, 0.00000000 , ..., -6.82891273 , -6.22576237 , -9.42883873 ],\n",
" [-15.26447582, -22.26662445, 0.00000000 , ..., -13.31693172, -11.05612659, -12.70977211],\n",
" [[12.90771198, 13.40234852, 13.01435471, ..., 13.80359459, 14.08088684, 13.17883396],\n",
" [14.06678009, 14.06943512, 12.52837276, ..., 13.66423225, 13.66300583, 13.60142994],\n",
" [12.58743191, 12.94520760, 11.75190544, ..., 14.28828907, 14.08229160, 13.02433395],\n",
" ...,\n",
" [-4.81728077 , -10.65084648, 0.00000000 , ..., 3.19982862 , 8.42359638 , 7.95100546 ],\n",
" [-7.54755068 , -12.56441689, 0.00000000 , ..., 4.12789631 , 6.98472023 , 7.79936218 ],\n",
" [-8.79256725 , -11.23776722, 0.00000000 , ..., 1.31829071 , 1.30352044 , 6.80789280 ]]])\n"
" [16.20896912, 16.42283821, 14.94358730, ..., 12.91146755, 12.66766262, 11.76361752],\n",
" [13.49324894, 14.14653301, 13.16490936, ..., 13.23435783, 13.45378494, 12.60386276],\n",
" [15.56288910, 15.92445087, 14.90794277, ..., 13.43840790, 13.41075516, 12.55605984]]])\n"
]
}
],
......@@ -354,7 +506,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 7,
"id": "minus-modern",
"metadata": {},
"outputs": [
......@@ -362,58 +514,70 @@
"name": "stdout",
"output_type": "stream",
"text": [
"test: Tensor(shape=[5, 23], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [[87, 37, 26, 1, 87, 97, 26, 61, 87, 38, 26, 82, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],\n",
" [97, 37, 26, 79, 26, 1, 38, 82, 1, 87, 37, 3, 87, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],\n",
" [87, 26, 61, 1, 82, 26, 18, 64, 61, 25, 82, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],\n",
" [37, 26, 1, 25, 64, 26, 82, 61, 2, 87, 1, 97, 64, 79, 52, 1, 3, 87, 1, 3, 53, 53, -1],\n",
" [97, 37, 26, 79, 26, 1, 38, 82, 1, 58, 102, 1, 17, 79, 64, 87, 37, 26, 79, 1, 61, 64, 97]])\n",
"test raw: W%\u001a\u0001Wa\u001a=W&\u001aR\n",
"test raw: a%\u001aO\u001a\u0001&R\u0001:f\u0001\u0011O@W%\u001aO\u0001=@a\n",
"fbank\n",
"[232 387 331 ... 249 249 262] int16\n",
"fbank\n",
"[-138 -219 -192 ... 338 324 351] int16\n",
"fbank\n",
"[ 694 1175 1022 ... 553 514 627] int16\n",
"fbank\n",
"[-39 -79 -53 ... 139 172 99] int16\n",
"fbank\n",
"[-277 -480 -425 ... 758 767 739] int16\n",
"fbank\n",
"test: Tensor(shape=[5, 7], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [[2695, 505, 2332, 2553, 169, -1 , -1 ],\n",
" [ 230, 1237, 2 , 1556, 2553, 1694, -1 ],\n",
" [3703, 28 , 2739, 1172, 1694, 2966, 2553],\n",
" [ 70 , 355, 70 , -1 , -1 , -1 , -1 ],\n",
" [ 477, 3363, 1621, 412, -1 , -1 , -1 ]])\n",
"[ 399 693 609 ... 1291 1270 1291] int16\n",
"test raw: ઇǹज৹©\n",
"test raw: ǝണٕƜ\n",
"test len: Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n",
" [12, 13, 11, 22, 23])\n",
"audio: Tensor(shape=[5, 203, 80], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [[[-51.32406616, -17.91388321, 0.00000000 , ..., -26.66350746, -27.46039391, -27.22303963],\n",
" [-15.19027233, -20.52460480, 0.00000000 , ..., -28.47811317, -26.87953568, -25.13592339],\n",
" [-22.80181694, -19.48889351, 0.00000000 , ..., -29.96320724, -25.96619034, -24.57164192],\n",
" [5, 6, 7, 3, 4])\n",
"audio: Tensor(shape=[5, 207, 80], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [[[12.25794601, 12.61855793, 10.37306023, ..., 13.12571049, 11.53678799, 10.32210350],\n",
" [13.32333183, 13.41336918, 11.44248962, ..., 13.65861225, 12.79308128, 11.31168747],\n",
" [12.62584686, 12.53506088, 10.96861362, ..., 13.32526493, 12.41560936, 10.71458912],\n",
" ...,\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
"\n",
" [[-15.38297653, -18.95307732, 0.00000000 , ..., -15.22777271, -16.46900940, -12.32327461],\n",
" [-14.06289291, -12.69954872, 0.00000000 , ..., -15.68012810, -16.92030334, -13.49134445],\n",
" [-19.78544235, -11.63046265, 0.00000000 , ..., -14.35409069, -14.82787228, -15.72653484],\n",
" [[11.00003052, 11.35529137, 9.56384087 , ..., 10.06063652, 10.16322994, 9.43149185 ],\n",
" [10.44556236, 9.81155300 , 5.49400425 , ..., 10.84116268, 11.02734756, 9.42253590 ],\n",
" [10.23620510, 10.23321152, 7.99466419 , ..., 10.93381882, 10.28395081, 10.00841141],\n",
" ...,\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
"\n",
" [[-22.65289879, -21.11938667, 0.00000000 , ..., -31.80981827, -30.58669853, -28.68988228],\n",
" [-31.04699135, -21.68680763, 0.00000000 , ..., -29.90789604, -30.31726456, -30.99709320],\n",
" [-18.16406441, -17.50658417, 0.00000000 , ..., -29.47821617, -29.77137375, -30.45121002],\n",
" [[14.10379314, 14.50375748, 12.87825108, ..., 12.68065739, 12.62359715, 11.53773308],\n",
" [13.84964657, 13.15079498, 10.67198086, ..., 13.24875164, 13.45796680, 10.97363472],\n",
" [13.19808197, 13.23482990, 11.65900230, ..., 12.70375061, 12.41395664, 11.88668156],\n",
" ...,\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
"\n",
" [[-16.17608452, -15.22302818, 0.00000000 , ..., -8.82944202 , -7.88900328 , -6.10806322 ],\n",
" [-19.40717316, -12.32932186, 0.00000000 , ..., -8.05214977 , -8.03145599 , -7.35137606 ],\n",
" [-11.01850796, -13.20147514, 0.00000000 , ..., -9.65334892 , -8.96987629 , -9.13897228 ],\n",
" [[12.85676289, 12.82410812, 11.67961884, ..., 14.12018299, 15.14850044, 13.80065727],\n",
" [13.19532776, 13.40243340, 11.43492508, ..., 13.29144669, 13.70278549, 12.67841339],\n",
" [13.02196407, 12.92111111, 11.43998623, ..., 12.71165752, 13.16518497, 11.92028046],\n",
" ...,\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [ 0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ],\n",
" [0. , 0. , 0. , ..., 0. , 0. , 0. ]],\n",
"\n",
" [[-16.55369759, -16.95514297, 0.00000000 , ..., -7.00301647 , -6.53273058 , -10.14600754],\n",
" [-19.51947975, -14.86818218, 0.00000000 , ..., -6.82891273 , -6.22576237 , -9.42883873 ],\n",
" [-15.26447582, -22.26662445, 0.00000000 , ..., -13.31693172, -11.05612659, -12.70977211],\n",
" [[12.90661621, 13.40162563, 13.01394463, ..., 13.84056377, 14.11240959, 13.21227264],\n",
" [14.06642914, 14.06922340, 12.52955723, ..., 13.55829811, 13.60157204, 13.50268650],\n",
" [12.58881378, 12.94780254, 11.75758171, ..., 14.29055786, 14.12165928, 13.02695847],\n",
" ...,\n",
" [-4.81728077 , -10.65084648, 0.00000000 , ..., 3.19982862 , 8.42359638 , 7.95100546 ],\n",
" [-7.54755068 , -12.56441689, 0.00000000 , ..., 4.12789631 , 6.98472023 , 7.79936218 ],\n",
" [-8.79256725 , -11.23776722, 0.00000000 , ..., 1.31829071 , 1.30352044 , 6.80789280 ]]])\n",
" [16.20891571, 16.42290306, 14.94398117, ..., 12.86083794, 12.63515949, 11.67581463],\n",
" [13.49345875, 14.14656067, 13.16498375, ..., 13.28024578, 13.40956783, 12.70357513],\n",
" [15.56265163, 15.92387581, 14.90643024, ..., 13.45694065, 13.44703197, 12.81099033]]])\n",
"audio len: Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n",
" [163, 173, 184, 190, 203])\n"
" [184, 194, 196, 204, 207])\n"
]
}
],
......@@ -464,6 +628,556 @@
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 8,
"id": "knowing-military",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'num_samples': 1, 'specgram_type': 'fbank', 'feat_dim': 80, 'delta_delta': False, 'stride_ms': 10.0, 'window_ms': 25.0, 'sample_rate': 16000, 'manifest_path': 'examples/aishell/s1/data/manifest.train', 'output_path': 'examples/aishell/s1/data/mean_std.npz'}\n"
]
}
],
"source": [
"import sys\n",
"import argparse\n",
"import functools\n",
"from deepspeech.utils.utility import add_arguments, print_arguments\n",
"parser = argparse.ArgumentParser(description=__doc__)\n",
"add_arg = functools.partial(add_arguments, argparser=parser)\n",
"\n",
"add_arg('num_samples', int, 1, \"# of samples to for statistics.\")\n",
"add_arg('specgram_type', str, 'fbank',\n",
" \"Audio feature type. Options: linear, mfcc, fbank.\",\n",
" choices=['linear', 'mfcc', 'fbank'])\n",
"add_arg('feat_dim', int, 80, \"Audio feature dim.\")\n",
"add_arg('delta_delta', bool, False,\"Audio feature with delta delta.\")\n",
"add_arg('stride_ms', float, 10.0, \"stride length in ms.\")\n",
"add_arg('window_ms', float, 25.0, \"stride length in ms.\")\n",
"add_arg('sample_rate', int, 16000, \"target sample rate.\")\n",
"add_arg('manifest_path', str,\n",
" 'examples/aishell/s1/data/manifest.train',\n",
" \"Filepath of manifest to compute normalizer's mean and stddev.\")\n",
"add_arg('output_path', str,\n",
" 'examples/aishell/s1/data/mean_std.npz',\n",
" \"Filepath of write mean and stddev to (.npz).\")\n",
"args = parser.parse_args([])\n",
"print(vars(args))\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "unnecessary-province",
"metadata": {},
"outputs": [],
"source": [
"\n",
"from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline\n",
"from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer\n",
"from deepspeech.frontend.normalizer import FeatureNormalizer\n",
"from deepspeech.frontend.audio import AudioSegment\n",
"from deepspeech.frontend.utility import load_cmvn\n",
"from deepspeech.frontend.utility import read_manifest\n",
"\n",
"\n",
"\n",
"def mean(args):\n",
" augmentation_pipeline = AugmentationPipeline('{}')\n",
" audio_featurizer = AudioFeaturizer(\n",
" specgram_type=args.specgram_type,\n",
" feat_dim=args.feat_dim,\n",
" delta_delta=args.delta_delta,\n",
" stride_ms=args.stride_ms,\n",
" window_ms=args.window_ms,\n",
" n_fft=None,\n",
" max_freq=None,\n",
" target_sample_rate=args.sample_rate,\n",
" use_dB_normalization=True,\n",
" target_dB=-20,\n",
" dither=0.0)\n",
"\n",
" def augment_and_featurize(audio_segment):\n",
" augmentation_pipeline.transform_audio(audio_segment)\n",
" return audio_featurizer.featurize(audio_segment)\n",
"\n",
" normalizer = FeatureNormalizer(\n",
" mean_std_filepath=None,\n",
" manifest_path=args.manifest_path,\n",
" featurize_func=augment_and_featurize,\n",
" num_samples=args.num_samples)\n",
" normalizer.write_to_file(args.output_path)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "interested-camping",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[0.00164795 0.00274658 0.00234985 ... 0.00177002 0.00177002 0.00186157]\n",
"[54. 90. 77. ... 58. 58. 61.]\n",
"29746\n",
"fbank\n",
"[54 90 77 ... 58 58 61] int16\n",
"(184, 80) float64\n",
"[[10.61737914 10.07708936 5.32487528 ... 10.2481839 8.89699394\n",
" 7.80671114]\n",
" [11.0440077 10.3180721 6.30866128 ... 11.23730926 10.35838868\n",
" 8.83860079]\n",
" [10.26930555 9.99636567 7.3296638 ... 10.45131595 9.69295303\n",
" 7.96168491]\n",
" ...\n",
" [10.14497345 9.88674207 6.73801138 ... 10.21580627 9.00343472\n",
" 8.75616521]\n",
" [ 9.97745961 9.67949736 7.90660425 ... 10.22436653 9.59456493\n",
" 7.69287184]\n",
" [ 6.47357374 7.76335491 7.75765843 ... 9.96522077 9.6226365\n",
" 8.16007108]]\n",
"(184, 80) float64\n",
"[[10.61737914 10.07708936 5.32487528 ... 10.2481839 8.89699394\n",
" 7.80671114]\n",
" [11.0440077 10.3180721 6.30866128 ... 11.23730926 10.35838868\n",
" 8.83860079]\n",
" [10.26930555 9.99636567 7.3296638 ... 10.45131595 9.69295303\n",
" 7.96168491]\n",
" ...\n",
" [10.14497345 9.88674207 6.73801138 ... 10.21580627 9.00343472\n",
" 8.75616521]\n",
" [ 9.97745961 9.67949736 7.90660425 ... 10.22436653 9.59456493\n",
" 7.69287184]\n",
" [ 6.47357374 7.76335491 7.75765843 ... 9.96522077 9.6226365\n",
" 8.16007108]]\n"
]
}
],
"source": [
"wav='/workspace/DeepSpeech-2.x/examples/aishell/s1/../../..//examples/dataset/aishell/data_aishell/wav/test/S0916/BAC009S0916W0426.wav'\n",
"test='祝可爱的你'\n",
"audio_featurizer = AudioFeaturizer(\n",
" specgram_type=args.specgram_type,\n",
" feat_dim=args.feat_dim,\n",
" delta_delta=args.delta_delta,\n",
" stride_ms=args.stride_ms,\n",
" window_ms=args.window_ms,\n",
" n_fft=None,\n",
" max_freq=None,\n",
" target_sample_rate=args.sample_rate,\n",
" use_dB_normalization=False,\n",
" target_dB=-20,\n",
" dither=0.0)\n",
"samples = AudioSegment.from_file(wav)\n",
"print(samples._samples)\n",
"print(samples._samples * 2**15)\n",
"print(len(samples._samples))\n",
"feat = audio_featurizer.featurize(samples, False, False)\n",
"feat = feat.T\n",
"print(feat.shape, feat.dtype)\n",
"print(feat)\n",
"\n",
"from python_speech_features import logfbank\n",
"max_freq = args.sample_rate / 2\n",
"fbank_feat = logfbank(\n",
" signal=samples.to('int16'),\n",
" samplerate=args.sample_rate,\n",
" winlen=0.001 * args.window_ms,\n",
" winstep=0.001 * args.stride_ms,\n",
" nfilt=args.feat_dim,\n",
" nfft=512,\n",
" lowfreq=20,\n",
" highfreq=max_freq,\n",
" preemph=0.97,\n",
" dither=0.0,\n",
" wintype='povey')\n",
"print(fbank_feat.shape, fbank_feat.dtype)\n",
"print(fbank_feat)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "numeric-analyst",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(184, 160)\n",
"[ 8.59522397 8.43148278 8.36414052 8.45487173 8.31761643 8.04843683\n",
" 8.01683696 7.6574614 7.95521932 8.22945157 10.20138275 9.0447775\n",
" 9.14763398 9.18184349 9.03801065 9.04852307 8.67706728 8.71894271\n",
" 9.54553655 9.19535135 8.76413076 8.47828946 8.52586143 8.49469288\n",
" 8.72461247 8.28562879 8.11581393 7.99922156 7.91023364 8.04142296\n",
" 7.89762773 7.76257636 8.32043745 8.01592886 8.34109665 8.90115454\n",
" 8.48246945 7.98658664 8.05745122 8.11384088 8.18864479 8.8091827\n",
" 11.8067711 13.25258218 14.44311795 13.90515283 14.00120623 13.99801252\n",
" 13.81595394 13.6379904 13.3574897 13.14933334 12.96518543 13.02601156\n",
" 12.70246737 12.54410834 12.15615068 11.86574681 11.67497882 10.79645481\n",
" 10.48150035 10.03758575 10.05637027 9.92891308 10.06923218 12.43382431\n",
" 12.71428321 14.33135052 13.94470959 14.29188291 14.11483993 14.03496606\n",
" 13.78167331 13.66701466 14.40308625 14.73934137 15.09569382 14.89565815\n",
" 15.10519995 14.94383582 15.03275563 15.42194679 15.29219967 15.41602274\n",
" 15.39242545 15.76836177 16.259222 16.47777231 17.03366795 17.46165793\n",
" 17.52596217 17.78844031 17.99878075 18.11446843 17.95761578 17.99900337\n",
" 17.86282737 17.7290163 17.47686504 17.43425516 17.07750485 16.64395242\n",
" 15.68217043 14.90058399 14.45645737 14.0405463 14.89549542 16.00405781\n",
" 16.27301689 16.37572895 16.31219037 16.31765447 16.44819716 16.36281089\n",
" 16.24932823 15.79302555 14.76361963 13.95761882 13.48917053 13.45543501\n",
" 13.00091327 13.13854248 13.74596395 13.86340629 14.00656109 13.77432101\n",
" 13.64267001 13.35742634 13.23042234 12.97916104 12.80694468 12.70005006\n",
" 13.2802483 13.22644525 13.14579624 13.02536594 13.36511022 11.37167205\n",
" 12.11598045 12.47619798 12.83885973 11.63880287 11.42083924 11.08747705\n",
" 11.04093403 11.11263149 10.74353319 10.58734669 10.46180738 10.34157335\n",
" 9.63131146 9.70582692 9.29059204 8.94583657 8.66065094 8.46799095\n",
" 8.25064103 8.30239167 8.19463371 8.12104567 8.02731234 8.06412715\n",
" 7.84889951 7.73090283 7.74119562 7.85444657 7.80717312 7.7129933\n",
" 7.84087442 7.77907788 7.60660865 7.55051479 7.458385 7.496416\n",
" 7.69519793 7.49086759 7.32199493 8.01617458 7.58525375 7.06661122\n",
" 6.94653756 7.19874283 7.28515661 7.17574078]\n",
"(184,)\n",
"(184,)\n",
"[1.48370471 1.52174523 1.46984238 1.67010478 1.88757689 1.68825992\n",
" 1.74270259 1.55497318 1.29200818 1.68446481 1.88133219 1.97138928\n",
" 2.15910096 2.3149476 1.9820247 2.07694378 1.93498835 2.01493974\n",
" 2.39156824 2.02396518 1.69586449 1.63808752 1.64020228 1.43573473\n",
" 1.93092656 1.37466294 1.34704929 1.59600739 1.03960441 1.45276496\n",
" 1.59360131 1.57466343 1.89491479 1.79333746 1.32701974 1.49441767\n",
" 1.51466756 1.63497989 1.42858074 1.51135396 1.61077201 1.81066387\n",
" 1.83367783 2.3507094 2.87885378 3.26231227 2.1313117 1.98557548\n",
" 1.99105426 2.26150533 2.34298751 2.44621608 2.39201042 2.41226503\n",
" 2.5142992 3.03777565 2.81592295 2.75117863 2.78324175 2.68819666\n",
" 2.8945782 2.84464168 2.680973 2.78397395 2.47996808 1.71829563\n",
" 1.60636949 1.65992483 1.38122631 1.74831825 2.16006884 1.68076185\n",
" 1.69329487 1.44929837 1.63763312 1.80101076 2.01166253 2.03254244\n",
" 1.9583913 2.04542255 2.00859694 2.16600883 2.16095629 1.97541122\n",
" 2.13807632 2.06386436 2.2154187 2.84205688 2.54862449 2.64321545\n",
" 2.6805773 2.52300146 2.53209001 2.54682059 2.4521937 2.43155532\n",
" 2.42571275 2.23421289 2.23164529 2.23597192 2.14215121 2.10406703\n",
" 2.07962874 1.88506161 1.80092372 1.61156092 1.77426835 1.98765563\n",
" 2.0356793 1.87964187 1.779513 1.87187681 1.76463632 1.70978684\n",
" 1.76471778 1.75604749 1.62792552 1.73929352 1.6887024 1.8677704\n",
" 2.17342368 2.08166072 2.14567453 2.15936953 2.18351006 2.41010388\n",
" 2.26101752 2.25468001 2.23739715 2.15395133 2.04547813 1.92038843\n",
" 1.85491264 1.91905927 2.16709365 1.99924152 2.1850471 2.55461622\n",
" 2.72476673 1.69682926 1.73249614 2.06992695 2.1210591 1.66854454\n",
" 1.63907505 1.32203822 1.38992558 1.2436937 1.17932877 1.02963653\n",
" 1.26085036 1.16997132 1.09339504 1.14188689 1.18675772 1.31859788\n",
" 1.21746591 1.3872131 1.26095274 1.34885761 1.46633543 1.64506975\n",
" 1.36013821 1.45574721 1.43766588 1.65119054 1.57163772 1.55082968\n",
" 1.29413316 1.38351736 1.64234673 1.57186432 1.45381083 1.71204761\n",
" 1.51828607 1.30639985 1.32928395 1.49004237 1.6057589 1.81815735\n",
" 1.67784678 1.72180861 1.60703743 1.64850255]\n"
]
}
],
"source": [
"a = np.hstack([feat, feat])\n",
"print(a.shape)\n",
"m = np.mean(a, axis=1)\n",
"print(m)\n",
"print(m.shape)\n",
"std = np.std(a, axis=1)\n",
"print(std.shape)\n",
"print(std)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "nonprofit-potato",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 18,
"id": "hispanic-ethics",
"metadata": {},
"outputs": [],
"source": [
"import torch\n",
"import torchaudio\n",
"import torchaudio.compliance.kaldi as kaldi\n",
"import torchaudio.sox_effects as sox_effects\n",
"from torch.nn.utils.rnn import pad_sequence\n",
"torchaudio.set_audio_backend(\"sox\")"
]
},
{
"cell_type": "code",
"execution_count": 19,
"id": "changing-calvin",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"torch.Size([1, 29746])\n",
"tensor([[54., 90., 77., ..., 58., 58., 61.]])\n",
"(184, 80)\n",
"[[10.617376 10.077089 5.3248763 ... 10.248186 8.896992 7.8067265]\n",
" [11.044004 10.318072 6.3086634 ... 11.237308 10.358393 8.838616 ]\n",
" [10.269302 9.9963665 7.3296647 ... 10.451319 9.692951 7.9617033]\n",
" ...\n",
" [10.14497 9.886743 6.738012 ... 10.215809 9.0034275 8.756177 ]\n",
" [ 9.977456 9.679498 7.9066052 ... 10.224365 9.594568 7.6928873]\n",
" [ 6.4735703 7.7633557 7.7576594 ... 9.965221 9.622637 8.160085 ]]\n",
"-----------\n",
"[0.00164795 0.00274658 0.00234985 ... 0.00177002 0.00177002 0.00186157]\n",
"(184, 80)\n",
"[[-10.177039 -10.717326 -15.46954 ... -10.546229 -11.897424 -12.987689]\n",
" [ -9.750411 -10.476343 -14.485752 ... -9.557108 -10.436023 -11.955799]\n",
" [-10.525113 -10.798049 -13.46475 ... -10.343097 -11.101464 -12.832712]\n",
" ...\n",
" [-10.649446 -10.907673 -14.056403 ... -10.578607 -11.790988 -12.038239]\n",
" [-10.816959 -11.114918 -12.88781 ... -10.570049 -11.199847 -13.101528]\n",
" [-14.320845 -13.03106 -13.036756 ... -10.829194 -11.171779 -12.634331]]\n",
"**************\n",
"[0.00164795 0.00274658 0.00234985 ... 0.00177002 0.00177002 0.00186157]\n",
"[54. 90. 77. ... 58. 58. 61.] float32\n",
"(184, 80)\n",
"[[10.617376 10.077089 5.3248763 ... 10.248186 8.896992 7.8067265]\n",
" [11.044004 10.318072 6.3086634 ... 11.237308 10.358393 8.838616 ]\n",
" [10.269302 9.9963665 7.3296647 ... 10.451319 9.692951 7.9617033]\n",
" ...\n",
" [10.14497 9.886743 6.738012 ... 10.215809 9.0034275 8.756177 ]\n",
" [ 9.977456 9.679498 7.9066052 ... 10.224365 9.594568 7.6928873]\n",
" [ 6.4735703 7.7633557 7.7576594 ... 9.965221 9.622637 8.160085 ]]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel_launcher.py:1: UserWarning: torchaudio.backend.sox_backend.load_wav has been deprecated and will be removed from 0.9.0 release. Please use \"torchaudio.load\".\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
]
}
],
"source": [
"waveform, sample_rate = torchaudio.load_wav(wav)\n",
"print(waveform.shape)\n",
"print(waveform)\n",
"mat = kaldi.fbank(\n",
" waveform,\n",
" num_mel_bins=80,\n",
" frame_length=25,\n",
" frame_shift=10,\n",
" dither=0,\n",
" energy_floor=0.0,\n",
" sample_frequency=sample_rate\n",
" )\n",
"mat = mat.detach().numpy()\n",
"print(mat.shape)\n",
"print(mat)\n",
"\n",
"print('-----------')\n",
"print(samples._samples)\n",
"aud = torch.tensor(samples._samples).view(1, -1)\n",
"mat = kaldi.fbank(\n",
" aud,\n",
" num_mel_bins=80,\n",
" frame_length=25,\n",
" frame_shift=10,\n",
" dither=0,\n",
" energy_floor=0.0,\n",
" sample_frequency=sample_rate\n",
" )\n",
"mat = mat.detach().numpy()\n",
"print(mat.shape)\n",
"print(mat)\n",
"\n",
"print('**************')\n",
"print(samples._samples)\n",
"tmp = samples.to('int16').astype('float32')\n",
"print(tmp, tmp.dtype)\n",
"aud = torch.tensor(tmp).view(1, -1)\n",
"mat = kaldi.fbank(\n",
" aud,\n",
" num_mel_bins=80,\n",
" frame_length=25,\n",
" frame_shift=10,\n",
" dither=0,\n",
" energy_floor=0.0,\n",
" sample_frequency=sample_rate\n",
" )\n",
"mat = mat.detach().numpy()\n",
"print(mat.shape)\n",
"print(mat)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "buried-dependence",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "silver-printing",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 20,
"id": "outer-space",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(29746,)\n",
"[54 90 77 ... 58 58 61]\n",
"(184, 80)\n",
"[[10.61737914 10.07708936 5.32487528 ... 10.2481839 8.89699394\n",
" 7.80671114]\n",
" [11.0440077 10.3180721 6.30866128 ... 11.23730926 10.35838868\n",
" 8.83860079]\n",
" [10.26930555 9.99636567 7.3296638 ... 10.45131595 9.69295303\n",
" 7.96168491]\n",
" ...\n",
" [10.14497345 9.88674207 6.73801138 ... 10.21580627 9.00343472\n",
" 8.75616521]\n",
" [ 9.97745961 9.67949736 7.90660425 ... 10.22436653 9.59456493\n",
" 7.69287184]\n",
" [ 6.47357374 7.76335491 7.75765843 ... 9.96522077 9.6226365\n",
" 8.16007108]]\n",
"(184, 13)\n",
"[[ 14.73775998 -13.30393391 5.85974818 ... -3.42359739 2.82785335\n",
" 8.86862748]\n",
" [ 15.31274834 -13.33671651 4.06537223 ... 8.15970347 2.15934846\n",
" 6.78353115]\n",
" [ 13.82218765 -13.39296404 6.8304843 ... 2.55332563 8.86724453\n",
" -0.05919222]\n",
" ...\n",
" [ 13.5837844 -13.42104892 11.21222354 ... 4.81477718 1.66627505\n",
" 5.59045842]\n",
" [ 13.75757034 -13.92626662 13.06074011 ... -0.46694046 5.56214833\n",
" 12.0785146 ]\n",
" [ 11.92813809 -15.9169855 8.78372271 ... -1.42014277 -3.25768086\n",
" 0.88337965]]\n"
]
}
],
"source": [
"from python_speech_features import mfcc\n",
"from python_speech_features import delta\n",
"from python_speech_features import logfbank\n",
"import scipy.io.wavfile as iowav\n",
"\n",
"(rate,sig) = iowav.read(wav)\n",
"print(sig.shape)\n",
"print(sig)\n",
"\n",
"# note that generally nfilt=40 is used for speech recognition\n",
"fbank_feat = logfbank(sig,nfilt=80,lowfreq=20,dither=0,wintype='povey')\n",
"print(fbank_feat.shape)\n",
"print(fbank_feat)\n",
"\n",
"# the computed fbank coefficents of english.wav with dimension [110,23]\n",
"# [ 12.2865\t12.6906\t13.1765\t15.714\t16.064\t15.7553\t16.5746\t16.9205\t16.6472\t16.1302\t16.4576\t16.7326\t16.8864\t17.7215\t18.88\t19.1377\t19.1495\t18.6683\t18.3886\t20.3506\t20.2772\t18.8248\t18.1899\n",
"# 11.9198\t13.146\t14.7215\t15.8642\t17.4288\t16.394\t16.8238\t16.1095\t16.4297\t16.6331\t16.3163\t16.5093\t17.4981\t18.3429\t19.6555\t19.6263\t19.8435\t19.0534\t19.001\t20.0287\t19.7707\t19.5852\t19.1112\n",
"# ...\n",
"# ...\n",
"# the same with that using kaldi commands: compute-fbank-feats --dither=0.0\n",
"\n",
"mfcc_feat = mfcc(sig,dither=0,useEnergy=True,wintype='povey')\n",
"print(mfcc_feat.shape)\n",
"print(mfcc_feat)\n",
"\n",
"# the computed mfcc coefficents of english.wav with dimension [110,13]\n",
"# [ 17.1337\t-23.3651\t-7.41751\t-7.73686\t-21.3682\t-8.93884\t-3.70843\t4.68346\t-16.0676\t12.782\t-7.24054\t8.25089\t10.7292\n",
"# 17.1692\t-23.3028\t-5.61872\t-4.0075\t-23.287\t-20.6101\t-5.51584\t-6.15273\t-14.4333\t8.13052\t-0.0345329\t2.06274\t-0.564298\n",
"# ...\n",
"# ...\n",
"# the same with that using kaldi commands: compute-mfcc-feats --dither=0.0"
]
},
{
"cell_type": "code",
"execution_count": 21,
"id": "sporting-school",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(184, 80)\n",
"[[-10.17703627 -10.71732606 -15.46954014 ... -10.54623152 -11.89742148\n",
" -12.98770428]\n",
" [ -9.75040771 -10.47634331 -14.48575413 ... -9.55710616 -10.43602673\n",
" -11.95581463]\n",
" [-10.52510987 -10.79804975 -13.46475161 ... -10.34309947 -11.10146239\n",
" -12.83273051]\n",
" ...\n",
" [-10.64944197 -10.90767335 -14.05640404 ... -10.57860915 -11.7909807\n",
" -12.03825021]\n",
" [-10.8169558 -11.11491806 -12.88781116 ... -10.57004889 -11.19985048\n",
" -13.10154358]\n",
" [-14.32084168 -13.03106051 -13.03675699 ... -10.82919465 -11.17177892\n",
" -12.63434434]]\n",
"(184, 13)\n",
"[[ -6.05665544 -13.30393391 5.85974818 ... -3.42359739 2.82785335\n",
" 8.86862748]\n",
" [ -5.48166707 -13.33671651 4.06537223 ... 8.15970347 2.15934846\n",
" 6.78353115]\n",
" [ -6.97222776 -13.39296404 6.8304843 ... 2.55332563 8.86724453\n",
" -0.05919222]\n",
" ...\n",
" [ -7.21063102 -13.42104892 11.21222354 ... 4.81477718 1.66627505\n",
" 5.59045842]\n",
" [ -7.03684508 -13.92626662 13.06074011 ... -0.46694046 5.56214833\n",
" 12.0785146 ]\n",
" [ -8.86627732 -15.9169855 8.78372271 ... -1.42014277 -3.25768086\n",
" 0.88337965]]\n"
]
}
],
"source": [
"fbank_feat = logfbank(samples._samples,nfilt=80,lowfreq=20,dither=0,wintype='povey')\n",
"print(fbank_feat.shape)\n",
"print(fbank_feat)\n",
"\n",
"mfcc_feat = mfcc(samples._samples,dither=0,useEnergy=True,wintype='povey')\n",
"print(mfcc_feat.shape)\n",
"print(mfcc_feat)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "restricted-license",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "specialized-threat",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
......
......@@ -637,7 +637,7 @@
{
"cell_type": "code",
"execution_count": 59,
"id": "engaged-offense",
"id": "first-release",
"metadata": {},
"outputs": [
{
......@@ -660,7 +660,7 @@
{
"cell_type": "code",
"execution_count": 35,
"id": "level-fairy",
"id": "convertible-roulette",
"metadata": {},
"outputs": [
{
......@@ -705,7 +705,7 @@
{
"cell_type": "code",
"execution_count": 3,
"id": "beautiful-geometry",
"id": "cutting-fleece",
"metadata": {},
"outputs": [
{
......@@ -728,7 +728,7 @@
{
"cell_type": "code",
"execution_count": 4,
"id": "african-trustee",
"id": "historical-diving",
"metadata": {},
"outputs": [
{
......@@ -748,7 +748,7 @@
{
"cell_type": "code",
"execution_count": 5,
"id": "ready-wages",
"id": "similar-spice",
"metadata": {},
"outputs": [],
"source": [
......@@ -758,7 +758,7 @@
{
"cell_type": "code",
"execution_count": 6,
"id": "distinguished-printer",
"id": "grand-influence",
"metadata": {},
"outputs": [
{
......@@ -776,7 +776,7 @@
{
"cell_type": "code",
"execution_count": 7,
"id": "precious-limit",
"id": "wireless-hypothetical",
"metadata": {},
"outputs": [
{
......@@ -809,7 +809,7 @@
{
"cell_type": "code",
"execution_count": 17,
"id": "chemical-convenience",
"id": "designed-fluid",
"metadata": {},
"outputs": [
{
......@@ -839,7 +839,7 @@
{
"cell_type": "code",
"execution_count": 18,
"id": "round-remark",
"id": "cultural-friendship",
"metadata": {},
"outputs": [
{
......@@ -871,7 +871,7 @@
{
"cell_type": "code",
"execution_count": 19,
"id": "smaller-shower",
"id": "fossil-lotus",
"metadata": {},
"outputs": [
{
......@@ -903,7 +903,7 @@
{
"cell_type": "code",
"execution_count": 31,
"id": "integrated-block",
"id": "constitutional-poker",
"metadata": {},
"outputs": [
{
......@@ -935,7 +935,7 @@
{
"cell_type": "code",
"execution_count": 32,
"id": "favorite-failure",
"id": "threaded-strap",
"metadata": {},
"outputs": [
{
......@@ -966,7 +966,7 @@
{
"cell_type": "code",
"execution_count": 20,
"id": "boolean-saint",
"id": "infectious-welcome",
"metadata": {},
"outputs": [],
"source": [
......@@ -977,7 +977,7 @@
{
"cell_type": "code",
"execution_count": 46,
"id": "senior-hospital",
"id": "musical-anatomy",
"metadata": {},
"outputs": [
{
......@@ -997,7 +997,7 @@
{
"cell_type": "code",
"execution_count": 30,
"id": "consolidated-incident",
"id": "lucky-paraguay",
"metadata": {},
"outputs": [],
"source": [
......@@ -1007,7 +1007,7 @@
{
"cell_type": "code",
"execution_count": 31,
"id": "pursuant-paragraph",
"id": "annual-christmas",
"metadata": {},
"outputs": [],
"source": [
......@@ -1017,7 +1017,7 @@
{
"cell_type": "code",
"execution_count": 47,
"id": "mexican-apollo",
"id": "infectious-seeker",
"metadata": {},
"outputs": [
{
......@@ -1038,7 +1038,7 @@
{
"cell_type": "code",
"execution_count": 1,
"id": "encouraging-integration",
"id": "pregnant-conditioning",
"metadata": {},
"outputs": [],
"source": [
......@@ -1049,7 +1049,7 @@
{
"cell_type": "code",
"execution_count": 56,
"id": "trying-auckland",
"id": "logical-happiness",
"metadata": {},
"outputs": [],
"source": [
......@@ -1059,7 +1059,7 @@
{
"cell_type": "code",
"execution_count": 58,
"id": "national-edward",
"id": "rocky-plastic",
"metadata": {},
"outputs": [],
"source": [
......@@ -1069,7 +1069,7 @@
{
"cell_type": "code",
"execution_count": 60,
"id": "aerial-campaign",
"id": "focused-compensation",
"metadata": {},
"outputs": [],
"source": [
......@@ -1079,7 +1079,7 @@
{
"cell_type": "code",
"execution_count": 66,
"id": "instant-violence",
"id": "centered-repository",
"metadata": {},
"outputs": [],
"source": [
......@@ -1089,7 +1089,7 @@
{
"cell_type": "code",
"execution_count": 95,
"id": "medical-globe",
"id": "inner-invite",
"metadata": {},
"outputs": [
{
......@@ -1110,7 +1110,7 @@
{
"cell_type": "code",
"execution_count": 81,
"id": "three-contrast",
"id": "russian-chosen",
"metadata": {},
"outputs": [
{
......@@ -1131,7 +1131,7 @@
{
"cell_type": "code",
"execution_count": 11,
"id": "cross-atlas",
"id": "equal-particle",
"metadata": {},
"outputs": [],
"source": [
......@@ -1161,7 +1161,7 @@
{
"cell_type": "code",
"execution_count": 12,
"id": "empirical-defense",
"id": "tracked-purse",
"metadata": {},
"outputs": [],
"source": [
......@@ -1172,7 +1172,7 @@
{
"cell_type": "code",
"execution_count": 14,
"id": "rocky-listening",
"id": "steady-mileage",
"metadata": {},
"outputs": [
{
......@@ -1201,7 +1201,7 @@
{
"cell_type": "code",
"execution_count": 13,
"id": "surrounded-absolute",
"id": "regulated-google",
"metadata": {},
"outputs": [
{
......@@ -1230,7 +1230,7 @@
{
"cell_type": "code",
"execution_count": 15,
"id": "differential-surgery",
"id": "homeless-forge",
"metadata": {},
"outputs": [
{
......@@ -1260,7 +1260,7 @@
{
"cell_type": "code",
"execution_count": 29,
"id": "durable-powell",
"id": "exciting-blocking",
"metadata": {},
"outputs": [
{
......@@ -1290,7 +1290,7 @@
{
"cell_type": "code",
"execution_count": 30,
"id": "young-continuity",
"id": "through-botswana",
"metadata": {},
"outputs": [
{
......@@ -1308,7 +1308,7 @@
{
"cell_type": "code",
"execution_count": 22,
"id": "geological-sarah",
"id": "cellular-violence",
"metadata": {},
"outputs": [
{
......@@ -1343,7 +1343,7 @@
{
"cell_type": "code",
"execution_count": 23,
"id": "possible-angle",
"id": "undefined-parade",
"metadata": {},
"outputs": [
{
......@@ -1376,7 +1376,7 @@
{
"cell_type": "code",
"execution_count": 33,
"id": "novel-sucking",
"id": "special-delicious",
"metadata": {},
"outputs": [],
"source": [
......@@ -1386,7 +1386,7 @@
{
"cell_type": "code",
"execution_count": 34,
"id": "fixed-wallet",
"id": "seasonal-consensus",
"metadata": {},
"outputs": [
{
......@@ -1428,7 +1428,7 @@
{
"cell_type": "code",
"execution_count": 35,
"id": "north-seattle",
"id": "dress-distinction",
"metadata": {},
"outputs": [],
"source": [
......@@ -1438,7 +1438,7 @@
{
"cell_type": "code",
"execution_count": 38,
"id": "above-western",
"id": "rental-anthony",
"metadata": {},
"outputs": [
{
......@@ -1471,7 +1471,7 @@
{
"cell_type": "code",
"execution_count": 41,
"id": "choice-diabetes",
"id": "separated-restriction",
"metadata": {},
"outputs": [],
"source": [
......@@ -1481,7 +1481,7 @@
{
"cell_type": "code",
"execution_count": 3,
"id": "white-vessel",
"id": "painted-variable",
"metadata": {},
"outputs": [
{
......@@ -1504,7 +1504,7 @@
{
"cell_type": "code",
"execution_count": 5,
"id": "treated-freedom",
"id": "satellite-insider",
"metadata": {},
"outputs": [
{
......@@ -1523,7 +1523,7 @@
{
"cell_type": "code",
"execution_count": 7,
"id": "convinced-safety",
"id": "developed-thirty",
"metadata": {},
"outputs": [
{
......@@ -1543,7 +1543,7 @@
{
"cell_type": "code",
"execution_count": 8,
"id": "blond-bunny",
"id": "official-bench",
"metadata": {},
"outputs": [
{
......@@ -1560,10 +1560,97 @@
"print(sorted_val_scores)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "ranking-camera",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"b'\\x01\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x14\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x02\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x1e\\x00\\x00\\x00\\x00\\x00\\x00\\x00'\n",
"[ 1 20 2 30]\n",
"[[ 1 20]\n",
" [ 2 30]]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel_launcher.py:1: DeprecationWarning: tostring() is deprecated. Use tobytes() instead.\n",
" \"\"\"Entry point for launching an IPython kernel.\n",
"/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel_launcher.py:3: DeprecationWarning: The binary mode of fromstring is deprecated, as it behaves surprisingly on unicode inputs. Use frombuffer instead\n",
" This is separate from the ipykernel package so we can avoid doing imports until\n"
]
}
],
"source": [
"a = scores.tostring()\n",
"print(a)\n",
"b = np.fromstring(a, scores.dtype)\n",
"print(b)\n",
"print(scores)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "breeding-proxy",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"numpy.int16"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.int16"
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "coordinate-hungary",
"metadata": {},
"outputs": [],
"source": [
"dtype = np.dtype('int16')"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "specified-jackson",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"int16\n",
"16\n"
]
}
],
"source": [
"print(dtype)\n",
"dtype is np.int16\n",
"print(np.iinfo(dtype).bits)"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "utility-monroe",
"id": "activated-insight",
"metadata": {},
"outputs": [],
"source": []
......@@ -3,6 +3,7 @@
hooks:
- id: yapf
files: \.py$
exclude: (?=third_party).*(\.py)$
- repo: https://github.com/pre-commit/pre-commit-hooks
sha: a11d9314b22d8f8c7556443875b731ef05965464
hooks:
......@@ -15,6 +16,7 @@
- id: trailing-whitespace
files: \.md$
- id: requirements-txt-fixer
exclude: (?=third_party).*$
- id: check-yaml
- id: check-json
- id: pretty-format-json
......@@ -27,6 +29,7 @@
- --ignore=E501,E228,E226,E261,E266,E128,E402,W503
- --builtins=G,request
- --jobs=1
exclude: (?=third_party).*(\.py)$
- repo : https://github.com/Lucas-C/pre-commit-hooks
sha: v1.0.1
hooks:
......@@ -51,8 +54,9 @@
entry: python .pre-commit-hooks/copyright-check.hook
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
#exclude: (?=decoders/swig).*(\.cpp|\.h)$
exclude: (?=third_party).*(\.cpp|\.h|\.py)$
- repo: https://github.com/asottile/reorder_python_imports
rev: v2.4.0
hooks:
- id: reorder-python-imports
exclude: (?=third_party).*(\.py)$
......@@ -298,6 +298,18 @@ class AudioSegment(object):
samples = self._convert_samples_from_float32(self._samples, dtype)
return samples.tostring()
def to(self, dtype='int16'):
"""Create a `dtype` audio content.
:param dtype: Data type for export samples. Options: 'int16', 'int32',
'float32', 'float64'. Default is 'float32'.
:type dtype: str
:return: np.ndarray containing `dtype` audio content.
:rtype: str
"""
samples = self._convert_samples_from_float32(self._samples, dtype)
return samples
def gain_db(self, gain):
"""Apply gain in decibels to samples.
......
......@@ -64,6 +64,7 @@ class SpecAugmentor(AugmentorBase):
self.n_freq_masks = n_freq_masks
self.n_time_masks = n_time_masks
self.p = p
#logger.info(f"specaug: F-{F}, T-{T}, F-n-{n_freq_masks}, T-n-{n_time_masks}")
# adaptive SpecAugment
self.adaptive_number_ratio = adaptive_number_ratio
......
......@@ -56,7 +56,8 @@ class AudioFeaturizer(object):
max_freq=None,
target_sample_rate=16000,
use_dB_normalization=True,
target_dB=-20):
target_dB=-20,
dither=1.0):
self._specgram_type = specgram_type
# mfcc and fbank using `feat_dim`
self._feat_dim = feat_dim
......@@ -69,6 +70,7 @@ class AudioFeaturizer(object):
self._use_dB_normalization = use_dB_normalization
self._target_dB = target_dB
self._fft_point = n_fft
self._dither = dither
def featurize(self,
audio_segment,
......@@ -101,8 +103,7 @@ class AudioFeaturizer(object):
if self._use_dB_normalization:
audio_segment.normalize(target_db=self._target_dB)
# extract spectrogram
return self._compute_specgram(audio_segment.samples,
audio_segment.sample_rate)
return self._compute_specgram(audio_segment)
@property
def feature_size(self):
......@@ -125,9 +126,11 @@ class AudioFeaturizer(object):
"Supported values: linear." % self._specgram_type)
return feat_dim
def _compute_specgram(self, samples, sample_rate):
def _compute_specgram(self, audio_segment):
"""Extract various audio features."""
sample_rate = audio_segment.sample_rate
if self._specgram_type == 'linear':
samples = audio_segment.samples
return self._compute_linear_specgram(
samples,
sample_rate,
......@@ -135,6 +138,7 @@ class AudioFeaturizer(object):
window_ms=self._window_ms,
max_freq=self._max_freq)
elif self._specgram_type == 'mfcc':
samples = audio_segment.to('int16')
return self._compute_mfcc(
samples,
sample_rate,
......@@ -142,8 +146,10 @@ class AudioFeaturizer(object):
stride_ms=self._stride_ms,
window_ms=self._window_ms,
max_freq=self._max_freq,
dither=self._dither,
delta_delta=self._delta_delta)
elif self._specgram_type == 'fbank':
samples = audio_segment.to('int16')
return self._compute_fbank(
samples,
sample_rate,
......@@ -151,6 +157,7 @@ class AudioFeaturizer(object):
stride_ms=self._stride_ms,
window_ms=self._window_ms,
max_freq=self._max_freq,
dither=self._dither,
delta_delta=self._delta_delta)
else:
raise ValueError("Unknown specgram_type %s. "
......@@ -233,17 +240,18 @@ class AudioFeaturizer(object):
sample_rate,
feat_dim=13,
stride_ms=10.0,
window_ms=20.0,
window_ms=25.0,
max_freq=None,
dither=1.0,
delta_delta=True):
"""Compute mfcc from samples.
Args:
samples (np.ndarray): the audio signal from which to compute features. Should be an N*1 array
samples (np.ndarray, np.int16): the audio signal from which to compute features.
sample_rate (float): the sample rate of the signal we are working with, in Hz.
feat_dim (int): the number of cepstrum to return, default 13.
stride_ms (float, optional): stride length in ms. Defaults to 10.0.
window_ms (float, optional): window length in ms. Defaults to 20.0.
window_ms (float, optional): window length in ms. Defaults to 25.0.
max_freq ([type], optional): highest band edge of mel filters. In Hz, default is samplerate/2. Defaults to None.
delta_delta (bool, optional): Whether with delta delta. Defaults to False.
......@@ -270,14 +278,16 @@ class AudioFeaturizer(object):
winlen=0.001 * window_ms,
winstep=0.001 * stride_ms,
numcep=feat_dim,
nfilt=2 * feat_dim,
nfft=None,
lowfreq=0,
nfilt=23,
nfft=512,
lowfreq=20,
highfreq=max_freq,
dither=dither,
remove_dc_offset=True,
preemph=0.97,
ceplifter=22,
appendEnergy=True,
winfunc=lambda x: np.ones((x, )))
useEnergy=True,
winfunc='povey')
mfcc_feat = np.transpose(mfcc_feat)
if delta_delta:
mfcc_feat = self._concat_delta_delta(mfcc_feat)
......@@ -286,15 +296,16 @@ class AudioFeaturizer(object):
def _compute_fbank(self,
samples,
sample_rate,
feat_dim=26,
feat_dim=40,
stride_ms=10.0,
window_ms=20.0,
window_ms=25.0,
max_freq=None,
dither=1.0,
delta_delta=False):
"""Compute logfbank from samples.
Args:
samples (np.ndarray): the audio signal from which to compute features. Should be an N*1 array
samples (np.ndarray, np.int16): the audio signal from which to compute features. Should be an N*1 array
sample_rate (float): the sample rate of the signal we are working with, in Hz.
feat_dim (int): the number of cepstrum to return, default 13.
stride_ms (float, optional): stride length in ms. Defaults to 10.0.
......@@ -325,9 +336,13 @@ class AudioFeaturizer(object):
winstep=0.001 * stride_ms,
nfilt=feat_dim,
nfft=512,
lowfreq=0,
lowfreq=20,
highfreq=max_freq,
preemph=0.97, )
dither=dither,
remove_dc_offset=True,
preemph=0.97,
wintype='povey')
fbank_feat = np.transpose(fbank_feat)
if delta_delta:
fbank_feat = self._concat_delta_delta(fbank_feat)
......
......@@ -82,12 +82,15 @@ class FeatureNormalizer(object):
def _read_mean_std_from_file(self, filepath, eps=1e-20):
"""Load mean and std from file."""
mean, std = load_cmvn(filepath, filetype='npz')
self._mean = mean
self._istd = 1.0 / std
self._mean = mean.T
self._istd = 1.0 / std.T
def _compute_mean_std(self, manifest_path, featurize_func, num_samples):
"""Compute mean and std from randomly sampled instances."""
manifest = read_manifest(manifest_path)
if num_samples == -1:
sampled_manifest = manifest
else:
sampled_manifest = self._rng.sample(manifest, num_samples)
features = []
for instance in sampled_manifest:
......
......@@ -36,10 +36,12 @@ fi
# compute mean and stddev for normalizer
python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
--manifest_path="data/manifest.train.raw" \
--num_samples=2000 \
--specgram_type="fbank" \
--feat_dim=80 \
--delta_delta=false \
--stride_ms=10.0 \
--window_ms=25.0 \
--sample_rate=16000 \
--output_path="data/mean_std.npz"
if [ $? -ne 0 ]; then
......
[
{
"type": "speed",
"params": {
"min_speed_rate": 0.9,
"max_speed_rate": 1.1,
"num_rates": 3
},
"prob": 0.0
},
{
"type": "shift",
"params": {
......@@ -6,5 +15,20 @@
"max_shift_ms": 5
},
"prob": 1.0
},
{
"type": "specaug",
"params": {
"F": 10,
"T": 50,
"n_freq_masks": 2,
"n_time_masks": 2,
"p": 1.0,
"W": 80,
"adaptive_number_ratio": 0,
"adaptive_size_ratio": 0,
"max_n_time_masks": 20
},
"prob": 1.0
}
]
......@@ -54,4 +54,14 @@ if [ $? != 0 ]; then
exit -1
fi
# install kaldi-comptiable feature
pushd third_party/python_kaldi_features/
python setup.py install
if [ $? != 0 ]; then
error_msg "Please check why kaldi feature install error!"
exit -1
fi
popd
info_msg "Install all dependencies successfully."
* [python_kaldi_features](https://github.com/ZitengWang/python_kaldi_features)
commit: fc1bd6240c2008412ab64dc25045cd872f5e126c
ref: https://zhuanlan.zhihu.com/p/55371926
The MIT License (MIT)
Copyright (c) 2013 James Lyons
Permission is hereby granted, free of charge, to any person obtaining a copy of
this software and associated documentation files (the "Software"), to deal in
the Software without restriction, including without limitation the rights to
use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
the Software, and to permit persons to whom the Software is furnished to do so,
subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
# file GENERATED by distutils, do NOT edit
setup.py
python_speech_features\__init__.py
python_speech_features\base.py
python_speech_features\sigproc.py
forked from `<https://github.com/jameslyons/python_speech_features>`_
check the readme therein for the usages
It has been modified to produce the same results as with the compute-mfcc-feats and compute-fbank-feats (check their default parameters first) commands in Kaldi.
-------------------------------
The compute-mfcc-feats pipeline:
src/featbin/Compute-mfcc-feats.cc
Mfcc mfcc(mfcc_opts) --> src/feat/Feature-mfcc.h
struct MfccOptions
typedef OfflineFeatureTpl<MfccComputer> Mfcc --> src/feat/Feature-common.h
MfccComputer() --> src/feat/Feature-mfcc.cc
ComputeDctMatrix() --> src/matrix/Matrix-functions.cc
ComputeLifterCoeffs() --> src/feat/Mel-computations.cc
for each utterance:
mfcc.ComputeFeatures()
src/feat/Feature-common-inl.h
   OfflineFeatureTpl<F>::ComputeFeatures()
Compute()
ExtractWindow() --> src/feat/Feature-window.cc
ProcessWindow()
Dither, remove_dc_offset, log_energy_pre_window, Preemphasize, window
           computer_.Compute() --> src/feat/Feature-mfcc.cc
MfccComputer::Compute()
                                        const MelBanks &mel_banks --> Mel-computations.cc
                                         srfft_
                                       
                                        ComputerPowerSpectrum()
mel_banks.Compute()
mel_energies_.ApplyLog()
dct, cepstral_lifter
# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications
# Author: James Lyons 2012
from __future__ import division
import numpy
from python_speech_features import sigproc
from scipy.fftpack import dct
def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
nfilt=23,nfft=512,lowfreq=20,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,
ceplifter=22,useEnergy=True,wintype='povey'):
"""Compute MFCC features from an audio signal.
:param signal: the audio signal from which to compute features. Should be an N*1 array
:param samplerate: the samplerate of the signal we are working with.
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
:param numcep: the number of cepstrum to return, default 13
:param nfilt: the number of filters in the filterbank, default 26.
:param nfft: the FFT size. Default is 512.
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
:param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
:returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
"""
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither,remove_dc_offset,preemph,wintype)
feat = numpy.log(feat)
feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
feat = lifter(feat,ceplifter)
if useEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
return feat
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97,
wintype='hamming'):
"""Compute Mel-filterbank energy features from an audio signal.
:param signal: the audio signal from which to compute features. Should be an N*1 array
:param samplerate: the samplerate of the signal we are working with.
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
:param nfilt: the number of filters in the filterbank, default 26.
:param nfft: the FFT size. Default is 512.
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
winfunc=lambda x:numpy.ones((x,))
:returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
second return value is the energy in each frame (total energy, unwindowed)
"""
highfreq= highfreq or samplerate/2
frames,raw_frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, dither, preemph, remove_dc_offset, wintype)
pspec = sigproc.powspec(frames,nfft) # nearly the same until this part
energy = numpy.sum(raw_frames**2,1) # this stores the raw energy in each frame
energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
return feat,energy
def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
nfilt=40,nfft=512,lowfreq=64,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,wintype='hamming'):
"""Compute log Mel-filterbank energy features from an audio signal.
:param signal: the audio signal from which to compute features. Should be an N*1 array
:param samplerate: the samplerate of the signal we are working with.
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
:param nfilt: the number of filters in the filterbank, default 26.
:param nfft: the FFT size. Default is 512.
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
"""
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither, remove_dc_offset,preemph,wintype)
return numpy.log(feat)
def hz2mel(hz):
"""Convert a value in Hertz to Mels
:param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
:returns: a value in Mels. If an array was passed in, an identical sized array is returned.
"""
return 1127 * numpy.log(1+hz/700.0)
def mel2hz(mel):
"""Convert a value in Mels to Hertz
:param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
:returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
"""
return 700 * (numpy.exp(mel/1127.0)-1)
def get_filterbanks(nfilt=26,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
"""Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
:param nfilt: the number of filters in the filterbank, default 20.
:param nfft: the FFT size. Default is 512.
:param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
:param lowfreq: lowest band edge of mel filters, default 0 Hz
:param highfreq: highest band edge of mel filters, default samplerate/2
:returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
"""
highfreq= highfreq or samplerate/2
assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
# compute points evenly spaced in mels
lowmel = hz2mel(lowfreq)
highmel = hz2mel(highfreq)
# check kaldi/src/feat/Mel-computations.h
fbank = numpy.zeros([nfilt,nfft//2+1])
mel_freq_delta = (highmel-lowmel)/(nfilt+1)
for j in range(0,nfilt):
leftmel = lowmel+j*mel_freq_delta
centermel = lowmel+(j+1)*mel_freq_delta
rightmel = lowmel+(j+2)*mel_freq_delta
for i in range(0,nfft//2):
mel=hz2mel(i*samplerate/nfft)
if mel>leftmel and mel<rightmel:
if mel<centermel:
fbank[j,i]=(mel-leftmel)/(centermel-leftmel)
else:
fbank[j,i]=(rightmel-mel)/(rightmel-centermel)
return fbank
def lifter(cepstra, L=22):
"""Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
magnitude of the high frequency DCT coeffs.
:param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
:param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
"""
if L > 0:
nframes,ncoeff = numpy.shape(cepstra)
n = numpy.arange(ncoeff)
lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
return lift*cepstra
else:
# values of L <= 0, do nothing
return cepstra
def delta(feat, N):
"""Compute delta features from a feature vector sequence.
:param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
:param N: For each frame, calculate delta features based on preceding and following N frames
:returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
"""
if N < 1:
raise ValueError('N must be an integer >= 1')
NUMFRAMES = len(feat)
denominator = 2 * sum([i**2 for i in range(1, N+1)])
delta_feat = numpy.empty_like(feat)
padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat
for t in range(NUMFRAMES):
delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
return delta_feat
# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications
# Author: James Lyons 2012
from __future__ import division
import numpy
from python_speech_features import sigproc
from scipy.fftpack import dct
def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True,
winfunc=lambda x:numpy.ones((x,))):
"""Compute MFCC features from an audio signal.
:param signal: the audio signal from which to compute features. Should be an N*1 array
:param samplerate: the samplerate of the signal we are working with.
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
:param numcep: the number of cepstrum to return, default 13
:param nfilt: the number of filters in the filterbank, default 26.
:param nfft: the FFT size. Default is 512.
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
:param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
:returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
"""
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph,winfunc)
feat = numpy.log(feat)
feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
feat = lifter(feat,ceplifter)
if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
return feat
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
winfunc=lambda x:numpy.ones((x,))):
"""Compute Mel-filterbank energy features from an audio signal.
:param signal: the audio signal from which to compute features. Should be an N*1 array
:param samplerate: the samplerate of the signal we are working with.
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
:param nfilt: the number of filters in the filterbank, default 26.
:param nfft: the FFT size. Default is 512.
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
:returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
second return value is the energy in each frame (total energy, unwindowed)
"""
highfreq= highfreq or samplerate/2
signal = sigproc.preemphasis(signal,preemph)
frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
pspec = sigproc.powspec(frames,nfft)
energy = numpy.sum(pspec,1) # this stores the total energy in each frame
energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
return feat,energy
def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
"""Compute log Mel-filterbank energy features from an audio signal.
:param signal: the audio signal from which to compute features. Should be an N*1 array
:param samplerate: the samplerate of the signal we are working with.
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
:param nfilt: the number of filters in the filterbank, default 26.
:param nfft: the FFT size. Default is 512.
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
"""
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph)
return numpy.log(feat)
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
winfunc=lambda x:numpy.ones((x,))):
"""Compute Spectral Subband Centroid features from an audio signal.
:param signal: the audio signal from which to compute features. Should be an N*1 array
:param samplerate: the samplerate of the signal we are working with.
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
:param nfilt: the number of filters in the filterbank, default 26.
:param nfft: the FFT size. Default is 512.
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
:returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
"""
highfreq= highfreq or samplerate/2
signal = sigproc.preemphasis(signal,preemph)
frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
pspec = sigproc.powspec(frames,nfft)
pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems
fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1))
return numpy.dot(pspec*R,fb.T) / feat
def hz2mel(hz):
"""Convert a value in Hertz to Mels
:param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
:returns: a value in Mels. If an array was passed in, an identical sized array is returned.
"""
return 2595 * numpy.log10(1+hz/700.)
def mel2hz(mel):
"""Convert a value in Mels to Hertz
:param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
:returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
"""
return 700*(10**(mel/2595.0)-1)
def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
"""Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
:param nfilt: the number of filters in the filterbank, default 20.
:param nfft: the FFT size. Default is 512.
:param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
:param lowfreq: lowest band edge of mel filters, default 0 Hz
:param highfreq: highest band edge of mel filters, default samplerate/2
:returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
"""
highfreq= highfreq or samplerate/2
assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
# compute points evenly spaced in mels
lowmel = hz2mel(lowfreq)
highmel = hz2mel(highfreq)
melpoints = numpy.linspace(lowmel,highmel,nfilt+2)
# our points are in Hz, but we use fft bins, so we have to convert
# from Hz to fft bin number
bin = numpy.floor((nfft+1)*mel2hz(melpoints)/samplerate)
fbank = numpy.zeros([nfilt,nfft//2+1])
for j in range(0,nfilt):
for i in range(int(bin[j]), int(bin[j+1])):
fbank[j,i] = (i - bin[j]) / (bin[j+1]-bin[j])
for i in range(int(bin[j+1]), int(bin[j+2])):
fbank[j,i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1])
return fbank
def lifter(cepstra, L=22):
"""Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
magnitude of the high frequency DCT coeffs.
:param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
:param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
"""
if L > 0:
nframes,ncoeff = numpy.shape(cepstra)
n = numpy.arange(ncoeff)
lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
return lift*cepstra
else:
# values of L <= 0, do nothing
return cepstra
def delta(feat, N):
"""Compute delta features from a feature vector sequence.
:param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
:param N: For each frame, calculate delta features based on preceding and following N frames
:returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
"""
if N < 1:
raise ValueError('N must be an integer >= 1')
NUMFRAMES = len(feat)
denominator = 2 * sum([i**2 for i in range(1, N+1)])
delta_feat = numpy.empty_like(feat)
padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat
for t in range(NUMFRAMES):
delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
return delta_feat
# This file includes routines for basic signal processing including framing and computing power spectra.
# Author: James Lyons 2012
import decimal
import numpy
import math
import logging
def round_half_up(number):
return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
def rolling_window(a, window, step=1):
# http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
strides = a.strides + (a.strides[-1],)
return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
def framesig(sig, frame_len, frame_step, dither=1.0, preemph=0.97, remove_dc_offset=True, wintype='hamming', stride_trick=True):
"""Frame a signal into overlapping frames.
:param sig: the audio signal to frame.
:param frame_len: length of each frame measured in samples.
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
:param stride_trick: use stride trick to compute the rolling window and window multiplication faster
:returns: an array of frames. Size is NUMFRAMES by frame_len.
"""
slen = len(sig)
frame_len = int(round_half_up(frame_len))
frame_step = int(round_half_up(frame_step))
if slen <= frame_len:
numframes = 1
else:
numframes = 1 + (( slen - frame_len) // frame_step)
# check kaldi/src/feat/feature-window.h
padsignal = sig[:(numframes-1)*frame_step+frame_len]
if wintype is 'povey':
win = numpy.empty(frame_len)
for i in range(frame_len):
win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85
else: # the hamming window
win = numpy.hamming(frame_len)
if stride_trick:
frames = rolling_window(padsignal, window=frame_len, step=frame_step)
else:
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
indices = numpy.array(indices, dtype=numpy.int32)
frames = padsignal[indices]
win = numpy.tile(win, (numframes, 1))
frames = frames.astype(numpy.float32)
raw_frames = numpy.zeros(frames.shape)
for frm in range(frames.shape[0]):
frames[frm,:] = do_dither(frames[frm,:], dither) # dither
frames[frm,:] = do_remove_dc_offset(frames[frm,:]) # remove dc offset
raw_frames[frm,:] = frames[frm,:]
frames[frm,:] = do_preemphasis(frames[frm,:], preemph) # preemphasize
return frames * win, raw_frames
def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
"""Does overlap-add procedure to undo the action of framesig.
:param frames: the array of frames.
:param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
:param frame_len: length of each frame measured in samples.
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
:returns: a 1-D signal.
"""
frame_len = round_half_up(frame_len)
frame_step = round_half_up(frame_step)
numframes = numpy.shape(frames)[0]
assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
indices = numpy.array(indices, dtype=numpy.int32)
padlen = (numframes - 1) * frame_step + frame_len
if siglen <= 0: siglen = padlen
rec_signal = numpy.zeros((padlen,))
window_correction = numpy.zeros((padlen,))
win = winfunc(frame_len)
for i in range(0, numframes):
window_correction[indices[i, :]] = window_correction[
indices[i, :]] + win + 1e-15 # add a little bit so it is never zero
rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
rec_signal = rec_signal / window_correction
return rec_signal[0:siglen]
def magspec(frames, NFFT):
"""Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
:param frames: the array of frames. Each row is a frame.
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
"""
if numpy.shape(frames)[1] > NFFT:
logging.warn(
'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
numpy.shape(frames)[1], NFFT)
complex_spec = numpy.fft.rfft(frames, NFFT)
return numpy.absolute(complex_spec)
def powspec(frames, NFFT):
"""Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
:param frames: the array of frames. Each row is a frame.
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
"""
return numpy.square(magspec(frames, NFFT))
def logpowspec(frames, NFFT, norm=1):
"""Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
:param frames: the array of frames. Each row is a frame.
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
:param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
"""
ps = powspec(frames, NFFT);
ps[ps <= 1e-30] = 1e-30
lps = 10 * numpy.log10(ps)
if norm:
return lps - numpy.max(lps)
else:
return lps
def do_dither(signal, dither_value=1.0):
signal += numpy.random.normal(size=signal.shape) * dither_value
return signal
def do_remove_dc_offset(signal):
signal -= numpy.mean(signal)
return signal
def do_preemphasis(signal, coeff=0.97):
"""perform preemphasis on the input signal.
:param signal: The signal to filter.
:param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
:returns: the filtered signal.
"""
return numpy.append((1-coeff)*signal[0], signal[1:] - coeff * signal[:-1])
# This file includes routines for basic signal processing including framing and computing power spectra.
# Author: James Lyons 2012
import decimal
import numpy
import math
import logging
def round_half_up(number):
return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
def rolling_window(a, window, step=1):
# http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
strides = a.strides + (a.strides[-1],)
return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,)), stride_trick=True):
"""Frame a signal into overlapping frames.
:param sig: the audio signal to frame.
:param frame_len: length of each frame measured in samples.
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
:param stride_trick: use stride trick to compute the rolling window and window multiplication faster
:returns: an array of frames. Size is NUMFRAMES by frame_len.
"""
slen = len(sig)
frame_len = int(round_half_up(frame_len))
frame_step = int(round_half_up(frame_step))
if slen <= frame_len:
numframes = 1
else:
numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step))
padlen = int((numframes - 1) * frame_step + frame_len)
zeros = numpy.zeros((padlen - slen,))
padsignal = numpy.concatenate((sig, zeros))
if stride_trick:
win = winfunc(frame_len)
frames = rolling_window(padsignal, window=frame_len, step=frame_step)
else:
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
indices = numpy.array(indices, dtype=numpy.int32)
frames = padsignal[indices]
win = numpy.tile(winfunc(frame_len), (numframes, 1))
return frames * win
def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
"""Does overlap-add procedure to undo the action of framesig.
:param frames: the array of frames.
:param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
:param frame_len: length of each frame measured in samples.
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
:returns: a 1-D signal.
"""
frame_len = round_half_up(frame_len)
frame_step = round_half_up(frame_step)
numframes = numpy.shape(frames)[0]
assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
indices = numpy.array(indices, dtype=numpy.int32)
padlen = (numframes - 1) * frame_step + frame_len
if siglen <= 0: siglen = padlen
rec_signal = numpy.zeros((padlen,))
window_correction = numpy.zeros((padlen,))
win = winfunc(frame_len)
for i in range(0, numframes):
window_correction[indices[i, :]] = window_correction[
indices[i, :]] + win + 1e-15 # add a little bit so it is never zero
rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
rec_signal = rec_signal / window_correction
return rec_signal[0:siglen]
def magspec(frames, NFFT):
"""Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
:param frames: the array of frames. Each row is a frame.
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
"""
if numpy.shape(frames)[1] > NFFT:
logging.warn(
'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
numpy.shape(frames)[1], NFFT)
complex_spec = numpy.fft.rfft(frames, NFFT)
return numpy.absolute(complex_spec)
def powspec(frames, NFFT):
"""Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
:param frames: the array of frames. Each row is a frame.
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
"""
return 1.0 / NFFT * numpy.square(magspec(frames, NFFT))
def logpowspec(frames, NFFT, norm=1):
"""Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
:param frames: the array of frames. Each row is a frame.
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
:param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
"""
ps = powspec(frames, NFFT);
ps[ps <= 1e-30] = 1e-30
lps = 10 * numpy.log10(ps)
if norm:
return lps - numpy.max(lps)
else:
return lps
def preemphasis(signal, coeff=0.95):
"""perform preemphasis on the input signal.
:param signal: The signal to filter.
:param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
:returns: the filtered signal.
"""
return numpy.append(signal[0], signal[1:] - coeff * signal[:-1])
# Makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
PAPER =
BUILDDIR = build
# Internal variables.
PAPEROPT_a4 = -D latex_paper_size=a4
PAPEROPT_letter = -D latex_paper_size=letter
ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
.PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest
help:
@echo "Please use \`make <target>' where <target> is one of"
@echo " html to make standalone HTML files"
@echo " dirhtml to make HTML files named index.html in directories"
@echo " pickle to make pickle files"
@echo " json to make JSON files"
@echo " htmlhelp to make HTML files and a HTML help project"
@echo " qthelp to make HTML files and a qthelp project"
@echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
@echo " changes to make an overview of all changed/added/deprecated items"
@echo " linkcheck to check all external links for integrity"
@echo " doctest to run all doctests embedded in the documentation (if enabled)"
clean:
-rm -rf $(BUILDDIR)/*
html:
$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
pickle:
$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
@echo
@echo "Build finished; now you can process the pickle files."
json:
$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
@echo
@echo "Build finished; now you can process the JSON files."
htmlhelp:
$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
@echo
@echo "Build finished; now you can run HTML Help Workshop with the" \
".hhp project file in $(BUILDDIR)/htmlhelp."
qthelp:
$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
@echo
@echo "Build finished; now you can run "qcollectiongenerator" with the" \
".qhcp project file in $(BUILDDIR)/qthelp, like this:"
@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/python_speech_features.qhcp"
@echo "To view the help file:"
@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/python_speech_features.qhc"
latex:
$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
@echo
@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
"run these through (pdf)latex."
changes:
$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
@echo
@echo "The overview file is in $(BUILDDIR)/changes."
linkcheck:
$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
@echo
@echo "Link check complete; look for any errors in the above output " \
"or in $(BUILDDIR)/linkcheck/output.txt."
doctest:
$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
@echo "Testing of doctests in the sources finished, look at the " \
"results in $(BUILDDIR)/doctest/output.txt."
@ECHO OFF
REM Command file for Sphinx documentation
set SPHINXBUILD=sphinx-build
set BUILDDIR=build
set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source
if NOT "%PAPER%" == "" (
set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
)
if "%1" == "" goto help
if "%1" == "help" (
:help
echo.Please use `make ^<target^>` where ^<target^> is one of
echo. html to make standalone HTML files
echo. dirhtml to make HTML files named index.html in directories
echo. pickle to make pickle files
echo. json to make JSON files
echo. htmlhelp to make HTML files and a HTML help project
echo. qthelp to make HTML files and a qthelp project
echo. latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter
echo. changes to make an overview over all changed/added/deprecated items
echo. linkcheck to check all external links for integrity
echo. doctest to run all doctests embedded in the documentation if enabled
goto end
)
if "%1" == "clean" (
for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
del /q /s %BUILDDIR%\*
goto end
)
if "%1" == "html" (
%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/html.
goto end
)
if "%1" == "dirhtml" (
%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
echo.
echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
goto end
)
if "%1" == "pickle" (
%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
echo.
echo.Build finished; now you can process the pickle files.
goto end
)
if "%1" == "json" (
%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
echo.
echo.Build finished; now you can process the JSON files.
goto end
)
if "%1" == "htmlhelp" (
%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
echo.
echo.Build finished; now you can run HTML Help Workshop with the ^
.hhp project file in %BUILDDIR%/htmlhelp.
goto end
)
if "%1" == "qthelp" (
%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
echo.
echo.Build finished; now you can run "qcollectiongenerator" with the ^
.qhcp project file in %BUILDDIR%/qthelp, like this:
echo.^> qcollectiongenerator %BUILDDIR%\qthelp\python_speech_features.qhcp
echo.To view the help file:
echo.^> assistant -collectionFile %BUILDDIR%\qthelp\python_speech_features.ghc
goto end
)
if "%1" == "latex" (
%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
echo.
echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
goto end
)
if "%1" == "changes" (
%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
echo.
echo.The overview file is in %BUILDDIR%/changes.
goto end
)
if "%1" == "linkcheck" (
%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
echo.
echo.Link check complete; look for any errors in the above output ^
or in %BUILDDIR%/linkcheck/output.txt.
goto end
)
if "%1" == "doctest" (
%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
echo.
echo.Testing of doctests in the sources finished, look at the ^
results in %BUILDDIR%/doctest/output.txt.
goto end
)
:end
# -*- coding: utf-8 -*-
#
# python_speech_features documentation build configuration file, created by
# sphinx-quickstart on Thu Oct 31 16:49:58 2013.
#
# This file is execfile()d with the current directory set to its containing dir.
#
# Note that not all possible configuration values are present in this
# autogenerated file.
#
# All configuration values have a default; values that are commented out
# serve to show the default.
import sys, os
import mock
MOCK_MODULES = ['numpy', 'scipy', 'scipy.fftpack']
for mod_name in MOCK_MODULES:
sys.modules[mod_name] = mock.Mock()
# If extensions (or modules to document with autodoc) are in another directory,
# add these directories to sys.path here. If the directory is relative to the
# documentation root, use os.path.abspath to make it absolute, like shown here.
sys.path.insert(0,os.path.abspath('../..'))
# -- General configuration -----------------------------------------------------
# Add any Sphinx extension module names here, as strings. They can be extensions
# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
extensions = ['sphinx.ext.autodoc']
# Add any paths that contain templates here, relative to this directory.
templates_path = ['_templates']
# The suffix of source filenames.
source_suffix = '.rst'
# The encoding of source files.
#source_encoding = 'utf-8'
# The master toctree document.
master_doc = 'index'
# General information about the project.
project = u'python_speech_features'
copyright = u'2013, James Lyons'
# The version info for the project you're documenting, acts as replacement for
# |version| and |release|, also used in various other places throughout the
# built documents.
#
# The short X.Y version.
version = '0.1.0'
# The full version, including alpha/beta/rc tags.
release = '0.1.0'
# The language for content autogenerated by Sphinx. Refer to documentation
# for a list of supported languages.
#language = None
# There are two options for replacing |today|: either, you set today to some
# non-false value, then it is used:
#today = ''
# Else, today_fmt is used as the format for a strftime call.
#today_fmt = '%B %d, %Y'
# List of documents that shouldn't be included in the build.
#unused_docs = []
# List of directories, relative to source directory, that shouldn't be searched
# for source files.
exclude_trees = []
# The reST default role (used for this markup: `text`) to use for all documents.
#default_role = None
# If true, '()' will be appended to :func: etc. cross-reference text.
#add_function_parentheses = True
# If true, the current module name will be prepended to all description
# unit titles (such as .. function::).
#add_module_names = True
# If true, sectionauthor and moduleauthor directives will be shown in the
# output. They are ignored by default.
#show_authors = False
# The name of the Pygments (syntax highlighting) style to use.
pygments_style = 'sphinx'
# A list of ignored prefixes for module index sorting.
#modindex_common_prefix = []
# -- Options for HTML output ---------------------------------------------------
# The theme to use for HTML and HTML Help pages. Major themes that come with
# Sphinx are currently 'default' and 'sphinxdoc'.
html_theme = 'default'
# Theme options are theme-specific and customize the look and feel of a theme
# further. For a list of options available for each theme, see the
# documentation.
#html_theme_options = {}
# Add any paths that contain custom themes here, relative to this directory.
#html_theme_path = []
# The name for this set of Sphinx documents. If None, it defaults to
# "<project> v<release> documentation".
#html_title = None
# A shorter title for the navigation bar. Default is the same as html_title.
#html_short_title = None
# The name of an image file (relative to this directory) to place at the top
# of the sidebar.
#html_logo = None
# The name of an image file (within the static path) to use as favicon of the
# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
# pixels large.
#html_favicon = None
# Add any paths that contain custom static files (such as style sheets) here,
# relative to this directory. They are copied after the builtin static files,
# so a file named "default.css" will overwrite the builtin "default.css".
html_static_path = ['_static']
# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
# using the given strftime format.
#html_last_updated_fmt = '%b %d, %Y'
# If true, SmartyPants will be used to convert quotes and dashes to
# typographically correct entities.
#html_use_smartypants = True
# Custom sidebar templates, maps document names to template names.
#html_sidebars = {}
# Additional templates that should be rendered to pages, maps page names to
# template names.
#html_additional_pages = {}
# If false, no module index is generated.
#html_use_modindex = True
# If false, no index is generated.
#html_use_index = True
# If true, the index is split into individual pages for each letter.
#html_split_index = False
# If true, links to the reST sources are added to the pages.
#html_show_sourcelink = True
# If true, an OpenSearch description file will be output, and all pages will
# contain a <link> tag referring to it. The value of this option must be the
# base URL from which the finished HTML is served.
#html_use_opensearch = ''
# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
#html_file_suffix = ''
# Output file base name for HTML help builder.
htmlhelp_basename = 'python_speech_featuresdoc'
# -- Options for LaTeX output --------------------------------------------------
# The paper size ('letter' or 'a4').
#latex_paper_size = 'letter'
# The font size ('10pt', '11pt' or '12pt').
#latex_font_size = '10pt'
# Grouping the document tree into LaTeX files. List of tuples
# (source start file, target name, title, author, documentclass [howto/manual]).
latex_documents = [
('index', 'python_speech_features.tex', u'python\\_speech\\_features Documentation',
u'James Lyons', 'manual'),
]
# The name of an image file (relative to this directory) to place at the top of
# the title page.
#latex_logo = None
# For "manual" documents, if this is true, then toplevel headings are parts,
# not chapters.
#latex_use_parts = False
# Additional stuff for the LaTeX preamble.
#latex_preamble = ''
# Documents to append as an appendix to all manuals.
#latex_appendices = []
# If false, no module index is generated.
#latex_use_modindex = True
autodoc_member_order = 'bysource'
.. python_speech_features documentation master file, created by
sphinx-quickstart on Thu Oct 31 16:49:58 2013.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to python_speech_features's documentation!
==================================================
This library provides common speech features for ASR including MFCCs and filterbank energies.
If you are not sure what MFCCs are, and would like to know more have a look at this MFCC tutorial:
http://www.practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/.
You will need numpy and scipy to run these files. The code for this project is available at https://github.com/jameslyons/python_speech_features .
Supported features:
- :py:meth:`python_speech_features.mfcc` - Mel Frequency Cepstral Coefficients
- :py:meth:`python_speech_features.fbank` - Filterbank Energies
- :py:meth:`python_speech_features.logfbank` - Log Filterbank Energies
- :py:meth:`python_speech_features.ssc` - Spectral Subband Centroids
To use MFCC features::
from python_speech_features import mfcc
from python_speech_features import logfbank
import scipy.io.wavfile as wav
(rate,sig) = wav.read("file.wav")
mfcc_feat = mfcc(sig,rate)
fbank_feat = logfbank(sig,rate)
print(fbank_feat[1:3,:])
From here you can write the features to a file etc.
Functions provided in python_speech_features module
-------------------------------------
.. automodule:: python_speech_features.base
:members:
Functions provided in sigproc module
------------------------------------
.. automodule:: python_speech_features.sigproc
:members:
Indices and tables
==================
* :ref:`genindex`
* :ref:`search`
#!/usr/bin/env python
from python_speech_features import mfcc
from python_speech_features import delta
from python_speech_features import logfbank
import scipy.io.wavfile as wav
(rate,sig) = wav.read("english.wav")
# note that generally nfilt=40 is used for speech recognition
fbank_feat = logfbank(sig,nfilt=23,lowfreq=20,dither=0,wintype='povey')
# the computed fbank coefficents of english.wav with dimension [110,23]
# [ 12.2865 12.6906 13.1765 15.714 16.064 15.7553 16.5746 16.9205 16.6472 16.1302 16.4576 16.7326 16.8864 17.7215 18.88 19.1377 19.1495 18.6683 18.3886 20.3506 20.2772 18.8248 18.1899
# 11.9198 13.146 14.7215 15.8642 17.4288 16.394 16.8238 16.1095 16.4297 16.6331 16.3163 16.5093 17.4981 18.3429 19.6555 19.6263 19.8435 19.0534 19.001 20.0287 19.7707 19.5852 19.1112
# ...
# ...
# the same with that using kaldi commands: compute-fbank-feats --dither=0.0
mfcc_feat = mfcc(sig,dither=0,useEnergy=True,wintype='povey')
# the computed mfcc coefficents of english.wav with dimension [110,13]
# [ 17.1337 -23.3651 -7.41751 -7.73686 -21.3682 -8.93884 -3.70843 4.68346 -16.0676 12.782 -7.24054 8.25089 10.7292
# 17.1692 -23.3028 -5.61872 -4.0075 -23.287 -20.6101 -5.51584 -6.15273 -14.4333 8.13052 -0.0345329 2.06274 -0.564298
# ...
# ...
# the same with that using kaldi commands: compute-mfcc-feats --dither=0.0
Metadata-Version: 1.0
Name: python-speech-features
Version: 0.6
Summary: Python Speech Feature extraction
Home-page: https://github.com/jameslyons/python_speech_features
Author: James Lyons
Author-email: james.lyons0@gmail.com
License: MIT
Description: UNKNOWN
Platform: UNKNOWN
README.rst
setup.py
python_speech_features/__init__.py
python_speech_features/base.py
python_speech_features/base_orig.py
python_speech_features/sigproc.py
python_speech_features/sigproc_orig.py
python_speech_features.egg-info/PKG-INFO
python_speech_features.egg-info/SOURCES.txt
python_speech_features.egg-info/dependency_links.txt
python_speech_features.egg-info/top_level.txt
test/test_sigproc.py
\ No newline at end of file
# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications
# Author: James Lyons 2012
from __future__ import division
import numpy
from python_speech_features import sigproc
from scipy.fftpack import dct
def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
nfilt=23,nfft=512,lowfreq=20,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,
ceplifter=22,useEnergy=True,wintype='povey'):
"""Compute MFCC features from an audio signal.
:param signal: the audio signal from which to compute features. Should be an N*1 array
:param samplerate: the samplerate of the signal we are working with.
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
:param numcep: the number of cepstrum to return, default 13
:param nfilt: the number of filters in the filterbank, default 26.
:param nfft: the FFT size. Default is 512.
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
:param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
:returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
"""
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither,remove_dc_offset,preemph,wintype)
feat = numpy.log(feat)
feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
feat = lifter(feat,ceplifter)
if useEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
return feat
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97,
wintype='hamming'):
"""Compute Mel-filterbank energy features from an audio signal.
:param signal: the audio signal from which to compute features. Should be an N*1 array
:param samplerate: the samplerate of the signal we are working with.
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
:param nfilt: the number of filters in the filterbank, default 26.
:param nfft: the FFT size. Default is 512.
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
winfunc=lambda x:numpy.ones((x,))
:returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
second return value is the energy in each frame (total energy, unwindowed)
"""
highfreq= highfreq or samplerate/2
frames,raw_frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, dither, preemph, remove_dc_offset, wintype)
pspec = sigproc.powspec(frames,nfft) # nearly the same until this part
energy = numpy.sum(raw_frames**2,1) # this stores the raw energy in each frame
energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
return feat,energy
def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
nfilt=40,nfft=512,lowfreq=64,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,wintype='hamming'):
"""Compute log Mel-filterbank energy features from an audio signal.
:param signal: the audio signal from which to compute features. Should be an N*1 array
:param samplerate: the samplerate of the signal we are working with.
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
:param nfilt: the number of filters in the filterbank, default 26.
:param nfft: the FFT size. Default is 512.
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
"""
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither, remove_dc_offset,preemph,wintype)
return numpy.log(feat)
def hz2mel(hz):
"""Convert a value in Hertz to Mels
:param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
:returns: a value in Mels. If an array was passed in, an identical sized array is returned.
"""
return 1127 * numpy.log(1+hz/700.0)
def mel2hz(mel):
"""Convert a value in Mels to Hertz
:param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
:returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
"""
return 700 * (numpy.exp(mel/1127.0)-1)
def get_filterbanks(nfilt=26,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
"""Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
:param nfilt: the number of filters in the filterbank, default 20.
:param nfft: the FFT size. Default is 512.
:param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
:param lowfreq: lowest band edge of mel filters, default 0 Hz
:param highfreq: highest band edge of mel filters, default samplerate/2
:returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
"""
highfreq= highfreq or samplerate/2
assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
# compute points evenly spaced in mels
lowmel = hz2mel(lowfreq)
highmel = hz2mel(highfreq)
# check kaldi/src/feat/Mel-computations.h
fbank = numpy.zeros([nfilt,nfft//2+1])
mel_freq_delta = (highmel-lowmel)/(nfilt+1)
for j in range(0,nfilt):
leftmel = lowmel+j*mel_freq_delta
centermel = lowmel+(j+1)*mel_freq_delta
rightmel = lowmel+(j+2)*mel_freq_delta
for i in range(0,nfft//2):
mel=hz2mel(i*samplerate/nfft)
if mel>leftmel and mel<rightmel:
if mel<centermel:
fbank[j,i]=(mel-leftmel)/(centermel-leftmel)
else:
fbank[j,i]=(rightmel-mel)/(rightmel-centermel)
return fbank
def lifter(cepstra, L=22):
"""Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
magnitude of the high frequency DCT coeffs.
:param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
:param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
"""
if L > 0:
nframes,ncoeff = numpy.shape(cepstra)
n = numpy.arange(ncoeff)
lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
return lift*cepstra
else:
# values of L <= 0, do nothing
return cepstra
def delta(feat, N):
"""Compute delta features from a feature vector sequence.
:param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
:param N: For each frame, calculate delta features based on preceding and following N frames
:returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
"""
if N < 1:
raise ValueError('N must be an integer >= 1')
NUMFRAMES = len(feat)
denominator = 2 * sum([i**2 for i in range(1, N+1)])
delta_feat = numpy.empty_like(feat)
padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat
for t in range(NUMFRAMES):
delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
return delta_feat
# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications
# Author: James Lyons 2012
from __future__ import division
import numpy
from python_speech_features import sigproc
from scipy.fftpack import dct
def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True,
winfunc=lambda x:numpy.ones((x,))):
"""Compute MFCC features from an audio signal.
:param signal: the audio signal from which to compute features. Should be an N*1 array
:param samplerate: the samplerate of the signal we are working with.
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
:param numcep: the number of cepstrum to return, default 13
:param nfilt: the number of filters in the filterbank, default 26.
:param nfft: the FFT size. Default is 512.
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
:param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
:returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
"""
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph,winfunc)
feat = numpy.log(feat)
feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
feat = lifter(feat,ceplifter)
if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
return feat
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
winfunc=lambda x:numpy.ones((x,))):
"""Compute Mel-filterbank energy features from an audio signal.
:param signal: the audio signal from which to compute features. Should be an N*1 array
:param samplerate: the samplerate of the signal we are working with.
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
:param nfilt: the number of filters in the filterbank, default 26.
:param nfft: the FFT size. Default is 512.
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
:returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
second return value is the energy in each frame (total energy, unwindowed)
"""
highfreq= highfreq or samplerate/2
signal = sigproc.preemphasis(signal,preemph)
frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
pspec = sigproc.powspec(frames,nfft)
energy = numpy.sum(pspec,1) # this stores the total energy in each frame
energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
return feat,energy
def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
"""Compute log Mel-filterbank energy features from an audio signal.
:param signal: the audio signal from which to compute features. Should be an N*1 array
:param samplerate: the samplerate of the signal we are working with.
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
:param nfilt: the number of filters in the filterbank, default 26.
:param nfft: the FFT size. Default is 512.
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
"""
feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph)
return numpy.log(feat)
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
winfunc=lambda x:numpy.ones((x,))):
"""Compute Spectral Subband Centroid features from an audio signal.
:param signal: the audio signal from which to compute features. Should be an N*1 array
:param samplerate: the samplerate of the signal we are working with.
:param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
:param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
:param nfilt: the number of filters in the filterbank, default 26.
:param nfft: the FFT size. Default is 512.
:param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
:param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
:param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
:param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
:returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
"""
highfreq= highfreq or samplerate/2
signal = sigproc.preemphasis(signal,preemph)
frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
pspec = sigproc.powspec(frames,nfft)
pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems
fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1))
return numpy.dot(pspec*R,fb.T) / feat
def hz2mel(hz):
"""Convert a value in Hertz to Mels
:param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
:returns: a value in Mels. If an array was passed in, an identical sized array is returned.
"""
return 2595 * numpy.log10(1+hz/700.)
def mel2hz(mel):
"""Convert a value in Mels to Hertz
:param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
:returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
"""
return 700*(10**(mel/2595.0)-1)
def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
"""Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
:param nfilt: the number of filters in the filterbank, default 20.
:param nfft: the FFT size. Default is 512.
:param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
:param lowfreq: lowest band edge of mel filters, default 0 Hz
:param highfreq: highest band edge of mel filters, default samplerate/2
:returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
"""
highfreq= highfreq or samplerate/2
assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
# compute points evenly spaced in mels
lowmel = hz2mel(lowfreq)
highmel = hz2mel(highfreq)
melpoints = numpy.linspace(lowmel,highmel,nfilt+2)
# our points are in Hz, but we use fft bins, so we have to convert
# from Hz to fft bin number
bin = numpy.floor((nfft+1)*mel2hz(melpoints)/samplerate)
fbank = numpy.zeros([nfilt,nfft//2+1])
for j in range(0,nfilt):
for i in range(int(bin[j]), int(bin[j+1])):
fbank[j,i] = (i - bin[j]) / (bin[j+1]-bin[j])
for i in range(int(bin[j+1]), int(bin[j+2])):
fbank[j,i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1])
return fbank
def lifter(cepstra, L=22):
"""Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
magnitude of the high frequency DCT coeffs.
:param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
:param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
"""
if L > 0:
nframes,ncoeff = numpy.shape(cepstra)
n = numpy.arange(ncoeff)
lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
return lift*cepstra
else:
# values of L <= 0, do nothing
return cepstra
def delta(feat, N):
"""Compute delta features from a feature vector sequence.
:param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
:param N: For each frame, calculate delta features based on preceding and following N frames
:returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
"""
if N < 1:
raise ValueError('N must be an integer >= 1')
NUMFRAMES = len(feat)
denominator = 2 * sum([i**2 for i in range(1, N+1)])
delta_feat = numpy.empty_like(feat)
padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge') # padded version of feat
for t in range(NUMFRAMES):
delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
return delta_feat
# This file includes routines for basic signal processing including framing and computing power spectra.
# Author: James Lyons 2012
import decimal
import numpy
import math
import logging
def round_half_up(number):
return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
def rolling_window(a, window, step=1):
# http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
strides = a.strides + (a.strides[-1],)
return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
def framesig(sig, frame_len, frame_step, dither=1.0, preemph=0.97, remove_dc_offset=True, wintype='hamming', stride_trick=True):
"""Frame a signal into overlapping frames.
:param sig: the audio signal to frame.
:param frame_len: length of each frame measured in samples.
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
:param stride_trick: use stride trick to compute the rolling window and window multiplication faster
:returns: an array of frames. Size is NUMFRAMES by frame_len.
"""
slen = len(sig)
frame_len = int(round_half_up(frame_len))
frame_step = int(round_half_up(frame_step))
if slen <= frame_len:
numframes = 1
else:
numframes = 1 + (( slen - frame_len) // frame_step)
# check kaldi/src/feat/feature-window.h
padsignal = sig[:(numframes-1)*frame_step+frame_len]
if wintype is 'povey':
win = numpy.empty(frame_len)
for i in range(frame_len):
win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85
else: # the hamming window
win = numpy.hamming(frame_len)
if stride_trick:
frames = rolling_window(padsignal, window=frame_len, step=frame_step)
else:
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
indices = numpy.array(indices, dtype=numpy.int32)
frames = padsignal[indices]
win = numpy.tile(win, (numframes, 1))
frames = frames.astype(numpy.float32)
raw_frames = numpy.zeros(frames.shape)
for frm in range(frames.shape[0]):
frames[frm,:] = do_dither(frames[frm,:], dither) # dither
frames[frm,:] = do_remove_dc_offset(frames[frm,:]) # remove dc offset
raw_frames[frm,:] = frames[frm,:]
frames[frm,:] = do_preemphasis(frames[frm,:], preemph) # preemphasize
return frames * win, raw_frames
def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
"""Does overlap-add procedure to undo the action of framesig.
:param frames: the array of frames.
:param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
:param frame_len: length of each frame measured in samples.
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
:returns: a 1-D signal.
"""
frame_len = round_half_up(frame_len)
frame_step = round_half_up(frame_step)
numframes = numpy.shape(frames)[0]
assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
indices = numpy.array(indices, dtype=numpy.int32)
padlen = (numframes - 1) * frame_step + frame_len
if siglen <= 0: siglen = padlen
rec_signal = numpy.zeros((padlen,))
window_correction = numpy.zeros((padlen,))
win = winfunc(frame_len)
for i in range(0, numframes):
window_correction[indices[i, :]] = window_correction[
indices[i, :]] + win + 1e-15 # add a little bit so it is never zero
rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
rec_signal = rec_signal / window_correction
return rec_signal[0:siglen]
def magspec(frames, NFFT):
"""Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
:param frames: the array of frames. Each row is a frame.
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
"""
if numpy.shape(frames)[1] > NFFT:
logging.warn(
'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
numpy.shape(frames)[1], NFFT)
complex_spec = numpy.fft.rfft(frames, NFFT)
return numpy.absolute(complex_spec)
def powspec(frames, NFFT):
"""Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
:param frames: the array of frames. Each row is a frame.
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
"""
return numpy.square(magspec(frames, NFFT))
def logpowspec(frames, NFFT, norm=1):
"""Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
:param frames: the array of frames. Each row is a frame.
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
:param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
"""
ps = powspec(frames, NFFT);
ps[ps <= 1e-30] = 1e-30
lps = 10 * numpy.log10(ps)
if norm:
return lps - numpy.max(lps)
else:
return lps
def do_dither(signal, dither_value=1.0):
signal += numpy.random.normal(size=signal.shape) * dither_value
return signal
def do_remove_dc_offset(signal):
signal -= numpy.mean(signal)
return signal
def do_preemphasis(signal, coeff=0.97):
"""perform preemphasis on the input signal.
:param signal: The signal to filter.
:param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
:returns: the filtered signal.
"""
return numpy.append((1-coeff)*signal[0], signal[1:] - coeff * signal[:-1])
# This file includes routines for basic signal processing including framing and computing power spectra.
# Author: James Lyons 2012
import decimal
import numpy
import math
import logging
def round_half_up(number):
return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
def rolling_window(a, window, step=1):
# http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
strides = a.strides + (a.strides[-1],)
return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,)), stride_trick=True):
"""Frame a signal into overlapping frames.
:param sig: the audio signal to frame.
:param frame_len: length of each frame measured in samples.
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
:param stride_trick: use stride trick to compute the rolling window and window multiplication faster
:returns: an array of frames. Size is NUMFRAMES by frame_len.
"""
slen = len(sig)
frame_len = int(round_half_up(frame_len))
frame_step = int(round_half_up(frame_step))
if slen <= frame_len:
numframes = 1
else:
numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step))
padlen = int((numframes - 1) * frame_step + frame_len)
zeros = numpy.zeros((padlen - slen,))
padsignal = numpy.concatenate((sig, zeros))
if stride_trick:
win = winfunc(frame_len)
frames = rolling_window(padsignal, window=frame_len, step=frame_step)
else:
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
indices = numpy.array(indices, dtype=numpy.int32)
frames = padsignal[indices]
win = numpy.tile(winfunc(frame_len), (numframes, 1))
return frames * win
def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
"""Does overlap-add procedure to undo the action of framesig.
:param frames: the array of frames.
:param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
:param frame_len: length of each frame measured in samples.
:param frame_step: number of samples after the start of the previous frame that the next frame should begin.
:param winfunc: the analysis window to apply to each frame. By default no window is applied.
:returns: a 1-D signal.
"""
frame_len = round_half_up(frame_len)
frame_step = round_half_up(frame_step)
numframes = numpy.shape(frames)[0]
assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
indices = numpy.array(indices, dtype=numpy.int32)
padlen = (numframes - 1) * frame_step + frame_len
if siglen <= 0: siglen = padlen
rec_signal = numpy.zeros((padlen,))
window_correction = numpy.zeros((padlen,))
win = winfunc(frame_len)
for i in range(0, numframes):
window_correction[indices[i, :]] = window_correction[
indices[i, :]] + win + 1e-15 # add a little bit so it is never zero
rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
rec_signal = rec_signal / window_correction
return rec_signal[0:siglen]
def magspec(frames, NFFT):
"""Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
:param frames: the array of frames. Each row is a frame.
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
"""
if numpy.shape(frames)[1] > NFFT:
logging.warn(
'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
numpy.shape(frames)[1], NFFT)
complex_spec = numpy.fft.rfft(frames, NFFT)
return numpy.absolute(complex_spec)
def powspec(frames, NFFT):
"""Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
:param frames: the array of frames. Each row is a frame.
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
"""
return 1.0 / NFFT * numpy.square(magspec(frames, NFFT))
def logpowspec(frames, NFFT, norm=1):
"""Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
:param frames: the array of frames. Each row is a frame.
:param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
:param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
:returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
"""
ps = powspec(frames, NFFT);
ps[ps <= 1e-30] = 1e-30
lps = 10 * numpy.log10(ps)
if norm:
return lps - numpy.max(lps)
else:
return lps
def preemphasis(signal, coeff=0.95):
"""perform preemphasis on the input signal.
:param signal: The signal to filter.
:param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
:returns: the filtered signal.
"""
return numpy.append(signal[0], signal[1:] - coeff * signal[:-1])
try:
from setuptools import setup #enables develop
except ImportError:
from distutils.core import setup
setup(name='python_speech_features',
version='0.6',
description='Python Speech Feature extraction',
author='James Lyons',
author_email='james.lyons0@gmail.com',
license='MIT',
url='https://github.com/jameslyons/python_speech_features',
packages=['python_speech_features'],
)
from python_speech_features import sigproc
import unittest
import numpy as np
import time
class test_case(unittest.TestCase):
def test_frame_sig(self):
n = 10000124
frame_len = 37
frame_step = 13
x = np.random.rand(n)
t0 = time.time()
y_old = sigproc.framesig(x, frame_len=frame_len, frame_step=frame_step, stride_trick=False)
t1 = time.time()
y_new = sigproc.framesig(x, frame_len=frame_len, frame_step=frame_step, stride_trick=True)
t_new = time.time() - t1
t_old = t1 - t0
self.assertTupleEqual(y_old.shape, y_new.shape)
np.testing.assert_array_equal(y_old, y_new)
self.assertLess(t_new, t_old)
print('new run time %3.2f < %3.2f sec' % (t_new, t_old))
def test_rolling(self):
x = np.arange(10)
y = sigproc.rolling_window(x, window=4, step=3)
y_expected = np.array([[0, 1, 2, 3],
[3, 4, 5, 6],
[6, 7, 8, 9]]
)
y = np.testing.assert_array_equal(y, y_expected)
......@@ -24,7 +24,7 @@ from deepspeech.utils.utility import print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('num_samples', int, 2000, "# of samples to for statistics.")
add_arg('num_samples', int, -1, "# of samples to for statistics.")
add_arg('specgram_type', str,
'linear',
"Audio feature type. Options: linear, mfcc, fbank.",
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册