kaldi fbank and mfcc

48f4bda3 · Hui Zhang · 281d46da · 48f4bda3 · 48f4bda3 · 48f4bda3
40 changed file
--- a/.flake8
+++ b/.flake8
@@ -12,6 +12,7 @@ exclude =
    .git,
    # python cache
    __pycache__,
+    third_party/,
 # Provide a comma-separate list of glob patterns to include for checks.
 filename =
    *.py
@@ -46,4 +47,4 @@ select =
    E,
    W,
    F,
-    C
\ No newline at end of file
+    C
--- a/.notebook/dataloader_with_tokens_tokenids.ipynb
+++ b/.notebook/dataloader_with_tokens_tokenids.ipynb
@@ -83,37 +83,39 @@
     "text": [
      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
      "  and should_run_async(code)\n",
-      "WARNING:root:register user softmax to paddle, remove this when fixed!\n",
-      "WARNING:root:register user log_softmax to paddle, remove this when fixed!\n",
-      "WARNING:root:register user sigmoid to paddle, remove this when fixed!\n",
-      "WARNING:root:register user log_sigmoid to paddle, remove this when fixed!\n",
-      "WARNING:root:register user relu to paddle, remove this when fixed!\n",
-      "WARNING:root:override cat of paddle if exists or register, remove this when fixed!\n",
-      "WARNING:root:override item of paddle.Tensor if exists or register, remove this when fixed!\n",
-      "WARNING:root:override long of paddle.Tensor if exists or register, remove this when fixed!\n",
-      "WARNING:root:override new_full of paddle.Tensor if exists or register, remove this when fixed!\n",
-      "WARNING:root:override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
-      "WARNING:root:override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
-      "WARNING:root:override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
-      "WARNING:root:register user view to paddle.Tensor, remove this when fixed!\n",
-      "WARNING:root:register user view_as to paddle.Tensor, remove this when fixed!\n",
-      "WARNING:root:register user masked_fill to paddle.Tensor, remove this when fixed!\n",
-      "WARNING:root:register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
-      "WARNING:root:register user fill_ to paddle.Tensor, remove this when fixed!\n",
-      "WARNING:root:register user repeat to paddle.Tensor, remove this when fixed!\n",
-      "WARNING:root:register user softmax to paddle.Tensor, remove this when fixed!\n",
-      "WARNING:root:register user sigmoid to paddle.Tensor, remove this when fixed!\n",
-      "WARNING:root:register user relu to paddle.Tensor, remove this when fixed!\n",
-      "WARNING:root:register user type_as to paddle.Tensor, remove this when fixed!\n",
-      "WARNING:root:register user to to paddle.Tensor, remove this when fixed!\n",
-      "WARNING:root:register user float to paddle.Tensor, remove this when fixed!\n",
-      "WARNING:root:register user glu to paddle.nn.functional, remove this when fixed!\n",
-      "WARNING:root:override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
-      "WARNING:root:register user Module to paddle.nn, remove this when fixed!\n",
-      "WARNING:root:register user ModuleList to paddle.nn, remove this when fixed!\n",
-      "WARNING:root:register user GLU to paddle.nn, remove this when fixed!\n",
-      "WARNING:root:register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
-      "WARNING:root:register user export to paddle.jit, remove this when fixed!\n"
+      "[WARNING 2021/04/16 06:32:09 __init__.py:93] register user softmax to paddle, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:97] register user log_softmax to paddle, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:101] register user sigmoid to paddle, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:105] register user log_sigmoid to paddle, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:109] register user relu to paddle, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:119] override cat of paddle if exists or register, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:133] override item of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:144] override long of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:164] override new_full of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:179] override eq of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:185] override eq of paddle if exists or register, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:195] override contiguous of paddle.Tensor if exists or register, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:212] override size of paddle.Tensor (`to_static` do not process `size` property, maybe some `paddle` api dependent on it), remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:223] register user view to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:233] register user view_as to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:259] register user masked_fill to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:277] register user masked_fill_ to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:288] register user fill_ to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:298] register user repeat to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:303] register user softmax to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:308] register user sigmoid to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:312] register user relu to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:322] register user type_as to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:337] register user to to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:346] register user float to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:356] register user tolist to paddle.Tensor, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:371] register user glu to paddle.nn.functional, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:422] override ctc_loss of paddle.nn.functional if exists, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:428] register user Module to paddle.nn, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:434] register user ModuleList to paddle.nn, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:450] register user GLU to paddle.nn, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:483] register user ConstantPad2d to paddle.nn, remove this when fixed!\n",
+      "[WARNING 2021/04/16 06:32:09 __init__.py:489] register user export to paddle.jit, remove this when fixed!\n"
     ]
    },
    {
@@ -191,6 +193,84 @@
  {
   "cell_type": "code",
   "execution_count": 4,
+   "id": "wired-principal",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'num_samples': 5, 'beam_size': 500, 'num_proc_bsearch': 8, 'num_conv_layers': 2, 'num_rnn_layers': 3, 'rnn_layer_size': 2048, 'alpha': 2.5, 'beta': 0.3, 'cutoff_prob': 1.0, 'cutoff_top_n': 40, 'use_gru': False, 'use_gpu': True, 'share_rnn_weights': True, 'unit_type': 'char', 'spm_model_prefix': 'examples/aishell/s1/data/spm_bpe', 'infer_manifest': 'examples/aishell/s1/data/manifest.test', 'mean_std_path': '', 'vocab_path': 'examples/aishell/s1/data/vocab.txt', 'lang_model_path': 'models/lm/common_crawl_00.prune01111.trie.klm', 'model_path': 'examples/aishell/s1/checkpoints/step_final', 'decoding_method': 'ctc_beam_search', 'error_rate_type': 'wer', 'specgram_type': 'fbank', 'feat_dim': 80, 'delta_delta': False}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "import argparse\n",
+    "import functools\n",
+    "from deepspeech.utils.utility import add_arguments, print_arguments\n",
+    "parser = argparse.ArgumentParser(description=__doc__)\n",
+    "add_arg = functools.partial(add_arguments, argparser=parser)\n",
+    "# yapf: disable\n",
+    "add_arg('num_samples',      int,    5,     \"# of samples to infer.\")\n",
+    "add_arg('beam_size',        int,    500,    \"Beam search width.\")\n",
+    "add_arg('num_proc_bsearch', int,    8,      \"# of CPUs for beam search.\")\n",
+    "add_arg('num_conv_layers',  int,    2,      \"# of convolution layers.\")\n",
+    "add_arg('num_rnn_layers',   int,    3,      \"# of recurrent layers.\")\n",
+    "add_arg('rnn_layer_size',   int,    2048,   \"# of recurrent cells per layer.\")\n",
+    "add_arg('alpha',            float,  2.5,    \"Coef of LM for beam search.\")\n",
+    "add_arg('beta',             float,  0.3,    \"Coef of WC for beam search.\")\n",
+    "add_arg('cutoff_prob',      float,  1.0,    \"Cutoff probability for pruning.\")\n",
+    "add_arg('cutoff_top_n',     int,    40,     \"Cutoff number for pruning.\")\n",
+    "add_arg('use_gru',          bool,   False,  \"Use GRUs instead of simple RNNs.\")\n",
+    "add_arg('use_gpu',          bool,   True,   \"Use GPU or not.\")\n",
+    "add_arg('share_rnn_weights',bool,   True,   \"Share input-hidden weights across \"\n",
+    "                                            \"bi-directional RNNs. Not for GRU.\")\n",
+    "add_arg('unit_type',    str,\n",
+    "        'char',\n",
+    "        \"Options: char, word, spm.\",\n",
+    "        choices=['char', 'word', 'spm'])\n",
+    "add_arg('spm_model_prefix',    str,\n",
+    "        'examples/aishell/s1/data/spm_bpe',\n",
+    "        \"spm model prefix.\",)\n",
+    "add_arg('infer_manifest',   str,\n",
+    "        'examples/aishell/s1/data/manifest.test',\n",
+    "        \"Filepath of manifest to infer.\")\n",
+    "add_arg('mean_std_path',    str,\n",
+    "        '',\n",
+    "        \"examples/aishell/s1/data/mean_std.npz, Filepath of normalizer's mean & std.\")\n",
+    "add_arg('vocab_path',       str,\n",
+    "        'examples/aishell/s1/data/vocab.txt',\n",
+    "        \"Filepath of vocabulary.\")\n",
+    "add_arg('lang_model_path',  str,\n",
+    "        'models/lm/common_crawl_00.prune01111.trie.klm',\n",
+    "        \"Filepath for language model.\")\n",
+    "add_arg('model_path',       str,\n",
+    "        'examples/aishell/s1/checkpoints/step_final',\n",
+    "        \"If None, the training starts from scratch, \"\n",
+    "        \"otherwise, it resumes from the pre-trained model.\")\n",
+    "add_arg('decoding_method',  str,\n",
+    "        'ctc_beam_search',\n",
+    "        \"Decoding method. Options: ctc_beam_search, ctc_greedy\",\n",
+    "        choices = ['ctc_beam_search', 'ctc_greedy'])\n",
+    "add_arg('error_rate_type',  str,\n",
+    "        'wer',\n",
+    "        \"Error rate type for evaluation.\",\n",
+    "        choices=['wer', 'cer'])\n",
+    "add_arg('specgram_type',    str,\n",
+    "        'fbank',\n",
+    "        \"Audio feature type. Options: linear, mfcc.\",\n",
+    "        choices=['linear', 'mfcc', 'fbank'])\n",
+    "add_arg('feat_dim',  int,  80, \"mfcc or fbank feat dim.\")\n",
+    "add_arg('delta_delta',  bool,  False, \"delta delta\")\n",
+    "# yapf: disable\n",
+    "args = parser.parse_args([])\n",
+    "print(vars(args))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
   "id": "bearing-physics",
   "metadata": {},
   "outputs": [
@@ -259,7 +339,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 6,
   "id": "classified-melissa",
   "metadata": {},
   "outputs": [
@@ -268,7 +348,31 @@
     "output_type": "stream",
     "text": [
      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel/ipkernel.py:283: DeprecationWarning: `should_run_async` will not call `transform_cell` automatically in the future. Please pass the result to `transformed_cell` argument and any exception that happen during thetransform in `preprocessing_exc_tuple` in IPython 7.17 and above.\n",
-      "  and should_run_async(code)\n",
+      "  and should_run_async(code)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "fbank\n",
+      "[232 387 331 ... 249 249 262] int16\n",
+      "fbank\n",
+      "[-138 -219 -192 ...  338  324  351] int16\n",
+      "fbank\n",
+      "[ 694 1175 1022 ...  553  514  627] int16\n",
+      "fbank\n",
+      "[-39 -79 -53 ... 139 172  99] int16\n",
+      "fbank\n",
+      "[-277 -480 -425 ...  758  767  739] int16\n",
+      "fbank\n",
+      "[ 399  693  609 ... 1291 1270 1291] int16\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/paddle/fluid/dataloader/dataloader_iter.py:354: DeprecationWarning: `np.object` is a deprecated alias for the builtin `object`. To silence this warning, use `object` by itself. Doing this will not modify any behavior and is safe. \n",
      "Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations\n",
      "  if arr.dtype == np.object:\n"
@@ -278,58 +382,106 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "test: Tensor(shape=[5, 23], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n",
-      "       [[116, 104, 101,  32, 116, 119, 101, 110, 116, 105, 101, 115, -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ],\n",
-      "        [119, 104, 101, 114, 101,  32, 105, 115,  32, 116, 104,  97, 116, -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ],\n",
-      "        [116, 101, 110,  32, 115, 101,  99, 111, 110, 100, 115, -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 ],\n",
-      "        [104, 101,  32, 100, 111, 101, 115, 110,  39, 116,  32, 119, 111, 114, 107,  32,  97, 116,  32,  97, 108, 108, -1 ],\n",
-      "        [119, 104, 101, 114, 101,  32, 105, 115,  32, 109, 121,  32,  98, 114, 111, 116, 104, 101, 114,  32, 110, 111, 119]])\n",
-      "test raw: the twenties\n",
-      "test raw: where is my brother now\n",
+      "fbank\n",
+      "[ -750 -1254 -1107 ...  2276  1889  2067] int16\n",
+      "fbank\n",
+      "[ -127  -199  -149 ... -5243 -5065 -5398] int16\n",
+      "fbank\n",
+      "[ 465  783  677 ...  980  903 1008] int16\n",
+      "fbank\n",
+      "[ 90 160 157 ...  -2 -16 -21] int16\n",
+      "fbank\n",
+      "[ 213  345  295 ... 2483 2246 2501] int16\n",
+      "fbank\n",
+      "[ -86 -159 -131 ...  270  258  290] int16\n",
+      "fbank\n",
+      "[-1023 -1714 -1505 ...  1532  1596  1575] int16\n",
+      "fbank\n",
+      "[-366 -602 -527 ...  374  370  379] int16\n",
+      "fbank\n",
+      "[ 761 1275 1127 ...  369  413  295] int16\n",
+      "fbank\n",
+      "[382 621 550 ... 161 161 174] int16\n",
+      "fbank\n",
+      "[ -28  -91 -120 ...   28   34   11] int16\n",
+      "fbank\n",
+      "[ -5  -5  -5 ... 268 294 341] int16\n",
+      "fbank\n",
+      "[240 417 684 ... 267 262 219] int16\n",
+      "fbank\n",
+      "[131 206 194 ... 383 320 343] int16\n",
+      "test: Tensor(shape=[5, 7], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n",
+      "       [[31069, 21487, 29233, 30340, 20320, -1   , -1   ],\n",
+      "        [20540, 24471, 19968, 25552, 30340, 26159, -1   ],\n",
+      "        [36825, 20010, 31243, 24230, 26159, 32654, 30340],\n",
+      "        [20108, 21040, 20108, -1   , -1   , -1   , -1   ],\n",
+      "        [21435, 34892, 25919, 21270, -1   , -1   , -1   ]])\n",
+      "fbank\n",
+      "[1155 1890 1577 ... 1092  989 1130] int16\n",
+      "fbank\n",
+      "[296 358 296 ... 140 140 168] int16\n",
+      "fbank\n",
+      "[-50 -91 -63 ... 104 104  86] int16\n",
+      "fbank\n",
+      "[-37 -66 -50 ... -31 -45 -52] int16\n",
+      "fbank\n",
+      "[-401 -652 -547 ... -339 -307 -344] int16\n",
+      "fbank\n",
+      "[-21 -47 -51 ...  94  81 107] int16\n",
+      "fbank\n",
+      "[ 533  887  755 ... 3074 2853 3254] int16\n",
+      "fbank\n",
+      "[  44   71   66 ... -628 -733 -601] int16\n",
+      "fbank\n",
+      "[ 50  86  79 ... 129 116 138] int16\n",
+      "fbank\n",
+      "[  92  146  126 ... -208 -193 -179] int16\n",
+      "test raw: 祝可爱的你\n",
+      "test raw: 去行政化\n",
      "audio len: Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n",
-      "       [163, 173, 184, 190, 203])\n",
+      "       [184, 194, 196, 204, 207])\n",
      "test len: Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n",
-      "       [12, 13, 11, 22, 23])\n",
-      "audio: Tensor(shape=[5, 203, 80], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n",
-      "       [[[-51.32406616, -17.91388321,  0.00000000 , ..., -26.66350746, -27.46039391, -27.22303963],\n",
-      "         [-15.19027233, -20.52460480,  0.00000000 , ..., -28.47811317, -26.87953568, -25.13592339],\n",
-      "         [-22.80181694, -19.48889351,  0.00000000 , ..., -29.96320724, -25.96619034, -24.57164192],\n",
+      "       [5, 6, 7, 3, 4])\n",
+      "audio: Tensor(shape=[5, 207, 80], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n",
+      "       [[[12.25633812, 12.61639309, 10.36936474, ..., 13.02949619, 11.51365757, 10.59789085],\n",
+      "         [13.32148266, 13.41071606, 11.43800735, ..., 13.69783783, 12.83939362, 11.51259613],\n",
+      "         [12.62640572, 12.53621101, 10.97212505, ..., 13.33757591, 12.32293034, 10.75493717],\n",
      "         ...,\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ]],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ]],\n",
      "\n",
-      "        [[-15.38297653, -18.95307732,  0.00000000 , ..., -15.22777271, -16.46900940, -12.32327461],\n",
-      "         [-14.06289291, -12.69954872,  0.00000000 , ..., -15.68012810, -16.92030334, -13.49134445],\n",
-      "         [-19.78544235, -11.63046265,  0.00000000 , ..., -14.35409069, -14.82787228, -15.72653484],\n",
+      "        [[10.99619484, 11.35202599, 9.56922054 , ..., 9.94971657 , 9.88354111 , 9.55315971 ],\n",
+      "         [10.44461155, 9.81688595 , 5.62538481 , ..., 10.60468388, 10.94417381, 9.42646980 ],\n",
+      "         [10.23835754, 10.23407459, 7.99464273 , ..., 10.68097591, 9.91640091 , 10.04131031],\n",
      "         ...,\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ]],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ]],\n",
      "\n",
-      "        [[-22.65289879, -21.11938667,  0.00000000 , ..., -31.80981827, -30.58669853, -28.68988228],\n",
-      "         [-31.04699135, -21.68680763,  0.00000000 , ..., -29.90789604, -30.31726456, -30.99709320],\n",
-      "         [-18.16406441, -17.50658417,  0.00000000 , ..., -29.47821617, -29.77137375, -30.45121002],\n",
+      "        [[14.10299397, 14.50298119, 12.87738323, ..., 12.62796497, 12.69949627, 11.43171215],\n",
+      "         [13.85035992, 13.15289116, 10.66541386, ..., 13.34364223, 13.46972179, 11.02160740],\n",
+      "         [13.19866467, 13.23537827, 11.65760899, ..., 12.72559357, 12.42716217, 11.74562359],\n",
      "         ...,\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ]],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ]],\n",
      "\n",
-      "        [[-16.17608452, -15.22302818,  0.00000000 , ..., -8.82944202 , -7.88900328 , -6.10806322 ],\n",
-      "         [-19.40717316, -12.32932186,  0.00000000 , ..., -8.05214977 , -8.03145599 , -7.35137606 ],\n",
-      "         [-11.01850796, -13.20147514,  0.00000000 , ..., -9.65334892 , -8.96987629 , -9.13897228 ],\n",
+      "        [[12.85668373, 12.82431412, 11.68144703, ..., 14.10119247, 15.12791920, 13.68221378],\n",
+      "         [13.19507027, 13.40244961, 11.43618393, ..., 13.32919979, 13.68267441, 12.73429012],\n",
+      "         [13.02173328, 12.92082500, 11.44303989, ..., 12.77793121, 13.10915661, 11.77327728],\n",
      "         ...,\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ]],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ]],\n",
      "\n",
-      "        [[-16.55369759, -16.95514297,  0.00000000 , ..., -7.00301647 , -6.53273058 , -10.14600754],\n",
-      "         [-19.51947975, -14.86818218,  0.00000000 , ..., -6.82891273 , -6.22576237 , -9.42883873 ],\n",
-      "         [-15.26447582, -22.26662445,  0.00000000 , ..., -13.31693172, -11.05612659, -12.70977211],\n",
+      "        [[12.90771198, 13.40234852, 13.01435471, ..., 13.80359459, 14.08088684, 13.17883396],\n",
+      "         [14.06678009, 14.06943512, 12.52837276, ..., 13.66423225, 13.66300583, 13.60142994],\n",
+      "         [12.58743191, 12.94520760, 11.75190544, ..., 14.28828907, 14.08229160, 13.02433395],\n",
      "         ...,\n",
-      "         [-4.81728077 , -10.65084648,  0.00000000 , ...,  3.19982862 ,  8.42359638 ,  7.95100546 ],\n",
-      "         [-7.54755068 , -12.56441689,  0.00000000 , ...,  4.12789631 ,  6.98472023 ,  7.79936218 ],\n",
-      "         [-8.79256725 , -11.23776722,  0.00000000 , ...,  1.31829071 ,  1.30352044 ,  6.80789280 ]]])\n"
+      "         [16.20896912, 16.42283821, 14.94358730, ..., 12.91146755, 12.66766262, 11.76361752],\n",
+      "         [13.49324894, 14.14653301, 13.16490936, ..., 13.23435783, 13.45378494, 12.60386276],\n",
+      "         [15.56288910, 15.92445087, 14.90794277, ..., 13.43840790, 13.41075516, 12.55605984]]])\n"
     ]
    }
   ],
@@ -354,7 +506,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 7,
   "id": "minus-modern",
   "metadata": {},
   "outputs": [
@@ -362,58 +514,70 @@
     "name": "stdout",
     "output_type": "stream",
     "text": [
-      "test: Tensor(shape=[5, 23], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n",
-      "       [[87, 37, 26,  1, 87, 97, 26, 61, 87, 38, 26, 82, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],\n",
-      "        [97, 37, 26, 79, 26,  1, 38, 82,  1, 87, 37,  3, 87, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],\n",
-      "        [87, 26, 61,  1, 82, 26, 18, 64, 61, 25, 82, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1],\n",
-      "        [37, 26,  1, 25, 64, 26, 82, 61,  2, 87,  1, 97, 64, 79, 52,  1,  3, 87,  1,  3, 53, 53, -1],\n",
-      "        [97, 37, 26, 79, 26,  1, 38, 82,  1, 58, 102,  1, 17, 79, 64, 87, 37, 26, 79,  1, 61, 64, 97]])\n",
-      "test raw: W%\u001a\u0001Wa\u001a=W&\u001aR\n",
-      "test raw: a%\u001aO\u001a\u0001&R\u0001:f\u0001\u0011O@W%\u001aO\u0001=@a\n",
+      "fbank\n",
+      "[232 387 331 ... 249 249 262] int16\n",
+      "fbank\n",
+      "[-138 -219 -192 ...  338  324  351] int16\n",
+      "fbank\n",
+      "[ 694 1175 1022 ...  553  514  627] int16\n",
+      "fbank\n",
+      "[-39 -79 -53 ... 139 172  99] int16\n",
+      "fbank\n",
+      "[-277 -480 -425 ...  758  767  739] int16\n",
+      "fbank\n",
+      "test: Tensor(shape=[5, 7], dtype=int32, place=CUDAPinnedPlace, stop_gradient=True,\n",
+      "       [[2695,  505, 2332, 2553,  169, -1  , -1  ],\n",
+      "        [ 230, 1237,  2  , 1556, 2553, 1694, -1  ],\n",
+      "        [3703,  28 , 2739, 1172, 1694, 2966, 2553],\n",
+      "        [ 70 ,  355,  70 , -1  , -1  , -1  , -1  ],\n",
+      "        [ 477, 3363, 1621,  412, -1  , -1  , -1  ]])\n",
+      "[ 399  693  609 ... 1291 1270 1291] int16\n",
+      "test raw: ઇǹज৹©\n",
+      "test raw: ǝണٕƜ\n",
      "test len: Tensor(shape=[5], dtype=int64, place=CUDAPlace(0), stop_gradient=True,\n",
-      "       [12, 13, 11, 22, 23])\n",
-      "audio: Tensor(shape=[5, 203, 80], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n",
-      "       [[[-51.32406616, -17.91388321,  0.00000000 , ..., -26.66350746, -27.46039391, -27.22303963],\n",
-      "         [-15.19027233, -20.52460480,  0.00000000 , ..., -28.47811317, -26.87953568, -25.13592339],\n",
-      "         [-22.80181694, -19.48889351,  0.00000000 , ..., -29.96320724, -25.96619034, -24.57164192],\n",
+      "       [5, 6, 7, 3, 4])\n",
+      "audio: Tensor(shape=[5, 207, 80], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,\n",
+      "       [[[12.25794601, 12.61855793, 10.37306023, ..., 13.12571049, 11.53678799, 10.32210350],\n",
+      "         [13.32333183, 13.41336918, 11.44248962, ..., 13.65861225, 12.79308128, 11.31168747],\n",
+      "         [12.62584686, 12.53506088, 10.96861362, ..., 13.32526493, 12.41560936, 10.71458912],\n",
      "         ...,\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ]],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ]],\n",
      "\n",
-      "        [[-15.38297653, -18.95307732,  0.00000000 , ..., -15.22777271, -16.46900940, -12.32327461],\n",
-      "         [-14.06289291, -12.69954872,  0.00000000 , ..., -15.68012810, -16.92030334, -13.49134445],\n",
-      "         [-19.78544235, -11.63046265,  0.00000000 , ..., -14.35409069, -14.82787228, -15.72653484],\n",
+      "        [[11.00003052, 11.35529137, 9.56384087 , ..., 10.06063652, 10.16322994, 9.43149185 ],\n",
+      "         [10.44556236, 9.81155300 , 5.49400425 , ..., 10.84116268, 11.02734756, 9.42253590 ],\n",
+      "         [10.23620510, 10.23321152, 7.99466419 , ..., 10.93381882, 10.28395081, 10.00841141],\n",
      "         ...,\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ]],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ]],\n",
      "\n",
-      "        [[-22.65289879, -21.11938667,  0.00000000 , ..., -31.80981827, -30.58669853, -28.68988228],\n",
-      "         [-31.04699135, -21.68680763,  0.00000000 , ..., -29.90789604, -30.31726456, -30.99709320],\n",
-      "         [-18.16406441, -17.50658417,  0.00000000 , ..., -29.47821617, -29.77137375, -30.45121002],\n",
+      "        [[14.10379314, 14.50375748, 12.87825108, ..., 12.68065739, 12.62359715, 11.53773308],\n",
+      "         [13.84964657, 13.15079498, 10.67198086, ..., 13.24875164, 13.45796680, 10.97363472],\n",
+      "         [13.19808197, 13.23482990, 11.65900230, ..., 12.70375061, 12.41395664, 11.88668156],\n",
      "         ...,\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ]],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ]],\n",
      "\n",
-      "        [[-16.17608452, -15.22302818,  0.00000000 , ..., -8.82944202 , -7.88900328 , -6.10806322 ],\n",
-      "         [-19.40717316, -12.32932186,  0.00000000 , ..., -8.05214977 , -8.03145599 , -7.35137606 ],\n",
-      "         [-11.01850796, -13.20147514,  0.00000000 , ..., -9.65334892 , -8.96987629 , -9.13897228 ],\n",
+      "        [[12.85676289, 12.82410812, 11.67961884, ..., 14.12018299, 15.14850044, 13.80065727],\n",
+      "         [13.19532776, 13.40243340, 11.43492508, ..., 13.29144669, 13.70278549, 12.67841339],\n",
+      "         [13.02196407, 12.92111111, 11.43998623, ..., 12.71165752, 13.16518497, 11.92028046],\n",
      "         ...,\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ],\n",
-      "         [ 0.         ,  0.         ,  0.         , ...,  0.         ,  0.         ,  0.         ]],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ],\n",
+      "         [0.         , 0.         , 0.         , ..., 0.         , 0.         , 0.         ]],\n",
      "\n",
-      "        [[-16.55369759, -16.95514297,  0.00000000 , ..., -7.00301647 , -6.53273058 , -10.14600754],\n",
-      "         [-19.51947975, -14.86818218,  0.00000000 , ..., -6.82891273 , -6.22576237 , -9.42883873 ],\n",
-      "         [-15.26447582, -22.26662445,  0.00000000 , ..., -13.31693172, -11.05612659, -12.70977211],\n",
+      "        [[12.90661621, 13.40162563, 13.01394463, ..., 13.84056377, 14.11240959, 13.21227264],\n",
+      "         [14.06642914, 14.06922340, 12.52955723, ..., 13.55829811, 13.60157204, 13.50268650],\n",
+      "         [12.58881378, 12.94780254, 11.75758171, ..., 14.29055786, 14.12165928, 13.02695847],\n",
      "         ...,\n",
-      "         [-4.81728077 , -10.65084648,  0.00000000 , ...,  3.19982862 ,  8.42359638 ,  7.95100546 ],\n",
-      "         [-7.54755068 , -12.56441689,  0.00000000 , ...,  4.12789631 ,  6.98472023 ,  7.79936218 ],\n",
-      "         [-8.79256725 , -11.23776722,  0.00000000 , ...,  1.31829071 ,  1.30352044 ,  6.80789280 ]]])\n",
+      "         [16.20891571, 16.42290306, 14.94398117, ..., 12.86083794, 12.63515949, 11.67581463],\n",
+      "         [13.49345875, 14.14656067, 13.16498375, ..., 13.28024578, 13.40956783, 12.70357513],\n",
+      "         [15.56265163, 15.92387581, 14.90643024, ..., 13.45694065, 13.44703197, 12.81099033]]])\n",
      "audio len: Tensor(shape=[5], dtype=int64, place=CUDAPinnedPlace, stop_gradient=True,\n",
-      "       [163, 173, 184, 190, 203])\n"
+      "       [184, 194, 196, 204, 207])\n"
     ]
    }
   ],
@@ -464,6 +628,556 @@
   "metadata": {},
   "outputs": [],
   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "knowing-military",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'num_samples': 1, 'specgram_type': 'fbank', 'feat_dim': 80, 'delta_delta': False, 'stride_ms': 10.0, 'window_ms': 25.0, 'sample_rate': 16000, 'manifest_path': 'examples/aishell/s1/data/manifest.train', 'output_path': 'examples/aishell/s1/data/mean_std.npz'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "import argparse\n",
+    "import functools\n",
+    "from deepspeech.utils.utility import add_arguments, print_arguments\n",
+    "parser = argparse.ArgumentParser(description=__doc__)\n",
+    "add_arg = functools.partial(add_arguments, argparser=parser)\n",
+    "\n",
+    "add_arg('num_samples',      int,    1,    \"# of samples to for statistics.\")\n",
+    "add_arg('specgram_type',    str,   'fbank',\n",
+    "        \"Audio feature type. Options: linear, mfcc, fbank.\",\n",
+    "        choices=['linear', 'mfcc', 'fbank'])\n",
+    "add_arg('feat_dim',    int, 80, \"Audio feature dim.\")\n",
+    "add_arg('delta_delta',    bool, False,\"Audio feature with delta delta.\")\n",
+    "add_arg('stride_ms',    float, 10.0,  \"stride length in ms.\")\n",
+    "add_arg('window_ms',    float, 25.0,  \"stride length in ms.\")\n",
+    "add_arg('sample_rate',    int, 16000,  \"target sample rate.\")\n",
+    "add_arg('manifest_path',    str,\n",
+    "        'examples/aishell/s1/data/manifest.train',\n",
+    "        \"Filepath of manifest to compute normalizer's mean and stddev.\")\n",
+    "add_arg('output_path',    str,\n",
+    "        'examples/aishell/s1/data/mean_std.npz',\n",
+    "        \"Filepath of write mean and stddev to (.npz).\")\n",
+    "args = parser.parse_args([])\n",
+    "print(vars(args))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "unnecessary-province",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline\n",
+    "from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer\n",
+    "from deepspeech.frontend.normalizer import FeatureNormalizer\n",
+    "from deepspeech.frontend.audio import AudioSegment\n",
+    "from deepspeech.frontend.utility import load_cmvn\n",
+    "from deepspeech.frontend.utility import read_manifest\n",
+    "\n",
+    "\n",
+    "\n",
+    "def mean(args):\n",
+    "    augmentation_pipeline = AugmentationPipeline('{}')\n",
+    "    audio_featurizer = AudioFeaturizer(\n",
+    "        specgram_type=args.specgram_type,\n",
+    "        feat_dim=args.feat_dim,\n",
+    "        delta_delta=args.delta_delta,\n",
+    "        stride_ms=args.stride_ms,\n",
+    "        window_ms=args.window_ms,\n",
+    "        n_fft=None,\n",
+    "        max_freq=None,\n",
+    "        target_sample_rate=args.sample_rate,\n",
+    "        use_dB_normalization=True,\n",
+    "        target_dB=-20,\n",
+    "        dither=0.0)\n",
+    "\n",
+    "    def augment_and_featurize(audio_segment):\n",
+    "        augmentation_pipeline.transform_audio(audio_segment)\n",
+    "        return audio_featurizer.featurize(audio_segment)\n",
+    "\n",
+    "    normalizer = FeatureNormalizer(\n",
+    "        mean_std_filepath=None,\n",
+    "        manifest_path=args.manifest_path,\n",
+    "        featurize_func=augment_and_featurize,\n",
+    "        num_samples=args.num_samples)\n",
+    "    normalizer.write_to_file(args.output_path)\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "interested-camping",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[0.00164795 0.00274658 0.00234985 ... 0.00177002 0.00177002 0.00186157]\n",
+      "[54. 90. 77. ... 58. 58. 61.]\n",
+      "29746\n",
+      "fbank\n",
+      "[54 90 77 ... 58 58 61] int16\n",
+      "(184, 80) float64\n",
+      "[[10.61737914 10.07708936  5.32487528 ... 10.2481839   8.89699394\n",
+      "   7.80671114]\n",
+      " [11.0440077  10.3180721   6.30866128 ... 11.23730926 10.35838868\n",
+      "   8.83860079]\n",
+      " [10.26930555  9.99636567  7.3296638  ... 10.45131595  9.69295303\n",
+      "   7.96168491]\n",
+      " ...\n",
+      " [10.14497345  9.88674207  6.73801138 ... 10.21580627  9.00343472\n",
+      "   8.75616521]\n",
+      " [ 9.97745961  9.67949736  7.90660425 ... 10.22436653  9.59456493\n",
+      "   7.69287184]\n",
+      " [ 6.47357374  7.76335491  7.75765843 ...  9.96522077  9.6226365\n",
+      "   8.16007108]]\n",
+      "(184, 80) float64\n",
+      "[[10.61737914 10.07708936  5.32487528 ... 10.2481839   8.89699394\n",
+      "   7.80671114]\n",
+      " [11.0440077  10.3180721   6.30866128 ... 11.23730926 10.35838868\n",
+      "   8.83860079]\n",
+      " [10.26930555  9.99636567  7.3296638  ... 10.45131595  9.69295303\n",
+      "   7.96168491]\n",
+      " ...\n",
+      " [10.14497345  9.88674207  6.73801138 ... 10.21580627  9.00343472\n",
+      "   8.75616521]\n",
+      " [ 9.97745961  9.67949736  7.90660425 ... 10.22436653  9.59456493\n",
+      "   7.69287184]\n",
+      " [ 6.47357374  7.76335491  7.75765843 ...  9.96522077  9.6226365\n",
+      "   8.16007108]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "wav='/workspace/DeepSpeech-2.x/examples/aishell/s1/../../..//examples/dataset/aishell/data_aishell/wav/test/S0916/BAC009S0916W0426.wav'\n",
+    "test='祝可爱的你'\n",
+    "audio_featurizer = AudioFeaturizer(\n",
+    "        specgram_type=args.specgram_type,\n",
+    "        feat_dim=args.feat_dim,\n",
+    "        delta_delta=args.delta_delta,\n",
+    "        stride_ms=args.stride_ms,\n",
+    "        window_ms=args.window_ms,\n",
+    "        n_fft=None,\n",
+    "        max_freq=None,\n",
+    "        target_sample_rate=args.sample_rate,\n",
+    "        use_dB_normalization=False,\n",
+    "        target_dB=-20,\n",
+    "        dither=0.0)\n",
+    "samples = AudioSegment.from_file(wav)\n",
+    "print(samples._samples)\n",
+    "print(samples._samples * 2**15)\n",
+    "print(len(samples._samples))\n",
+    "feat = audio_featurizer.featurize(samples, False, False)\n",
+    "feat = feat.T\n",
+    "print(feat.shape, feat.dtype)\n",
+    "print(feat)\n",
+    "\n",
+    "from python_speech_features import logfbank\n",
+    "max_freq = args.sample_rate / 2\n",
+    "fbank_feat = logfbank(\n",
+    "            signal=samples.to('int16'),\n",
+    "            samplerate=args.sample_rate,\n",
+    "            winlen=0.001 * args.window_ms,\n",
+    "            winstep=0.001 * args.stride_ms,\n",
+    "            nfilt=args.feat_dim,\n",
+    "            nfft=512,\n",
+    "            lowfreq=20,\n",
+    "            highfreq=max_freq,\n",
+    "            preemph=0.97,\n",
+    "            dither=0.0,\n",
+    "            wintype='povey')\n",
+    "print(fbank_feat.shape, fbank_feat.dtype)\n",
+    "print(fbank_feat)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "numeric-analyst",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(184, 160)\n",
+      "[ 8.59522397  8.43148278  8.36414052  8.45487173  8.31761643  8.04843683\n",
+      "  8.01683696  7.6574614   7.95521932  8.22945157 10.20138275  9.0447775\n",
+      "  9.14763398  9.18184349  9.03801065  9.04852307  8.67706728  8.71894271\n",
+      "  9.54553655  9.19535135  8.76413076  8.47828946  8.52586143  8.49469288\n",
+      "  8.72461247  8.28562879  8.11581393  7.99922156  7.91023364  8.04142296\n",
+      "  7.89762773  7.76257636  8.32043745  8.01592886  8.34109665  8.90115454\n",
+      "  8.48246945  7.98658664  8.05745122  8.11384088  8.18864479  8.8091827\n",
+      " 11.8067711  13.25258218 14.44311795 13.90515283 14.00120623 13.99801252\n",
+      " 13.81595394 13.6379904  13.3574897  13.14933334 12.96518543 13.02601156\n",
+      " 12.70246737 12.54410834 12.15615068 11.86574681 11.67497882 10.79645481\n",
+      " 10.48150035 10.03758575 10.05637027  9.92891308 10.06923218 12.43382431\n",
+      " 12.71428321 14.33135052 13.94470959 14.29188291 14.11483993 14.03496606\n",
+      " 13.78167331 13.66701466 14.40308625 14.73934137 15.09569382 14.89565815\n",
+      " 15.10519995 14.94383582 15.03275563 15.42194679 15.29219967 15.41602274\n",
+      " 15.39242545 15.76836177 16.259222   16.47777231 17.03366795 17.46165793\n",
+      " 17.52596217 17.78844031 17.99878075 18.11446843 17.95761578 17.99900337\n",
+      " 17.86282737 17.7290163  17.47686504 17.43425516 17.07750485 16.64395242\n",
+      " 15.68217043 14.90058399 14.45645737 14.0405463  14.89549542 16.00405781\n",
+      " 16.27301689 16.37572895 16.31219037 16.31765447 16.44819716 16.36281089\n",
+      " 16.24932823 15.79302555 14.76361963 13.95761882 13.48917053 13.45543501\n",
+      " 13.00091327 13.13854248 13.74596395 13.86340629 14.00656109 13.77432101\n",
+      " 13.64267001 13.35742634 13.23042234 12.97916104 12.80694468 12.70005006\n",
+      " 13.2802483  13.22644525 13.14579624 13.02536594 13.36511022 11.37167205\n",
+      " 12.11598045 12.47619798 12.83885973 11.63880287 11.42083924 11.08747705\n",
+      " 11.04093403 11.11263149 10.74353319 10.58734669 10.46180738 10.34157335\n",
+      "  9.63131146  9.70582692  9.29059204  8.94583657  8.66065094  8.46799095\n",
+      "  8.25064103  8.30239167  8.19463371  8.12104567  8.02731234  8.06412715\n",
+      "  7.84889951  7.73090283  7.74119562  7.85444657  7.80717312  7.7129933\n",
+      "  7.84087442  7.77907788  7.60660865  7.55051479  7.458385    7.496416\n",
+      "  7.69519793  7.49086759  7.32199493  8.01617458  7.58525375  7.06661122\n",
+      "  6.94653756  7.19874283  7.28515661  7.17574078]\n",
+      "(184,)\n",
+      "(184,)\n",
+      "[1.48370471 1.52174523 1.46984238 1.67010478 1.88757689 1.68825992\n",
+      " 1.74270259 1.55497318 1.29200818 1.68446481 1.88133219 1.97138928\n",
+      " 2.15910096 2.3149476  1.9820247  2.07694378 1.93498835 2.01493974\n",
+      " 2.39156824 2.02396518 1.69586449 1.63808752 1.64020228 1.43573473\n",
+      " 1.93092656 1.37466294 1.34704929 1.59600739 1.03960441 1.45276496\n",
+      " 1.59360131 1.57466343 1.89491479 1.79333746 1.32701974 1.49441767\n",
+      " 1.51466756 1.63497989 1.42858074 1.51135396 1.61077201 1.81066387\n",
+      " 1.83367783 2.3507094  2.87885378 3.26231227 2.1313117  1.98557548\n",
+      " 1.99105426 2.26150533 2.34298751 2.44621608 2.39201042 2.41226503\n",
+      " 2.5142992  3.03777565 2.81592295 2.75117863 2.78324175 2.68819666\n",
+      " 2.8945782  2.84464168 2.680973   2.78397395 2.47996808 1.71829563\n",
+      " 1.60636949 1.65992483 1.38122631 1.74831825 2.16006884 1.68076185\n",
+      " 1.69329487 1.44929837 1.63763312 1.80101076 2.01166253 2.03254244\n",
+      " 1.9583913  2.04542255 2.00859694 2.16600883 2.16095629 1.97541122\n",
+      " 2.13807632 2.06386436 2.2154187  2.84205688 2.54862449 2.64321545\n",
+      " 2.6805773  2.52300146 2.53209001 2.54682059 2.4521937  2.43155532\n",
+      " 2.42571275 2.23421289 2.23164529 2.23597192 2.14215121 2.10406703\n",
+      " 2.07962874 1.88506161 1.80092372 1.61156092 1.77426835 1.98765563\n",
+      " 2.0356793  1.87964187 1.779513   1.87187681 1.76463632 1.70978684\n",
+      " 1.76471778 1.75604749 1.62792552 1.73929352 1.6887024  1.8677704\n",
+      " 2.17342368 2.08166072 2.14567453 2.15936953 2.18351006 2.41010388\n",
+      " 2.26101752 2.25468001 2.23739715 2.15395133 2.04547813 1.92038843\n",
+      " 1.85491264 1.91905927 2.16709365 1.99924152 2.1850471  2.55461622\n",
+      " 2.72476673 1.69682926 1.73249614 2.06992695 2.1210591  1.66854454\n",
+      " 1.63907505 1.32203822 1.38992558 1.2436937  1.17932877 1.02963653\n",
+      " 1.26085036 1.16997132 1.09339504 1.14188689 1.18675772 1.31859788\n",
+      " 1.21746591 1.3872131  1.26095274 1.34885761 1.46633543 1.64506975\n",
+      " 1.36013821 1.45574721 1.43766588 1.65119054 1.57163772 1.55082968\n",
+      " 1.29413316 1.38351736 1.64234673 1.57186432 1.45381083 1.71204761\n",
+      " 1.51828607 1.30639985 1.32928395 1.49004237 1.6057589  1.81815735\n",
+      " 1.67784678 1.72180861 1.60703743 1.64850255]\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = np.hstack([feat, feat])\n",
+    "print(a.shape)\n",
+    "m = np.mean(a, axis=1)\n",
+    "print(m)\n",
+    "print(m.shape)\n",
+    "std = np.std(a, axis=1)\n",
+    "print(std.shape)\n",
+    "print(std)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "nonprofit-potato",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "hispanic-ethics",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import torchaudio\n",
+    "import torchaudio.compliance.kaldi as kaldi\n",
+    "import torchaudio.sox_effects as sox_effects\n",
+    "from torch.nn.utils.rnn import pad_sequence\n",
+    "torchaudio.set_audio_backend(\"sox\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "changing-calvin",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([1, 29746])\n",
+      "tensor([[54., 90., 77.,  ..., 58., 58., 61.]])\n",
+      "(184, 80)\n",
+      "[[10.617376  10.077089   5.3248763 ... 10.248186   8.896992   7.8067265]\n",
+      " [11.044004  10.318072   6.3086634 ... 11.237308  10.358393   8.838616 ]\n",
+      " [10.269302   9.9963665  7.3296647 ... 10.451319   9.692951   7.9617033]\n",
+      " ...\n",
+      " [10.14497    9.886743   6.738012  ... 10.215809   9.0034275  8.756177 ]\n",
+      " [ 9.977456   9.679498   7.9066052 ... 10.224365   9.594568   7.6928873]\n",
+      " [ 6.4735703  7.7633557  7.7576594 ...  9.965221   9.622637   8.160085 ]]\n",
+      "-----------\n",
+      "[0.00164795 0.00274658 0.00234985 ... 0.00177002 0.00177002 0.00186157]\n",
+      "(184, 80)\n",
+      "[[-10.177039 -10.717326 -15.46954  ... -10.546229 -11.897424 -12.987689]\n",
+      " [ -9.750411 -10.476343 -14.485752 ...  -9.557108 -10.436023 -11.955799]\n",
+      " [-10.525113 -10.798049 -13.46475  ... -10.343097 -11.101464 -12.832712]\n",
+      " ...\n",
+      " [-10.649446 -10.907673 -14.056403 ... -10.578607 -11.790988 -12.038239]\n",
+      " [-10.816959 -11.114918 -12.88781  ... -10.570049 -11.199847 -13.101528]\n",
+      " [-14.320845 -13.03106  -13.036756 ... -10.829194 -11.171779 -12.634331]]\n",
+      "**************\n",
+      "[0.00164795 0.00274658 0.00234985 ... 0.00177002 0.00177002 0.00186157]\n",
+      "[54. 90. 77. ... 58. 58. 61.] float32\n",
+      "(184, 80)\n",
+      "[[10.617376  10.077089   5.3248763 ... 10.248186   8.896992   7.8067265]\n",
+      " [11.044004  10.318072   6.3086634 ... 11.237308  10.358393   8.838616 ]\n",
+      " [10.269302   9.9963665  7.3296647 ... 10.451319   9.692951   7.9617033]\n",
+      " ...\n",
+      " [10.14497    9.886743   6.738012  ... 10.215809   9.0034275  8.756177 ]\n",
+      " [ 9.977456   9.679498   7.9066052 ... 10.224365   9.594568   7.6928873]\n",
+      " [ 6.4735703  7.7633557  7.7576594 ...  9.965221   9.622637   8.160085 ]]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel_launcher.py:1: UserWarning: torchaudio.backend.sox_backend.load_wav has been deprecated and will be removed from 0.9.0 release. Please use \"torchaudio.load\".\n",
+      "  \"\"\"Entry point for launching an IPython kernel.\n"
+     ]
+    }
+   ],
+   "source": [
+    "waveform, sample_rate = torchaudio.load_wav(wav)\n",
+    "print(waveform.shape)\n",
+    "print(waveform)\n",
+    "mat = kaldi.fbank(\n",
+    "                waveform,\n",
+    "                num_mel_bins=80,\n",
+    "                frame_length=25,\n",
+    "                frame_shift=10,\n",
+    "                dither=0,\n",
+    "                energy_floor=0.0,\n",
+    "                sample_frequency=sample_rate\n",
+    "            )\n",
+    "mat = mat.detach().numpy()\n",
+    "print(mat.shape)\n",
+    "print(mat)\n",
+    "\n",
+    "print('-----------')\n",
+    "print(samples._samples)\n",
+    "aud = torch.tensor(samples._samples).view(1, -1)\n",
+    "mat = kaldi.fbank(\n",
+    "                aud,\n",
+    "                num_mel_bins=80,\n",
+    "                frame_length=25,\n",
+    "                frame_shift=10,\n",
+    "                dither=0,\n",
+    "                energy_floor=0.0,\n",
+    "                sample_frequency=sample_rate\n",
+    "            )\n",
+    "mat = mat.detach().numpy()\n",
+    "print(mat.shape)\n",
+    "print(mat)\n",
+    "\n",
+    "print('**************')\n",
+    "print(samples._samples)\n",
+    "tmp = samples.to('int16').astype('float32')\n",
+    "print(tmp, tmp.dtype)\n",
+    "aud = torch.tensor(tmp).view(1, -1)\n",
+    "mat = kaldi.fbank(\n",
+    "                aud,\n",
+    "                num_mel_bins=80,\n",
+    "                frame_length=25,\n",
+    "                frame_shift=10,\n",
+    "                dither=0,\n",
+    "                energy_floor=0.0,\n",
+    "                sample_frequency=sample_rate\n",
+    "            )\n",
+    "mat = mat.detach().numpy()\n",
+    "print(mat.shape)\n",
+    "print(mat)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "buried-dependence",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "silver-printing",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "outer-space",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(29746,)\n",
+      "[54 90 77 ... 58 58 61]\n",
+      "(184, 80)\n",
+      "[[10.61737914 10.07708936  5.32487528 ... 10.2481839   8.89699394\n",
+      "   7.80671114]\n",
+      " [11.0440077  10.3180721   6.30866128 ... 11.23730926 10.35838868\n",
+      "   8.83860079]\n",
+      " [10.26930555  9.99636567  7.3296638  ... 10.45131595  9.69295303\n",
+      "   7.96168491]\n",
+      " ...\n",
+      " [10.14497345  9.88674207  6.73801138 ... 10.21580627  9.00343472\n",
+      "   8.75616521]\n",
+      " [ 9.97745961  9.67949736  7.90660425 ... 10.22436653  9.59456493\n",
+      "   7.69287184]\n",
+      " [ 6.47357374  7.76335491  7.75765843 ...  9.96522077  9.6226365\n",
+      "   8.16007108]]\n",
+      "(184, 13)\n",
+      "[[ 14.73775998 -13.30393391   5.85974818 ...  -3.42359739   2.82785335\n",
+      "    8.86862748]\n",
+      " [ 15.31274834 -13.33671651   4.06537223 ...   8.15970347   2.15934846\n",
+      "    6.78353115]\n",
+      " [ 13.82218765 -13.39296404   6.8304843  ...   2.55332563   8.86724453\n",
+      "   -0.05919222]\n",
+      " ...\n",
+      " [ 13.5837844  -13.42104892  11.21222354 ...   4.81477718   1.66627505\n",
+      "    5.59045842]\n",
+      " [ 13.75757034 -13.92626662  13.06074011 ...  -0.46694046   5.56214833\n",
+      "   12.0785146 ]\n",
+      " [ 11.92813809 -15.9169855    8.78372271 ...  -1.42014277  -3.25768086\n",
+      "    0.88337965]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "from python_speech_features import mfcc\n",
+    "from python_speech_features import delta\n",
+    "from python_speech_features import logfbank\n",
+    "import scipy.io.wavfile as iowav\n",
+    "\n",
+    "(rate,sig) = iowav.read(wav)\n",
+    "print(sig.shape)\n",
+    "print(sig)\n",
+    "\n",
+    "# note that generally nfilt=40 is used for speech recognition\n",
+    "fbank_feat = logfbank(sig,nfilt=80,lowfreq=20,dither=0,wintype='povey')\n",
+    "print(fbank_feat.shape)\n",
+    "print(fbank_feat)\n",
+    "\n",
+    "# the computed fbank coefficents of english.wav with dimension [110,23]\n",
+    "# [ 12.2865\t12.6906\t13.1765\t15.714\t16.064\t15.7553\t16.5746\t16.9205\t16.6472\t16.1302\t16.4576\t16.7326\t16.8864\t17.7215\t18.88\t19.1377\t19.1495\t18.6683\t18.3886\t20.3506\t20.2772\t18.8248\t18.1899\n",
+    "# 11.9198\t13.146\t14.7215\t15.8642\t17.4288\t16.394\t16.8238\t16.1095\t16.4297\t16.6331\t16.3163\t16.5093\t17.4981\t18.3429\t19.6555\t19.6263\t19.8435\t19.0534\t19.001\t20.0287\t19.7707\t19.5852\t19.1112\n",
+    "# ...\n",
+    "# ...\n",
+    "# the same with that using kaldi commands: compute-fbank-feats --dither=0.0\n",
+    "\n",
+    "mfcc_feat = mfcc(sig,dither=0,useEnergy=True,wintype='povey')\n",
+    "print(mfcc_feat.shape)\n",
+    "print(mfcc_feat)\n",
+    "\n",
+    "# the computed mfcc coefficents of english.wav with dimension [110,13]\n",
+    "# [ 17.1337\t-23.3651\t-7.41751\t-7.73686\t-21.3682\t-8.93884\t-3.70843\t4.68346\t-16.0676\t12.782\t-7.24054\t8.25089\t10.7292\n",
+    "# 17.1692\t-23.3028\t-5.61872\t-4.0075\t-23.287\t-20.6101\t-5.51584\t-6.15273\t-14.4333\t8.13052\t-0.0345329\t2.06274\t-0.564298\n",
+    "# ...\n",
+    "# ...\n",
+    "# the same with that using kaldi commands: compute-mfcc-feats --dither=0.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "sporting-school",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(184, 80)\n",
+      "[[-10.17703627 -10.71732606 -15.46954014 ... -10.54623152 -11.89742148\n",
+      "  -12.98770428]\n",
+      " [ -9.75040771 -10.47634331 -14.48575413 ...  -9.55710616 -10.43602673\n",
+      "  -11.95581463]\n",
+      " [-10.52510987 -10.79804975 -13.46475161 ... -10.34309947 -11.10146239\n",
+      "  -12.83273051]\n",
+      " ...\n",
+      " [-10.64944197 -10.90767335 -14.05640404 ... -10.57860915 -11.7909807\n",
+      "  -12.03825021]\n",
+      " [-10.8169558  -11.11491806 -12.88781116 ... -10.57004889 -11.19985048\n",
+      "  -13.10154358]\n",
+      " [-14.32084168 -13.03106051 -13.03675699 ... -10.82919465 -11.17177892\n",
+      "  -12.63434434]]\n",
+      "(184, 13)\n",
+      "[[ -6.05665544 -13.30393391   5.85974818 ...  -3.42359739   2.82785335\n",
+      "    8.86862748]\n",
+      " [ -5.48166707 -13.33671651   4.06537223 ...   8.15970347   2.15934846\n",
+      "    6.78353115]\n",
+      " [ -6.97222776 -13.39296404   6.8304843  ...   2.55332563   8.86724453\n",
+      "   -0.05919222]\n",
+      " ...\n",
+      " [ -7.21063102 -13.42104892  11.21222354 ...   4.81477718   1.66627505\n",
+      "    5.59045842]\n",
+      " [ -7.03684508 -13.92626662  13.06074011 ...  -0.46694046   5.56214833\n",
+      "   12.0785146 ]\n",
+      " [ -8.86627732 -15.9169855    8.78372271 ...  -1.42014277  -3.25768086\n",
+      "    0.88337965]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "fbank_feat = logfbank(samples._samples,nfilt=80,lowfreq=20,dither=0,wintype='povey')\n",
+    "print(fbank_feat.shape)\n",
+    "print(fbank_feat)\n",
+    "\n",
+    "mfcc_feat = mfcc(samples._samples,dither=0,useEnergy=True,wintype='povey')\n",
+    "print(mfcc_feat.shape)\n",
+    "print(mfcc_feat)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "restricted-license",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "specialized-threat",
+   "metadata": {},
+   "outputs": [],
+   "source": []
  }
 ],
 "metadata": {

--- a/.notebook/python_test.ipynb
+++ b/.notebook/python_test.ipynb
@@ -637,7 +637,7 @@
  {
   "cell_type": "code",
   "execution_count": 59,
-   "id": "engaged-offense",
+   "id": "first-release",
   "metadata": {},
   "outputs": [
    {
@@ -660,7 +660,7 @@
  {
   "cell_type": "code",
   "execution_count": 35,
-   "id": "level-fairy",
+   "id": "convertible-roulette",
   "metadata": {},
   "outputs": [
    {
@@ -705,7 +705,7 @@
  {
   "cell_type": "code",
   "execution_count": 3,
-   "id": "beautiful-geometry",
+   "id": "cutting-fleece",
   "metadata": {},
   "outputs": [
    {
@@ -728,7 +728,7 @@
  {
   "cell_type": "code",
   "execution_count": 4,
-   "id": "african-trustee",
+   "id": "historical-diving",
   "metadata": {},
   "outputs": [
    {
@@ -748,7 +748,7 @@
  {
   "cell_type": "code",
   "execution_count": 5,
-   "id": "ready-wages",
+   "id": "similar-spice",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -758,7 +758,7 @@
  {
   "cell_type": "code",
   "execution_count": 6,
-   "id": "distinguished-printer",
+   "id": "grand-influence",
   "metadata": {},
   "outputs": [
    {
@@ -776,7 +776,7 @@
  {
   "cell_type": "code",
   "execution_count": 7,
-   "id": "precious-limit",
+   "id": "wireless-hypothetical",
   "metadata": {},
   "outputs": [
    {
@@ -809,7 +809,7 @@
  {
   "cell_type": "code",
   "execution_count": 17,
-   "id": "chemical-convenience",
+   "id": "designed-fluid",
   "metadata": {},
   "outputs": [
    {
@@ -839,7 +839,7 @@
  {
   "cell_type": "code",
   "execution_count": 18,
-   "id": "round-remark",
+   "id": "cultural-friendship",
   "metadata": {},
   "outputs": [
    {
@@ -871,7 +871,7 @@
  {
   "cell_type": "code",
   "execution_count": 19,
-   "id": "smaller-shower",
+   "id": "fossil-lotus",
   "metadata": {},
   "outputs": [
    {
@@ -903,7 +903,7 @@
  {
   "cell_type": "code",
   "execution_count": 31,
-   "id": "integrated-block",
+   "id": "constitutional-poker",
   "metadata": {},
   "outputs": [
    {
@@ -935,7 +935,7 @@
  {
   "cell_type": "code",
   "execution_count": 32,
-   "id": "favorite-failure",
+   "id": "threaded-strap",
   "metadata": {},
   "outputs": [
    {
@@ -966,7 +966,7 @@
  {
   "cell_type": "code",
   "execution_count": 20,
-   "id": "boolean-saint",
+   "id": "infectious-welcome",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -977,7 +977,7 @@
  {
   "cell_type": "code",
   "execution_count": 46,
-   "id": "senior-hospital",
+   "id": "musical-anatomy",
   "metadata": {},
   "outputs": [
    {
@@ -997,7 +997,7 @@
  {
   "cell_type": "code",
   "execution_count": 30,
-   "id": "consolidated-incident",
+   "id": "lucky-paraguay",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1007,7 +1007,7 @@
  {
   "cell_type": "code",
   "execution_count": 31,
-   "id": "pursuant-paragraph",
+   "id": "annual-christmas",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1017,7 +1017,7 @@
  {
   "cell_type": "code",
   "execution_count": 47,
-   "id": "mexican-apollo",
+   "id": "infectious-seeker",
   "metadata": {},
   "outputs": [
    {
@@ -1038,7 +1038,7 @@
  {
   "cell_type": "code",
   "execution_count": 1,
-   "id": "encouraging-integration",
+   "id": "pregnant-conditioning",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1049,7 +1049,7 @@
  {
   "cell_type": "code",
   "execution_count": 56,
-   "id": "trying-auckland",
+   "id": "logical-happiness",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1059,7 +1059,7 @@
  {
   "cell_type": "code",
   "execution_count": 58,
-   "id": "national-edward",
+   "id": "rocky-plastic",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1069,7 +1069,7 @@
  {
   "cell_type": "code",
   "execution_count": 60,
-   "id": "aerial-campaign",
+   "id": "focused-compensation",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1079,7 +1079,7 @@
  {
   "cell_type": "code",
   "execution_count": 66,
-   "id": "instant-violence",
+   "id": "centered-repository",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1089,7 +1089,7 @@
  {
   "cell_type": "code",
   "execution_count": 95,
-   "id": "medical-globe",
+   "id": "inner-invite",
   "metadata": {},
   "outputs": [
    {
@@ -1110,7 +1110,7 @@
  {
   "cell_type": "code",
   "execution_count": 81,
-   "id": "three-contrast",
+   "id": "russian-chosen",
   "metadata": {},
   "outputs": [
    {
@@ -1131,7 +1131,7 @@
  {
   "cell_type": "code",
   "execution_count": 11,
-   "id": "cross-atlas",
+   "id": "equal-particle",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1161,7 +1161,7 @@
  {
   "cell_type": "code",
   "execution_count": 12,
-   "id": "empirical-defense",
+   "id": "tracked-purse",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1172,7 +1172,7 @@
  {
   "cell_type": "code",
   "execution_count": 14,
-   "id": "rocky-listening",
+   "id": "steady-mileage",
   "metadata": {},
   "outputs": [
    {
@@ -1201,7 +1201,7 @@
  {
   "cell_type": "code",
   "execution_count": 13,
-   "id": "surrounded-absolute",
+   "id": "regulated-google",
   "metadata": {},
   "outputs": [
    {
@@ -1230,7 +1230,7 @@
  {
   "cell_type": "code",
   "execution_count": 15,
-   "id": "differential-surgery",
+   "id": "homeless-forge",
   "metadata": {},
   "outputs": [
    {
@@ -1260,7 +1260,7 @@
  {
   "cell_type": "code",
   "execution_count": 29,
-   "id": "durable-powell",
+   "id": "exciting-blocking",
   "metadata": {},
   "outputs": [
    {
@@ -1290,7 +1290,7 @@
  {
   "cell_type": "code",
   "execution_count": 30,
-   "id": "young-continuity",
+   "id": "through-botswana",
   "metadata": {},
   "outputs": [
    {
@@ -1308,7 +1308,7 @@
  {
   "cell_type": "code",
   "execution_count": 22,
-   "id": "geological-sarah",
+   "id": "cellular-violence",
   "metadata": {},
   "outputs": [
    {
@@ -1343,7 +1343,7 @@
  {
   "cell_type": "code",
   "execution_count": 23,
-   "id": "possible-angle",
+   "id": "undefined-parade",
   "metadata": {},
   "outputs": [
    {
@@ -1376,7 +1376,7 @@
  {
   "cell_type": "code",
   "execution_count": 33,
-   "id": "novel-sucking",
+   "id": "special-delicious",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1386,7 +1386,7 @@
  {
   "cell_type": "code",
   "execution_count": 34,
-   "id": "fixed-wallet",
+   "id": "seasonal-consensus",
   "metadata": {},
   "outputs": [
    {
@@ -1428,7 +1428,7 @@
  {
   "cell_type": "code",
   "execution_count": 35,
-   "id": "north-seattle",
+   "id": "dress-distinction",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1438,7 +1438,7 @@
  {
   "cell_type": "code",
   "execution_count": 38,
-   "id": "above-western",
+   "id": "rental-anthony",
   "metadata": {},
   "outputs": [
    {
@@ -1471,7 +1471,7 @@
  {
   "cell_type": "code",
   "execution_count": 41,
-   "id": "choice-diabetes",
+   "id": "separated-restriction",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1481,7 +1481,7 @@
  {
   "cell_type": "code",
   "execution_count": 3,
-   "id": "white-vessel",
+   "id": "painted-variable",
   "metadata": {},
   "outputs": [
    {
@@ -1504,7 +1504,7 @@
  {
   "cell_type": "code",
   "execution_count": 5,
-   "id": "treated-freedom",
+   "id": "satellite-insider",
   "metadata": {},
   "outputs": [
    {
@@ -1523,7 +1523,7 @@
  {
   "cell_type": "code",
   "execution_count": 7,
-   "id": "convinced-safety",
+   "id": "developed-thirty",
   "metadata": {},
   "outputs": [
    {
@@ -1543,7 +1543,7 @@
  {
   "cell_type": "code",
   "execution_count": 8,
-   "id": "blond-bunny",
+   "id": "official-bench",
   "metadata": {},
   "outputs": [
    {
@@ -1560,10 +1560,97 @@
    "print(sorted_val_scores)"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ranking-camera",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "b'\\x01\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x14\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x02\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x1e\\x00\\x00\\x00\\x00\\x00\\x00\\x00'\n",
+      "[ 1 20  2 30]\n",
+      "[[ 1 20]\n",
+      " [ 2 30]]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel_launcher.py:1: DeprecationWarning: tostring() is deprecated. Use tobytes() instead.\n",
+      "  \"\"\"Entry point for launching an IPython kernel.\n",
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel_launcher.py:3: DeprecationWarning: The binary mode of fromstring is deprecated, as it behaves surprisingly on unicode inputs. Use frombuffer instead\n",
+      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = scores.tostring()\n",
+    "print(a)\n",
+    "b = np.fromstring(a, scores.dtype)\n",
+    "print(b)\n",
+    "print(scores)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "breeding-proxy",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "numpy.int16"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.int16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "coordinate-hungary",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dtype = np.dtype('int16')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "specified-jackson",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "int16\n",
+      "16\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(dtype)\n",
+    "dtype is np.int16\n",
+    "print(np.iinfo(dtype).bits)"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "utility-monroe",
+   "id": "activated-insight",
   "metadata": {},
   "outputs": [],
   "source": []
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,6 +3,7 @@
    hooks:
    -   id: yapf
        files: \.py$
+        exclude: (?=third_party).*(\.py)$
 -   repo: https://github.com/pre-commit/pre-commit-hooks
    sha: a11d9314b22d8f8c7556443875b731ef05965464
    hooks:
@@ -15,6 +16,7 @@
    -   id: trailing-whitespace
        files: \.md$
    -   id: requirements-txt-fixer
+        exclude: (?=third_party).*$
    -   id: check-yaml
    -   id: check-json
    -   id: pretty-format-json
@@ -27,6 +29,7 @@
        -  --ignore=E501,E228,E226,E261,E266,E128,E402,W503
        -  --builtins=G,request
        -  --jobs=1
+        exclude: (?=third_party).*(\.py)$
 -   repo : https://github.com/Lucas-C/pre-commit-hooks
    sha: v1.0.1
    hooks:
@@ -51,8 +54,9 @@
        entry: python .pre-commit-hooks/copyright-check.hook
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
-        #exclude: (?=decoders/swig).*(\.cpp|\.h)$
+        exclude: (?=third_party).*(\.cpp|\.h|\.py)$
 -   repo: https://github.com/asottile/reorder_python_imports
    rev: v2.4.0
    hooks:
      - id: reorder-python-imports
+        exclude: (?=third_party).*(\.py)$
--- a/deepspeech/frontend/audio.py
+++ b/deepspeech/frontend/audio.py
@@ -298,6 +298,18 @@ class AudioSegment(object):
        samples = self._convert_samples_from_float32(self._samples, dtype)
        return samples.tostring()

+    def to(self, dtype='int16'):
+        """Create a `dtype` audio content.
+        
+        :param dtype: Data type for export samples. Options: 'int16', 'int32',
+                      'float32', 'float64'. Default is 'float32'.
+        :type dtype: str
+        :return: np.ndarray containing `dtype` audio content.
+        :rtype: str
+        """
+        samples = self._convert_samples_from_float32(self._samples, dtype)
+        return samples
+
    def gain_db(self, gain):
        """Apply gain in decibels to samples.


--- a/deepspeech/frontend/augmentor/spec_augment.py
+++ b/deepspeech/frontend/augmentor/spec_augment.py
@@ -64,6 +64,7 @@ class SpecAugmentor(AugmentorBase):
        self.n_freq_masks = n_freq_masks
        self.n_time_masks = n_time_masks
        self.p = p
+        #logger.info(f"specaug: F-{F}, T-{T}, F-n-{n_freq_masks}, T-n-{n_time_masks}")

        # adaptive SpecAugment
        self.adaptive_number_ratio = adaptive_number_ratio

--- a/deepspeech/frontend/featurizer/audio_featurizer.py
+++ b/deepspeech/frontend/featurizer/audio_featurizer.py
@@ -56,7 +56,8 @@ class AudioFeaturizer(object):
                 max_freq=None,
                 target_sample_rate=16000,
                 use_dB_normalization=True,
-                 target_dB=-20):
+                 target_dB=-20,
+                 dither=1.0):
        self._specgram_type = specgram_type
        # mfcc and fbank using `feat_dim`
        self._feat_dim = feat_dim
@@ -69,6 +70,7 @@ class AudioFeaturizer(object):
        self._use_dB_normalization = use_dB_normalization
        self._target_dB = target_dB
        self._fft_point = n_fft
+        self._dither = dither

    def featurize(self,
                  audio_segment,
@@ -101,8 +103,7 @@ class AudioFeaturizer(object):
        if self._use_dB_normalization:
            audio_segment.normalize(target_db=self._target_dB)
        # extract spectrogram
-        return self._compute_specgram(audio_segment.samples,
-                                      audio_segment.sample_rate)
+        return self._compute_specgram(audio_segment)

    @property
    def feature_size(self):
@@ -125,9 +126,11 @@ class AudioFeaturizer(object):
                             "Supported values: linear." % self._specgram_type)
        return feat_dim

-    def _compute_specgram(self, samples, sample_rate):
+    def _compute_specgram(self, audio_segment):
        """Extract various audio features."""
+        sample_rate = audio_segment.sample_rate
        if self._specgram_type == 'linear':
+            samples = audio_segment.samples
            return self._compute_linear_specgram(
                samples,
                sample_rate,
@@ -135,6 +138,7 @@ class AudioFeaturizer(object):
                window_ms=self._window_ms,
                max_freq=self._max_freq)
        elif self._specgram_type == 'mfcc':
+            samples = audio_segment.to('int16')
            return self._compute_mfcc(
                samples,
                sample_rate,
@@ -142,8 +146,10 @@ class AudioFeaturizer(object):
                stride_ms=self._stride_ms,
                window_ms=self._window_ms,
                max_freq=self._max_freq,
+                dither=self._dither,
                delta_delta=self._delta_delta)
        elif self._specgram_type == 'fbank':
+            samples = audio_segment.to('int16')
            return self._compute_fbank(
                samples,
                sample_rate,
@@ -151,6 +157,7 @@ class AudioFeaturizer(object):
                stride_ms=self._stride_ms,
                window_ms=self._window_ms,
                max_freq=self._max_freq,
+                dither=self._dither,
                delta_delta=self._delta_delta)
        else:
            raise ValueError("Unknown specgram_type %s. "
@@ -233,17 +240,18 @@ class AudioFeaturizer(object):
                      sample_rate,
                      feat_dim=13,
                      stride_ms=10.0,
-                      window_ms=20.0,
+                      window_ms=25.0,
                      max_freq=None,
+                      dither=1.0,
                      delta_delta=True):
        """Compute mfcc from samples.

        Args:
-            samples (np.ndarray): the audio signal from which to compute features. Should be an N*1 array
+            samples (np.ndarray, np.int16): the audio signal from which to compute features.
            sample_rate (float): the sample rate of the signal we are working with, in Hz.
            feat_dim (int): the number of cepstrum to return, default 13.
            stride_ms (float, optional): stride length in ms. Defaults to 10.0.
-            window_ms (float, optional): window length in ms. Defaults to 20.0.
+            window_ms (float, optional): window length in ms. Defaults to 25.0.
            max_freq ([type], optional): highest band edge of mel filters. In Hz, default is samplerate/2. Defaults to None.
            delta_delta (bool, optional): Whether with delta delta. Defaults to False.

@@ -270,14 +278,16 @@ class AudioFeaturizer(object):
            winlen=0.001 * window_ms,
            winstep=0.001 * stride_ms,
            numcep=feat_dim,
-            nfilt=2 * feat_dim,
-            nfft=None,
-            lowfreq=0,
+            nfilt=23,
+            nfft=512,
+            lowfreq=20,
            highfreq=max_freq,
+            dither=dither,
+            remove_dc_offset=True,
            preemph=0.97,
            ceplifter=22,
-            appendEnergy=True,
-            winfunc=lambda x: np.ones((x, )))
+            useEnergy=True,
+            winfunc='povey')
        mfcc_feat = np.transpose(mfcc_feat)
        if delta_delta:
            mfcc_feat = self._concat_delta_delta(mfcc_feat)
@@ -286,15 +296,16 @@ class AudioFeaturizer(object):
    def _compute_fbank(self,
                       samples,
                       sample_rate,
-                       feat_dim=26,
+                       feat_dim=40,
                       stride_ms=10.0,
-                       window_ms=20.0,
+                       window_ms=25.0,
                       max_freq=None,
+                       dither=1.0,
                       delta_delta=False):
        """Compute logfbank from samples.
        
        Args:
-            samples (np.ndarray): the audio signal from which to compute features. Should be an N*1 array
+            samples (np.ndarray, np.int16): the audio signal from which to compute features. Should be an N*1 array
            sample_rate (float): the sample rate of the signal we are working with, in Hz.
            feat_dim (int): the number of cepstrum to return, default 13.
            stride_ms (float, optional): stride length in ms. Defaults to 10.0.
@@ -325,9 +336,13 @@ class AudioFeaturizer(object):
            winstep=0.001 * stride_ms,
            nfilt=feat_dim,
            nfft=512,
-            lowfreq=0,
+            lowfreq=20,
            highfreq=max_freq,
-            preemph=0.97, )
+            dither=dither,
+            remove_dc_offset=True,
+            preemph=0.97,
+            wintype='povey')
+
        fbank_feat = np.transpose(fbank_feat)
        if delta_delta:
            fbank_feat = self._concat_delta_delta(fbank_feat)

--- a/deepspeech/frontend/normalizer.py
+++ b/deepspeech/frontend/normalizer.py
@@ -82,13 +82,16 @@ class FeatureNormalizer(object):
    def _read_mean_std_from_file(self, filepath, eps=1e-20):
        """Load mean and std from file."""
        mean, std = load_cmvn(filepath, filetype='npz')
-        self._mean = mean
-        self._istd = 1.0 / std
+        self._mean = mean.T
+        self._istd = 1.0 / std.T

    def _compute_mean_std(self, manifest_path, featurize_func, num_samples):
        """Compute mean and std from randomly sampled instances."""
        manifest = read_manifest(manifest_path)
-        sampled_manifest = self._rng.sample(manifest, num_samples)
+        if num_samples == -1:
+            sampled_manifest = manifest
+        else:
+            sampled_manifest = self._rng.sample(manifest, num_samples)
        features = []
        for instance in sampled_manifest:
            features.append(

--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/s0/local/data.sh
@@ -36,10 +36,12 @@ fi
 # compute mean and stddev for normalizer
 python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
 --manifest_path="data/manifest.train.raw" \
--num_samples=2000 \
 --specgram_type="fbank" \
 --feat_dim=80 \
 --delta_delta=false \
+--stride_ms=10.0 \
+--window_ms=25.0 \
+--sample_rate=16000 \
 --output_path="data/mean_std.npz"

 if [ $? -ne 0 ]; then

--- a/examples/tiny/s1/conf/augmentation.json
+++ b/examples/tiny/s1/conf/augmentation.json
 [
+  {
+    "type": "speed",
+    "params": {
+      "min_speed_rate": 0.9,
+      "max_speed_rate": 1.1,
+      "num_rates": 3
+    },
+    "prob": 0.0
+  },
  {
    "type": "shift",
    "params": {
@@ -6,5 +15,20 @@
      "max_shift_ms": 5
    },
    "prob": 1.0
+  },
+  {
+    "type": "specaug",
+    "params": {
+      "F": 10,
+      "T": 50,
+      "n_freq_masks": 2,
+      "n_time_masks": 2,
+      "p": 1.0,
+      "W": 80,
+      "adaptive_number_ratio": 0,
+      "adaptive_size_ratio": 0,
+      "max_n_time_masks": 20
+    },
+    "prob": 1.0
  }
 ]
--- a/setup.sh
+++ b/setup.sh
@@ -54,4 +54,14 @@ if [ $? != 0 ]; then
   exit -1
 fi

+
+# install kaldi-comptiable feature 
+pushd third_party/python_kaldi_features/
+python setup.py install
+if [ $? != 0 ]; then
+   error_msg "Please check why kaldi feature install error!"
+   exit -1
+fi
+popd
+
 info_msg "Install all dependencies successfully."
--- a/third_party/README.md
+++ b/third_party/README.md
+
+* [python_kaldi_features](https://github.com/ZitengWang/python_kaldi_features)
+commit: fc1bd6240c2008412ab64dc25045cd872f5e126c
+ref: https://zhuanlan.zhihu.com/p/55371926
--- a/third_party/python_kaldi_features/LICENSE
+++ b/third_party/python_kaldi_features/LICENSE
+The MIT License (MIT)
+
+Copyright (c) 2013 James Lyons
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/third_party/python_kaldi_features/MANIFEST
+++ b/third_party/python_kaldi_features/MANIFEST
+# file GENERATED by distutils, do NOT edit
+setup.py
+python_speech_features\__init__.py
+python_speech_features\base.py
+python_speech_features\sigproc.py
--- a/third_party/python_kaldi_features/README.rst
+++ b/third_party/python_kaldi_features/README.rst
+
+
+forked from `<https://github.com/jameslyons/python_speech_features>`_
+
+check the readme therein for the usages
+
+It has been modified to produce the same results as with the compute-mfcc-feats and compute-fbank-feats (check their default parameters first) commands in Kaldi.
+ 
+-------------------------------
+
+The compute-mfcc-feats pipeline:
+
+src/featbin/Compute-mfcc-feats.cc
+    
+    Mfcc mfcc(mfcc_opts)  --> src/feat/Feature-mfcc.h
+    
+                                 struct MfccOptions
+                                 
+                                 typedef OfflineFeatureTpl<MfccComputer> Mfcc --> src/feat/Feature-common.h
+           
+                                 MfccComputer()  --> src/feat/Feature-mfcc.cc
+                                 
+                                                         ComputeDctMatrix()  --> src/matrix/Matrix-functions.cc
+                                                         
+                                                         ComputeLifterCoeffs()  --> src/feat/Mel-computations.cc
+  
+    
+    for each utterance:
+    mfcc.ComputeFeatures()
+
+src/feat/Feature-common-inl.h
+
+    OfflineFeatureTpl<F>::ComputeFeatures()
+    
+        Compute()
+        
+            ExtractWindow()  --> src/feat/Feature-window.cc
+                                     
+                                     ProcessWindow()
+                                         
+                                         Dither, remove_dc_offset, log_energy_pre_window, Preemphasize, window
+            
+            computer_.Compute() --> src/feat/Feature-mfcc.cc
+               
+                                      MfccComputer::Compute()
+                                      
+                                          const MelBanks &mel_banks --> Mel-computations.cc
+                                          
+                                          srfft_
+                                        
+                                          ComputerPowerSpectrum()
+                                          
+                                          mel_banks.Compute()
+                                          
+                                          mel_energies_.ApplyLog()
+                                          
+                                          dct, cepstral_lifter
+                                          
--- a/third_party/python_kaldi_features/build/lib/python_speech_features/__init__.py
+++ b/third_party/python_kaldi_features/build/lib/python_speech_features/__init__.py
+from .base import *
--- a/third_party/python_kaldi_features/build/lib/python_speech_features/base.py
+++ b/third_party/python_kaldi_features/build/lib/python_speech_features/base.py
+# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications
+# Author: James Lyons 2012
+from __future__ import division
+import numpy
+from python_speech_features import sigproc
+from scipy.fftpack import dct
+
+def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
+         nfilt=23,nfft=512,lowfreq=20,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,
+         ceplifter=22,useEnergy=True,wintype='povey'):
+    """Compute MFCC features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param numcep: the number of cepstrum to return, default 13
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
+    :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+    :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
+    """
+    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither,remove_dc_offset,preemph,wintype)
+    feat = numpy.log(feat)
+    feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
+    feat = lifter(feat,ceplifter)
+    if useEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
+    return feat
+
+def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+          nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97, 
+          wintype='hamming'):
+    """Compute Mel-filterbank energy features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+     winfunc=lambda x:numpy.ones((x,))   
+    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
+        second return value is the energy in each frame (total energy, unwindowed)
+    """
+    highfreq= highfreq or samplerate/2
+    frames,raw_frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, dither, preemph, remove_dc_offset, wintype)
+    pspec = sigproc.powspec(frames,nfft) # nearly the same until this part
+    energy = numpy.sum(raw_frames**2,1) # this stores the raw energy in each frame
+    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
+
+    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
+    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
+    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
+
+    return feat,energy
+
+def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+          nfilt=40,nfft=512,lowfreq=64,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,wintype='hamming'):
+    """Compute log Mel-filterbank energy features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
+    """
+    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither, remove_dc_offset,preemph,wintype)
+    return numpy.log(feat)
+
+def hz2mel(hz):
+    """Convert a value in Hertz to Mels
+
+    :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
+    :returns: a value in Mels. If an array was passed in, an identical sized array is returned.
+    """
+    return 1127 * numpy.log(1+hz/700.0)
+
+
+def mel2hz(mel):
+    """Convert a value in Mels to Hertz
+
+    :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
+    :returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
+    """
+    return 700 * (numpy.exp(mel/1127.0)-1)
+
+def get_filterbanks(nfilt=26,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
+    """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
+    to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
+
+    :param nfilt: the number of filters in the filterbank, default 20.
+    :param nfft: the FFT size. Default is 512.
+    :param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
+    :param lowfreq: lowest band edge of mel filters, default 0 Hz
+    :param highfreq: highest band edge of mel filters, default samplerate/2
+    :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
+    """
+    highfreq= highfreq or samplerate/2
+    assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
+
+    # compute points evenly spaced in mels
+    lowmel = hz2mel(lowfreq)
+    highmel = hz2mel(highfreq)
+
+    # check kaldi/src/feat/Mel-computations.h    
+    fbank = numpy.zeros([nfilt,nfft//2+1])
+    mel_freq_delta = (highmel-lowmel)/(nfilt+1)
+    for j in range(0,nfilt):
+        leftmel = lowmel+j*mel_freq_delta
+        centermel = lowmel+(j+1)*mel_freq_delta
+        rightmel = lowmel+(j+2)*mel_freq_delta
+        for i in range(0,nfft//2):
+            mel=hz2mel(i*samplerate/nfft)
+            if mel>leftmel and mel<rightmel:
+                if mel<centermel:
+                    fbank[j,i]=(mel-leftmel)/(centermel-leftmel)
+                else:
+                    fbank[j,i]=(rightmel-mel)/(rightmel-centermel)
+    return fbank
+
+def lifter(cepstra, L=22):
+    """Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
+    magnitude of the high frequency DCT coeffs.
+
+    :param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
+    :param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
+    """
+    if L > 0:
+        nframes,ncoeff = numpy.shape(cepstra)
+        n = numpy.arange(ncoeff)
+        lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
+        return lift*cepstra
+    else:
+        # values of L <= 0, do nothing
+        return cepstra
+
+def delta(feat, N):
+    """Compute delta features from a feature vector sequence.
+
+    :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
+    :param N: For each frame, calculate delta features based on preceding and following N frames
+    :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
+    """
+    if N < 1:
+        raise ValueError('N must be an integer >= 1')
+    NUMFRAMES = len(feat)
+    denominator = 2 * sum([i**2 for i in range(1, N+1)])
+    delta_feat = numpy.empty_like(feat)
+    padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge')   # padded version of feat
+    for t in range(NUMFRAMES):
+        delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator   # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
+    return delta_feat
--- a/third_party/python_kaldi_features/build/lib/python_speech_features/base_orig.py
+++ b/third_party/python_kaldi_features/build/lib/python_speech_features/base_orig.py
+# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications
+# Author: James Lyons 2012
+from __future__ import division
+import numpy
+from python_speech_features import sigproc
+from scipy.fftpack import dct
+
+def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
+         nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True,
+         winfunc=lambda x:numpy.ones((x,))):
+    """Compute MFCC features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param numcep: the number of cepstrum to return, default 13
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
+    :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+    :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
+    """
+    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph,winfunc)
+    feat = numpy.log(feat)
+    feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
+    feat = lifter(feat,ceplifter)
+    if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
+    return feat
+
+def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
+          winfunc=lambda x:numpy.ones((x,))):
+    """Compute Mel-filterbank energy features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
+        second return value is the energy in each frame (total energy, unwindowed)
+    """
+    highfreq= highfreq or samplerate/2
+    signal = sigproc.preemphasis(signal,preemph)
+    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
+    pspec = sigproc.powspec(frames,nfft)
+    energy = numpy.sum(pspec,1) # this stores the total energy in each frame
+    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
+
+    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
+    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
+    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
+
+    return feat,energy
+
+def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
+    """Compute log Mel-filterbank energy features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
+    """
+    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph)
+    return numpy.log(feat)
+
+def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+        nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
+        winfunc=lambda x:numpy.ones((x,))):
+    """Compute Spectral Subband Centroid features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
+    """
+    highfreq= highfreq or samplerate/2
+    signal = sigproc.preemphasis(signal,preemph)
+    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
+    pspec = sigproc.powspec(frames,nfft)
+    pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems
+
+    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
+    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
+    R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1))
+
+    return numpy.dot(pspec*R,fb.T) / feat
+
+def hz2mel(hz):
+    """Convert a value in Hertz to Mels
+
+    :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
+    :returns: a value in Mels. If an array was passed in, an identical sized array is returned.
+    """
+    return 2595 * numpy.log10(1+hz/700.)
+
+def mel2hz(mel):
+    """Convert a value in Mels to Hertz
+
+    :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
+    :returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
+    """
+    return 700*(10**(mel/2595.0)-1)
+
+def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
+    """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
+    to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
+
+    :param nfilt: the number of filters in the filterbank, default 20.
+    :param nfft: the FFT size. Default is 512.
+    :param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
+    :param lowfreq: lowest band edge of mel filters, default 0 Hz
+    :param highfreq: highest band edge of mel filters, default samplerate/2
+    :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
+    """
+    highfreq= highfreq or samplerate/2
+    assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
+
+    # compute points evenly spaced in mels
+    lowmel = hz2mel(lowfreq)
+    highmel = hz2mel(highfreq)
+    melpoints = numpy.linspace(lowmel,highmel,nfilt+2)
+    # our points are in Hz, but we use fft bins, so we have to convert
+    #  from Hz to fft bin number
+    bin = numpy.floor((nfft+1)*mel2hz(melpoints)/samplerate)
+
+    fbank = numpy.zeros([nfilt,nfft//2+1])
+    for j in range(0,nfilt):
+        for i in range(int(bin[j]), int(bin[j+1])):
+            fbank[j,i] = (i - bin[j]) / (bin[j+1]-bin[j])
+        for i in range(int(bin[j+1]), int(bin[j+2])):
+            fbank[j,i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1])
+    return fbank
+
+def lifter(cepstra, L=22):
+    """Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
+    magnitude of the high frequency DCT coeffs.
+
+    :param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
+    :param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
+    """
+    if L > 0:
+        nframes,ncoeff = numpy.shape(cepstra)
+        n = numpy.arange(ncoeff)
+        lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
+        return lift*cepstra
+    else:
+        # values of L <= 0, do nothing
+        return cepstra
+
+def delta(feat, N):
+    """Compute delta features from a feature vector sequence.
+
+    :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
+    :param N: For each frame, calculate delta features based on preceding and following N frames
+    :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
+    """
+    if N < 1:
+        raise ValueError('N must be an integer >= 1')
+    NUMFRAMES = len(feat)
+    denominator = 2 * sum([i**2 for i in range(1, N+1)])
+    delta_feat = numpy.empty_like(feat)
+    padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge')   # padded version of feat
+    for t in range(NUMFRAMES):
+        delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator   # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
+    return delta_feat
--- a/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc.py
+++ b/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc.py
+# This file includes routines for basic signal processing including framing and computing power spectra.
+# Author: James Lyons 2012
+import decimal
+
+import numpy
+import math
+import logging
+
+
+def round_half_up(number):
+    return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
+
+
+def rolling_window(a, window, step=1):
+    # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
+    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
+    strides = a.strides + (a.strides[-1],)
+    return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
+
+
+def framesig(sig, frame_len, frame_step, dither=1.0, preemph=0.97, remove_dc_offset=True, wintype='hamming', stride_trick=True):
+    """Frame a signal into overlapping frames.
+
+    :param sig: the audio signal to frame.
+    :param frame_len: length of each frame measured in samples.
+    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
+    :param stride_trick: use stride trick to compute the rolling window and window multiplication faster
+    :returns: an array of frames. Size is NUMFRAMES by frame_len.
+    """
+    slen = len(sig)
+    frame_len = int(round_half_up(frame_len))
+    frame_step = int(round_half_up(frame_step))
+    if slen <= frame_len:
+        numframes = 1
+    else:
+        numframes = 1 + (( slen - frame_len) // frame_step)
+
+    # check kaldi/src/feat/feature-window.h
+    padsignal = sig[:(numframes-1)*frame_step+frame_len]
+    if wintype is 'povey':
+        win = numpy.empty(frame_len)
+        for i in range(frame_len):
+            win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85     
+    else: # the hamming window
+        win = numpy.hamming(frame_len)
+        
+    if stride_trick:
+        frames = rolling_window(padsignal, window=frame_len, step=frame_step)
+    else:
+        indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
+            numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
+        indices = numpy.array(indices, dtype=numpy.int32)
+        frames = padsignal[indices]
+        win = numpy.tile(win, (numframes, 1))
+        
+    frames = frames.astype(numpy.float32)
+    raw_frames = numpy.zeros(frames.shape)
+    for frm in range(frames.shape[0]):
+        frames[frm,:] = do_dither(frames[frm,:], dither)        # dither
+        frames[frm,:] = do_remove_dc_offset(frames[frm,:])      # remove dc offset
+        raw_frames[frm,:] = frames[frm,:]
+        frames[frm,:] = do_preemphasis(frames[frm,:], preemph)    # preemphasize
+
+    return frames * win, raw_frames
+
+def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
+    """Does overlap-add procedure to undo the action of framesig.
+
+    :param frames: the array of frames.
+    :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
+    :param frame_len: length of each frame measured in samples.
+    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
+    :returns: a 1-D signal.
+    """
+    frame_len = round_half_up(frame_len)
+    frame_step = round_half_up(frame_step)
+    numframes = numpy.shape(frames)[0]
+    assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
+
+    indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
+        numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
+    indices = numpy.array(indices, dtype=numpy.int32)
+    padlen = (numframes - 1) * frame_step + frame_len
+
+    if siglen <= 0: siglen = padlen
+
+    rec_signal = numpy.zeros((padlen,))
+    window_correction = numpy.zeros((padlen,))
+    win = winfunc(frame_len)
+
+    for i in range(0, numframes):
+        window_correction[indices[i, :]] = window_correction[
+                                               indices[i, :]] + win + 1e-15  # add a little bit so it is never zero
+        rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
+
+    rec_signal = rec_signal / window_correction
+    return rec_signal[0:siglen]
+
+
+def magspec(frames, NFFT):
+    """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
+    """
+    if numpy.shape(frames)[1] > NFFT:
+        logging.warn(
+            'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
+            numpy.shape(frames)[1], NFFT)
+    complex_spec = numpy.fft.rfft(frames, NFFT)
+    return numpy.absolute(complex_spec)
+
+
+def powspec(frames, NFFT):
+    """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
+    """
+    return numpy.square(magspec(frames, NFFT))
+
+
+def logpowspec(frames, NFFT, norm=1):
+    """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
+    """
+    ps = powspec(frames, NFFT);
+    ps[ps <= 1e-30] = 1e-30
+    lps = 10 * numpy.log10(ps)
+    if norm:
+        return lps - numpy.max(lps)
+    else:
+        return lps
+
+def do_dither(signal, dither_value=1.0):
+    signal += numpy.random.normal(size=signal.shape) * dither_value
+    return signal
+    
+def do_remove_dc_offset(signal):
+    signal -= numpy.mean(signal)
+    return signal
+
+def do_preemphasis(signal, coeff=0.97):
+    """perform preemphasis on the input signal.
+
+    :param signal: The signal to filter.
+    :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
+    :returns: the filtered signal.
+    """
+    return numpy.append((1-coeff)*signal[0], signal[1:] - coeff * signal[:-1])
--- a/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc_orig.py
+++ b/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc_orig.py
+# This file includes routines for basic signal processing including framing and computing power spectra.
+# Author: James Lyons 2012
+import decimal
+
+import numpy
+import math
+import logging
+
+
+def round_half_up(number):
+    return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
+
+
+def rolling_window(a, window, step=1):
+    # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
+    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
+    strides = a.strides + (a.strides[-1],)
+    return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
+
+
+def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,)), stride_trick=True):
+    """Frame a signal into overlapping frames.
+
+    :param sig: the audio signal to frame.
+    :param frame_len: length of each frame measured in samples.
+    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
+    :param stride_trick: use stride trick to compute the rolling window and window multiplication faster
+    :returns: an array of frames. Size is NUMFRAMES by frame_len.
+    """
+    slen = len(sig)
+    frame_len = int(round_half_up(frame_len))
+    frame_step = int(round_half_up(frame_step))
+    if slen <= frame_len:
+        numframes = 1
+    else:
+        numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step))
+
+    padlen = int((numframes - 1) * frame_step + frame_len)
+
+    zeros = numpy.zeros((padlen - slen,))
+    padsignal = numpy.concatenate((sig, zeros))
+    if stride_trick:
+        win = winfunc(frame_len)
+        frames = rolling_window(padsignal, window=frame_len, step=frame_step)
+    else:
+        indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
+            numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
+        indices = numpy.array(indices, dtype=numpy.int32)
+        frames = padsignal[indices]
+        win = numpy.tile(winfunc(frame_len), (numframes, 1))
+
+    return frames * win
+
+
+def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
+    """Does overlap-add procedure to undo the action of framesig.
+
+    :param frames: the array of frames.
+    :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
+    :param frame_len: length of each frame measured in samples.
+    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
+    :returns: a 1-D signal.
+    """
+    frame_len = round_half_up(frame_len)
+    frame_step = round_half_up(frame_step)
+    numframes = numpy.shape(frames)[0]
+    assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
+
+    indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
+        numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
+    indices = numpy.array(indices, dtype=numpy.int32)
+    padlen = (numframes - 1) * frame_step + frame_len
+
+    if siglen <= 0: siglen = padlen
+
+    rec_signal = numpy.zeros((padlen,))
+    window_correction = numpy.zeros((padlen,))
+    win = winfunc(frame_len)
+
+    for i in range(0, numframes):
+        window_correction[indices[i, :]] = window_correction[
+                                               indices[i, :]] + win + 1e-15  # add a little bit so it is never zero
+        rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
+
+    rec_signal = rec_signal / window_correction
+    return rec_signal[0:siglen]
+
+
+def magspec(frames, NFFT):
+    """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
+    """
+    if numpy.shape(frames)[1] > NFFT:
+        logging.warn(
+            'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
+            numpy.shape(frames)[1], NFFT)
+    complex_spec = numpy.fft.rfft(frames, NFFT)
+    return numpy.absolute(complex_spec)
+
+
+def powspec(frames, NFFT):
+    """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
+    """
+    return 1.0 / NFFT * numpy.square(magspec(frames, NFFT))
+
+
+def logpowspec(frames, NFFT, norm=1):
+    """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
+    """
+    ps = powspec(frames, NFFT);
+    ps[ps <= 1e-30] = 1e-30
+    lps = 10 * numpy.log10(ps)
+    if norm:
+        return lps - numpy.max(lps)
+    else:
+        return lps
+
+
+def preemphasis(signal, coeff=0.95):
+    """perform preemphasis on the input signal.
+
+    :param signal: The signal to filter.
+    :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
+    :returns: the filtered signal.
+    """
+    return numpy.append(signal[0], signal[1:] - coeff * signal[:-1])
--- a/third_party/python_kaldi_features/dist/python_speech_features-0.6-py3.7.egg
+++ b/third_party/python_kaldi_features/dist/python_speech_features-0.6-py3.7.egg
--- a/third_party/python_kaldi_features/docs/Makefile
+++ b/third_party/python_kaldi_features/docs/Makefile
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = build
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
+
+.PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html      to make standalone HTML files"
+	@echo "  dirhtml   to make HTML files named index.html in directories"
+	@echo "  pickle    to make pickle files"
+	@echo "  json      to make JSON files"
+	@echo "  htmlhelp  to make HTML files and a HTML help project"
+	@echo "  qthelp    to make HTML files and a qthelp project"
+	@echo "  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  changes   to make an overview of all changed/added/deprecated items"
+	@echo "  linkcheck to check all external links for integrity"
+	@echo "  doctest   to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+	-rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/python_speech_features.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/python_speech_features.qhc"
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
+	      "run these through (pdf)latex."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
--- a/third_party/python_kaldi_features/docs/make.bat
+++ b/third_party/python_kaldi_features/docs/make.bat
+@ECHO OFF
+
+REM Command file for Sphinx documentation
+
+set SPHINXBUILD=sphinx-build
+set BUILDDIR=build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source
+if NOT "%PAPER%" == "" (
+	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+	:help
+	echo.Please use `make ^<target^>` where ^<target^> is one of
+	echo.  html      to make standalone HTML files
+	echo.  dirhtml   to make HTML files named index.html in directories
+	echo.  pickle    to make pickle files
+	echo.  json      to make JSON files
+	echo.  htmlhelp  to make HTML files and a HTML help project
+	echo.  qthelp    to make HTML files and a qthelp project
+	echo.  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+	echo.  changes   to make an overview over all changed/added/deprecated items
+	echo.  linkcheck to check all external links for integrity
+	echo.  doctest   to run all doctests embedded in the documentation if enabled
+	goto end
+)
+
+if "%1" == "clean" (
+	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+	del /q /s %BUILDDIR%\*
+	goto end
+)
+
+if "%1" == "html" (
+	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+	goto end
+)
+
+if "%1" == "dirhtml" (
+	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+	goto end
+)
+
+if "%1" == "pickle" (
+	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+	echo.
+	echo.Build finished; now you can process the pickle files.
+	goto end
+)
+
+if "%1" == "json" (
+	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+	echo.
+	echo.Build finished; now you can process the JSON files.
+	goto end
+)
+
+if "%1" == "htmlhelp" (
+	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+	echo.
+	echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+	goto end
+)
+
+if "%1" == "qthelp" (
+	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+	echo.
+	echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\python_speech_features.qhcp
+	echo.To view the help file:
+	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\python_speech_features.ghc
+	goto end
+)
+
+if "%1" == "latex" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	echo.
+	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "changes" (
+	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+	echo.
+	echo.The overview file is in %BUILDDIR%/changes.
+	goto end
+)
+
+if "%1" == "linkcheck" (
+	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+	echo.
+	echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+	goto end
+)
+
+if "%1" == "doctest" (
+	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+	echo.
+	echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+	goto end
+)
+
+:end
--- a/third_party/python_kaldi_features/docs/source/conf.py
+++ b/third_party/python_kaldi_features/docs/source/conf.py
+# -*- coding: utf-8 -*-
+#
+# python_speech_features documentation build configuration file, created by
+# sphinx-quickstart on Thu Oct 31 16:49:58 2013.
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys, os
+
+import mock
+ 
+MOCK_MODULES = ['numpy', 'scipy', 'scipy.fftpack']
+for mod_name in MOCK_MODULES:
+  sys.modules[mod_name] = mock.Mock()
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+sys.path.insert(0,os.path.abspath('../..'))
+
+# -- General configuration -----------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = ['sphinx.ext.autodoc']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'python_speech_features'
+copyright = u'2013, James Lyons'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '0.1.0'
+# The full version, including alpha/beta/rc tags.
+release = '0.1.0'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of documents that shouldn't be included in the build.
+#unused_docs = []
+
+# List of directories, relative to source directory, that shouldn't be searched
+# for source files.
+exclude_trees = []
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  Major themes that come with
+# Sphinx are currently 'default' and 'sphinxdoc'.
+html_theme = 'default'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_use_modindex = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = ''
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'python_speech_featuresdoc'
+
+
+# -- Options for LaTeX output --------------------------------------------------
+
+# The paper size ('letter' or 'a4').
+#latex_paper_size = 'letter'
+
+# The font size ('10pt', '11pt' or '12pt').
+#latex_font_size = '10pt'
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+  ('index', 'python_speech_features.tex', u'python\\_speech\\_features Documentation',
+   u'James Lyons', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# Additional stuff for the LaTeX preamble.
+#latex_preamble = ''
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_use_modindex = True
+
+autodoc_member_order = 'bysource'
--- a/third_party/python_kaldi_features/docs/source/index.rst
+++ b/third_party/python_kaldi_features/docs/source/index.rst
+.. python_speech_features documentation master file, created by
+   sphinx-quickstart on Thu Oct 31 16:49:58 2013.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to python_speech_features's documentation!
+==================================================
+
+This library provides common speech features for ASR including MFCCs and filterbank energies.
+If you are not sure what MFCCs are, and would like to know more have a look at this MFCC tutorial: 
+http://www.practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/.
+
+You will need numpy and scipy to run these files. The code for this project is available at https://github.com/jameslyons/python_speech_features .
+
+Supported features:
+
+- :py:meth:`python_speech_features.mfcc` - Mel Frequency Cepstral Coefficients
+- :py:meth:`python_speech_features.fbank` - Filterbank Energies
+- :py:meth:`python_speech_features.logfbank` - Log Filterbank Energies
+- :py:meth:`python_speech_features.ssc` - Spectral Subband Centroids
+
+To use MFCC features::
+
+    from python_speech_features import mfcc
+    from python_speech_features import logfbank
+    import scipy.io.wavfile as wav
+    
+    (rate,sig) = wav.read("file.wav")
+    mfcc_feat = mfcc(sig,rate)
+    fbank_feat = logfbank(sig,rate)
+    
+    print(fbank_feat[1:3,:])
+
+From here you can write the features to a file etc.
+
+Functions provided in python_speech_features module
+-------------------------------------
+   
+.. automodule:: python_speech_features.base
+    :members:
+    
+
+Functions provided in sigproc module
+------------------------------------
+.. automodule:: python_speech_features.sigproc
+    :members:
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
+
--- a/third_party/python_kaldi_features/english.wav
+++ b/third_party/python_kaldi_features/english.wav
--- a/third_party/python_kaldi_features/example.py
+++ b/third_party/python_kaldi_features/example.py
+#!/usr/bin/env python
+
+from python_speech_features import mfcc
+from python_speech_features import delta
+from python_speech_features import logfbank
+import scipy.io.wavfile as wav
+
+(rate,sig) = wav.read("english.wav")
+
+# note that generally nfilt=40 is used for speech recognition
+fbank_feat = logfbank(sig,nfilt=23,lowfreq=20,dither=0,wintype='povey')
+
+# the computed fbank coefficents of english.wav with dimension [110,23]
+# [ 12.2865	12.6906	13.1765	15.714	16.064	15.7553	16.5746	16.9205	16.6472	16.1302	16.4576	16.7326	16.8864	17.7215	18.88	19.1377	19.1495	18.6683	18.3886	20.3506	20.2772	18.8248	18.1899
+# 11.9198	13.146	14.7215	15.8642	17.4288	16.394	16.8238	16.1095	16.4297	16.6331	16.3163	16.5093	17.4981	18.3429	19.6555	19.6263	19.8435	19.0534	19.001	20.0287	19.7707	19.5852	19.1112
+# ...
+# ...
+# the same with that using kaldi commands: compute-fbank-feats --dither=0.0
+
+
+mfcc_feat = mfcc(sig,dither=0,useEnergy=True,wintype='povey')
+
+# the computed mfcc coefficents of english.wav with dimension [110,13]
+# [ 17.1337	-23.3651	-7.41751	-7.73686	-21.3682	-8.93884	-3.70843	4.68346	-16.0676	12.782	-7.24054	8.25089	10.7292
+# 17.1692	-23.3028	-5.61872	-4.0075	-23.287	-20.6101	-5.51584	-6.15273	-14.4333	8.13052	-0.0345329	2.06274	-0.564298
+# ...
+# ...
+# the same with that using kaldi commands: compute-mfcc-feats --dither=0.0
+
--- a/third_party/python_kaldi_features/python_speech_features.egg-info/PKG-INFO
+++ b/third_party/python_kaldi_features/python_speech_features.egg-info/PKG-INFO
+Metadata-Version: 1.0
+Name: python-speech-features
+Version: 0.6
+Summary: Python Speech Feature extraction
+Home-page: https://github.com/jameslyons/python_speech_features
+Author: James Lyons
+Author-email: james.lyons0@gmail.com
+License: MIT
+Description: UNKNOWN
+Platform: UNKNOWN
--- a/third_party/python_kaldi_features/python_speech_features.egg-info/SOURCES.txt
+++ b/third_party/python_kaldi_features/python_speech_features.egg-info/SOURCES.txt
+README.rst
+setup.py
+python_speech_features/__init__.py
+python_speech_features/base.py
+python_speech_features/base_orig.py
+python_speech_features/sigproc.py
+python_speech_features/sigproc_orig.py
+python_speech_features.egg-info/PKG-INFO
+python_speech_features.egg-info/SOURCES.txt
+python_speech_features.egg-info/dependency_links.txt
+python_speech_features.egg-info/top_level.txt
+test/test_sigproc.py
\ No newline at end of file
--- a/third_party/python_kaldi_features/python_speech_features.egg-info/dependency_links.txt
+++ b/third_party/python_kaldi_features/python_speech_features.egg-info/dependency_links.txt
+
--- a/third_party/python_kaldi_features/python_speech_features.egg-info/top_level.txt
+++ b/third_party/python_kaldi_features/python_speech_features.egg-info/top_level.txt
+python_speech_features
--- a/third_party/python_kaldi_features/python_speech_features/__init__.py
+++ b/third_party/python_kaldi_features/python_speech_features/__init__.py
+from .base import *
--- a/third_party/python_kaldi_features/python_speech_features/base.py
+++ b/third_party/python_kaldi_features/python_speech_features/base.py
+# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications
+# Author: James Lyons 2012
+from __future__ import division
+import numpy
+from python_speech_features import sigproc
+from scipy.fftpack import dct
+
+def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
+         nfilt=23,nfft=512,lowfreq=20,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,
+         ceplifter=22,useEnergy=True,wintype='povey'):
+    """Compute MFCC features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param numcep: the number of cepstrum to return, default 13
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
+    :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+    :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
+    """
+    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither,remove_dc_offset,preemph,wintype)
+    feat = numpy.log(feat)
+    feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
+    feat = lifter(feat,ceplifter)
+    if useEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
+    return feat
+
+def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+          nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97, 
+          wintype='hamming'):
+    """Compute Mel-filterbank energy features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+     winfunc=lambda x:numpy.ones((x,))   
+    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
+        second return value is the energy in each frame (total energy, unwindowed)
+    """
+    highfreq= highfreq or samplerate/2
+    frames,raw_frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, dither, preemph, remove_dc_offset, wintype)
+    pspec = sigproc.powspec(frames,nfft) # nearly the same until this part
+    energy = numpy.sum(raw_frames**2,1) # this stores the raw energy in each frame
+    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
+
+    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
+    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
+    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
+
+    return feat,energy
+
+def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+          nfilt=40,nfft=512,lowfreq=64,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,wintype='hamming'):
+    """Compute log Mel-filterbank energy features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
+    """
+    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither, remove_dc_offset,preemph,wintype)
+    return numpy.log(feat)
+
+def hz2mel(hz):
+    """Convert a value in Hertz to Mels
+
+    :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
+    :returns: a value in Mels. If an array was passed in, an identical sized array is returned.
+    """
+    return 1127 * numpy.log(1+hz/700.0)
+
+
+def mel2hz(mel):
+    """Convert a value in Mels to Hertz
+
+    :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
+    :returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
+    """
+    return 700 * (numpy.exp(mel/1127.0)-1)
+
+def get_filterbanks(nfilt=26,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
+    """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
+    to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
+
+    :param nfilt: the number of filters in the filterbank, default 20.
+    :param nfft: the FFT size. Default is 512.
+    :param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
+    :param lowfreq: lowest band edge of mel filters, default 0 Hz
+    :param highfreq: highest band edge of mel filters, default samplerate/2
+    :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
+    """
+    highfreq= highfreq or samplerate/2
+    assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
+
+    # compute points evenly spaced in mels
+    lowmel = hz2mel(lowfreq)
+    highmel = hz2mel(highfreq)
+
+    # check kaldi/src/feat/Mel-computations.h    
+    fbank = numpy.zeros([nfilt,nfft//2+1])
+    mel_freq_delta = (highmel-lowmel)/(nfilt+1)
+    for j in range(0,nfilt):
+        leftmel = lowmel+j*mel_freq_delta
+        centermel = lowmel+(j+1)*mel_freq_delta
+        rightmel = lowmel+(j+2)*mel_freq_delta
+        for i in range(0,nfft//2):
+            mel=hz2mel(i*samplerate/nfft)
+            if mel>leftmel and mel<rightmel:
+                if mel<centermel:
+                    fbank[j,i]=(mel-leftmel)/(centermel-leftmel)
+                else:
+                    fbank[j,i]=(rightmel-mel)/(rightmel-centermel)
+    return fbank
+
+def lifter(cepstra, L=22):
+    """Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
+    magnitude of the high frequency DCT coeffs.
+
+    :param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
+    :param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
+    """
+    if L > 0:
+        nframes,ncoeff = numpy.shape(cepstra)
+        n = numpy.arange(ncoeff)
+        lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
+        return lift*cepstra
+    else:
+        # values of L <= 0, do nothing
+        return cepstra
+
+def delta(feat, N):
+    """Compute delta features from a feature vector sequence.
+
+    :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
+    :param N: For each frame, calculate delta features based on preceding and following N frames
+    :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
+    """
+    if N < 1:
+        raise ValueError('N must be an integer >= 1')
+    NUMFRAMES = len(feat)
+    denominator = 2 * sum([i**2 for i in range(1, N+1)])
+    delta_feat = numpy.empty_like(feat)
+    padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge')   # padded version of feat
+    for t in range(NUMFRAMES):
+        delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator   # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
+    return delta_feat
--- a/third_party/python_kaldi_features/python_speech_features/base_orig.py
+++ b/third_party/python_kaldi_features/python_speech_features/base_orig.py
+# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications
+# Author: James Lyons 2012
+from __future__ import division
+import numpy
+from python_speech_features import sigproc
+from scipy.fftpack import dct
+
+def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
+         nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True,
+         winfunc=lambda x:numpy.ones((x,))):
+    """Compute MFCC features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param numcep: the number of cepstrum to return, default 13
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
+    :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+    :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
+    """
+    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph,winfunc)
+    feat = numpy.log(feat)
+    feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
+    feat = lifter(feat,ceplifter)
+    if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
+    return feat
+
+def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
+          winfunc=lambda x:numpy.ones((x,))):
+    """Compute Mel-filterbank energy features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
+        second return value is the energy in each frame (total energy, unwindowed)
+    """
+    highfreq= highfreq or samplerate/2
+    signal = sigproc.preemphasis(signal,preemph)
+    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
+    pspec = sigproc.powspec(frames,nfft)
+    energy = numpy.sum(pspec,1) # this stores the total energy in each frame
+    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
+
+    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
+    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
+    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
+
+    return feat,energy
+
+def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
+    """Compute log Mel-filterbank energy features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
+    """
+    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph)
+    return numpy.log(feat)
+
+def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+        nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
+        winfunc=lambda x:numpy.ones((x,))):
+    """Compute Spectral Subband Centroid features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
+    """
+    highfreq= highfreq or samplerate/2
+    signal = sigproc.preemphasis(signal,preemph)
+    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
+    pspec = sigproc.powspec(frames,nfft)
+    pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems
+
+    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
+    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
+    R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1))
+
+    return numpy.dot(pspec*R,fb.T) / feat
+
+def hz2mel(hz):
+    """Convert a value in Hertz to Mels
+
+    :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
+    :returns: a value in Mels. If an array was passed in, an identical sized array is returned.
+    """
+    return 2595 * numpy.log10(1+hz/700.)
+
+def mel2hz(mel):
+    """Convert a value in Mels to Hertz
+
+    :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
+    :returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
+    """
+    return 700*(10**(mel/2595.0)-1)
+
+def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
+    """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
+    to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
+
+    :param nfilt: the number of filters in the filterbank, default 20.
+    :param nfft: the FFT size. Default is 512.
+    :param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
+    :param lowfreq: lowest band edge of mel filters, default 0 Hz
+    :param highfreq: highest band edge of mel filters, default samplerate/2
+    :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
+    """
+    highfreq= highfreq or samplerate/2
+    assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
+
+    # compute points evenly spaced in mels
+    lowmel = hz2mel(lowfreq)
+    highmel = hz2mel(highfreq)
+    melpoints = numpy.linspace(lowmel,highmel,nfilt+2)
+    # our points are in Hz, but we use fft bins, so we have to convert
+    #  from Hz to fft bin number
+    bin = numpy.floor((nfft+1)*mel2hz(melpoints)/samplerate)
+
+    fbank = numpy.zeros([nfilt,nfft//2+1])
+    for j in range(0,nfilt):
+        for i in range(int(bin[j]), int(bin[j+1])):
+            fbank[j,i] = (i - bin[j]) / (bin[j+1]-bin[j])
+        for i in range(int(bin[j+1]), int(bin[j+2])):
+            fbank[j,i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1])
+    return fbank
+
+def lifter(cepstra, L=22):
+    """Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
+    magnitude of the high frequency DCT coeffs.
+
+    :param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
+    :param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
+    """
+    if L > 0:
+        nframes,ncoeff = numpy.shape(cepstra)
+        n = numpy.arange(ncoeff)
+        lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
+        return lift*cepstra
+    else:
+        # values of L <= 0, do nothing
+        return cepstra
+
+def delta(feat, N):
+    """Compute delta features from a feature vector sequence.
+
+    :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
+    :param N: For each frame, calculate delta features based on preceding and following N frames
+    :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
+    """
+    if N < 1:
+        raise ValueError('N must be an integer >= 1')
+    NUMFRAMES = len(feat)
+    denominator = 2 * sum([i**2 for i in range(1, N+1)])
+    delta_feat = numpy.empty_like(feat)
+    padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge')   # padded version of feat
+    for t in range(NUMFRAMES):
+        delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator   # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
+    return delta_feat
--- a/third_party/python_kaldi_features/python_speech_features/sigproc.py
+++ b/third_party/python_kaldi_features/python_speech_features/sigproc.py
+# This file includes routines for basic signal processing including framing and computing power spectra.
+# Author: James Lyons 2012
+import decimal
+
+import numpy
+import math
+import logging
+
+
+def round_half_up(number):
+    return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
+
+
+def rolling_window(a, window, step=1):
+    # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
+    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
+    strides = a.strides + (a.strides[-1],)
+    return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
+
+
+def framesig(sig, frame_len, frame_step, dither=1.0, preemph=0.97, remove_dc_offset=True, wintype='hamming', stride_trick=True):
+    """Frame a signal into overlapping frames.
+
+    :param sig: the audio signal to frame.
+    :param frame_len: length of each frame measured in samples.
+    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
+    :param stride_trick: use stride trick to compute the rolling window and window multiplication faster
+    :returns: an array of frames. Size is NUMFRAMES by frame_len.
+    """
+    slen = len(sig)
+    frame_len = int(round_half_up(frame_len))
+    frame_step = int(round_half_up(frame_step))
+    if slen <= frame_len:
+        numframes = 1
+    else:
+        numframes = 1 + (( slen - frame_len) // frame_step)
+
+    # check kaldi/src/feat/feature-window.h
+    padsignal = sig[:(numframes-1)*frame_step+frame_len]
+    if wintype is 'povey':
+        win = numpy.empty(frame_len)
+        for i in range(frame_len):
+            win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85     
+    else: # the hamming window
+        win = numpy.hamming(frame_len)
+        
+    if stride_trick:
+        frames = rolling_window(padsignal, window=frame_len, step=frame_step)
+    else:
+        indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
+            numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
+        indices = numpy.array(indices, dtype=numpy.int32)
+        frames = padsignal[indices]
+        win = numpy.tile(win, (numframes, 1))
+        
+    frames = frames.astype(numpy.float32)
+    raw_frames = numpy.zeros(frames.shape)
+    for frm in range(frames.shape[0]):
+        frames[frm,:] = do_dither(frames[frm,:], dither)        # dither
+        frames[frm,:] = do_remove_dc_offset(frames[frm,:])      # remove dc offset
+        raw_frames[frm,:] = frames[frm,:]
+        frames[frm,:] = do_preemphasis(frames[frm,:], preemph)    # preemphasize
+
+    return frames * win, raw_frames
+
+def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
+    """Does overlap-add procedure to undo the action of framesig.
+
+    :param frames: the array of frames.
+    :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
+    :param frame_len: length of each frame measured in samples.
+    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
+    :returns: a 1-D signal.
+    """
+    frame_len = round_half_up(frame_len)
+    frame_step = round_half_up(frame_step)
+    numframes = numpy.shape(frames)[0]
+    assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
+
+    indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
+        numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
+    indices = numpy.array(indices, dtype=numpy.int32)
+    padlen = (numframes - 1) * frame_step + frame_len
+
+    if siglen <= 0: siglen = padlen
+
+    rec_signal = numpy.zeros((padlen,))
+    window_correction = numpy.zeros((padlen,))
+    win = winfunc(frame_len)
+
+    for i in range(0, numframes):
+        window_correction[indices[i, :]] = window_correction[
+                                               indices[i, :]] + win + 1e-15  # add a little bit so it is never zero
+        rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
+
+    rec_signal = rec_signal / window_correction
+    return rec_signal[0:siglen]
+
+
+def magspec(frames, NFFT):
+    """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
+    """
+    if numpy.shape(frames)[1] > NFFT:
+        logging.warn(
+            'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
+            numpy.shape(frames)[1], NFFT)
+    complex_spec = numpy.fft.rfft(frames, NFFT)
+    return numpy.absolute(complex_spec)
+
+
+def powspec(frames, NFFT):
+    """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
+    """
+    return numpy.square(magspec(frames, NFFT))
+
+
+def logpowspec(frames, NFFT, norm=1):
+    """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
+    """
+    ps = powspec(frames, NFFT);
+    ps[ps <= 1e-30] = 1e-30
+    lps = 10 * numpy.log10(ps)
+    if norm:
+        return lps - numpy.max(lps)
+    else:
+        return lps
+
+def do_dither(signal, dither_value=1.0):
+    signal += numpy.random.normal(size=signal.shape) * dither_value
+    return signal
+    
+def do_remove_dc_offset(signal):
+    signal -= numpy.mean(signal)
+    return signal
+
+def do_preemphasis(signal, coeff=0.97):
+    """perform preemphasis on the input signal.
+
+    :param signal: The signal to filter.
+    :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
+    :returns: the filtered signal.
+    """
+    return numpy.append((1-coeff)*signal[0], signal[1:] - coeff * signal[:-1])
--- a/third_party/python_kaldi_features/python_speech_features/sigproc_orig.py
+++ b/third_party/python_kaldi_features/python_speech_features/sigproc_orig.py
+# This file includes routines for basic signal processing including framing and computing power spectra.
+# Author: James Lyons 2012
+import decimal
+
+import numpy
+import math
+import logging
+
+
+def round_half_up(number):
+    return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
+
+
+def rolling_window(a, window, step=1):
+    # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
+    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
+    strides = a.strides + (a.strides[-1],)
+    return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
+
+
+def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,)), stride_trick=True):
+    """Frame a signal into overlapping frames.
+
+    :param sig: the audio signal to frame.
+    :param frame_len: length of each frame measured in samples.
+    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
+    :param stride_trick: use stride trick to compute the rolling window and window multiplication faster
+    :returns: an array of frames. Size is NUMFRAMES by frame_len.
+    """
+    slen = len(sig)
+    frame_len = int(round_half_up(frame_len))
+    frame_step = int(round_half_up(frame_step))
+    if slen <= frame_len:
+        numframes = 1
+    else:
+        numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step))
+
+    padlen = int((numframes - 1) * frame_step + frame_len)
+
+    zeros = numpy.zeros((padlen - slen,))
+    padsignal = numpy.concatenate((sig, zeros))
+    if stride_trick:
+        win = winfunc(frame_len)
+        frames = rolling_window(padsignal, window=frame_len, step=frame_step)
+    else:
+        indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
+            numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
+        indices = numpy.array(indices, dtype=numpy.int32)
+        frames = padsignal[indices]
+        win = numpy.tile(winfunc(frame_len), (numframes, 1))
+
+    return frames * win
+
+
+def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
+    """Does overlap-add procedure to undo the action of framesig.
+
+    :param frames: the array of frames.
+    :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
+    :param frame_len: length of each frame measured in samples.
+    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
+    :returns: a 1-D signal.
+    """
+    frame_len = round_half_up(frame_len)
+    frame_step = round_half_up(frame_step)
+    numframes = numpy.shape(frames)[0]
+    assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
+
+    indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
+        numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
+    indices = numpy.array(indices, dtype=numpy.int32)
+    padlen = (numframes - 1) * frame_step + frame_len
+
+    if siglen <= 0: siglen = padlen
+
+    rec_signal = numpy.zeros((padlen,))
+    window_correction = numpy.zeros((padlen,))
+    win = winfunc(frame_len)
+
+    for i in range(0, numframes):
+        window_correction[indices[i, :]] = window_correction[
+                                               indices[i, :]] + win + 1e-15  # add a little bit so it is never zero
+        rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
+
+    rec_signal = rec_signal / window_correction
+    return rec_signal[0:siglen]
+
+
+def magspec(frames, NFFT):
+    """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
+    """
+    if numpy.shape(frames)[1] > NFFT:
+        logging.warn(
+            'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
+            numpy.shape(frames)[1], NFFT)
+    complex_spec = numpy.fft.rfft(frames, NFFT)
+    return numpy.absolute(complex_spec)
+
+
+def powspec(frames, NFFT):
+    """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
+    """
+    return 1.0 / NFFT * numpy.square(magspec(frames, NFFT))
+
+
+def logpowspec(frames, NFFT, norm=1):
+    """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
+    """
+    ps = powspec(frames, NFFT);
+    ps[ps <= 1e-30] = 1e-30
+    lps = 10 * numpy.log10(ps)
+    if norm:
+        return lps - numpy.max(lps)
+    else:
+        return lps
+
+
+def preemphasis(signal, coeff=0.95):
+    """perform preemphasis on the input signal.
+
+    :param signal: The signal to filter.
+    :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
+    :returns: the filtered signal.
+    """
+    return numpy.append(signal[0], signal[1:] - coeff * signal[:-1])
--- a/third_party/python_kaldi_features/requirements.txt
+++ b/third_party/python_kaldi_features/requirements.txt
+mock
+scipy
+numpy
--- a/third_party/python_kaldi_features/setup.py
+++ b/third_party/python_kaldi_features/setup.py
+try:
+    from setuptools import setup #enables develop
+except ImportError:
+    from distutils.core import setup
+
+setup(name='python_speech_features',
+      version='0.6',
+      description='Python Speech Feature extraction',
+      author='James Lyons',
+      author_email='james.lyons0@gmail.com',
+      license='MIT',
+      url='https://github.com/jameslyons/python_speech_features',
+      packages=['python_speech_features'],
+    )
--- a/third_party/python_kaldi_features/test/test_sigproc.py
+++ b/third_party/python_kaldi_features/test/test_sigproc.py
+from python_speech_features import sigproc
+import unittest
+import numpy as np
+import time
+
+
+class test_case(unittest.TestCase):
+    def test_frame_sig(self):
+        n = 10000124
+        frame_len = 37
+        frame_step = 13
+        x = np.random.rand(n)
+        t0 = time.time()
+        y_old = sigproc.framesig(x, frame_len=frame_len, frame_step=frame_step, stride_trick=False)
+        t1 = time.time()
+        y_new = sigproc.framesig(x, frame_len=frame_len, frame_step=frame_step, stride_trick=True)
+        t_new = time.time() - t1
+        t_old = t1 - t0
+        self.assertTupleEqual(y_old.shape, y_new.shape)
+        np.testing.assert_array_equal(y_old, y_new)
+        self.assertLess(t_new, t_old)
+        print('new run time %3.2f < %3.2f sec' % (t_new, t_old))
+
+    def test_rolling(self):
+        x = np.arange(10)
+        y = sigproc.rolling_window(x, window=4, step=3)
+        y_expected = np.array([[0, 1, 2, 3],
+                               [3, 4, 5, 6],
+                               [6, 7, 8, 9]]
+                              )
+        y = np.testing.assert_array_equal(y, y_expected)
--- a/utils/compute_mean_std.py
+++ b/utils/compute_mean_std.py
@@ -24,7 +24,7 @@ from deepspeech.utils.utility import print_arguments
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
-add_arg('num_samples',      int,    2000,    "# of samples to for statistics.")
+add_arg('num_samples',      int,    -1,    "# of samples to for statistics.")
 add_arg('specgram_type',    str,
        'linear',
        "Audio feature type. Options: linear, mfcc, fbank.",