kaldi fbank and mfcc

48f4bda3 · Hui Zhang · 281d46da · 48f4bda3 · 48f4bda3 · 48f4bda3
40 changed file
--- a/.flake8
+++ b/.flake8
@@ -12,6 +12,7 @@ exclude =
    .git,
    # python cache
    __pycache__,
+    third_party/,
 # Provide a comma-separate list of glob patterns to include for checks.
 filename =
    *.py
@@ -46,4 +47,4 @@ select =
    E,
    W,
    F,
-    C
\ No newline at end of file
+    C
--- a/.notebook/dataloader_with_tokens_tokenids.ipynb
+++ b/.notebook/dataloader_with_tokens_tokenids.ipynb
--- a/.notebook/python_test.ipynb
+++ b/.notebook/python_test.ipynb
@@ -637,7 +637,7 @@
  {
   "cell_type": "code",
   "execution_count": 59,
-   "id": "engaged-offense",
+   "id": "first-release",
   "metadata": {},
   "outputs": [
    {
@@ -660,7 +660,7 @@
  {
   "cell_type": "code",
   "execution_count": 35,
-   "id": "level-fairy",
+   "id": "convertible-roulette",
   "metadata": {},
   "outputs": [
    {
@@ -705,7 +705,7 @@
  {
   "cell_type": "code",
   "execution_count": 3,
-   "id": "beautiful-geometry",
+   "id": "cutting-fleece",
   "metadata": {},
   "outputs": [
    {
@@ -728,7 +728,7 @@
  {
   "cell_type": "code",
   "execution_count": 4,
-   "id": "african-trustee",
+   "id": "historical-diving",
   "metadata": {},
   "outputs": [
    {
@@ -748,7 +748,7 @@
  {
   "cell_type": "code",
   "execution_count": 5,
-   "id": "ready-wages",
+   "id": "similar-spice",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -758,7 +758,7 @@
  {
   "cell_type": "code",
   "execution_count": 6,
-   "id": "distinguished-printer",
+   "id": "grand-influence",
   "metadata": {},
   "outputs": [
    {
@@ -776,7 +776,7 @@
  {
   "cell_type": "code",
   "execution_count": 7,
-   "id": "precious-limit",
+   "id": "wireless-hypothetical",
   "metadata": {},
   "outputs": [
    {
@@ -809,7 +809,7 @@
  {
   "cell_type": "code",
   "execution_count": 17,
-   "id": "chemical-convenience",
+   "id": "designed-fluid",
   "metadata": {},
   "outputs": [
    {
@@ -839,7 +839,7 @@
  {
   "cell_type": "code",
   "execution_count": 18,
-   "id": "round-remark",
+   "id": "cultural-friendship",
   "metadata": {},
   "outputs": [
    {
@@ -871,7 +871,7 @@
  {
   "cell_type": "code",
   "execution_count": 19,
-   "id": "smaller-shower",
+   "id": "fossil-lotus",
   "metadata": {},
   "outputs": [
    {
@@ -903,7 +903,7 @@
  {
   "cell_type": "code",
   "execution_count": 31,
-   "id": "integrated-block",
+   "id": "constitutional-poker",
   "metadata": {},
   "outputs": [
    {
@@ -935,7 +935,7 @@
  {
   "cell_type": "code",
   "execution_count": 32,
-   "id": "favorite-failure",
+   "id": "threaded-strap",
   "metadata": {},
   "outputs": [
    {
@@ -966,7 +966,7 @@
  {
   "cell_type": "code",
   "execution_count": 20,
-   "id": "boolean-saint",
+   "id": "infectious-welcome",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -977,7 +977,7 @@
  {
   "cell_type": "code",
   "execution_count": 46,
-   "id": "senior-hospital",
+   "id": "musical-anatomy",
   "metadata": {},
   "outputs": [
    {
@@ -997,7 +997,7 @@
  {
   "cell_type": "code",
   "execution_count": 30,
-   "id": "consolidated-incident",
+   "id": "lucky-paraguay",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1007,7 +1007,7 @@
  {
   "cell_type": "code",
   "execution_count": 31,
-   "id": "pursuant-paragraph",
+   "id": "annual-christmas",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1017,7 +1017,7 @@
  {
   "cell_type": "code",
   "execution_count": 47,
-   "id": "mexican-apollo",
+   "id": "infectious-seeker",
   "metadata": {},
   "outputs": [
    {
@@ -1038,7 +1038,7 @@
  {
   "cell_type": "code",
   "execution_count": 1,
-   "id": "encouraging-integration",
+   "id": "pregnant-conditioning",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1049,7 +1049,7 @@
  {
   "cell_type": "code",
   "execution_count": 56,
-   "id": "trying-auckland",
+   "id": "logical-happiness",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1059,7 +1059,7 @@
  {
   "cell_type": "code",
   "execution_count": 58,
-   "id": "national-edward",
+   "id": "rocky-plastic",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1069,7 +1069,7 @@
  {
   "cell_type": "code",
   "execution_count": 60,
-   "id": "aerial-campaign",
+   "id": "focused-compensation",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1079,7 +1079,7 @@
  {
   "cell_type": "code",
   "execution_count": 66,
-   "id": "instant-violence",
+   "id": "centered-repository",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1089,7 +1089,7 @@
  {
   "cell_type": "code",
   "execution_count": 95,
-   "id": "medical-globe",
+   "id": "inner-invite",
   "metadata": {},
   "outputs": [
    {
@@ -1110,7 +1110,7 @@
  {
   "cell_type": "code",
   "execution_count": 81,
-   "id": "three-contrast",
+   "id": "russian-chosen",
   "metadata": {},
   "outputs": [
    {
@@ -1131,7 +1131,7 @@
  {
   "cell_type": "code",
   "execution_count": 11,
-   "id": "cross-atlas",
+   "id": "equal-particle",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1161,7 +1161,7 @@
  {
   "cell_type": "code",
   "execution_count": 12,
-   "id": "empirical-defense",
+   "id": "tracked-purse",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1172,7 +1172,7 @@
  {
   "cell_type": "code",
   "execution_count": 14,
-   "id": "rocky-listening",
+   "id": "steady-mileage",
   "metadata": {},
   "outputs": [
    {
@@ -1201,7 +1201,7 @@
  {
   "cell_type": "code",
   "execution_count": 13,
-   "id": "surrounded-absolute",
+   "id": "regulated-google",
   "metadata": {},
   "outputs": [
    {
@@ -1230,7 +1230,7 @@
  {
   "cell_type": "code",
   "execution_count": 15,
-   "id": "differential-surgery",
+   "id": "homeless-forge",
   "metadata": {},
   "outputs": [
    {
@@ -1260,7 +1260,7 @@
  {
   "cell_type": "code",
   "execution_count": 29,
-   "id": "durable-powell",
+   "id": "exciting-blocking",
   "metadata": {},
   "outputs": [
    {
@@ -1290,7 +1290,7 @@
  {
   "cell_type": "code",
   "execution_count": 30,
-   "id": "young-continuity",
+   "id": "through-botswana",
   "metadata": {},
   "outputs": [
    {
@@ -1308,7 +1308,7 @@
  {
   "cell_type": "code",
   "execution_count": 22,
-   "id": "geological-sarah",
+   "id": "cellular-violence",
   "metadata": {},
   "outputs": [
    {
@@ -1343,7 +1343,7 @@
  {
   "cell_type": "code",
   "execution_count": 23,
-   "id": "possible-angle",
+   "id": "undefined-parade",
   "metadata": {},
   "outputs": [
    {
@@ -1376,7 +1376,7 @@
  {
   "cell_type": "code",
   "execution_count": 33,
-   "id": "novel-sucking",
+   "id": "special-delicious",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1386,7 +1386,7 @@
  {
   "cell_type": "code",
   "execution_count": 34,
-   "id": "fixed-wallet",
+   "id": "seasonal-consensus",
   "metadata": {},
   "outputs": [
    {
@@ -1428,7 +1428,7 @@
  {
   "cell_type": "code",
   "execution_count": 35,
-   "id": "north-seattle",
+   "id": "dress-distinction",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1438,7 +1438,7 @@
  {
   "cell_type": "code",
   "execution_count": 38,
-   "id": "above-western",
+   "id": "rental-anthony",
   "metadata": {},
   "outputs": [
    {
@@ -1471,7 +1471,7 @@
  {
   "cell_type": "code",
   "execution_count": 41,
-   "id": "choice-diabetes",
+   "id": "separated-restriction",
   "metadata": {},
   "outputs": [],
   "source": [
@@ -1481,7 +1481,7 @@
  {
   "cell_type": "code",
   "execution_count": 3,
-   "id": "white-vessel",
+   "id": "painted-variable",
   "metadata": {},
   "outputs": [
    {
@@ -1504,7 +1504,7 @@
  {
   "cell_type": "code",
   "execution_count": 5,
-   "id": "treated-freedom",
+   "id": "satellite-insider",
   "metadata": {},
   "outputs": [
    {
@@ -1523,7 +1523,7 @@
  {
   "cell_type": "code",
   "execution_count": 7,
-   "id": "convinced-safety",
+   "id": "developed-thirty",
   "metadata": {},
   "outputs": [
    {
@@ -1543,7 +1543,7 @@
  {
   "cell_type": "code",
   "execution_count": 8,
-   "id": "blond-bunny",
+   "id": "official-bench",
   "metadata": {},
   "outputs": [
    {
@@ -1560,10 +1560,97 @@
    "print(sorted_val_scores)"
   ]
  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "ranking-camera",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "b'\\x01\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x14\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x02\\x00\\x00\\x00\\x00\\x00\\x00\\x00\\x1e\\x00\\x00\\x00\\x00\\x00\\x00\\x00'\n",
+      "[ 1 20  2 30]\n",
+      "[[ 1 20]\n",
+      " [ 2 30]]\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel_launcher.py:1: DeprecationWarning: tostring() is deprecated. Use tobytes() instead.\n",
+      "  \"\"\"Entry point for launching an IPython kernel.\n",
+      "/workspace/DeepSpeech-2.x/tools/venv/lib/python3.7/site-packages/ipykernel_launcher.py:3: DeprecationWarning: The binary mode of fromstring is deprecated, as it behaves surprisingly on unicode inputs. Use frombuffer instead\n",
+      "  This is separate from the ipykernel package so we can avoid doing imports until\n"
+     ]
+    }
+   ],
+   "source": [
+    "a = scores.tostring()\n",
+    "print(a)\n",
+    "b = np.fromstring(a, scores.dtype)\n",
+    "print(b)\n",
+    "print(scores)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "breeding-proxy",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "numpy.int16"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "np.int16"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "coordinate-hungary",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dtype = np.dtype('int16')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "specified-jackson",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "int16\n",
+      "16\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(dtype)\n",
+    "dtype is np.int16\n",
+    "print(np.iinfo(dtype).bits)"
+   ]
+  },
  {
   "cell_type": "code",
   "execution_count": null,
-   "id": "utility-monroe",
+   "id": "activated-insight",
   "metadata": {},
   "outputs": [],
   "source": []
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,6 +3,7 @@
    hooks:
    -   id: yapf
        files: \.py$
+        exclude: (?=third_party).*(\.py)$
 -   repo: https://github.com/pre-commit/pre-commit-hooks
    sha: a11d9314b22d8f8c7556443875b731ef05965464
    hooks:
@@ -15,6 +16,7 @@
    -   id: trailing-whitespace
        files: \.md$
    -   id: requirements-txt-fixer
+        exclude: (?=third_party).*$
    -   id: check-yaml
    -   id: check-json
    -   id: pretty-format-json
@@ -27,6 +29,7 @@
        -  --ignore=E501,E228,E226,E261,E266,E128,E402,W503
        -  --builtins=G,request
        -  --jobs=1
+        exclude: (?=third_party).*(\.py)$
 -   repo : https://github.com/Lucas-C/pre-commit-hooks
    sha: v1.0.1
    hooks:
@@ -51,8 +54,9 @@
        entry: python .pre-commit-hooks/copyright-check.hook
        language: system
        files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py)$
-        #exclude: (?=decoders/swig).*(\.cpp|\.h)$
+        exclude: (?=third_party).*(\.cpp|\.h|\.py)$
 -   repo: https://github.com/asottile/reorder_python_imports
    rev: v2.4.0
    hooks:
      - id: reorder-python-imports
+        exclude: (?=third_party).*(\.py)$
--- a/deepspeech/frontend/audio.py
+++ b/deepspeech/frontend/audio.py
@@ -298,6 +298,18 @@ class AudioSegment(object):
        samples = self._convert_samples_from_float32(self._samples, dtype)
        return samples.tostring()

+    def to(self, dtype='int16'):
+        """Create a `dtype` audio content.
+        
+        :param dtype: Data type for export samples. Options: 'int16', 'int32',
+                      'float32', 'float64'. Default is 'float32'.
+        :type dtype: str
+        :return: np.ndarray containing `dtype` audio content.
+        :rtype: str
+        """
+        samples = self._convert_samples_from_float32(self._samples, dtype)
+        return samples
+
    def gain_db(self, gain):
        """Apply gain in decibels to samples.


--- a/deepspeech/frontend/augmentor/spec_augment.py
+++ b/deepspeech/frontend/augmentor/spec_augment.py
@@ -64,6 +64,7 @@ class SpecAugmentor(AugmentorBase):
        self.n_freq_masks = n_freq_masks
        self.n_time_masks = n_time_masks
        self.p = p
+        #logger.info(f"specaug: F-{F}, T-{T}, F-n-{n_freq_masks}, T-n-{n_time_masks}")

        # adaptive SpecAugment
        self.adaptive_number_ratio = adaptive_number_ratio

--- a/deepspeech/frontend/featurizer/audio_featurizer.py
+++ b/deepspeech/frontend/featurizer/audio_featurizer.py
@@ -56,7 +56,8 @@ class AudioFeaturizer(object):
                 max_freq=None,
                 target_sample_rate=16000,
                 use_dB_normalization=True,
-                 target_dB=-20):
+                 target_dB=-20,
+                 dither=1.0):
        self._specgram_type = specgram_type
        # mfcc and fbank using `feat_dim`
        self._feat_dim = feat_dim
@@ -69,6 +70,7 @@ class AudioFeaturizer(object):
        self._use_dB_normalization = use_dB_normalization
        self._target_dB = target_dB
        self._fft_point = n_fft
+        self._dither = dither

    def featurize(self,
                  audio_segment,
@@ -101,8 +103,7 @@ class AudioFeaturizer(object):
        if self._use_dB_normalization:
            audio_segment.normalize(target_db=self._target_dB)
        # extract spectrogram
-        return self._compute_specgram(audio_segment.samples,
-                                      audio_segment.sample_rate)
+        return self._compute_specgram(audio_segment)

    @property
    def feature_size(self):
@@ -125,9 +126,11 @@ class AudioFeaturizer(object):
                             "Supported values: linear." % self._specgram_type)
        return feat_dim

-    def _compute_specgram(self, samples, sample_rate):
+    def _compute_specgram(self, audio_segment):
        """Extract various audio features."""
+        sample_rate = audio_segment.sample_rate
        if self._specgram_type == 'linear':
+            samples = audio_segment.samples
            return self._compute_linear_specgram(
                samples,
                sample_rate,
@@ -135,6 +138,7 @@ class AudioFeaturizer(object):
                window_ms=self._window_ms,
                max_freq=self._max_freq)
        elif self._specgram_type == 'mfcc':
+            samples = audio_segment.to('int16')
            return self._compute_mfcc(
                samples,
                sample_rate,
@@ -142,8 +146,10 @@ class AudioFeaturizer(object):
                stride_ms=self._stride_ms,
                window_ms=self._window_ms,
                max_freq=self._max_freq,
+                dither=self._dither,
                delta_delta=self._delta_delta)
        elif self._specgram_type == 'fbank':
+            samples = audio_segment.to('int16')
            return self._compute_fbank(
                samples,
                sample_rate,
@@ -151,6 +157,7 @@ class AudioFeaturizer(object):
                stride_ms=self._stride_ms,
                window_ms=self._window_ms,
                max_freq=self._max_freq,
+                dither=self._dither,
                delta_delta=self._delta_delta)
        else:
            raise ValueError("Unknown specgram_type %s. "
@@ -233,17 +240,18 @@ class AudioFeaturizer(object):
                      sample_rate,
                      feat_dim=13,
                      stride_ms=10.0,
-                      window_ms=20.0,
+                      window_ms=25.0,
                      max_freq=None,
+                      dither=1.0,
                      delta_delta=True):
        """Compute mfcc from samples.

        Args:
-            samples (np.ndarray): the audio signal from which to compute features. Should be an N*1 array
+            samples (np.ndarray, np.int16): the audio signal from which to compute features.
            sample_rate (float): the sample rate of the signal we are working with, in Hz.
            feat_dim (int): the number of cepstrum to return, default 13.
            stride_ms (float, optional): stride length in ms. Defaults to 10.0.
-            window_ms (float, optional): window length in ms. Defaults to 20.0.
+            window_ms (float, optional): window length in ms. Defaults to 25.0.
            max_freq ([type], optional): highest band edge of mel filters. In Hz, default is samplerate/2. Defaults to None.
            delta_delta (bool, optional): Whether with delta delta. Defaults to False.

@@ -270,14 +278,16 @@ class AudioFeaturizer(object):
            winlen=0.001 * window_ms,
            winstep=0.001 * stride_ms,
            numcep=feat_dim,
-            nfilt=2 * feat_dim,
-            nfft=None,
-            lowfreq=0,
+            nfilt=23,
+            nfft=512,
+            lowfreq=20,
            highfreq=max_freq,
+            dither=dither,
+            remove_dc_offset=True,
            preemph=0.97,
            ceplifter=22,
-            appendEnergy=True,
-            winfunc=lambda x: np.ones((x, )))
+            useEnergy=True,
+            winfunc='povey')
        mfcc_feat = np.transpose(mfcc_feat)
        if delta_delta:
            mfcc_feat = self._concat_delta_delta(mfcc_feat)
@@ -286,15 +296,16 @@ class AudioFeaturizer(object):
    def _compute_fbank(self,
                       samples,
                       sample_rate,
-                       feat_dim=26,
+                       feat_dim=40,
                       stride_ms=10.0,
-                       window_ms=20.0,
+                       window_ms=25.0,
                       max_freq=None,
+                       dither=1.0,
                       delta_delta=False):
        """Compute logfbank from samples.
        
        Args:
-            samples (np.ndarray): the audio signal from which to compute features. Should be an N*1 array
+            samples (np.ndarray, np.int16): the audio signal from which to compute features. Should be an N*1 array
            sample_rate (float): the sample rate of the signal we are working with, in Hz.
            feat_dim (int): the number of cepstrum to return, default 13.
            stride_ms (float, optional): stride length in ms. Defaults to 10.0.
@@ -325,9 +336,13 @@ class AudioFeaturizer(object):
            winstep=0.001 * stride_ms,
            nfilt=feat_dim,
            nfft=512,
-            lowfreq=0,
+            lowfreq=20,
            highfreq=max_freq,
-            preemph=0.97, )
+            dither=dither,
+            remove_dc_offset=True,
+            preemph=0.97,
+            wintype='povey')
+
        fbank_feat = np.transpose(fbank_feat)
        if delta_delta:
            fbank_feat = self._concat_delta_delta(fbank_feat)

--- a/deepspeech/frontend/normalizer.py
+++ b/deepspeech/frontend/normalizer.py
@@ -82,13 +82,16 @@ class FeatureNormalizer(object):
    def _read_mean_std_from_file(self, filepath, eps=1e-20):
        """Load mean and std from file."""
        mean, std = load_cmvn(filepath, filetype='npz')
-        self._mean = mean
-        self._istd = 1.0 / std
+        self._mean = mean.T
+        self._istd = 1.0 / std.T

    def _compute_mean_std(self, manifest_path, featurize_func, num_samples):
        """Compute mean and std from randomly sampled instances."""
        manifest = read_manifest(manifest_path)
-        sampled_manifest = self._rng.sample(manifest, num_samples)
+        if num_samples == -1:
+            sampled_manifest = manifest
+        else:
+            sampled_manifest = self._rng.sample(manifest, num_samples)
        features = []
        for instance in sampled_manifest:
            features.append(

--- a/examples/aishell/s0/local/data.sh
+++ b/examples/aishell/s0/local/data.sh
@@ -36,10 +36,12 @@ fi
 # compute mean and stddev for normalizer
 python3 ${MAIN_ROOT}/utils/compute_mean_std.py \
 --manifest_path="data/manifest.train.raw" \
--num_samples=2000 \
 --specgram_type="fbank" \
 --feat_dim=80 \
 --delta_delta=false \
+--stride_ms=10.0 \
+--window_ms=25.0 \
+--sample_rate=16000 \
 --output_path="data/mean_std.npz"

 if [ $? -ne 0 ]; then

--- a/examples/tiny/s1/conf/augmentation.json
+++ b/examples/tiny/s1/conf/augmentation.json
 [
+  {
+    "type": "speed",
+    "params": {
+      "min_speed_rate": 0.9,
+      "max_speed_rate": 1.1,
+      "num_rates": 3
+    },
+    "prob": 0.0
+  },
  {
    "type": "shift",
    "params": {
@@ -6,5 +15,20 @@
      "max_shift_ms": 5
    },
    "prob": 1.0
+  },
+  {
+    "type": "specaug",
+    "params": {
+      "F": 10,
+      "T": 50,
+      "n_freq_masks": 2,
+      "n_time_masks": 2,
+      "p": 1.0,
+      "W": 80,
+      "adaptive_number_ratio": 0,
+      "adaptive_size_ratio": 0,
+      "max_n_time_masks": 20
+    },
+    "prob": 1.0
  }
 ]
--- a/setup.sh
+++ b/setup.sh
@@ -54,4 +54,14 @@ if [ $? != 0 ]; then
   exit -1
 fi

+
+# install kaldi-comptiable feature 
+pushd third_party/python_kaldi_features/
+python setup.py install
+if [ $? != 0 ]; then
+   error_msg "Please check why kaldi feature install error!"
+   exit -1
+fi
+popd
+
 info_msg "Install all dependencies successfully."
--- a/third_party/README.md
+++ b/third_party/README.md
+
+* [python_kaldi_features](https://github.com/ZitengWang/python_kaldi_features)
+commit: fc1bd6240c2008412ab64dc25045cd872f5e126c
+ref: https://zhuanlan.zhihu.com/p/55371926
--- a/third_party/python_kaldi_features/LICENSE
+++ b/third_party/python_kaldi_features/LICENSE
+The MIT License (MIT)
+
+Copyright (c) 2013 James Lyons
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+the Software, and to permit persons to whom the Software is furnished to do so,
+subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
--- a/third_party/python_kaldi_features/MANIFEST
+++ b/third_party/python_kaldi_features/MANIFEST
+# file GENERATED by distutils, do NOT edit
+setup.py
+python_speech_features\__init__.py
+python_speech_features\base.py
+python_speech_features\sigproc.py
--- a/third_party/python_kaldi_features/README.rst
+++ b/third_party/python_kaldi_features/README.rst
+
+
+forked from `<https://github.com/jameslyons/python_speech_features>`_
+
+check the readme therein for the usages
+
+It has been modified to produce the same results as with the compute-mfcc-feats and compute-fbank-feats (check their default parameters first) commands in Kaldi.
+ 
+-------------------------------
+
+The compute-mfcc-feats pipeline:
+
+src/featbin/Compute-mfcc-feats.cc
+    
+    Mfcc mfcc(mfcc_opts)  --> src/feat/Feature-mfcc.h
+    
+                                 struct MfccOptions
+                                 
+                                 typedef OfflineFeatureTpl<MfccComputer> Mfcc --> src/feat/Feature-common.h
+           
+                                 MfccComputer()  --> src/feat/Feature-mfcc.cc
+                                 
+                                                         ComputeDctMatrix()  --> src/matrix/Matrix-functions.cc
+                                                         
+                                                         ComputeLifterCoeffs()  --> src/feat/Mel-computations.cc
+  
+    
+    for each utterance:
+    mfcc.ComputeFeatures()
+
+src/feat/Feature-common-inl.h
+
+    OfflineFeatureTpl<F>::ComputeFeatures()
+    
+        Compute()
+        
+            ExtractWindow()  --> src/feat/Feature-window.cc
+                                     
+                                     ProcessWindow()
+                                         
+                                         Dither, remove_dc_offset, log_energy_pre_window, Preemphasize, window
+            
+            computer_.Compute() --> src/feat/Feature-mfcc.cc
+               
+                                      MfccComputer::Compute()
+                                      
+                                          const MelBanks &mel_banks --> Mel-computations.cc
+                                          
+                                          srfft_
+                                        
+                                          ComputerPowerSpectrum()
+                                          
+                                          mel_banks.Compute()
+                                          
+                                          mel_energies_.ApplyLog()
+                                          
+                                          dct, cepstral_lifter
+                                          
--- a/third_party/python_kaldi_features/build/lib/python_speech_features/__init__.py
+++ b/third_party/python_kaldi_features/build/lib/python_speech_features/__init__.py
+from .base import *
--- a/third_party/python_kaldi_features/build/lib/python_speech_features/base.py
+++ b/third_party/python_kaldi_features/build/lib/python_speech_features/base.py
+# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications
+# Author: James Lyons 2012
+from __future__ import division
+import numpy
+from python_speech_features import sigproc
+from scipy.fftpack import dct
+
+def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
+         nfilt=23,nfft=512,lowfreq=20,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,
+         ceplifter=22,useEnergy=True,wintype='povey'):
+    """Compute MFCC features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param numcep: the number of cepstrum to return, default 13
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
+    :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+    :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
+    """
+    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither,remove_dc_offset,preemph,wintype)
+    feat = numpy.log(feat)
+    feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
+    feat = lifter(feat,ceplifter)
+    if useEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
+    return feat
+
+def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+          nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97, 
+          wintype='hamming'):
+    """Compute Mel-filterbank energy features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+     winfunc=lambda x:numpy.ones((x,))   
+    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
+        second return value is the energy in each frame (total energy, unwindowed)
+    """
+    highfreq= highfreq or samplerate/2
+    frames,raw_frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, dither, preemph, remove_dc_offset, wintype)
+    pspec = sigproc.powspec(frames,nfft) # nearly the same until this part
+    energy = numpy.sum(raw_frames**2,1) # this stores the raw energy in each frame
+    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
+
+    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
+    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
+    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
+
+    return feat,energy
+
+def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+          nfilt=40,nfft=512,lowfreq=64,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,wintype='hamming'):
+    """Compute log Mel-filterbank energy features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
+    """
+    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither, remove_dc_offset,preemph,wintype)
+    return numpy.log(feat)
+
+def hz2mel(hz):
+    """Convert a value in Hertz to Mels
+
+    :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
+    :returns: a value in Mels. If an array was passed in, an identical sized array is returned.
+    """
+    return 1127 * numpy.log(1+hz/700.0)
+
+
+def mel2hz(mel):
+    """Convert a value in Mels to Hertz
+
+    :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
+    :returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
+    """
+    return 700 * (numpy.exp(mel/1127.0)-1)
+
+def get_filterbanks(nfilt=26,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
+    """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
+    to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
+
+    :param nfilt: the number of filters in the filterbank, default 20.
+    :param nfft: the FFT size. Default is 512.
+    :param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
+    :param lowfreq: lowest band edge of mel filters, default 0 Hz
+    :param highfreq: highest band edge of mel filters, default samplerate/2
+    :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
+    """
+    highfreq= highfreq or samplerate/2
+    assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
+
+    # compute points evenly spaced in mels
+    lowmel = hz2mel(lowfreq)
+    highmel = hz2mel(highfreq)
+
+    # check kaldi/src/feat/Mel-computations.h    
+    fbank = numpy.zeros([nfilt,nfft//2+1])
+    mel_freq_delta = (highmel-lowmel)/(nfilt+1)
+    for j in range(0,nfilt):
+        leftmel = lowmel+j*mel_freq_delta
+        centermel = lowmel+(j+1)*mel_freq_delta
+        rightmel = lowmel+(j+2)*mel_freq_delta
+        for i in range(0,nfft//2):
+            mel=hz2mel(i*samplerate/nfft)
+            if mel>leftmel and mel<rightmel:
+                if mel<centermel:
+                    fbank[j,i]=(mel-leftmel)/(centermel-leftmel)
+                else:
+                    fbank[j,i]=(rightmel-mel)/(rightmel-centermel)
+    return fbank
+
+def lifter(cepstra, L=22):
+    """Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
+    magnitude of the high frequency DCT coeffs.
+
+    :param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
+    :param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
+    """
+    if L > 0:
+        nframes,ncoeff = numpy.shape(cepstra)
+        n = numpy.arange(ncoeff)
+        lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
+        return lift*cepstra
+    else:
+        # values of L <= 0, do nothing
+        return cepstra
+
+def delta(feat, N):
+    """Compute delta features from a feature vector sequence.
+
+    :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
+    :param N: For each frame, calculate delta features based on preceding and following N frames
+    :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
+    """
+    if N < 1:
+        raise ValueError('N must be an integer >= 1')
+    NUMFRAMES = len(feat)
+    denominator = 2 * sum([i**2 for i in range(1, N+1)])
+    delta_feat = numpy.empty_like(feat)
+    padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge')   # padded version of feat
+    for t in range(NUMFRAMES):
+        delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator   # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
+    return delta_feat
--- a/third_party/python_kaldi_features/build/lib/python_speech_features/base_orig.py
+++ b/third_party/python_kaldi_features/build/lib/python_speech_features/base_orig.py
+# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications
+# Author: James Lyons 2012
+from __future__ import division
+import numpy
+from python_speech_features import sigproc
+from scipy.fftpack import dct
+
+def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
+         nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True,
+         winfunc=lambda x:numpy.ones((x,))):
+    """Compute MFCC features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param numcep: the number of cepstrum to return, default 13
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
+    :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+    :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
+    """
+    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph,winfunc)
+    feat = numpy.log(feat)
+    feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
+    feat = lifter(feat,ceplifter)
+    if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
+    return feat
+
+def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
+          winfunc=lambda x:numpy.ones((x,))):
+    """Compute Mel-filterbank energy features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
+        second return value is the energy in each frame (total energy, unwindowed)
+    """
+    highfreq= highfreq or samplerate/2
+    signal = sigproc.preemphasis(signal,preemph)
+    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
+    pspec = sigproc.powspec(frames,nfft)
+    energy = numpy.sum(pspec,1) # this stores the total energy in each frame
+    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
+
+    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
+    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
+    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
+
+    return feat,energy
+
+def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
+    """Compute log Mel-filterbank energy features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
+    """
+    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph)
+    return numpy.log(feat)
+
+def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+        nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
+        winfunc=lambda x:numpy.ones((x,))):
+    """Compute Spectral Subband Centroid features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
+    """
+    highfreq= highfreq or samplerate/2
+    signal = sigproc.preemphasis(signal,preemph)
+    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
+    pspec = sigproc.powspec(frames,nfft)
+    pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems
+
+    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
+    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
+    R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1))
+
+    return numpy.dot(pspec*R,fb.T) / feat
+
+def hz2mel(hz):
+    """Convert a value in Hertz to Mels
+
+    :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
+    :returns: a value in Mels. If an array was passed in, an identical sized array is returned.
+    """
+    return 2595 * numpy.log10(1+hz/700.)
+
+def mel2hz(mel):
+    """Convert a value in Mels to Hertz
+
+    :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
+    :returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
+    """
+    return 700*(10**(mel/2595.0)-1)
+
+def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
+    """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
+    to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
+
+    :param nfilt: the number of filters in the filterbank, default 20.
+    :param nfft: the FFT size. Default is 512.
+    :param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
+    :param lowfreq: lowest band edge of mel filters, default 0 Hz
+    :param highfreq: highest band edge of mel filters, default samplerate/2
+    :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
+    """
+    highfreq= highfreq or samplerate/2
+    assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
+
+    # compute points evenly spaced in mels
+    lowmel = hz2mel(lowfreq)
+    highmel = hz2mel(highfreq)
+    melpoints = numpy.linspace(lowmel,highmel,nfilt+2)
+    # our points are in Hz, but we use fft bins, so we have to convert
+    #  from Hz to fft bin number
+    bin = numpy.floor((nfft+1)*mel2hz(melpoints)/samplerate)
+
+    fbank = numpy.zeros([nfilt,nfft//2+1])
+    for j in range(0,nfilt):
+        for i in range(int(bin[j]), int(bin[j+1])):
+            fbank[j,i] = (i - bin[j]) / (bin[j+1]-bin[j])
+        for i in range(int(bin[j+1]), int(bin[j+2])):
+            fbank[j,i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1])
+    return fbank
+
+def lifter(cepstra, L=22):
+    """Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
+    magnitude of the high frequency DCT coeffs.
+
+    :param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
+    :param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
+    """
+    if L > 0:
+        nframes,ncoeff = numpy.shape(cepstra)
+        n = numpy.arange(ncoeff)
+        lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
+        return lift*cepstra
+    else:
+        # values of L <= 0, do nothing
+        return cepstra
+
+def delta(feat, N):
+    """Compute delta features from a feature vector sequence.
+
+    :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
+    :param N: For each frame, calculate delta features based on preceding and following N frames
+    :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
+    """
+    if N < 1:
+        raise ValueError('N must be an integer >= 1')
+    NUMFRAMES = len(feat)
+    denominator = 2 * sum([i**2 for i in range(1, N+1)])
+    delta_feat = numpy.empty_like(feat)
+    padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge')   # padded version of feat
+    for t in range(NUMFRAMES):
+        delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator   # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
+    return delta_feat
--- a/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc.py
+++ b/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc.py
+# This file includes routines for basic signal processing including framing and computing power spectra.
+# Author: James Lyons 2012
+import decimal
+
+import numpy
+import math
+import logging
+
+
+def round_half_up(number):
+    return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
+
+
+def rolling_window(a, window, step=1):
+    # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
+    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
+    strides = a.strides + (a.strides[-1],)
+    return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
+
+
+def framesig(sig, frame_len, frame_step, dither=1.0, preemph=0.97, remove_dc_offset=True, wintype='hamming', stride_trick=True):
+    """Frame a signal into overlapping frames.
+
+    :param sig: the audio signal to frame.
+    :param frame_len: length of each frame measured in samples.
+    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
+    :param stride_trick: use stride trick to compute the rolling window and window multiplication faster
+    :returns: an array of frames. Size is NUMFRAMES by frame_len.
+    """
+    slen = len(sig)
+    frame_len = int(round_half_up(frame_len))
+    frame_step = int(round_half_up(frame_step))
+    if slen <= frame_len:
+        numframes = 1
+    else:
+        numframes = 1 + (( slen - frame_len) // frame_step)
+
+    # check kaldi/src/feat/feature-window.h
+    padsignal = sig[:(numframes-1)*frame_step+frame_len]
+    if wintype is 'povey':
+        win = numpy.empty(frame_len)
+        for i in range(frame_len):
+            win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85     
+    else: # the hamming window
+        win = numpy.hamming(frame_len)
+        
+    if stride_trick:
+        frames = rolling_window(padsignal, window=frame_len, step=frame_step)
+    else:
+        indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
+            numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
+        indices = numpy.array(indices, dtype=numpy.int32)
+        frames = padsignal[indices]
+        win = numpy.tile(win, (numframes, 1))
+        
+    frames = frames.astype(numpy.float32)
+    raw_frames = numpy.zeros(frames.shape)
+    for frm in range(frames.shape[0]):
+        frames[frm,:] = do_dither(frames[frm,:], dither)        # dither
+        frames[frm,:] = do_remove_dc_offset(frames[frm,:])      # remove dc offset
+        raw_frames[frm,:] = frames[frm,:]
+        frames[frm,:] = do_preemphasis(frames[frm,:], preemph)    # preemphasize
+
+    return frames * win, raw_frames
+
+def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
+    """Does overlap-add procedure to undo the action of framesig.
+
+    :param frames: the array of frames.
+    :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
+    :param frame_len: length of each frame measured in samples.
+    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
+    :returns: a 1-D signal.
+    """
+    frame_len = round_half_up(frame_len)
+    frame_step = round_half_up(frame_step)
+    numframes = numpy.shape(frames)[0]
+    assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
+
+    indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
+        numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
+    indices = numpy.array(indices, dtype=numpy.int32)
+    padlen = (numframes - 1) * frame_step + frame_len
+
+    if siglen <= 0: siglen = padlen
+
+    rec_signal = numpy.zeros((padlen,))
+    window_correction = numpy.zeros((padlen,))
+    win = winfunc(frame_len)
+
+    for i in range(0, numframes):
+        window_correction[indices[i, :]] = window_correction[
+                                               indices[i, :]] + win + 1e-15  # add a little bit so it is never zero
+        rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
+
+    rec_signal = rec_signal / window_correction
+    return rec_signal[0:siglen]
+
+
+def magspec(frames, NFFT):
+    """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
+    """
+    if numpy.shape(frames)[1] > NFFT:
+        logging.warn(
+            'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
+            numpy.shape(frames)[1], NFFT)
+    complex_spec = numpy.fft.rfft(frames, NFFT)
+    return numpy.absolute(complex_spec)
+
+
+def powspec(frames, NFFT):
+    """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
+    """
+    return numpy.square(magspec(frames, NFFT))
+
+
+def logpowspec(frames, NFFT, norm=1):
+    """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
+    """
+    ps = powspec(frames, NFFT);
+    ps[ps <= 1e-30] = 1e-30
+    lps = 10 * numpy.log10(ps)
+    if norm:
+        return lps - numpy.max(lps)
+    else:
+        return lps
+
+def do_dither(signal, dither_value=1.0):
+    signal += numpy.random.normal(size=signal.shape) * dither_value
+    return signal
+    
+def do_remove_dc_offset(signal):
+    signal -= numpy.mean(signal)
+    return signal
+
+def do_preemphasis(signal, coeff=0.97):
+    """perform preemphasis on the input signal.
+
+    :param signal: The signal to filter.
+    :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
+    :returns: the filtered signal.
+    """
+    return numpy.append((1-coeff)*signal[0], signal[1:] - coeff * signal[:-1])
--- a/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc_orig.py
+++ b/third_party/python_kaldi_features/build/lib/python_speech_features/sigproc_orig.py
+# This file includes routines for basic signal processing including framing and computing power spectra.
+# Author: James Lyons 2012
+import decimal
+
+import numpy
+import math
+import logging
+
+
+def round_half_up(number):
+    return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
+
+
+def rolling_window(a, window, step=1):
+    # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
+    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
+    strides = a.strides + (a.strides[-1],)
+    return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
+
+
+def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,)), stride_trick=True):
+    """Frame a signal into overlapping frames.
+
+    :param sig: the audio signal to frame.
+    :param frame_len: length of each frame measured in samples.
+    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
+    :param stride_trick: use stride trick to compute the rolling window and window multiplication faster
+    :returns: an array of frames. Size is NUMFRAMES by frame_len.
+    """
+    slen = len(sig)
+    frame_len = int(round_half_up(frame_len))
+    frame_step = int(round_half_up(frame_step))
+    if slen <= frame_len:
+        numframes = 1
+    else:
+        numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step))
+
+    padlen = int((numframes - 1) * frame_step + frame_len)
+
+    zeros = numpy.zeros((padlen - slen,))
+    padsignal = numpy.concatenate((sig, zeros))
+    if stride_trick:
+        win = winfunc(frame_len)
+        frames = rolling_window(padsignal, window=frame_len, step=frame_step)
+    else:
+        indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
+            numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
+        indices = numpy.array(indices, dtype=numpy.int32)
+        frames = padsignal[indices]
+        win = numpy.tile(winfunc(frame_len), (numframes, 1))
+
+    return frames * win
+
+
+def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
+    """Does overlap-add procedure to undo the action of framesig.
+
+    :param frames: the array of frames.
+    :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
+    :param frame_len: length of each frame measured in samples.
+    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
+    :returns: a 1-D signal.
+    """
+    frame_len = round_half_up(frame_len)
+    frame_step = round_half_up(frame_step)
+    numframes = numpy.shape(frames)[0]
+    assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
+
+    indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
+        numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
+    indices = numpy.array(indices, dtype=numpy.int32)
+    padlen = (numframes - 1) * frame_step + frame_len
+
+    if siglen <= 0: siglen = padlen
+
+    rec_signal = numpy.zeros((padlen,))
+    window_correction = numpy.zeros((padlen,))
+    win = winfunc(frame_len)
+
+    for i in range(0, numframes):
+        window_correction[indices[i, :]] = window_correction[
+                                               indices[i, :]] + win + 1e-15  # add a little bit so it is never zero
+        rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
+
+    rec_signal = rec_signal / window_correction
+    return rec_signal[0:siglen]
+
+
+def magspec(frames, NFFT):
+    """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
+    """
+    if numpy.shape(frames)[1] > NFFT:
+        logging.warn(
+            'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
+            numpy.shape(frames)[1], NFFT)
+    complex_spec = numpy.fft.rfft(frames, NFFT)
+    return numpy.absolute(complex_spec)
+
+
+def powspec(frames, NFFT):
+    """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
+    """
+    return 1.0 / NFFT * numpy.square(magspec(frames, NFFT))
+
+
+def logpowspec(frames, NFFT, norm=1):
+    """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
+    """
+    ps = powspec(frames, NFFT);
+    ps[ps <= 1e-30] = 1e-30
+    lps = 10 * numpy.log10(ps)
+    if norm:
+        return lps - numpy.max(lps)
+    else:
+        return lps
+
+
+def preemphasis(signal, coeff=0.95):
+    """perform preemphasis on the input signal.
+
+    :param signal: The signal to filter.
+    :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
+    :returns: the filtered signal.
+    """
+    return numpy.append(signal[0], signal[1:] - coeff * signal[:-1])
--- a/third_party/python_kaldi_features/dist/python_speech_features-0.6-py3.7.egg
+++ b/third_party/python_kaldi_features/dist/python_speech_features-0.6-py3.7.egg
--- a/third_party/python_kaldi_features/docs/Makefile
+++ b/third_party/python_kaldi_features/docs/Makefile
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS    =
+SPHINXBUILD   = sphinx-build
+PAPER         =
+BUILDDIR      = build
+
+# Internal variables.
+PAPEROPT_a4     = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS   = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) source
+
+.PHONY: help clean html dirhtml pickle json htmlhelp qthelp latex changes linkcheck doctest
+
+help:
+	@echo "Please use \`make <target>' where <target> is one of"
+	@echo "  html      to make standalone HTML files"
+	@echo "  dirhtml   to make HTML files named index.html in directories"
+	@echo "  pickle    to make pickle files"
+	@echo "  json      to make JSON files"
+	@echo "  htmlhelp  to make HTML files and a HTML help project"
+	@echo "  qthelp    to make HTML files and a qthelp project"
+	@echo "  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+	@echo "  changes   to make an overview of all changed/added/deprecated items"
+	@echo "  linkcheck to check all external links for integrity"
+	@echo "  doctest   to run all doctests embedded in the documentation (if enabled)"
+
+clean:
+	-rm -rf $(BUILDDIR)/*
+
+html:
+	$(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+	$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+	@echo
+	@echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+pickle:
+	$(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+	@echo
+	@echo "Build finished; now you can process the pickle files."
+
+json:
+	$(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+	@echo
+	@echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+	$(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+	@echo
+	@echo "Build finished; now you can run HTML Help Workshop with the" \
+	      ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+	$(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+	@echo
+	@echo "Build finished; now you can run "qcollectiongenerator" with the" \
+	      ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+	@echo "# qcollectiongenerator $(BUILDDIR)/qthelp/python_speech_features.qhcp"
+	@echo "To view the help file:"
+	@echo "# assistant -collectionFile $(BUILDDIR)/qthelp/python_speech_features.qhc"
+
+latex:
+	$(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+	@echo
+	@echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+	@echo "Run \`make all-pdf' or \`make all-ps' in that directory to" \
+	      "run these through (pdf)latex."
+
+changes:
+	$(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+	@echo
+	@echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+	$(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+	@echo
+	@echo "Link check complete; look for any errors in the above output " \
+	      "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+	$(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+	@echo "Testing of doctests in the sources finished, look at the " \
+	      "results in $(BUILDDIR)/doctest/output.txt."
--- a/third_party/python_kaldi_features/docs/make.bat
+++ b/third_party/python_kaldi_features/docs/make.bat
+@ECHO OFF
+
+REM Command file for Sphinx documentation
+
+set SPHINXBUILD=sphinx-build
+set BUILDDIR=build
+set ALLSPHINXOPTS=-d %BUILDDIR%/doctrees %SPHINXOPTS% source
+if NOT "%PAPER%" == "" (
+	set ALLSPHINXOPTS=-D latex_paper_size=%PAPER% %ALLSPHINXOPTS%
+)
+
+if "%1" == "" goto help
+
+if "%1" == "help" (
+	:help
+	echo.Please use `make ^<target^>` where ^<target^> is one of
+	echo.  html      to make standalone HTML files
+	echo.  dirhtml   to make HTML files named index.html in directories
+	echo.  pickle    to make pickle files
+	echo.  json      to make JSON files
+	echo.  htmlhelp  to make HTML files and a HTML help project
+	echo.  qthelp    to make HTML files and a qthelp project
+	echo.  latex     to make LaTeX files, you can set PAPER=a4 or PAPER=letter
+	echo.  changes   to make an overview over all changed/added/deprecated items
+	echo.  linkcheck to check all external links for integrity
+	echo.  doctest   to run all doctests embedded in the documentation if enabled
+	goto end
+)
+
+if "%1" == "clean" (
+	for /d %%i in (%BUILDDIR%\*) do rmdir /q /s %%i
+	del /q /s %BUILDDIR%\*
+	goto end
+)
+
+if "%1" == "html" (
+	%SPHINXBUILD% -b html %ALLSPHINXOPTS% %BUILDDIR%/html
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/html.
+	goto end
+)
+
+if "%1" == "dirhtml" (
+	%SPHINXBUILD% -b dirhtml %ALLSPHINXOPTS% %BUILDDIR%/dirhtml
+	echo.
+	echo.Build finished. The HTML pages are in %BUILDDIR%/dirhtml.
+	goto end
+)
+
+if "%1" == "pickle" (
+	%SPHINXBUILD% -b pickle %ALLSPHINXOPTS% %BUILDDIR%/pickle
+	echo.
+	echo.Build finished; now you can process the pickle files.
+	goto end
+)
+
+if "%1" == "json" (
+	%SPHINXBUILD% -b json %ALLSPHINXOPTS% %BUILDDIR%/json
+	echo.
+	echo.Build finished; now you can process the JSON files.
+	goto end
+)
+
+if "%1" == "htmlhelp" (
+	%SPHINXBUILD% -b htmlhelp %ALLSPHINXOPTS% %BUILDDIR%/htmlhelp
+	echo.
+	echo.Build finished; now you can run HTML Help Workshop with the ^
+.hhp project file in %BUILDDIR%/htmlhelp.
+	goto end
+)
+
+if "%1" == "qthelp" (
+	%SPHINXBUILD% -b qthelp %ALLSPHINXOPTS% %BUILDDIR%/qthelp
+	echo.
+	echo.Build finished; now you can run "qcollectiongenerator" with the ^
+.qhcp project file in %BUILDDIR%/qthelp, like this:
+	echo.^> qcollectiongenerator %BUILDDIR%\qthelp\python_speech_features.qhcp
+	echo.To view the help file:
+	echo.^> assistant -collectionFile %BUILDDIR%\qthelp\python_speech_features.ghc
+	goto end
+)
+
+if "%1" == "latex" (
+	%SPHINXBUILD% -b latex %ALLSPHINXOPTS% %BUILDDIR%/latex
+	echo.
+	echo.Build finished; the LaTeX files are in %BUILDDIR%/latex.
+	goto end
+)
+
+if "%1" == "changes" (
+	%SPHINXBUILD% -b changes %ALLSPHINXOPTS% %BUILDDIR%/changes
+	echo.
+	echo.The overview file is in %BUILDDIR%/changes.
+	goto end
+)
+
+if "%1" == "linkcheck" (
+	%SPHINXBUILD% -b linkcheck %ALLSPHINXOPTS% %BUILDDIR%/linkcheck
+	echo.
+	echo.Link check complete; look for any errors in the above output ^
+or in %BUILDDIR%/linkcheck/output.txt.
+	goto end
+)
+
+if "%1" == "doctest" (
+	%SPHINXBUILD% -b doctest %ALLSPHINXOPTS% %BUILDDIR%/doctest
+	echo.
+	echo.Testing of doctests in the sources finished, look at the ^
+results in %BUILDDIR%/doctest/output.txt.
+	goto end
+)
+
+:end
--- a/third_party/python_kaldi_features/docs/source/conf.py
+++ b/third_party/python_kaldi_features/docs/source/conf.py
+# -*- coding: utf-8 -*-
+#
+# python_speech_features documentation build configuration file, created by
+# sphinx-quickstart on Thu Oct 31 16:49:58 2013.
+#
+# This file is execfile()d with the current directory set to its containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys, os
+
+import mock
+ 
+MOCK_MODULES = ['numpy', 'scipy', 'scipy.fftpack']
+for mod_name in MOCK_MODULES:
+  sys.modules[mod_name] = mock.Mock()
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+sys.path.insert(0,os.path.abspath('../..'))
+
+# -- General configuration -----------------------------------------------------
+
+# Add any Sphinx extension module names here, as strings. They can be extensions
+# coming with Sphinx (named 'sphinx.ext.*') or your custom ones.
+extensions = ['sphinx.ext.autodoc']
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix of source filenames.
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'python_speech_features'
+copyright = u'2013, James Lyons'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = '0.1.0'
+# The full version, including alpha/beta/rc tags.
+release = '0.1.0'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of documents that shouldn't be included in the build.
+#unused_docs = []
+
+# List of directories, relative to source directory, that shouldn't be searched
+# for source files.
+exclude_trees = []
+
+# The reST default role (used for this markup: `text`) to use for all documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+
+# -- Options for HTML output ---------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages.  Major themes that come with
+# Sphinx are currently 'default' and 'sphinxdoc'.
+html_theme = 'default'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further.  For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents.  If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar.  Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs.  This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+#html_use_smartypants = True
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_use_modindex = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it.  The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# If nonempty, this is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = ''
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'python_speech_featuresdoc'
+
+
+# -- Options for LaTeX output --------------------------------------------------
+
+# The paper size ('letter' or 'a4').
+#latex_paper_size = 'letter'
+
+# The font size ('10pt', '11pt' or '12pt').
+#latex_font_size = '10pt'
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title, author, documentclass [howto/manual]).
+latex_documents = [
+  ('index', 'python_speech_features.tex', u'python\\_speech\\_features Documentation',
+   u'James Lyons', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# Additional stuff for the LaTeX preamble.
+#latex_preamble = ''
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_use_modindex = True
+
+autodoc_member_order = 'bysource'
--- a/third_party/python_kaldi_features/docs/source/index.rst
+++ b/third_party/python_kaldi_features/docs/source/index.rst
+.. python_speech_features documentation master file, created by
+   sphinx-quickstart on Thu Oct 31 16:49:58 2013.
+   You can adapt this file completely to your liking, but it should at least
+   contain the root `toctree` directive.
+
+Welcome to python_speech_features's documentation!
+==================================================
+
+This library provides common speech features for ASR including MFCCs and filterbank energies.
+If you are not sure what MFCCs are, and would like to know more have a look at this MFCC tutorial: 
+http://www.practicalcryptography.com/miscellaneous/machine-learning/guide-mel-frequency-cepstral-coefficients-mfccs/.
+
+You will need numpy and scipy to run these files. The code for this project is available at https://github.com/jameslyons/python_speech_features .
+
+Supported features:
+
+- :py:meth:`python_speech_features.mfcc` - Mel Frequency Cepstral Coefficients
+- :py:meth:`python_speech_features.fbank` - Filterbank Energies
+- :py:meth:`python_speech_features.logfbank` - Log Filterbank Energies
+- :py:meth:`python_speech_features.ssc` - Spectral Subband Centroids
+
+To use MFCC features::
+
+    from python_speech_features import mfcc
+    from python_speech_features import logfbank
+    import scipy.io.wavfile as wav
+    
+    (rate,sig) = wav.read("file.wav")
+    mfcc_feat = mfcc(sig,rate)
+    fbank_feat = logfbank(sig,rate)
+    
+    print(fbank_feat[1:3,:])
+
+From here you can write the features to a file etc.
+
+Functions provided in python_speech_features module
+-------------------------------------
+   
+.. automodule:: python_speech_features.base
+    :members:
+    
+
+Functions provided in sigproc module
+------------------------------------
+.. automodule:: python_speech_features.sigproc
+    :members:
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
+
--- a/third_party/python_kaldi_features/english.wav
+++ b/third_party/python_kaldi_features/english.wav
--- a/third_party/python_kaldi_features/example.py
+++ b/third_party/python_kaldi_features/example.py
+#!/usr/bin/env python
+
+from python_speech_features import mfcc
+from python_speech_features import delta
+from python_speech_features import logfbank
+import scipy.io.wavfile as wav
+
+(rate,sig) = wav.read("english.wav")
+
+# note that generally nfilt=40 is used for speech recognition
+fbank_feat = logfbank(sig,nfilt=23,lowfreq=20,dither=0,wintype='povey')
+
+# the computed fbank coefficents of english.wav with dimension [110,23]
+# [ 12.2865	12.6906	13.1765	15.714	16.064	15.7553	16.5746	16.9205	16.6472	16.1302	16.4576	16.7326	16.8864	17.7215	18.88	19.1377	19.1495	18.6683	18.3886	20.3506	20.2772	18.8248	18.1899
+# 11.9198	13.146	14.7215	15.8642	17.4288	16.394	16.8238	16.1095	16.4297	16.6331	16.3163	16.5093	17.4981	18.3429	19.6555	19.6263	19.8435	19.0534	19.001	20.0287	19.7707	19.5852	19.1112
+# ...
+# ...
+# the same with that using kaldi commands: compute-fbank-feats --dither=0.0
+
+
+mfcc_feat = mfcc(sig,dither=0,useEnergy=True,wintype='povey')
+
+# the computed mfcc coefficents of english.wav with dimension [110,13]
+# [ 17.1337	-23.3651	-7.41751	-7.73686	-21.3682	-8.93884	-3.70843	4.68346	-16.0676	12.782	-7.24054	8.25089	10.7292
+# 17.1692	-23.3028	-5.61872	-4.0075	-23.287	-20.6101	-5.51584	-6.15273	-14.4333	8.13052	-0.0345329	2.06274	-0.564298
+# ...
+# ...
+# the same with that using kaldi commands: compute-mfcc-feats --dither=0.0
+
--- a/third_party/python_kaldi_features/python_speech_features.egg-info/PKG-INFO
+++ b/third_party/python_kaldi_features/python_speech_features.egg-info/PKG-INFO
+Metadata-Version: 1.0
+Name: python-speech-features
+Version: 0.6
+Summary: Python Speech Feature extraction
+Home-page: https://github.com/jameslyons/python_speech_features
+Author: James Lyons
+Author-email: james.lyons0@gmail.com
+License: MIT
+Description: UNKNOWN
+Platform: UNKNOWN
--- a/third_party/python_kaldi_features/python_speech_features.egg-info/SOURCES.txt
+++ b/third_party/python_kaldi_features/python_speech_features.egg-info/SOURCES.txt
+README.rst
+setup.py
+python_speech_features/__init__.py
+python_speech_features/base.py
+python_speech_features/base_orig.py
+python_speech_features/sigproc.py
+python_speech_features/sigproc_orig.py
+python_speech_features.egg-info/PKG-INFO
+python_speech_features.egg-info/SOURCES.txt
+python_speech_features.egg-info/dependency_links.txt
+python_speech_features.egg-info/top_level.txt
+test/test_sigproc.py
\ No newline at end of file
--- a/third_party/python_kaldi_features/python_speech_features.egg-info/dependency_links.txt
+++ b/third_party/python_kaldi_features/python_speech_features.egg-info/dependency_links.txt
+
--- a/third_party/python_kaldi_features/python_speech_features.egg-info/top_level.txt
+++ b/third_party/python_kaldi_features/python_speech_features.egg-info/top_level.txt
+python_speech_features
--- a/third_party/python_kaldi_features/python_speech_features/__init__.py
+++ b/third_party/python_kaldi_features/python_speech_features/__init__.py
+from .base import *
--- a/third_party/python_kaldi_features/python_speech_features/base.py
+++ b/third_party/python_kaldi_features/python_speech_features/base.py
+# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications
+# Author: James Lyons 2012
+from __future__ import division
+import numpy
+from python_speech_features import sigproc
+from scipy.fftpack import dct
+
+def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
+         nfilt=23,nfft=512,lowfreq=20,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,
+         ceplifter=22,useEnergy=True,wintype='povey'):
+    """Compute MFCC features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param numcep: the number of cepstrum to return, default 13
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
+    :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+    :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
+    """
+    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither,remove_dc_offset,preemph,wintype)
+    feat = numpy.log(feat)
+    feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
+    feat = lifter(feat,ceplifter)
+    if useEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
+    return feat
+
+def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+          nfilt=40,nfft=512,lowfreq=0,highfreq=None,dither=1.0,remove_dc_offset=True, preemph=0.97, 
+          wintype='hamming'):
+    """Compute Mel-filterbank energy features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+     winfunc=lambda x:numpy.ones((x,))   
+    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
+        second return value is the energy in each frame (total energy, unwindowed)
+    """
+    highfreq= highfreq or samplerate/2
+    frames,raw_frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, dither, preemph, remove_dc_offset, wintype)
+    pspec = sigproc.powspec(frames,nfft) # nearly the same until this part
+    energy = numpy.sum(raw_frames**2,1) # this stores the raw energy in each frame
+    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
+
+    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
+    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
+    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
+
+    return feat,energy
+
+def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+          nfilt=40,nfft=512,lowfreq=64,highfreq=None,dither=1.0,remove_dc_offset=True,preemph=0.97,wintype='hamming'):
+    """Compute log Mel-filterbank energy features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
+    """
+    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,dither, remove_dc_offset,preemph,wintype)
+    return numpy.log(feat)
+
+def hz2mel(hz):
+    """Convert a value in Hertz to Mels
+
+    :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
+    :returns: a value in Mels. If an array was passed in, an identical sized array is returned.
+    """
+    return 1127 * numpy.log(1+hz/700.0)
+
+
+def mel2hz(mel):
+    """Convert a value in Mels to Hertz
+
+    :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
+    :returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
+    """
+    return 700 * (numpy.exp(mel/1127.0)-1)
+
+def get_filterbanks(nfilt=26,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
+    """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
+    to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
+
+    :param nfilt: the number of filters in the filterbank, default 20.
+    :param nfft: the FFT size. Default is 512.
+    :param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
+    :param lowfreq: lowest band edge of mel filters, default 0 Hz
+    :param highfreq: highest band edge of mel filters, default samplerate/2
+    :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
+    """
+    highfreq= highfreq or samplerate/2
+    assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
+
+    # compute points evenly spaced in mels
+    lowmel = hz2mel(lowfreq)
+    highmel = hz2mel(highfreq)
+
+    # check kaldi/src/feat/Mel-computations.h    
+    fbank = numpy.zeros([nfilt,nfft//2+1])
+    mel_freq_delta = (highmel-lowmel)/(nfilt+1)
+    for j in range(0,nfilt):
+        leftmel = lowmel+j*mel_freq_delta
+        centermel = lowmel+(j+1)*mel_freq_delta
+        rightmel = lowmel+(j+2)*mel_freq_delta
+        for i in range(0,nfft//2):
+            mel=hz2mel(i*samplerate/nfft)
+            if mel>leftmel and mel<rightmel:
+                if mel<centermel:
+                    fbank[j,i]=(mel-leftmel)/(centermel-leftmel)
+                else:
+                    fbank[j,i]=(rightmel-mel)/(rightmel-centermel)
+    return fbank
+
+def lifter(cepstra, L=22):
+    """Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
+    magnitude of the high frequency DCT coeffs.
+
+    :param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
+    :param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
+    """
+    if L > 0:
+        nframes,ncoeff = numpy.shape(cepstra)
+        n = numpy.arange(ncoeff)
+        lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
+        return lift*cepstra
+    else:
+        # values of L <= 0, do nothing
+        return cepstra
+
+def delta(feat, N):
+    """Compute delta features from a feature vector sequence.
+
+    :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
+    :param N: For each frame, calculate delta features based on preceding and following N frames
+    :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
+    """
+    if N < 1:
+        raise ValueError('N must be an integer >= 1')
+    NUMFRAMES = len(feat)
+    denominator = 2 * sum([i**2 for i in range(1, N+1)])
+    delta_feat = numpy.empty_like(feat)
+    padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge')   # padded version of feat
+    for t in range(NUMFRAMES):
+        delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator   # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
+    return delta_feat
--- a/third_party/python_kaldi_features/python_speech_features/base_orig.py
+++ b/third_party/python_kaldi_features/python_speech_features/base_orig.py
+# calculate filterbank features. Provides e.g. fbank and mfcc features for use in ASR applications
+# Author: James Lyons 2012
+from __future__ import division
+import numpy
+from python_speech_features import sigproc
+from scipy.fftpack import dct
+
+def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13,
+         nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True,
+         winfunc=lambda x:numpy.ones((x,))):
+    """Compute MFCC features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param numcep: the number of cepstrum to return, default 13
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.
+    :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+    :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.
+    """
+    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph,winfunc)
+    feat = numpy.log(feat)
+    feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep]
+    feat = lifter(feat,ceplifter)
+    if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy
+    return feat
+
+def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
+          winfunc=lambda x:numpy.ones((x,))):
+    """Compute Mel-filterbank energy features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+    :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The
+        second return value is the energy in each frame (total energy, unwindowed)
+    """
+    highfreq= highfreq or samplerate/2
+    signal = sigproc.preemphasis(signal,preemph)
+    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
+    pspec = sigproc.powspec(frames,nfft)
+    energy = numpy.sum(pspec,1) # this stores the total energy in each frame
+    energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log
+
+    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
+    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
+    feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log
+
+    return feat,energy
+
+def logfbank(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97):
+    """Compute log Mel-filterbank energy features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
+    """
+    feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph)
+    return numpy.log(feat)
+
+def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01,
+        nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,
+        winfunc=lambda x:numpy.ones((x,))):
+    """Compute Spectral Subband Centroid features from an audio signal.
+
+    :param signal: the audio signal from which to compute features. Should be an N*1 array
+    :param samplerate: the samplerate of the signal we are working with.
+    :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)
+    :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds)
+    :param nfilt: the number of filters in the filterbank, default 26.
+    :param nfft: the FFT size. Default is 512.
+    :param lowfreq: lowest band edge of mel filters. In Hz, default is 0.
+    :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2
+    :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming
+    :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector.
+    """
+    highfreq= highfreq or samplerate/2
+    signal = sigproc.preemphasis(signal,preemph)
+    frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc)
+    pspec = sigproc.powspec(frames,nfft)
+    pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems
+
+    fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq)
+    feat = numpy.dot(pspec,fb.T) # compute the filterbank energies
+    R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1))
+
+    return numpy.dot(pspec*R,fb.T) / feat
+
+def hz2mel(hz):
+    """Convert a value in Hertz to Mels
+
+    :param hz: a value in Hz. This can also be a numpy array, conversion proceeds element-wise.
+    :returns: a value in Mels. If an array was passed in, an identical sized array is returned.
+    """
+    return 2595 * numpy.log10(1+hz/700.)
+
+def mel2hz(mel):
+    """Convert a value in Mels to Hertz
+
+    :param mel: a value in Mels. This can also be a numpy array, conversion proceeds element-wise.
+    :returns: a value in Hertz. If an array was passed in, an identical sized array is returned.
+    """
+    return 700*(10**(mel/2595.0)-1)
+
+def get_filterbanks(nfilt=20,nfft=512,samplerate=16000,lowfreq=0,highfreq=None):
+    """Compute a Mel-filterbank. The filters are stored in the rows, the columns correspond
+    to fft bins. The filters are returned as an array of size nfilt * (nfft/2 + 1)
+
+    :param nfilt: the number of filters in the filterbank, default 20.
+    :param nfft: the FFT size. Default is 512.
+    :param samplerate: the samplerate of the signal we are working with. Affects mel spacing.
+    :param lowfreq: lowest band edge of mel filters, default 0 Hz
+    :param highfreq: highest band edge of mel filters, default samplerate/2
+    :returns: A numpy array of size nfilt * (nfft/2 + 1) containing filterbank. Each row holds 1 filter.
+    """
+    highfreq= highfreq or samplerate/2
+    assert highfreq <= samplerate/2, "highfreq is greater than samplerate/2"
+
+    # compute points evenly spaced in mels
+    lowmel = hz2mel(lowfreq)
+    highmel = hz2mel(highfreq)
+    melpoints = numpy.linspace(lowmel,highmel,nfilt+2)
+    # our points are in Hz, but we use fft bins, so we have to convert
+    #  from Hz to fft bin number
+    bin = numpy.floor((nfft+1)*mel2hz(melpoints)/samplerate)
+
+    fbank = numpy.zeros([nfilt,nfft//2+1])
+    for j in range(0,nfilt):
+        for i in range(int(bin[j]), int(bin[j+1])):
+            fbank[j,i] = (i - bin[j]) / (bin[j+1]-bin[j])
+        for i in range(int(bin[j+1]), int(bin[j+2])):
+            fbank[j,i] = (bin[j+2]-i) / (bin[j+2]-bin[j+1])
+    return fbank
+
+def lifter(cepstra, L=22):
+    """Apply a cepstral lifter the the matrix of cepstra. This has the effect of increasing the
+    magnitude of the high frequency DCT coeffs.
+
+    :param cepstra: the matrix of mel-cepstra, will be numframes * numcep in size.
+    :param L: the liftering coefficient to use. Default is 22. L <= 0 disables lifter.
+    """
+    if L > 0:
+        nframes,ncoeff = numpy.shape(cepstra)
+        n = numpy.arange(ncoeff)
+        lift = 1 + (L/2.)*numpy.sin(numpy.pi*n/L)
+        return lift*cepstra
+    else:
+        # values of L <= 0, do nothing
+        return cepstra
+
+def delta(feat, N):
+    """Compute delta features from a feature vector sequence.
+
+    :param feat: A numpy array of size (NUMFRAMES by number of features) containing features. Each row holds 1 feature vector.
+    :param N: For each frame, calculate delta features based on preceding and following N frames
+    :returns: A numpy array of size (NUMFRAMES by number of features) containing delta features. Each row holds 1 delta feature vector.
+    """
+    if N < 1:
+        raise ValueError('N must be an integer >= 1')
+    NUMFRAMES = len(feat)
+    denominator = 2 * sum([i**2 for i in range(1, N+1)])
+    delta_feat = numpy.empty_like(feat)
+    padded = numpy.pad(feat, ((N, N), (0, 0)), mode='edge')   # padded version of feat
+    for t in range(NUMFRAMES):
+        delta_feat[t] = numpy.dot(numpy.arange(-N, N+1), padded[t : t+2*N+1]) / denominator   # [t : t+2*N+1] == [(N+t)-N : (N+t)+N+1]
+    return delta_feat
--- a/third_party/python_kaldi_features/python_speech_features/sigproc.py
+++ b/third_party/python_kaldi_features/python_speech_features/sigproc.py
+# This file includes routines for basic signal processing including framing and computing power spectra.
+# Author: James Lyons 2012
+import decimal
+
+import numpy
+import math
+import logging
+
+
+def round_half_up(number):
+    return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
+
+
+def rolling_window(a, window, step=1):
+    # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
+    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
+    strides = a.strides + (a.strides[-1],)
+    return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
+
+
+def framesig(sig, frame_len, frame_step, dither=1.0, preemph=0.97, remove_dc_offset=True, wintype='hamming', stride_trick=True):
+    """Frame a signal into overlapping frames.
+
+    :param sig: the audio signal to frame.
+    :param frame_len: length of each frame measured in samples.
+    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
+    :param stride_trick: use stride trick to compute the rolling window and window multiplication faster
+    :returns: an array of frames. Size is NUMFRAMES by frame_len.
+    """
+    slen = len(sig)
+    frame_len = int(round_half_up(frame_len))
+    frame_step = int(round_half_up(frame_step))
+    if slen <= frame_len:
+        numframes = 1
+    else:
+        numframes = 1 + (( slen - frame_len) // frame_step)
+
+    # check kaldi/src/feat/feature-window.h
+    padsignal = sig[:(numframes-1)*frame_step+frame_len]
+    if wintype is 'povey':
+        win = numpy.empty(frame_len)
+        for i in range(frame_len):
+            win[i] = (0.5-0.5*numpy.cos(2*numpy.pi/(frame_len-1)*i))**0.85     
+    else: # the hamming window
+        win = numpy.hamming(frame_len)
+        
+    if stride_trick:
+        frames = rolling_window(padsignal, window=frame_len, step=frame_step)
+    else:
+        indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
+            numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
+        indices = numpy.array(indices, dtype=numpy.int32)
+        frames = padsignal[indices]
+        win = numpy.tile(win, (numframes, 1))
+        
+    frames = frames.astype(numpy.float32)
+    raw_frames = numpy.zeros(frames.shape)
+    for frm in range(frames.shape[0]):
+        frames[frm,:] = do_dither(frames[frm,:], dither)        # dither
+        frames[frm,:] = do_remove_dc_offset(frames[frm,:])      # remove dc offset
+        raw_frames[frm,:] = frames[frm,:]
+        frames[frm,:] = do_preemphasis(frames[frm,:], preemph)    # preemphasize
+
+    return frames * win, raw_frames
+
+def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
+    """Does overlap-add procedure to undo the action of framesig.
+
+    :param frames: the array of frames.
+    :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
+    :param frame_len: length of each frame measured in samples.
+    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
+    :returns: a 1-D signal.
+    """
+    frame_len = round_half_up(frame_len)
+    frame_step = round_half_up(frame_step)
+    numframes = numpy.shape(frames)[0]
+    assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
+
+    indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
+        numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
+    indices = numpy.array(indices, dtype=numpy.int32)
+    padlen = (numframes - 1) * frame_step + frame_len
+
+    if siglen <= 0: siglen = padlen
+
+    rec_signal = numpy.zeros((padlen,))
+    window_correction = numpy.zeros((padlen,))
+    win = winfunc(frame_len)
+
+    for i in range(0, numframes):
+        window_correction[indices[i, :]] = window_correction[
+                                               indices[i, :]] + win + 1e-15  # add a little bit so it is never zero
+        rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
+
+    rec_signal = rec_signal / window_correction
+    return rec_signal[0:siglen]
+
+
+def magspec(frames, NFFT):
+    """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
+    """
+    if numpy.shape(frames)[1] > NFFT:
+        logging.warn(
+            'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
+            numpy.shape(frames)[1], NFFT)
+    complex_spec = numpy.fft.rfft(frames, NFFT)
+    return numpy.absolute(complex_spec)
+
+
+def powspec(frames, NFFT):
+    """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
+    """
+    return numpy.square(magspec(frames, NFFT))
+
+
+def logpowspec(frames, NFFT, norm=1):
+    """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
+    """
+    ps = powspec(frames, NFFT);
+    ps[ps <= 1e-30] = 1e-30
+    lps = 10 * numpy.log10(ps)
+    if norm:
+        return lps - numpy.max(lps)
+    else:
+        return lps
+
+def do_dither(signal, dither_value=1.0):
+    signal += numpy.random.normal(size=signal.shape) * dither_value
+    return signal
+    
+def do_remove_dc_offset(signal):
+    signal -= numpy.mean(signal)
+    return signal
+
+def do_preemphasis(signal, coeff=0.97):
+    """perform preemphasis on the input signal.
+
+    :param signal: The signal to filter.
+    :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
+    :returns: the filtered signal.
+    """
+    return numpy.append((1-coeff)*signal[0], signal[1:] - coeff * signal[:-1])
--- a/third_party/python_kaldi_features/python_speech_features/sigproc_orig.py
+++ b/third_party/python_kaldi_features/python_speech_features/sigproc_orig.py
+# This file includes routines for basic signal processing including framing and computing power spectra.
+# Author: James Lyons 2012
+import decimal
+
+import numpy
+import math
+import logging
+
+
+def round_half_up(number):
+    return int(decimal.Decimal(number).quantize(decimal.Decimal('1'), rounding=decimal.ROUND_HALF_UP))
+
+
+def rolling_window(a, window, step=1):
+    # http://ellisvalentiner.com/post/2017-03-21-np-strides-trick
+    shape = a.shape[:-1] + (a.shape[-1] - window + 1, window)
+    strides = a.strides + (a.strides[-1],)
+    return numpy.lib.stride_tricks.as_strided(a, shape=shape, strides=strides)[::step]
+
+
+def framesig(sig, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,)), stride_trick=True):
+    """Frame a signal into overlapping frames.
+
+    :param sig: the audio signal to frame.
+    :param frame_len: length of each frame measured in samples.
+    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
+    :param stride_trick: use stride trick to compute the rolling window and window multiplication faster
+    :returns: an array of frames. Size is NUMFRAMES by frame_len.
+    """
+    slen = len(sig)
+    frame_len = int(round_half_up(frame_len))
+    frame_step = int(round_half_up(frame_step))
+    if slen <= frame_len:
+        numframes = 1
+    else:
+        numframes = 1 + int(math.ceil((1.0 * slen - frame_len) / frame_step))
+
+    padlen = int((numframes - 1) * frame_step + frame_len)
+
+    zeros = numpy.zeros((padlen - slen,))
+    padsignal = numpy.concatenate((sig, zeros))
+    if stride_trick:
+        win = winfunc(frame_len)
+        frames = rolling_window(padsignal, window=frame_len, step=frame_step)
+    else:
+        indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
+            numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
+        indices = numpy.array(indices, dtype=numpy.int32)
+        frames = padsignal[indices]
+        win = numpy.tile(winfunc(frame_len), (numframes, 1))
+
+    return frames * win
+
+
+def deframesig(frames, siglen, frame_len, frame_step, winfunc=lambda x: numpy.ones((x,))):
+    """Does overlap-add procedure to undo the action of framesig.
+
+    :param frames: the array of frames.
+    :param siglen: the length of the desired signal, use 0 if unknown. Output will be truncated to siglen samples.
+    :param frame_len: length of each frame measured in samples.
+    :param frame_step: number of samples after the start of the previous frame that the next frame should begin.
+    :param winfunc: the analysis window to apply to each frame. By default no window is applied.
+    :returns: a 1-D signal.
+    """
+    frame_len = round_half_up(frame_len)
+    frame_step = round_half_up(frame_step)
+    numframes = numpy.shape(frames)[0]
+    assert numpy.shape(frames)[1] == frame_len, '"frames" matrix is wrong size, 2nd dim is not equal to frame_len'
+
+    indices = numpy.tile(numpy.arange(0, frame_len), (numframes, 1)) + numpy.tile(
+        numpy.arange(0, numframes * frame_step, frame_step), (frame_len, 1)).T
+    indices = numpy.array(indices, dtype=numpy.int32)
+    padlen = (numframes - 1) * frame_step + frame_len
+
+    if siglen <= 0: siglen = padlen
+
+    rec_signal = numpy.zeros((padlen,))
+    window_correction = numpy.zeros((padlen,))
+    win = winfunc(frame_len)
+
+    for i in range(0, numframes):
+        window_correction[indices[i, :]] = window_correction[
+                                               indices[i, :]] + win + 1e-15  # add a little bit so it is never zero
+        rec_signal[indices[i, :]] = rec_signal[indices[i, :]] + frames[i, :]
+
+    rec_signal = rec_signal / window_correction
+    return rec_signal[0:siglen]
+
+
+def magspec(frames, NFFT):
+    """Compute the magnitude spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the magnitude spectrum of the corresponding frame.
+    """
+    if numpy.shape(frames)[1] > NFFT:
+        logging.warn(
+            'frame length (%d) is greater than FFT size (%d), frame will be truncated. Increase NFFT to avoid.',
+            numpy.shape(frames)[1], NFFT)
+    complex_spec = numpy.fft.rfft(frames, NFFT)
+    return numpy.absolute(complex_spec)
+
+
+def powspec(frames, NFFT):
+    """Compute the power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the power spectrum of the corresponding frame.
+    """
+    return 1.0 / NFFT * numpy.square(magspec(frames, NFFT))
+
+
+def logpowspec(frames, NFFT, norm=1):
+    """Compute the log power spectrum of each frame in frames. If frames is an NxD matrix, output will be Nx(NFFT/2+1).
+
+    :param frames: the array of frames. Each row is a frame.
+    :param NFFT: the FFT length to use. If NFFT > frame_len, the frames are zero-padded.
+    :param norm: If norm=1, the log power spectrum is normalised so that the max value (across all frames) is 0.
+    :returns: If frames is an NxD matrix, output will be Nx(NFFT/2+1). Each row will be the log power spectrum of the corresponding frame.
+    """
+    ps = powspec(frames, NFFT);
+    ps[ps <= 1e-30] = 1e-30
+    lps = 10 * numpy.log10(ps)
+    if norm:
+        return lps - numpy.max(lps)
+    else:
+        return lps
+
+
+def preemphasis(signal, coeff=0.95):
+    """perform preemphasis on the input signal.
+
+    :param signal: The signal to filter.
+    :param coeff: The preemphasis coefficient. 0 is no filter, default is 0.95.
+    :returns: the filtered signal.
+    """
+    return numpy.append(signal[0], signal[1:] - coeff * signal[:-1])
--- a/third_party/python_kaldi_features/requirements.txt
+++ b/third_party/python_kaldi_features/requirements.txt
+mock
+scipy
+numpy
--- a/third_party/python_kaldi_features/setup.py
+++ b/third_party/python_kaldi_features/setup.py
+try:
+    from setuptools import setup #enables develop
+except ImportError:
+    from distutils.core import setup
+
+setup(name='python_speech_features',
+      version='0.6',
+      description='Python Speech Feature extraction',
+      author='James Lyons',
+      author_email='james.lyons0@gmail.com',
+      license='MIT',
+      url='https://github.com/jameslyons/python_speech_features',
+      packages=['python_speech_features'],
+    )
--- a/third_party/python_kaldi_features/test/test_sigproc.py
+++ b/third_party/python_kaldi_features/test/test_sigproc.py
+from python_speech_features import sigproc
+import unittest
+import numpy as np
+import time
+
+
+class test_case(unittest.TestCase):
+    def test_frame_sig(self):
+        n = 10000124
+        frame_len = 37
+        frame_step = 13
+        x = np.random.rand(n)
+        t0 = time.time()
+        y_old = sigproc.framesig(x, frame_len=frame_len, frame_step=frame_step, stride_trick=False)
+        t1 = time.time()
+        y_new = sigproc.framesig(x, frame_len=frame_len, frame_step=frame_step, stride_trick=True)
+        t_new = time.time() - t1
+        t_old = t1 - t0
+        self.assertTupleEqual(y_old.shape, y_new.shape)
+        np.testing.assert_array_equal(y_old, y_new)
+        self.assertLess(t_new, t_old)
+        print('new run time %3.2f < %3.2f sec' % (t_new, t_old))
+
+    def test_rolling(self):
+        x = np.arange(10)
+        y = sigproc.rolling_window(x, window=4, step=3)
+        y_expected = np.array([[0, 1, 2, 3],
+                               [3, 4, 5, 6],
+                               [6, 7, 8, 9]]
+                              )
+        y = np.testing.assert_array_equal(y, y_expected)
--- a/utils/compute_mean_std.py
+++ b/utils/compute_mean_std.py
@@ -24,7 +24,7 @@ from deepspeech.utils.utility import print_arguments
 parser = argparse.ArgumentParser(description=__doc__)
 add_arg = functools.partial(add_arguments, argparser=parser)
 # yapf: disable
-add_arg('num_samples',      int,    2000,    "# of samples to for statistics.")
+add_arg('num_samples',      int,    -1,    "# of samples to for statistics.")
 add_arg('specgram_type',    str,
        'linear',
        "Audio feature type. Options: linear, mfcc, fbank.",