add 3 augmentor class and change resample module

27799eb9 · chrisxu2014 · 09b7bc35 · 27799eb9 · 27799eb9 · 27799eb9
4 changed file
--- a/deep_speech_2/data_utils/audio.py
+++ b/deep_speech_2/data_utils/audio.py
@@ -6,7 +6,7 @@ from __future__ import print_function
 import numpy as np
 import io
 import soundfile
-import scikits.samplerate
+import resampy
 from scipy import signal
 import random
 import copy
@@ -321,21 +321,19 @@ class AudioSegment(object):
        gain_db = target_db - rms_estimate_db
        self.apply_gain(gain_db)

-    def resample(self, target_sample_rate, quality='sinc_medium'):
+    def resample(self, target_sample_rate, filter='kaiser_best'):
        """Resample the audio to a target sample rate.

        Note that this is an in-place transformation.

        :param target_sample_rate: Target sample rate.
        :type target_sample_rate: int
-        :param quality: One of {'sinc_fastest', 'sinc_medium', 'sinc_best'}.
-                        Sets resampling speed/quality tradeoff.
-                        See http://www.mega-nerd.com/SRC/api_misc.html#Converters
-        :type quality: str
+        :param filter: The resampling filter to use one of {'kaiser_best',
+                       'kaiser_fast'}.               
+        :type filter: str
        """
-        resample_ratio = target_sample_rate / self._sample_rate
-        self._samples = scikits.samplerate.resample(
-            self._samples, r=resample_ratio, type=quality)
+        self._samples = resampy.resample(
+            self.samples, self.sample_rate, target_sample_rate, filter=filter)
        self._sample_rate = target_sample_rate

    def pad_silence(self, duration, sides='both'):

--- a/deep_speech_2/data_utils/augmentor/resample.py
+++ b/deep_speech_2/data_utils/augmentor/resample.py
@@ -8,6 +8,9 @@ from data_utils.augmentor.base import AugmentorBase

 class ResampleAugmentor(AugmentorBase):
    """Augmentation model for resampling.
+
+    See more info here:
+    https://ccrma.stanford.edu/~jos/resample/index.html
    
    :param rng: Random generator object.
    :type rng: random.Random
@@ -27,4 +30,4 @@ class ResampleAugmentor(AugmentorBase):
        :param audio: Audio segment to add effects to.
        :type audio: AudioSegment|SpeechSegment
        """
-        audio_segment.resample(self._new_sample_rate)
+        audio_segment.resample(self._new_sample_rate)
\ No newline at end of file
--- a/deep_speech_2/requirements.txt
+++ b/deep_speech_2/requirements.txt
 SoundFile==0.9.0.post1
 wget==3.2
 scipy==0.13.1
+resampy==0.1.5
\ No newline at end of file
--- a/deep_speech_2/tests/test_augmentor.py
+++ b/deep_speech_2/tests/test_augmentor.py
-"""Test augmentor class."""
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import unittest
-from data_utils import audio
-from data_utils.augmentor.augmentation import AugmentationPipeline
-import random
-import numpy as np
-
-random_seed = 0
-audio_data = [3.0517571e-05, -8.54492188e-04, -1.09863281e-03, -9.4604492e-04,\
-            -1.31225586e-03, -1.09863281e-03, -1.73950195e-03, -2.1057189e-03,\
-            -2.04467773e-03, -1.46484375e-03, -1.43432617e-03, -9.4604492e-04,\
-            -1.95312500e-03, -1.86157227e-03, -2.10571289e-03, -2.3193354e-03,\
-            -2.01416016e-03, -2.62451172e-03, -2.07519531e-03, -2.3803719e-03]
-audio_data = np.array(audio_data)
-samplerate = 10
-
-
-class TestAugmentor(unittest.TestCase):
-    def test_volume(self):
-        config_json = '[{"type": "volume","params": {"min_gain_dBFS": -15, '\
-        '"max_gain_dBFS": 15},"prob": 1.0}]'
-        aug_pipeline = AugmentationPipeline(
-            augmentation_config=config_json, random_seed=random_seed)
-        audio_seg = audio.AudioSegment(audio_data, samplerate)
-        aug_pipeline.transform_audio(audio_seg)
-        orig_audio = audio.AudioSegment(audio_data, samplerate)
-        self.assertFalse(np.any(audio_seg.samples == orig_audio.samples))
-
-    def test_speed(self):
-        config_json = '[{"type":"speed","params": {"min_speed_rate": 1.2,' \
-        '"max_speed_rate": 1.4},"prob": 1.0}]'
-        aug_pipeline = AugmentationPipeline(
-            augmentation_config=config_json, random_seed=random_seed)
-        audio_seg = audio.AudioSegment(audio_data, samplerate)
-        aug_pipeline.transform_audio(audio_seg)
-        orig_audio = audio.AudioSegment(audio_data, samplerate)
-        self.assertFalse(np.any(audio_seg.samples == orig_audio.samples))
-
-    def test_resample(self):
-        config_json = '[{"type":"resample","params": {"new_sample_rate":5},'\
-        '"prob": 1.0}]'
-        aug_pipeline = AugmentationPipeline(
-            augmentation_config=config_json, random_seed=random_seed)
-        audio_seg = audio.AudioSegment(audio_data, samplerate)
-        aug_pipeline.transform_audio(audio_seg)
-        self.assertTrue(audio_seg.sample_rate == 5)
-
-    def test_bayesial(self):
-        config_json = '[{"type":"bayesian_normal","params":{"target_db":-20,' \
-        '"prior_db":-4, "prior_samples": -8, "startup_delay": 0.0},"prob":1.0}]'
-        aug_pipeline = AugmentationPipeline(
-            augmentation_config=config_json, random_seed=random_seed)
-        audio_seg = audio.AudioSegment(audio_data, samplerate)
-        aug_pipeline.transform_audio(audio_seg)
-        orig_audio = audio.AudioSegment(audio_data, samplerate)
-        self.assertFalse(np.any(audio_seg.samples == orig_audio.samples))
-
-
-if __name__ == '__main__':
-    unittest.main()