add audio file

e64bd000 · chrisxu2014 · 1b7c7c61 · e64bd000 · e64bd000
隐藏空白更改
内联并排

Showing with 49 addition and 25 deletion

deep_speech_2/data_utils/audio.py deep_speech_2/data_utils/audio.py +44 -20

deep_speech_2/data_utils/speech.py deep_speech_2/data_utils/speech.py +5 -5

未找到文件。
--- a/deep_speech_2/data_utils/audio.py
+++ b/deep_speech_2/data_utils/audio.py
@@ -9,6 +9,7 @@ import soundfile
 import scikits.samplerate
 from scipy import signal
 import random
+import copy
 class AudioSegment(object):
@@ -87,9 +88,8 @@ class AudioSegment(object):
        :return: Audio segment instance as concatenating results.
        :rtype: AudioSegment
        :raises ValueError: If the number of segments is zero, or if the 
-                            sample_rate of any two segment does not match.
+                            sample_rate of any segments does not match.
-        :raises TypeError: If every item in segments is not AudioSegment
+        :raises TypeError: If any segment is not AudioSegment instance.
-                           instance.
        """
        # Perform basic sanity-checks.
        if len(segments) == 0:
@@ -101,7 +101,7 @@ class AudioSegment(object):
                                 "different sample rates")
            if type(seg) is not cls:
                raise TypeError("Only audio segments of the same type "
-                                "instance can be concatenated.")
+                                "can be concatenated.")
        samples = np.concatenate([seg.samples for seg in segments])
        return cls(samples, sample_rate)
@@ -180,8 +180,7 @@ class AudioSegment(object):
    @classmethod
    def make_silence(cls, duration, sample_rate):
-        """Creates a silent audio segment of the given duration and
+        """Creates a silent audio segment of the given duration and sample rate.
-        sample rate.
        :param duration: Length of silence in seconds.
        :type duration: float
@@ -193,15 +192,17 @@ class AudioSegment(object):
        samples = np.zeros(int(duration * sample_rate))
        return cls(samples, sample_rate)
-    def superimposed(self, other):
+    def superimpose(self, other):
        """Add samples from another segment to those of this segment
        (sample-wise addition, not segment concatenation).
+        Note that this is an in-place transformation.
        :param other: Segment containing samples to be added in.
        :type other: AudioSegments
        :raise TypeError: If type of two segments don't match.
-        :raise ValueError: If the sample_rate of two segments not equal, or if
+        :raise ValueError: If the sample rates of the two segments are not
-                           the length of segments don't match.
+                           equal, or if the lengths of segments don't match.
        """
        if type(self) != type(other):
            raise TypeError("Cannot add segments of different types: %s "
@@ -215,7 +216,7 @@ class AudioSegment(object):
    def to_bytes(self, dtype='float32'):
        """Create a byte string containing the audio content.
-        :param dtype: Data type for export samples. Options: 'int16','int32',
+        :param dtype: Data type for export samples. Options: 'int16', 'int32',
                      'float32', 'float64'. Default is 'float32'.
        :type dtype: str
        :return: Byte string containing audio content.
@@ -362,16 +363,20 @@ class AudioSegment(object):
        elif sides == "both":
            padded = cls.concatenate(silence, self, silence)
        else:
-            raise ValueError("Unknown value for the kwarg %s" % sides)
+            raise ValueError("Unknown value for the sides %s" % sides)
        self._samples = padded._samples
    def subsegment(self, start_sec=None, end_sec=None):
-        """Return new AudioSegment containing audio between given boundaries.
+        """Cut the AudioSegment between given boundaries.
+        Note that this is an in-place transformation.
        :param start_sec: Beginning of subsegment in seconds.
        :type start_sec: float
        :param end_sec: End of subsegment in seconds.
        :type end_sec: float
+        :raise ValueError: If start_sec or end_sec is incorrectly set, e.g. out
+                           of bounds in time.
        """
        start_sec = 0.0 if start_sec is None else start_sec
        end_sec = self.duration if end_sec is None else end_sec
@@ -379,19 +384,33 @@ class AudioSegment(object):
            start_sec = self.duration + start_sec
        if end_sec < 0.0:
            end_sec = self.duration + end_sec
+        if start_sec < 0.0:
+            raise ValueError("The slice start position (%f s) is out of "
+                             "bounds." % start_sec)
+        if end_sec < 0.0:
+            raise ValueError("The slice end position (%f s) is out of bounds." %
+                             end_sec)
+        if start_sec > end_sec:
+            raise ValueError("The slice start position (%f s) is later than "
+                             "the end position (%f s)." % (start_sec, end_sec))
+        if end_sec > self.duration:
+            raise ValueError("The slice end position (%f s) is out of bounds "
+                             "(> %f s)" % (end_sec, self.duration))
        start_sample = int(round(start_sec * self._sample_rate))
        end_sample = int(round(end_sec * self._sample_rate))
        self._samples = self._samples[start_sample:end_sample]
    def random_subsegment(self, subsegment_length, rng=None):
-        """Return a random subsegment of a specified length in seconds.
+        """Cut the specified length of the audiosegment randomly.
+        Note that this is an in-place transformation.
        :param subsegment_length: Subsegment length in seconds.
        :type subsegment_length: float
        :param rng: Random number generator state.
        :type rng: random.Random
-        :raises ValueError: If the length of subsegment greater than
+        :raises ValueError: If the length of subsegment is greater than
-                            origineal segemnt.
+                            the origineal segemnt.
        """
        rng = random.Random() if rng is None else rng
        if subsegment_length > self.duration:
@@ -401,7 +420,7 @@ class AudioSegment(object):
        self.subsegment(start_time, start_time + subsegment_length)
    def convolve(self, impulse_segment, allow_resample=False):
-        """Convolve this audio segment with the given impulse_segment.
+        """Convolve this audio segment with the given impulse segment.
        Note that this is an in-place transformation.
@@ -428,6 +447,8 @@ class AudioSegment(object):
        """Convolve and normalize the resulting audio segment so that it
        has the same average power as the input signal.
+        Note that this is an in-place transformation.
        :param impulse_segment: Impulse response segments.
        :type impulse_segment: AudioSegment
        :param allow_resample: Indicates whether resampling is allowed when
@@ -445,10 +466,12 @@ class AudioSegment(object):
                  allow_downsampling=False,
                  max_gain_db=300.0,
                  rng=None):
-        """Adds the given noise segment at a specific signal-to-noise ratio.
+        """Add the given noise segment at a specific signal-to-noise ratio.
        If the noise segment is longer than this segment, a random subsegment
        of matching length is sampled from it and used instead.
+        Note that this is an in-place transformation.
        :param noise: Noise signal to add.
        :type noise: AudioSegment
        :param snr_dB: Signal-to-Noise Ratio, in decibels.
@@ -480,9 +503,10 @@ class AudioSegment(object):
                             " base signal (%f sec)." %
                             (noise.duration, self.duration))
        noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db)
-        noise.random_subsegment(self.duration, rng=rng)
+        noise_new = copy.deepcopy(noise)
-        noise.apply_gain(noise_gain_db)
+        noise_new.random_subsegment(self.duration, rng=rng)
-        self.superimposed(noise)
+        noise_new.apply_gain(noise_gain_db)
+        self.superimpose(noise_new)
    @property
    def samples(self):

--- a/deep_speech_2/data_utils/speech.py
+++ b/deep_speech_2/data_utils/speech.py
@@ -67,7 +67,8 @@ class SpeechSegment(AudioSegment):
    @classmethod
    def concatenate(cls, *segments):
-        """Concatenate an arbitrary number of speech segments together.
+        """Concatenate an arbitrary number of speech segments together, both
+        audio and transcript will be concatenated.
        :param *segments: Input speech segments to be concatenated.
        :type *segments: tuple of SpeechSegment
@@ -75,8 +76,7 @@ class SpeechSegment(AudioSegment):
        :rtype: SpeechSegment
        :raises ValueError: If the number of segments is zero, or if the 
                            sample_rate of any two segments does not match.
-        :raises TypeError: If every item in segments is not SpeechSegment
+        :raises TypeError: If any segment is not SpeechSegment instance.
-                           instance.
        """
        if len(segments) == 0:
            raise ValueError("No speech segments are given to concatenate.")
@@ -94,7 +94,7 @@ class SpeechSegment(AudioSegment):
        return cls(samples, sample_rate, transcripts)
    @classmethod
-    def slice_from_file(cls, filepath, start=None, end=None, transcript=""):
+    def slice_from_file(cls, filepath, start=None, end=None, transcript):
        """Loads a small section of an speech without having to load
        the entire file into the memory which can be incredibly wasteful.
@@ -121,7 +121,7 @@ class SpeechSegment(AudioSegment):
    @classmethod
    def make_silence(cls, duration, sample_rate):
        """Creates a silent speech segment of the given duration and
-        sample rate.
+        sample rate, transcript will be an empty string.
        :param duration: Length of silence in seconds.
        :type duration: float