audio.py 23.6 KB
Newer Older
1 2 3 4 5
"""Contains the audio segment class."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

6 7 8
import numpy as np
import io
import soundfile
chrisxu2014's avatar
chrisxu2014 已提交
9 10
import scikits.samplerate
from scipy import signal
chrisxu2014's avatar
chrisxu2014 已提交
11
import random
12 13 14 15


class AudioSegment(object):
    """Monaural audio segment abstraction.
16 17 18 19 20 21

    :param samples: Audio samples [num_samples x num_channels].
    :type samples: ndarray.float32
    :param sample_rate: Audio sample rate.
    :type sample_rate: int
    :raises TypeError: If the sample data type is not float or int.
22 23 24
    """

    def __init__(self, samples, sample_rate):
25 26 27 28 29
        """Create audio segment from samples.

        Samples are convert float32 internally, with int scaled to [-1, 1].
        """
        self._samples = self._convert_samples_to_float32(samples)
30 31 32 33
        self._sample_rate = sample_rate
        if self._samples.ndim >= 2:
            self._samples = np.mean(self._samples, 1)

34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55
    def __eq__(self, other):
        """Return whether two objects are equal."""
        if type(other) is not type(self):
            return False
        if self._sample_rate != other._sample_rate:
            return False
        if self._samples.shape != other._samples.shape:
            return False
        if np.any(self.samples != other._samples):
            return False
        return True

    def __ne__(self, other):
        """Return whether two objects are unequal."""
        return not self.__eq__(other)

    def __str__(self):
        """Return human-readable representation of segment."""
        return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, "
                "rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate,
                                self.duration, self.rms_db))

56
    @classmethod
57 58 59 60 61 62 63 64 65
    def from_file(cls, file):
        """Create audio segment from audio file.
        
        :param filepath: Filepath or file object to audio file.
        :type filepath: basestring|file
        :return: Audio segment instance.
        :rtype: AudioSegment
        """
        samples, sample_rate = soundfile.read(file, dtype='float32')
66 67 68 69
        return cls(samples, sample_rate)

    @classmethod
    def from_bytes(cls, bytes):
70 71 72 73 74 75 76
        """Create audio segment from a byte string containing audio samples.
        
        :param bytes: Byte string containing audio samples.
        :type bytes: str
        :return: Audio segment instance.
        :rtype: AudioSegment
        """
77 78 79 80
        samples, sample_rate = soundfile.read(
            io.BytesIO(bytes), dtype='float32')
        return cls(samples, sample_rate)

chrisxu2014's avatar
chrisxu2014 已提交
81 82
    @classmethod
    def concatenate(cls, *segments):
chrisxu2014's avatar
chrisxu2014 已提交
83 84
        """Concatenate an arbitrary number of audio segments together.

chrisxu2014's avatar
chrisxu2014 已提交
85
        :param *segments: Input audio segments.
chrisxu2014's avatar
chrisxu2014 已提交
86
        :type *segments: AudioSegment
chrisxu2014's avatar
chrisxu2014 已提交
87
        :return: Audio segment instance as concatenating results.
chrisxu2014's avatar
chrisxu2014 已提交
88
        :rtype: AudioSegment
chrisxu2014's avatar
chrisxu2014 已提交
89
        :raises ValueError: If the number of segments is zero, or if the 
chrisxu2014's avatar
chrisxu2014 已提交
90
                            sample_rate of any two segment does not match.
chrisxu2014's avatar
chrisxu2014 已提交
91
        :raises TypeError: If every item in segments is not AudioSegment
chrisxu2014's avatar
chrisxu2014 已提交
92
                           instance.
chrisxu2014's avatar
chrisxu2014 已提交
93 94
        """
        # Perform basic sanity-checks.
chrisxu2014's avatar
chrisxu2014 已提交
95
        if len(segments) == 0:
chrisxu2014's avatar
chrisxu2014 已提交
96 97
            raise ValueError("No audio segments are given to concatenate.")
        sample_rate = segments[0]._sample_rate
chrisxu2014's avatar
chrisxu2014 已提交
98 99
        for seg in segments:
            if sample_rate != seg._sample_rate:
chrisxu2014's avatar
chrisxu2014 已提交
100 101
                raise ValueError("Can't concatenate segments with "
                                 "different sample rates")
chrisxu2014's avatar
chrisxu2014 已提交
102
            if type(seg) is not cls:
chrisxu2014's avatar
chrisxu2014 已提交
103 104 105
                raise TypeError("Only audio segments of the same type "
                                "instance can be concatenated.")
        samples = np.concatenate([seg.samples for seg in segments])
chrisxu2014's avatar
chrisxu2014 已提交
106
        return cls(samples, sample_rate)
chrisxu2014's avatar
chrisxu2014 已提交
107

108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132
    def to_wav_file(self, filepath, dtype='float32'):
        """Save audio segment to disk as wav file.
        
        :param filepath: WAV filepath or file object to save the
                         audio segment.
        :type filepath: basestring|file
        :param dtype: Subtype for audio file. Options: 'int16', 'int32',
                      'float32', 'float64'. Default is 'float32'.
        :type dtype: str
        :raises TypeError: If dtype is not supported.
        """
        samples = self._convert_samples_from_float32(self._samples, dtype)
        subtype_map = {
            'int16': 'PCM_16',
            'int32': 'PCM_32',
            'float32': 'FLOAT',
            'float64': 'DOUBLE'
        }
        soundfile.write(
            filepath,
            samples,
            self._sample_rate,
            format='WAV',
            subtype=subtype_map[dtype])

chrisxu2014's avatar
chrisxu2014 已提交
133 134
    @classmethod
    def slice_from_file(cls, file, start=None, end=None):
chrisxu2014's avatar
chrisxu2014 已提交
135 136 137
        """Loads a small section of an audio without having to load
        the entire file into the memory which can be incredibly wasteful.

chrisxu2014's avatar
chrisxu2014 已提交
138 139
        :param file: Input audio filepath or file object.
        :type file: basestring|file
chrisxu2014's avatar
chrisxu2014 已提交
140 141 142 143 144 145 146 147
        :param start: Start time in seconds. If start is negative, it wraps
                      around from the end. If not provided, this function 
                      reads from the very beginning.
        :type start: float
        :param end: End time in seconds. If end is negative, it wraps around
                    from the end. If not provided, the default behvaior is
                    to read to the end of the file.
        :type end: float
chrisxu2014's avatar
chrisxu2014 已提交
148 149
        :return: AudioSegment instance of the specified slice of the input
                 audio file.
chrisxu2014's avatar
chrisxu2014 已提交
150
        :rtype: AudioSegment
chrisxu2014's avatar
chrisxu2014 已提交
151 152
        :raise ValueError: If start or end is incorrectly set, e.g. out of
                           bounds in time.
chrisxu2014's avatar
chrisxu2014 已提交
153 154 155 156 157 158 159 160 161 162 163 164
        """
        sndfile = soundfile.SoundFile(file)
        sample_rate = sndfile.samplerate
        duration = float(len(sndfile)) / sample_rate
        start = 0. if start is None else start
        end = 0. if end is None else end
        if start < 0.0:
            start += duration
        if end < 0.0:
            end += duration
        if start < 0.0:
            raise ValueError("The slice start position (%f s) is out of "
chrisxu2014's avatar
chrisxu2014 已提交
165
                             "bounds." % start)
chrisxu2014's avatar
chrisxu2014 已提交
166
        if end < 0.0:
chrisxu2014's avatar
chrisxu2014 已提交
167 168
            raise ValueError("The slice end position (%f s) is out of bounds." %
                             end)
chrisxu2014's avatar
chrisxu2014 已提交
169 170 171 172
        if start > end:
            raise ValueError("The slice start position (%f s) is later than "
                             "the slice end position (%f s)." % (start, end))
        if end > duration:
chrisxu2014's avatar
chrisxu2014 已提交
173 174
            raise ValueError("The slice end position (%f s) is out of bounds "
                             "(> %f s)" % (end, duration))
chrisxu2014's avatar
chrisxu2014 已提交
175 176 177 178
        start_frame = int(start * sample_rate)
        end_frame = int(end * sample_rate)
        sndfile.seek(start_frame)
        data = sndfile.read(frames=end_frame - start_frame, dtype='float32')
chrisxu2014's avatar
chrisxu2014 已提交
179
        return cls(data, sample_rate)
chrisxu2014's avatar
chrisxu2014 已提交
180

chrisxu2014's avatar
chrisxu2014 已提交
181 182
    @classmethod
    def make_silence(cls, duration, sample_rate):
chrisxu2014's avatar
chrisxu2014 已提交
183 184 185
        """Creates a silent audio segment of the given duration and
        sample rate.

chrisxu2014's avatar
chrisxu2014 已提交
186
        :param duration: Length of silence in seconds.
chrisxu2014's avatar
chrisxu2014 已提交
187
        :type duration: float
chrisxu2014's avatar
chrisxu2014 已提交
188
        :param sample_rate: Sample rate.
chrisxu2014's avatar
chrisxu2014 已提交
189
        :type sample_rate: float
chrisxu2014's avatar
chrisxu2014 已提交
190
        :return: Silent AudioSegment instance of the given duration.
chrisxu2014's avatar
chrisxu2014 已提交
191 192 193
        :rtype: AudioSegment
        """
        samples = np.zeros(int(duration * sample_rate))
chrisxu2014's avatar
chrisxu2014 已提交
194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213
        return cls(samples, sample_rate)

    def superimposed(self, other):
        """Add samples from another segment to those of this segment
        (sample-wise addition, not segment concatenation).

        :param other: Segment containing samples to be added in.
        :type other: AudioSegments
        :raise TypeError: If type of two segments don't match.
        :raise ValueError: If the sample_rate of two segments not equal, or if
                           the length of segments don't match.
        """
        if type(self) != type(other):
            raise TypeError("Cannot add segments of different types: %s "
                            "and %s." % (type(self), type(other)))
        if self._sample_rate != other._sample_rate:
            raise ValueError("Sample rates must match to add segments.")
        if len(self._samples) != len(other._samples):
            raise ValueError("Segment lengths must match to add segments.")
        self._samples += other._samples
chrisxu2014's avatar
chrisxu2014 已提交
214

215 216 217
    def to_bytes(self, dtype='float32'):
        """Create a byte string containing the audio content.
        
chrisxu2014's avatar
chrisxu2014 已提交
218
        :param dtype: Data type for export samples. Options: 'int16','int32',
219 220 221 222 223 224 225 226
                      'float32', 'float64'. Default is 'float32'.
        :type dtype: str
        :return: Byte string containing audio content.
        :rtype: str
        """
        samples = self._convert_samples_from_float32(self._samples, dtype)
        return samples.tostring()

227
    def apply_gain(self, gain):
228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
        """Apply gain in decibels to samples.

        Note that this is an in-place transformation.
        
        :param gain: Gain in decibels to apply to samples. 
        :type gain: float
        """
        self._samples *= 10.**(gain / 20.)

    def change_speed(self, speed_rate):
        """Change the audio speed by linear interpolation.

        Note that this is an in-place transformation.
        
        :param speed_rate: Rate of speed change:
                           speed_rate > 1.0, speed up the audio;
                           speed_rate = 1.0, unchanged;
                           speed_rate < 1.0, slow down the audio;
                           speed_rate <= 0.0, not allowed, raise ValueError.
        :type speed_rate: float
        :raises ValueError: If speed_rate <= 0.0.
        """
        if speed_rate <= 0:
            raise ValueError("speed_rate should be greater than zero.")
        old_length = self._samples.shape[0]
        new_length = int(old_length / speed_rate)
        old_indices = np.arange(old_length)
        new_indices = np.linspace(start=0, stop=old_length, num=new_length)
        self._samples = np.interp(new_indices, old_indices, self._samples)

chrisxu2014's avatar
chrisxu2014 已提交
258
    def normalize(self, target_db=-20, max_gain_db=300.0):
chrisxu2014's avatar
chrisxu2014 已提交
259
        """Normalize audio to be of the desired RMS value in decibels.
chrisxu2014's avatar
chrisxu2014 已提交
260 261 262

        Note that this is an in-place transformation.

chrisxu2014's avatar
chrisxu2014 已提交
263 264
        :param target_db: Target RMS value in decibels. This value should be
                          less than 0.0 as 0.0 is full-scale audio.
chrisxu2014's avatar
chrisxu2014 已提交
265 266
        :type target_db: float
        :param max_gain_db: Max amount of gain in dB that can be applied for
chrisxu2014's avatar
chrisxu2014 已提交
267 268 269
                            normalization. This is to prevent nans when
                            attempting to normalize a signal consisting of
                            all zeros.
chrisxu2014's avatar
chrisxu2014 已提交
270 271 272
        :type max_gain_db: float
        :raises ValueError: If the required gain to normalize the segment to
                            the target_db value exceeds max_gain_db.
chrisxu2014's avatar
chrisxu2014 已提交
273 274 275 276
        """
        gain = target_db - self.rms_db
        if gain > max_gain_db:
            raise ValueError(
chrisxu2014's avatar
chrisxu2014 已提交
277 278 279
                "Unable to normalize segment to %f dB because the "
                "the probable gain have exceeds max_gain_db (%f dB)" %
                (target_db, max_gain_db))
chrisxu2014's avatar
chrisxu2014 已提交
280
        self.apply_gain(min(max_gain_db, target_db - self.rms_db))
chrisxu2014's avatar
chrisxu2014 已提交
281 282 283 284 285 286

    def normalize_online_bayesian(self,
                                  target_db,
                                  prior_db,
                                  prior_samples,
                                  startup_delay=0.0):
chrisxu2014's avatar
chrisxu2014 已提交
287 288 289
        """Normalize audio using a production-compatible online/causal
        algorithm. This uses an exponential likelihood and gamma prior to
        make online estimates of the RMS even when there are very few samples.
chrisxu2014's avatar
chrisxu2014 已提交
290 291 292

        Note that this is an in-place transformation.

chrisxu2014's avatar
chrisxu2014 已提交
293
        :param target_db: Target RMS value in decibels.
chrisxu2014's avatar
chrisxu2014 已提交
294
        :type target_bd: float
chrisxu2014's avatar
chrisxu2014 已提交
295
        :param prior_db: Prior RMS estimate in decibels.
chrisxu2014's avatar
chrisxu2014 已提交
296
        :type prior_db: float
chrisxu2014's avatar
chrisxu2014 已提交
297
        :param prior_samples: Prior strength in number of samples.
chrisxu2014's avatar
chrisxu2014 已提交
298
        :type prior_samples: float
chrisxu2014's avatar
chrisxu2014 已提交
299
        :param startup_delay: Default 0.0s. If provided, this function will
chrisxu2014's avatar
chrisxu2014 已提交
300 301
                              accrue statistics for the first startup_delay 
                              seconds before applying online normalization.
chrisxu2014's avatar
chrisxu2014 已提交
302
        :type startup_delay: float
chrisxu2014's avatar
chrisxu2014 已提交
303
        """
chrisxu2014's avatar
chrisxu2014 已提交
304
        # Estimate total RMS online.
chrisxu2014's avatar
chrisxu2014 已提交
305 306 307 308 309
        startup_sample_idx = min(self.num_samples - 1,
                                 int(self.sample_rate * startup_delay))
        prior_mean_squared = 10.**(prior_db / 10.)
        prior_sum_of_squares = prior_mean_squared * prior_samples
        cumsum_of_squares = np.cumsum(self.samples**2)
chrisxu2014's avatar
chrisxu2014 已提交
310
        sample_count = np.arange(len(self.num_samples)) + 1
chrisxu2014's avatar
chrisxu2014 已提交
311 312 313 314 315 316 317 318
        if startup_sample_idx > 0:
            cumsum_of_squares[:startup_sample_idx] = \
                cumsum_of_squares[startup_sample_idx]
            sample_count[:startup_sample_idx] = \
                sample_count[startup_sample_idx]
        mean_squared_estimate = ((cumsum_of_squares + prior_sum_of_squares) /
                                 (sample_count + prior_samples))
        rms_estimate_db = 10 * np.log10(mean_squared_estimate)
chrisxu2014's avatar
chrisxu2014 已提交
319
        # Compute required time-varying gain.
chrisxu2014's avatar
chrisxu2014 已提交
320 321 322 323
        gain_db = target_db - rms_estimate_db
        self.apply_gain(gain_db)

    def resample(self, target_sample_rate, quality='sinc_medium'):
chrisxu2014's avatar
chrisxu2014 已提交
324
        """Resample the audio to a target sample rate.
chrisxu2014's avatar
chrisxu2014 已提交
325 326 327

        Note that this is an in-place transformation.

chrisxu2014's avatar
chrisxu2014 已提交
328
        :param target_sample_rate: Target sample rate.
chrisxu2014's avatar
chrisxu2014 已提交
329
        :type target_sample_rate: int
chrisxu2014's avatar
chrisxu2014 已提交
330
        :param quality: One of {'sinc_fastest', 'sinc_medium', 'sinc_best'}.
chrisxu2014's avatar
chrisxu2014 已提交
331 332
                        Sets resampling speed/quality tradeoff.
                        See http://www.mega-nerd.com/SRC/api_misc.html#Converters
chrisxu2014's avatar
chrisxu2014 已提交
333
        :type quality: str
chrisxu2014's avatar
chrisxu2014 已提交
334 335
        """
        resample_ratio = target_sample_rate / self._sample_rate
chrisxu2014's avatar
chrisxu2014 已提交
336
        self._samples = scikits.samplerate.resample(
chrisxu2014's avatar
chrisxu2014 已提交
337
            self._samples, r=resample_ratio, type=quality)
chrisxu2014's avatar
chrisxu2014 已提交
338
        self._sample_rate = target_sample_rate
339

340
    def pad_silence(self, duration, sides='both'):
chrisxu2014's avatar
chrisxu2014 已提交
341
        """Pad this audio sample with a period of silence.
chrisxu2014's avatar
chrisxu2014 已提交
342 343 344

        Note that this is an in-place transformation.

chrisxu2014's avatar
chrisxu2014 已提交
345
        :param duration: Length of silence in seconds to pad.
chrisxu2014's avatar
chrisxu2014 已提交
346
        :type duration: float
chrisxu2014's avatar
chrisxu2014 已提交
347 348 349
        :param sides: Position for padding:
                     'beginning' - adds silence in the beginning;
                     'end' - adds silence in the end;
chrisxu2014's avatar
chrisxu2014 已提交
350 351
                     'both' - adds silence in both the beginning and the end.
        :type sides: str
chrisxu2014's avatar
chrisxu2014 已提交
352
        :raises ValueError: If sides is not supported.
chrisxu2014's avatar
chrisxu2014 已提交
353 354 355
        """
        if duration == 0.0:
            return self
chrisxu2014's avatar
chrisxu2014 已提交
356
        cls = type(self)
chrisxu2014's avatar
chrisxu2014 已提交
357
        silence = self.make_silence(duration, self._sample_rate)
chrisxu2014's avatar
chrisxu2014 已提交
358
        if sides == "beginning":
chrisxu2014's avatar
chrisxu2014 已提交
359
            padded = cls.concatenate(silence, self)
chrisxu2014's avatar
chrisxu2014 已提交
360
        elif sides == "end":
chrisxu2014's avatar
chrisxu2014 已提交
361
            padded = cls.concatenate(self, silence)
chrisxu2014's avatar
chrisxu2014 已提交
362
        elif sides == "both":
chrisxu2014's avatar
chrisxu2014 已提交
363
            padded = cls.concatenate(silence, self, silence)
chrisxu2014's avatar
chrisxu2014 已提交
364
        else:
chrisxu2014's avatar
chrisxu2014 已提交
365
            raise ValueError("Unknown value for the kwarg %s" % sides)
chrisxu2014's avatar
chrisxu2014 已提交
366
        self._samples = padded._samples
367 368

    def subsegment(self, start_sec=None, end_sec=None):
chrisxu2014's avatar
chrisxu2014 已提交
369 370
        """Return new AudioSegment containing audio between given boundaries.

chrisxu2014's avatar
chrisxu2014 已提交
371
        :param start_sec: Beginning of subsegment in seconds.
chrisxu2014's avatar
chrisxu2014 已提交
372
        :type start_sec: float
chrisxu2014's avatar
chrisxu2014 已提交
373
        :param end_sec: End of subsegment in seconds.
chrisxu2014's avatar
chrisxu2014 已提交
374
        :type end_sec: float
chrisxu2014's avatar
chrisxu2014 已提交
375
        """
chrisxu2014's avatar
chrisxu2014 已提交
376 377
        start_sec = 0.0 if start_sec is None else start_sec
        end_sec = self.duration if end_sec is None else end_sec
chrisxu2014's avatar
chrisxu2014 已提交
378 379 380 381 382 383
        if start_sec < 0.0:
            start_sec = self.duration + start_sec
        if end_sec < 0.0:
            end_sec = self.duration + end_sec
        start_sample = int(round(start_sec * self._sample_rate))
        end_sample = int(round(end_sec * self._sample_rate))
chrisxu2014's avatar
chrisxu2014 已提交
384
        self._samples = self._samples[start_sample:end_sample]
chrisxu2014's avatar
chrisxu2014 已提交
385 386

    def random_subsegment(self, subsegment_length, rng=None):
chrisxu2014's avatar
chrisxu2014 已提交
387
        """Return a random subsegment of a specified length in seconds.
chrisxu2014's avatar
chrisxu2014 已提交
388 389

        :param subsegment_length: Subsegment length in seconds.
chrisxu2014's avatar
chrisxu2014 已提交
390
        :type subsegment_length: float
chrisxu2014's avatar
chrisxu2014 已提交
391
        :param rng: Random number generator state.
chrisxu2014's avatar
chrisxu2014 已提交
392
        :type rng: random.Random
chrisxu2014's avatar
chrisxu2014 已提交
393 394
        :raises ValueError: If the length of subsegment greater than
                            origineal segemnt.
chrisxu2014's avatar
chrisxu2014 已提交
395
        """
chrisxu2014's avatar
chrisxu2014 已提交
396
        rng = random.Random() if rng is None else rng
chrisxu2014's avatar
chrisxu2014 已提交
397 398 399 400
        if subsegment_length > self.duration:
            raise ValueError("Length of subsegment must not be greater "
                             "than original segment.")
        start_time = rng.uniform(0.0, self.duration - subsegment_length)
chrisxu2014's avatar
chrisxu2014 已提交
401
        self.subsegment(start_time, start_time + subsegment_length)
chrisxu2014's avatar
chrisxu2014 已提交
402

chrisxu2014's avatar
chrisxu2014 已提交
403
    def convolve(self, impulse_segment, allow_resample=False):
chrisxu2014's avatar
chrisxu2014 已提交
404
        """Convolve this audio segment with the given impulse_segment.
chrisxu2014's avatar
chrisxu2014 已提交
405

chrisxu2014's avatar
chrisxu2014 已提交
406
        Note that this is an in-place transformation.
chrisxu2014's avatar
chrisxu2014 已提交
407

chrisxu2014's avatar
chrisxu2014 已提交
408 409
        :param impulse_segment: Impulse response segments.
        :type impulse_segment: AudioSegment
chrisxu2014's avatar
chrisxu2014 已提交
410 411 412 413
        :param allow_resample: Indicates whether resampling is allowed when
                               the impulse_segment has a different sample 
                               rate from this signal.
        :type allow_resample: bool
chrisxu2014's avatar
chrisxu2014 已提交
414
        :raises ValueError: If the sample rate is not match between two
chrisxu2014's avatar
chrisxu2014 已提交
415
                            audio segments when resample is not allowed.
chrisxu2014's avatar
chrisxu2014 已提交
416 417 418 419 420 421 422 423 424
        """
        if allow_resample and self.sample_rate != impulse_segment.sample_rate:
            impulse_segment = impulse_segment.resample(self.sample_rate)
        if self.sample_rate != impulse_segment.sample_rate:
            raise ValueError("Impulse segment's sample rate (%d Hz) is not"
                             "equal to base signal sample rate (%d Hz)." %
                             (impulse_segment.sample_rate, self.sample_rate))
        samples = signal.fftconvolve(self.samples, impulse_segment.samples,
                                     "full")
chrisxu2014's avatar
chrisxu2014 已提交
425 426
        self._samples = samples

chrisxu2014's avatar
chrisxu2014 已提交
427
    def convolve_and_normalize(self, impulse_segment, allow_resample=False):
chrisxu2014's avatar
chrisxu2014 已提交
428 429 430
        """Convolve and normalize the resulting audio segment so that it
        has the same average power as the input signal.

chrisxu2014's avatar
chrisxu2014 已提交
431 432
        :param impulse_segment: Impulse response segments.
        :type impulse_segment: AudioSegment
chrisxu2014's avatar
chrisxu2014 已提交
433 434 435 436
        :param allow_resample: Indicates whether resampling is allowed when
                               the impulse_segment has a different sample
                               rate from this signal.
        :type allow_resample: bool
chrisxu2014's avatar
chrisxu2014 已提交
437
        """
chrisxu2014's avatar
chrisxu2014 已提交
438 439 440
        target_db = self.rms_db
        self.convolve(impulse_segment, allow_resample=allow_resample)
        self.normalize(target_db)
chrisxu2014's avatar
chrisxu2014 已提交
441 442 443 444 445 446 447 448 449 450 451 452

    def add_noise(self,
                  noise,
                  snr_dB,
                  allow_downsampling=False,
                  max_gain_db=300.0,
                  rng=None):
        """Adds the given noise segment at a specific signal-to-noise ratio.
        If the noise segment is longer than this segment, a random subsegment
        of matching length is sampled from it and used instead.

        :param noise: Noise signal to add.
chrisxu2014's avatar
chrisxu2014 已提交
453
        :type noise: AudioSegment
chrisxu2014's avatar
chrisxu2014 已提交
454
        :param snr_dB: Signal-to-Noise Ratio, in decibels.
chrisxu2014's avatar
chrisxu2014 已提交
455
        :type snr_dB: float
chrisxu2014's avatar
chrisxu2014 已提交
456 457 458 459 460 461 462
        :param allow_downsampling: Whether to allow the noise signal to be
                                   downsampled to match the base signal sample
                                   rate.
        :type allow_downsampling: bool
        :param max_gain_db: Maximum amount of gain to apply to noise signal
                            before adding it in. This is to prevent attempting
                            to apply infinite gain to a zero signal.
chrisxu2014's avatar
chrisxu2014 已提交
463
        :type max_gain_db: float
chrisxu2014's avatar
chrisxu2014 已提交
464
        :param rng: Random number generator state.
chrisxu2014's avatar
chrisxu2014 已提交
465 466
        :type rng: None|random.Random
        :raises ValueError: If the sample rate does not match between the two
chrisxu2014's avatar
chrisxu2014 已提交
467 468
                            audio segments when downsampling is not allowed, or
                            if the duration of noise segments is shorter than
chrisxu2014's avatar
chrisxu2014 已提交
469
                            original audio segments.
chrisxu2014's avatar
chrisxu2014 已提交
470
        """
chrisxu2014's avatar
chrisxu2014 已提交
471
        rng = random.Random() if rng is None else rng
chrisxu2014's avatar
chrisxu2014 已提交
472 473 474
        if allow_downsampling and noise.sample_rate > self.sample_rate:
            noise = noise.resample(self.sample_rate)
        if noise.sample_rate != self.sample_rate:
chrisxu2014's avatar
chrisxu2014 已提交
475 476 477
            raise ValueError("Noise sample rate (%d Hz) is not equal to base "
                             "signal sample rate (%d Hz)." % (noise.sample_rate,
                                                              self.sample_rate))
chrisxu2014's avatar
chrisxu2014 已提交
478
        if noise.duration < self.duration:
chrisxu2014's avatar
chrisxu2014 已提交
479 480
            raise ValueError("Noise signal (%f sec) must be at least as long as"
                             " base signal (%f sec)." %
chrisxu2014's avatar
chrisxu2014 已提交
481
                             (noise.duration, self.duration))
chrisxu2014's avatar
chrisxu2014 已提交
482 483 484 485
        noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db)
        noise.random_subsegment(self.duration, rng=rng)
        noise.apply_gain(noise_gain_db)
        self.superimposed(noise)
chrisxu2014's avatar
chrisxu2014 已提交
486

487 488
    @property
    def samples(self):
489 490 491 492 493
        """Return audio samples.

        :return: Audio samples.
        :rtype: ndarray
        """
494 495 496 497
        return self._samples.copy()

    @property
    def sample_rate(self):
498 499 500 501 502
        """Return audio sample rate.

        :return: Audio sample rate.
        :rtype: int
        """
503 504 505
        return self._sample_rate

    @property
506 507
    def num_samples(self):
        """Return number of samples.
508

509 510 511
        :return: Number of samples.
        :rtype: int
        """
chrisxu2014's avatar
chrisxu2014 已提交
512
        return self._samples.shape[0]
513

514 515 516
    @property
    def duration(self):
        """Return audio duration.
517

518 519 520 521
        :return: Audio duration in seconds.
        :rtype: float
        """
        return self._samples.shape[0] / float(self._sample_rate)
522 523

    @property
524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555
    def rms_db(self):
        """Return root mean square energy of the audio in decibels.

        :return: Root mean square energy in decibels.
        :rtype: float
        """
        # square root => multiply by 10 instead of 20 for dBs
        mean_square = np.mean(self._samples**2)
        return 10 * np.log10(mean_square)

    def _convert_samples_to_float32(self, samples):
        """Convert sample type to float32.

        Audio sample type is usually integer or float-point.
        Integers will be scaled to [-1, 1] in float32.
        """
        float32_samples = samples.astype('float32')
        if samples.dtype in np.sctypes['int']:
            bits = np.iinfo(samples.dtype).bits
            float32_samples *= (1. / 2**(bits - 1))
        elif samples.dtype in np.sctypes['float']:
            pass
        else:
            raise TypeError("Unsupported sample type: %s." % samples.dtype)
        return float32_samples

    def _convert_samples_from_float32(self, samples, dtype):
        """Convert sample type from float32 to dtype.
        
        Audio sample type is usually integer or float-point. For integer
        type, float32 will be rescaled from [-1, 1] to the maximum range
        supported by the integer type.
chrisxu2014's avatar
chrisxu2014 已提交
556

557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575
        This is for writing a audio file.
        """
        dtype = np.dtype(dtype)
        output_samples = samples.copy()
        if dtype in np.sctypes['int']:
            bits = np.iinfo(dtype).bits
            output_samples *= (2**(bits - 1) / 1.)
            min_val = np.iinfo(dtype).min
            max_val = np.iinfo(dtype).max
            output_samples[output_samples > max_val] = max_val
            output_samples[output_samples < min_val] = min_val
        elif samples.dtype in np.sctypes['float']:
            min_val = np.finfo(dtype).min
            max_val = np.finfo(dtype).max
            output_samples[output_samples > max_val] = max_val
            output_samples[output_samples < min_val] = min_val
        else:
            raise TypeError("Unsupported sample type: %s." % samples.dtype)
        return output_samples.astype(dtype)