audio.py 24.4 KB
Newer Older
1 2 3 4 5
"""Contains the audio segment class."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

6 7 8
import numpy as np
import io
import soundfile
chrisxu2014's avatar
chrisxu2014 已提交
9 10
import scikits.samplerate
from scipy import signal
chrisxu2014's avatar
chrisxu2014 已提交
11
import random
12 13 14 15


class AudioSegment(object):
    """Monaural audio segment abstraction.
16 17 18 19 20 21

    :param samples: Audio samples [num_samples x num_channels].
    :type samples: ndarray.float32
    :param sample_rate: Audio sample rate.
    :type sample_rate: int
    :raises TypeError: If the sample data type is not float or int.
22 23 24
    """

    def __init__(self, samples, sample_rate):
25 26 27 28 29
        """Create audio segment from samples.

        Samples are convert float32 internally, with int scaled to [-1, 1].
        """
        self._samples = self._convert_samples_to_float32(samples)
30 31 32 33
        self._sample_rate = sample_rate
        if self._samples.ndim >= 2:
            self._samples = np.mean(self._samples, 1)

34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49
    def __eq__(self, other):
        """Return whether two objects are equal."""
        if type(other) is not type(self):
            return False
        if self._sample_rate != other._sample_rate:
            return False
        if self._samples.shape != other._samples.shape:
            return False
        if np.any(self.samples != other._samples):
            return False
        return True

    def __ne__(self, other):
        """Return whether two objects are unequal."""
        return not self.__eq__(other)

chrisxu2014's avatar
chrisxu2014 已提交
50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
    def __len__(self):
        """Returns length of segment in samples."""
        return self.num_samples

    def __add__(self, other):
        """Add samples from another segment to those of this segment and return
        a new segment (sample-wise addition, not segment concatenation).

        :param other: Segment containing samples to be
                      added in.
        :type other: AudioSegment
        :return: New segment containing resulting samples.
        :rtype: AudioSegment
        :raise TypeError: If sample rates of segments don't match,
                          or if length of segments don't match.
        """
        if type(self) != type(other):
            raise TypeError("Cannot add segment of different type: {}"
                            .format(type(other)))
        if self._sample_rate != other._sample_rate:
            raise TypeError("Sample rates must match to add segments.")
        if len(self._samples) != len(other._samples):
            raise TypeError("Segment lengths must match to add segments.")
        samples = self.samples + other.samples
        return type(self)(samples, sample_rate=self._sample_rate)

76 77 78 79 80 81
    def __str__(self):
        """Return human-readable representation of segment."""
        return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, "
                "rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate,
                                self.duration, self.rms_db))

82
    @classmethod
83 84 85 86 87 88 89 90 91
    def from_file(cls, file):
        """Create audio segment from audio file.
        
        :param filepath: Filepath or file object to audio file.
        :type filepath: basestring|file
        :return: Audio segment instance.
        :rtype: AudioSegment
        """
        samples, sample_rate = soundfile.read(file, dtype='float32')
92 93 94 95
        return cls(samples, sample_rate)

    @classmethod
    def from_bytes(cls, bytes):
96 97 98 99 100 101 102
        """Create audio segment from a byte string containing audio samples.
        
        :param bytes: Byte string containing audio samples.
        :type bytes: str
        :return: Audio segment instance.
        :rtype: AudioSegment
        """
103 104 105 106
        samples, sample_rate = soundfile.read(
            io.BytesIO(bytes), dtype='float32')
        return cls(samples, sample_rate)

chrisxu2014's avatar
chrisxu2014 已提交
107
    def concatenate(self, *segments):
chrisxu2014's avatar
chrisxu2014 已提交
108 109
        """Concatenate an arbitrary number of audio segments together.

chrisxu2014's avatar
chrisxu2014 已提交
110 111 112 113 114 115 116
        :param *segments: Input audio segments
        :type *segments: AudioSegment
        :return: Audio segment instance.
        :rtype: AudioSegment
        :raises ValueError: If number of segments is zero, or if sample_rate
                            not match between two audio segments
        :raises TypeError: If item of segments is not Audiosegment instance
chrisxu2014's avatar
chrisxu2014 已提交
117 118
        """
        # Perform basic sanity-checks.
chrisxu2014's avatar
chrisxu2014 已提交
119
        if len(segments) == 0:
chrisxu2014's avatar
chrisxu2014 已提交
120 121
            raise ValueError("No audio segments are given to concatenate.")
        sample_rate = segments[0]._sample_rate
chrisxu2014's avatar
chrisxu2014 已提交
122 123
        for seg in segments:
            if sample_rate != seg._sample_rate:
chrisxu2014's avatar
chrisxu2014 已提交
124 125
                raise ValueError("Can't concatenate segments with "
                                 "different sample rates")
chrisxu2014's avatar
chrisxu2014 已提交
126
            if type(seg) is not type(self):
chrisxu2014's avatar
chrisxu2014 已提交
127 128 129
                raise TypeError("Only audio segments of the same type "
                                "instance can be concatenated.")
        samples = np.concatenate([seg.samples for seg in segments])
chrisxu2014's avatar
chrisxu2014 已提交
130
        return type(self)(samples, sample_rate)
chrisxu2014's avatar
chrisxu2014 已提交
131

132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
    def to_wav_file(self, filepath, dtype='float32'):
        """Save audio segment to disk as wav file.
        
        :param filepath: WAV filepath or file object to save the
                         audio segment.
        :type filepath: basestring|file
        :param dtype: Subtype for audio file. Options: 'int16', 'int32',
                      'float32', 'float64'. Default is 'float32'.
        :type dtype: str
        :raises TypeError: If dtype is not supported.
        """
        samples = self._convert_samples_from_float32(self._samples, dtype)
        subtype_map = {
            'int16': 'PCM_16',
            'int32': 'PCM_32',
            'float32': 'FLOAT',
            'float64': 'DOUBLE'
        }
        soundfile.write(
            filepath,
            samples,
            self._sample_rate,
            format='WAV',
            subtype=subtype_map[dtype])

chrisxu2014's avatar
chrisxu2014 已提交
157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
    def slice_from_file(self, file, start=None, end=None):
        """Loads a small section of an audio without having to load
        the entire file into the memory which can be incredibly wasteful.

        :param file: Input audio filepath
        :type file: basestring
        :param start: Start time in seconds. If start is negative, it wraps
                      around from the end. If not provided, this function 
                      reads from the very beginning.
        :type start: float
        :param end: End time in seconds. If end is negative, it wraps around
                    from the end. If not provided, the default behvaior is
                    to read to the end of the file.
        :type end: float
        :return: The specified slice of input audio in the audio.AudioSegment format.
        :rtype: AudioSegment
        :rainse ValueError: If the position is error, or if the time is out bounds.
        """
        sndfile = soundfile.SoundFile(file)
        sample_rate = sndfile.samplerate
        duration = float(len(sndfile)) / sample_rate
        start = 0. if start is None else start
        end = 0. if end is None else end
        if start < 0.0:
            start += duration
        if end < 0.0:
            end += duration
        if start < 0.0:
            raise ValueError("The slice start position (%f s) is out of "
                             "bounds. Filename: %s" % (start, file))
        if end < 0.0:
            raise ValueError("The slice end position (%f s) is out of bounds "
                             "Filename: %s" % (end, file))
        if start > end:
            raise ValueError("The slice start position (%f s) is later than "
                             "the slice end position (%f s)." % (start, end))
        if end > duration:
            raise ValueError("The slice end time (%f s) is out of bounds "
                             "(> %f s) Filename: %s" % (end, duration, file))
        start_frame = int(start * sample_rate)
        end_frame = int(end * sample_rate)
        sndfile.seek(start_frame)
        data = sndfile.read(frames=end_frame - start_frame, dtype='float32')
        return type(self)(data, sample_rate)

    def make_silence(self, duration, sample_rate):
        """Creates a silent audio segment of the given duration and
        sample rate.

        :param duration: Length of silence in seconds
        :type duration: float
        :param sample_rate: Sample rate
        :type sample_rate: float
        :return: Silence of the given duration
        :rtype: AudioSegment
        """
        samples = np.zeros(int(duration * sample_rate))
        return type(self)(samples, sample_rate)

216 217 218 219 220 221 222 223 224 225 226 227
    def to_bytes(self, dtype='float32'):
        """Create a byte string containing the audio content.
        
        :param dtype: Data type for export samples. Options: 'int16', 'int32',
                      'float32', 'float64'. Default is 'float32'.
        :type dtype: str
        :return: Byte string containing audio content.
        :rtype: str
        """
        samples = self._convert_samples_from_float32(self._samples, dtype)
        return samples.tostring()

228
    def apply_gain(self, gain):
229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258
        """Apply gain in decibels to samples.

        Note that this is an in-place transformation.
        
        :param gain: Gain in decibels to apply to samples. 
        :type gain: float
        """
        self._samples *= 10.**(gain / 20.)

    def change_speed(self, speed_rate):
        """Change the audio speed by linear interpolation.

        Note that this is an in-place transformation.
        
        :param speed_rate: Rate of speed change:
                           speed_rate > 1.0, speed up the audio;
                           speed_rate = 1.0, unchanged;
                           speed_rate < 1.0, slow down the audio;
                           speed_rate <= 0.0, not allowed, raise ValueError.
        :type speed_rate: float
        :raises ValueError: If speed_rate <= 0.0.
        """
        if speed_rate <= 0:
            raise ValueError("speed_rate should be greater than zero.")
        old_length = self._samples.shape[0]
        new_length = int(old_length / speed_rate)
        old_indices = np.arange(old_length)
        new_indices = np.linspace(start=0, stop=old_length, num=new_length)
        self._samples = np.interp(new_indices, old_indices, self._samples)

chrisxu2014's avatar
chrisxu2014 已提交
259
    def normalize(self, target_db=-20, max_gain_db=300.0):
chrisxu2014's avatar
chrisxu2014 已提交
260
        """Normalize audio to be desired RMS value in decibels.
chrisxu2014's avatar
chrisxu2014 已提交
261 262 263

        Note that this is an in-place transformation.

chrisxu2014's avatar
chrisxu2014 已提交
264 265 266 267 268 269 270 271 272
        :param target_db: Target RMS value in decibels. This value should
                          be less than 0.0 as 0.0 is full-scale audio.
        :type target_db: float
        :param max_gain_db: Max amount of gain in dB that can be applied for
                            normalization. This is to prevent nans when attempting
                            to normalize a signal consisting of all zeros.
        :type max_gain_db: float
        :raises ValueError: If the required gain to normalize the segment to
                            the target_db value exceeds max_gain_db.
chrisxu2014's avatar
chrisxu2014 已提交
273 274 275 276
        """
        gain = target_db - self.rms_db
        if gain > max_gain_db:
            raise ValueError(
chrisxu2014's avatar
chrisxu2014 已提交
277 278 279 280
                "Unable to normalize segment to %f dB because it has an RMS "
                "value of %f dB and the difference exceeds max_gain_db (%f dB)"
                % (target_db, self.rms_db, max_gain_db))
        self.apply_gain(min(max_gain_db, target_db - self.rms_db))
chrisxu2014's avatar
chrisxu2014 已提交
281 282 283 284 285 286

    def normalize_online_bayesian(self,
                                  target_db,
                                  prior_db,
                                  prior_samples,
                                  startup_delay=0.0):
chrisxu2014's avatar
chrisxu2014 已提交
287 288 289
        """Normalize audio using a production-compatible online/causal algorithm.
        This uses an exponential likelihood and gamma prior to make online estimates
        of the RMS even when there are very few samples.
chrisxu2014's avatar
chrisxu2014 已提交
290 291 292 293

        Note that this is an in-place transformation.

        :param target_db: Target RMS value in decibels
chrisxu2014's avatar
chrisxu2014 已提交
294
        :type target_bd: float
chrisxu2014's avatar
chrisxu2014 已提交
295
        :param prior_db: Prior RMS estimate in decibels
chrisxu2014's avatar
chrisxu2014 已提交
296
        :type prior_db: float
chrisxu2014's avatar
chrisxu2014 已提交
297
        :param prior_samples: Prior strength in number of samples
chrisxu2014's avatar
chrisxu2014 已提交
298 299 300 301 302
        :type prior_samples: float
        :param startup_delay: Default 0.0 s. If provided, this function will accrue
                              statistics for the first startup_delay seconds before
                              applying online normalization.
        :type startup_delay: float
chrisxu2014's avatar
chrisxu2014 已提交
303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323
        """
        # Estimate total RMS online
        startup_sample_idx = min(self.num_samples - 1,
                                 int(self.sample_rate * startup_delay))
        prior_mean_squared = 10.**(prior_db / 10.)
        prior_sum_of_squares = prior_mean_squared * prior_samples
        cumsum_of_squares = np.cumsum(self.samples**2)
        sample_count = np.arange(len(self)) + 1
        if startup_sample_idx > 0:
            cumsum_of_squares[:startup_sample_idx] = \
                cumsum_of_squares[startup_sample_idx]
            sample_count[:startup_sample_idx] = \
                sample_count[startup_sample_idx]
        mean_squared_estimate = ((cumsum_of_squares + prior_sum_of_squares) /
                                 (sample_count + prior_samples))
        rms_estimate_db = 10 * np.log10(mean_squared_estimate)
        # Compute required time-varying gain
        gain_db = target_db - rms_estimate_db
        self.apply_gain(gain_db)

    def resample(self, target_sample_rate, quality='sinc_medium'):
chrisxu2014's avatar
chrisxu2014 已提交
324 325
        """Resample audio segment. This resamples the audio to a new 
        sample rate.
chrisxu2014's avatar
chrisxu2014 已提交
326 327 328

        Note that this is an in-place transformation.

chrisxu2014's avatar
chrisxu2014 已提交
329 330
        :param target_sample_rate: Target sample rate
        :type target_sample_rate: int
chrisxu2014's avatar
chrisxu2014 已提交
331
        :param quality: One of {'sinc_fastest', 'sinc_medium', 'sinc_best'}.
chrisxu2014's avatar
chrisxu2014 已提交
332 333
                        Sets resampling speed/quality tradeoff.
                        See http://www.mega-nerd.com/SRC/api_misc.html#Converters
chrisxu2014's avatar
chrisxu2014 已提交
334 335 336 337 338 339
        :type quality: basestring
        """
        resample_ratio = target_sample_rate / self._sample_rate
        new_samples = scikits.samplerate.resample(
            self._samples, r=resample_ratio, type=quality)
        self._samples = new_samples
chrisxu2014's avatar
chrisxu2014 已提交
340
        self._sample_rate = target_sample_rate
341

342
    def pad_silence(self, duration, sides='both'):
chrisxu2014's avatar
chrisxu2014 已提交
343 344 345 346
        """Pads this audio sample with a period of silence.

        Note that this is an in-place transformation.

chrisxu2014's avatar
chrisxu2014 已提交
347
        :param duration: Length of silence in seconds to pad
chrisxu2014's avatar
chrisxu2014 已提交
348
        :type duration: float
chrisxu2014's avatar
chrisxu2014 已提交
349 350 351 352 353 354
        :param sides: Position for padding
                     'beginning' - adds silence in the beginning
                     'end' - adds silence in the end
                     'both' - adds silence in both the beginning and the end.
        :type sides: str
        :raises ValueError: If the sides not surport
chrisxu2014's avatar
chrisxu2014 已提交
355 356 357
        """
        if duration == 0.0:
            return self
chrisxu2014's avatar
chrisxu2014 已提交
358
        silence = self.make_silence(duration, self._sample_rate)
chrisxu2014's avatar
chrisxu2014 已提交
359
        if sides == "beginning":
chrisxu2014's avatar
chrisxu2014 已提交
360
            padded = self.concatenate(silence, self)
chrisxu2014's avatar
chrisxu2014 已提交
361
        elif sides == "end":
chrisxu2014's avatar
chrisxu2014 已提交
362
            padded = self.concatenate(self, silence)
chrisxu2014's avatar
chrisxu2014 已提交
363
        elif sides == "both":
chrisxu2014's avatar
chrisxu2014 已提交
364
            padded = self.concatenate(silence, self, silence)
chrisxu2014's avatar
chrisxu2014 已提交
365
        else:
chrisxu2014's avatar
chrisxu2014 已提交
366
            raise ValueError("Unknown value for the kwarg %s" % sides)
chrisxu2014's avatar
chrisxu2014 已提交
367 368
        self._samples = padded._samples
        self._sample_rate = padded._sample_rate
369 370

    def subsegment(self, start_sec=None, end_sec=None):
chrisxu2014's avatar
chrisxu2014 已提交
371 372 373
        """Return new AudioSegment containing audio between given boundaries.

        :param start_sec: Beginning of subsegment in seconds,
chrisxu2014's avatar
chrisxu2014 已提交
374 375
                          (beginning of segment if None).
        :type start_sec: float
chrisxu2014's avatar
chrisxu2014 已提交
376
        :param end_sec: End of subsegment in seconds,
chrisxu2014's avatar
chrisxu2014 已提交
377 378 379 380
                        (end of segment if None).
        :type end_sec: float
        :return: New AudioSegment containing specified subsegment.
        :rtype: AudioSegment
chrisxu2014's avatar
chrisxu2014 已提交
381
        """
chrisxu2014's avatar
chrisxu2014 已提交
382 383
        start_sec = 0.0 if start_sec is None else start_sec
        end_sec = self.duration if end_sec is None else end_sec
chrisxu2014's avatar
chrisxu2014 已提交
384 385 386 387 388 389 390 391 392 393 394
        # negative boundaries are relative to end of segment
        if start_sec < 0.0:
            start_sec = self.duration + start_sec
        if end_sec < 0.0:
            end_sec = self.duration + end_sec
        start_sample = int(round(start_sec * self._sample_rate))
        end_sample = int(round(end_sec * self._sample_rate))
        samples = self._samples[start_sample:end_sample]
        return type(self)(samples, sample_rate=self._sample_rate)

    def random_subsegment(self, subsegment_length, rng=None):
chrisxu2014's avatar
chrisxu2014 已提交
395
        """Return a random subsegment of a specified length in seconds.
chrisxu2014's avatar
chrisxu2014 已提交
396 397

        :param subsegment_length: Subsegment length in seconds.
chrisxu2014's avatar
chrisxu2014 已提交
398
        :type subsegment_length: float
chrisxu2014's avatar
chrisxu2014 已提交
399
        :param rng: Random number generator state
chrisxu2014's avatar
chrisxu2014 已提交
400 401 402 403 404 405
        :type rng: random.Random
        :return: New AudioSegment containing random subsegment
                 of original segment
        :rtype: AudioSegment
        :raises ValueError: If the length of subsegment greater than origineal
                            segemnt.
chrisxu2014's avatar
chrisxu2014 已提交
406
        """
chrisxu2014's avatar
chrisxu2014 已提交
407
        rng = random.Random() if rng is None else rng
chrisxu2014's avatar
chrisxu2014 已提交
408 409 410 411 412 413
        if subsegment_length > self.duration:
            raise ValueError("Length of subsegment must not be greater "
                             "than original segment.")
        start_time = rng.uniform(0.0, self.duration - subsegment_length)
        return self.subsegment(start_time, start_time + subsegment_length)

chrisxu2014's avatar
chrisxu2014 已提交
414
    def convolve(self, impulse_segment, allow_resample=False):
chrisxu2014's avatar
chrisxu2014 已提交
415 416
        """Convolve this audio segment with the given filter.

chrisxu2014's avatar
chrisxu2014 已提交
417
        Note that this is an in-place transformation.
chrisxu2014's avatar
chrisxu2014 已提交
418

chrisxu2014's avatar
chrisxu2014 已提交
419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435
        :param impulse_segment: Impulse response segments.
        :type impulse_segment: AudioSegment
        :param allow_resample: indicates whether resampling is allowed when
                                 the impulse_segment has a different sample 
                                 rate from this signal.
        :type allow_resample: boolean
        :raises ValueError: If the sample rate is not match between two
                            audio segments and resample is not allowed.
        """
        if allow_resample and self.sample_rate != impulse_segment.sample_rate:
            impulse_segment = impulse_segment.resample(self.sample_rate)
        if self.sample_rate != impulse_segment.sample_rate:
            raise ValueError("Impulse segment's sample rate (%d Hz) is not"
                             "equal to base signal sample rate (%d Hz)." %
                             (impulse_segment.sample_rate, self.sample_rate))
        samples = signal.fftconvolve(self.samples, impulse_segment.samples,
                                     "full")
chrisxu2014's avatar
chrisxu2014 已提交
436 437
        self._samples = samples

chrisxu2014's avatar
chrisxu2014 已提交
438
    def convolve_and_normalize(self, impulse_segment, allow_resample=False):
chrisxu2014's avatar
chrisxu2014 已提交
439 440 441
        """Convolve and normalize the resulting audio segment so that it
        has the same average power as the input signal.

chrisxu2014's avatar
chrisxu2014 已提交
442 443 444 445 446
        :param impulse_segment: Impulse response segments.
        :type impulse_segment: AudioSegment
        :param allow_resample: indicates whether resampling is allowed when
                               the impulse_segment has a different sample rate from this signal.
        :type allow_resample: boolean
chrisxu2014's avatar
chrisxu2014 已提交
447
        """
chrisxu2014's avatar
chrisxu2014 已提交
448 449 450
        target_db = self.rms_db
        self.convolve(impulse_segment, allow_resample=allow_resample)
        self.normalize(target_db)
chrisxu2014's avatar
chrisxu2014 已提交
451 452 453 454 455 456 457 458 459 460 461 462

    def add_noise(self,
                  noise,
                  snr_dB,
                  allow_downsampling=False,
                  max_gain_db=300.0,
                  rng=None):
        """Adds the given noise segment at a specific signal-to-noise ratio.
        If the noise segment is longer than this segment, a random subsegment
        of matching length is sampled from it and used instead.

        :param noise: Noise signal to add.
chrisxu2014's avatar
chrisxu2014 已提交
463
        :type noise: AudioSegment
chrisxu2014's avatar
chrisxu2014 已提交
464
        :param snr_dB: Signal-to-Noise Ratio, in decibels.
chrisxu2014's avatar
chrisxu2014 已提交
465 466 467
        :type snr_dB: float
        :param allow_downsampling: whether to allow the noise signal to be downsampled
                                   to match the base signal sample rate.
chrisxu2014's avatar
chrisxu2014 已提交
468
        :type allow_downsampling: boolean
chrisxu2014's avatar
chrisxu2014 已提交
469 470 471 472
        :param max_gain_db: Maximum amount of gain to apply to noise signal before
                            adding it in. This is to prevent attempting to apply infinite
                            gain to a zero signal.
        :type max_gain_db: float
chrisxu2014's avatar
chrisxu2014 已提交
473 474
        :param rng: Random number generator state.
        :type rng: random.Random
chrisxu2014's avatar
chrisxu2014 已提交
475 476 477
        :raises ValueError: If the sample rate does not match between the two audio segments
                            and resample is not allowed, or if the duration of noise segments
                            is shorter than original audio segments.
chrisxu2014's avatar
chrisxu2014 已提交
478
        """
chrisxu2014's avatar
chrisxu2014 已提交
479
        rng = random.Random() if rng is None else rng
chrisxu2014's avatar
chrisxu2014 已提交
480 481 482
        if allow_downsampling and noise.sample_rate > self.sample_rate:
            noise = noise.resample(self.sample_rate)
        if noise.sample_rate != self.sample_rate:
chrisxu2014's avatar
chrisxu2014 已提交
483 484 485
            raise ValueError("Noise sample rate (%d Hz) is not equal to "
                             "base signal sample rate (%d Hz)." %
                             (noise.sample_rate, self.sample_rate))
chrisxu2014's avatar
chrisxu2014 已提交
486
        if noise.duration < self.duration:
chrisxu2014's avatar
chrisxu2014 已提交
487 488 489
            raise ValueError("Noise signal (%f sec) must be at "
                             "least as long as base signal (%f sec)." %
                             (noise.duration, self.duration))
chrisxu2014's avatar
chrisxu2014 已提交
490 491 492 493 494 495
        noise_gain_db = self.rms_db - noise.rms_db - snr_dB
        noise_gain_db = min(max_gain_db, noise_gain_db)
        noise_subsegment = noise.random_subsegment(self.duration, rng=rng)
        output = self + self.tranform_noise(noise_subsegment, noise_gain_db)
        self._samples = output._samples
        self._sample_rate = output._sample_rate
496

chrisxu2014's avatar
chrisxu2014 已提交
497 498 499 500 501 502
    def tranform_noise(self, noise_subsegment, noise_gain_db):
        """ tranform noise file
        """
        return type(self)(noise_subsegment._samples * (10.**(
            noise_gain_db / 20.)), noise_subsegment._sample_rate)

503 504
    @property
    def samples(self):
505 506 507 508 509
        """Return audio samples.

        :return: Audio samples.
        :rtype: ndarray
        """
510 511 512 513
        return self._samples.copy()

    @property
    def sample_rate(self):
514 515 516 517 518
        """Return audio sample rate.

        :return: Audio sample rate.
        :rtype: int
        """
519 520 521
        return self._sample_rate

    @property
522 523
    def num_samples(self):
        """Return number of samples.
524

525 526 527
        :return: Number of samples.
        :rtype: int
        """
chrisxu2014's avatar
chrisxu2014 已提交
528
        return self._samples.shape[0]
529

530 531 532
    @property
    def duration(self):
        """Return audio duration.
533

534 535 536 537
        :return: Audio duration in seconds.
        :rtype: float
        """
        return self._samples.shape[0] / float(self._sample_rate)
538 539

    @property
540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591
    def rms_db(self):
        """Return root mean square energy of the audio in decibels.

        :return: Root mean square energy in decibels.
        :rtype: float
        """
        # square root => multiply by 10 instead of 20 for dBs
        mean_square = np.mean(self._samples**2)
        return 10 * np.log10(mean_square)

    def _convert_samples_to_float32(self, samples):
        """Convert sample type to float32.

        Audio sample type is usually integer or float-point.
        Integers will be scaled to [-1, 1] in float32.
        """
        float32_samples = samples.astype('float32')
        if samples.dtype in np.sctypes['int']:
            bits = np.iinfo(samples.dtype).bits
            float32_samples *= (1. / 2**(bits - 1))
        elif samples.dtype in np.sctypes['float']:
            pass
        else:
            raise TypeError("Unsupported sample type: %s." % samples.dtype)
        return float32_samples

    def _convert_samples_from_float32(self, samples, dtype):
        """Convert sample type from float32 to dtype.
        
        Audio sample type is usually integer or float-point. For integer
        type, float32 will be rescaled from [-1, 1] to the maximum range
        supported by the integer type.
        
        This is for writing a audio file.
        """
        dtype = np.dtype(dtype)
        output_samples = samples.copy()
        if dtype in np.sctypes['int']:
            bits = np.iinfo(dtype).bits
            output_samples *= (2**(bits - 1) / 1.)
            min_val = np.iinfo(dtype).min
            max_val = np.iinfo(dtype).max
            output_samples[output_samples > max_val] = max_val
            output_samples[output_samples < min_val] = min_val
        elif samples.dtype in np.sctypes['float']:
            min_val = np.finfo(dtype).min
            max_val = np.finfo(dtype).max
            output_samples[output_samples > max_val] = max_val
            output_samples[output_samples < min_val] = min_val
        else:
            raise TypeError("Unsupported sample type: %s." % samples.dtype)
        return output_samples.astype(dtype)