audio.py 28.0 KB
Newer Older
1 2 3 4 5
"""Contains the audio segment class."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

6 7
import numpy as np
import io
8 9
import struct
import re
10
import soundfile
11
import resampy
chrisxu2014's avatar
chrisxu2014 已提交
12
from scipy import signal
chrisxu2014's avatar
chrisxu2014 已提交
13
import random
chrisxu2014's avatar
chrisxu2014 已提交
14
import copy
15 16 17 18


class AudioSegment(object):
    """Monaural audio segment abstraction.
19 20 21 22 23 24

    :param samples: Audio samples [num_samples x num_channels].
    :type samples: ndarray.float32
    :param sample_rate: Audio sample rate.
    :type sample_rate: int
    :raises TypeError: If the sample data type is not float or int.
25 26 27
    """

    def __init__(self, samples, sample_rate):
28 29 30 31 32
        """Create audio segment from samples.

        Samples are convert float32 internally, with int scaled to [-1, 1].
        """
        self._samples = self._convert_samples_to_float32(samples)
33 34 35 36
        self._sample_rate = sample_rate
        if self._samples.ndim >= 2:
            self._samples = np.mean(self._samples, 1)

37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58
    def __eq__(self, other):
        """Return whether two objects are equal."""
        if type(other) is not type(self):
            return False
        if self._sample_rate != other._sample_rate:
            return False
        if self._samples.shape != other._samples.shape:
            return False
        if np.any(self.samples != other._samples):
            return False
        return True

    def __ne__(self, other):
        """Return whether two objects are unequal."""
        return not self.__eq__(other)

    def __str__(self):
        """Return human-readable representation of segment."""
        return ("%s: num_samples=%d, sample_rate=%d, duration=%.2fsec, "
                "rms=%.2fdB" % (type(self), self.num_samples, self.sample_rate,
                                self.duration, self.rms_db))

59
    @classmethod
60 61 62 63 64 65 66 67
    def from_file(cls, file):
        """Create audio segment from audio file.
        
        :param filepath: Filepath or file object to audio file.
        :type filepath: basestring|file
        :return: Audio segment instance.
        :rtype: AudioSegment
        """
68 69 70 71 72
        if isinstance(file, basestring) and re.findall(r".seqbin_\d+$", file):
            return cls.from_sequence_file(file)
        else:
            samples, sample_rate = soundfile.read(file, dtype='float32')
            return cls(samples, sample_rate)
73

74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121
    @classmethod
    def slice_from_file(cls, file, start=None, end=None):
        """Loads a small section of an audio without having to load
        the entire file into the memory which can be incredibly wasteful.

        :param file: Input audio filepath or file object.
        :type file: basestring|file
        :param start: Start time in seconds. If start is negative, it wraps
                      around from the end. If not provided, this function 
                      reads from the very beginning.
        :type start: float
        :param end: End time in seconds. If end is negative, it wraps around
                    from the end. If not provided, the default behvaior is
                    to read to the end of the file.
        :type end: float
        :return: AudioSegment instance of the specified slice of the input
                 audio file.
        :rtype: AudioSegment
        :raise ValueError: If start or end is incorrectly set, e.g. out of
                           bounds in time.
        """
        sndfile = soundfile.SoundFile(file)
        sample_rate = sndfile.samplerate
        duration = float(len(sndfile)) / sample_rate
        start = 0. if start is None else start
        end = 0. if end is None else end
        if start < 0.0:
            start += duration
        if end < 0.0:
            end += duration
        if start < 0.0:
            raise ValueError("The slice start position (%f s) is out of "
                             "bounds." % start)
        if end < 0.0:
            raise ValueError("The slice end position (%f s) is out of bounds." %
                             end)
        if start > end:
            raise ValueError("The slice start position (%f s) is later than "
                             "the slice end position (%f s)." % (start, end))
        if end > duration:
            raise ValueError("The slice end position (%f s) is out of bounds "
                             "(> %f s)" % (end, duration))
        start_frame = int(start * sample_rate)
        end_frame = int(end * sample_rate)
        sndfile.seek(start_frame)
        data = sndfile.read(frames=end_frame - start_frame, dtype='float32')
        return cls(data, sample_rate)

122 123
    @classmethod
    def from_sequence_file(cls, filepath):
124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
        """Create audio segment from sequence file. Sequence file is a binary
        file containing a collection of multiple audio files, with several
        header bytes in the head indicating the offsets of each audio byte data
        chunk.

        The format is:

            4 bytes (int, version),
            4 bytes (int, num of utterance),
            4 bytes (int, bytes per header),
            [bytes_per_header*(num_utterance+1)] bytes (offsets for each audio),
            audio_bytes_data_of_1st_utterance,
            audio_bytes_data_of_2nd_utterance,
            ......

        Sequence file name must end with ".seqbin". And the filename of the 5th
        utterance's audio file in sequence file "xxx.seqbin" must be
        "xxx.seqbin_5", with "5" indicating the utterance index within this
        sequence file (starting from 1).
143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179

        :param filepath: Filepath of sequence file.
        :type filepath: basestring
        :return: Audio segment instance.
        :rtype: AudioSegment
        """
        # parse filepath
        matches = re.match(r"(.+\.seqbin)_(\d+)", filepath)
        if matches is None:
            raise IOError("File type of %s is not supported" % filepath)
        filename = matches.group(1)
        fileno = int(matches.group(2))

        # read headers
        f = open(filename, 'rb')
        version = f.read(4)
        num_utterances = struct.unpack("i", f.read(4))[0]
        bytes_per_header = struct.unpack("i", f.read(4))[0]
        header_bytes = f.read(bytes_per_header * (num_utterances + 1))
        header = [
            struct.unpack("i", header_bytes[bytes_per_header * i:
                                            bytes_per_header * (i + 1)])[0]
            for i in range(num_utterances + 1)
        ]

        # read audio bytes
        f.seek(header[fileno - 1])
        audio_bytes = f.read(header[fileno] - header[fileno - 1])
        f.close()

        # create audio segment
        try:
            return cls.from_bytes(audio_bytes)
        except Exception as e:
            samples = np.frombuffer(audio_bytes, dtype='int16')
            return cls(samples=samples, sample_rate=8000)

180 181
    @classmethod
    def from_bytes(cls, bytes):
182 183 184 185 186 187 188
        """Create audio segment from a byte string containing audio samples.
        
        :param bytes: Byte string containing audio samples.
        :type bytes: str
        :return: Audio segment instance.
        :rtype: AudioSegment
        """
189 190 191 192
        samples, sample_rate = soundfile.read(
            io.BytesIO(bytes), dtype='float32')
        return cls(samples, sample_rate)

chrisxu2014's avatar
chrisxu2014 已提交
193 194
    @classmethod
    def concatenate(cls, *segments):
chrisxu2014's avatar
chrisxu2014 已提交
195 196
        """Concatenate an arbitrary number of audio segments together.

chrisxu2014's avatar
chrisxu2014 已提交
197 198
        :param *segments: Input audio segments to be concatenated.
        :type *segments: tuple of AudioSegment
chrisxu2014's avatar
chrisxu2014 已提交
199
        :return: Audio segment instance as concatenating results.
chrisxu2014's avatar
chrisxu2014 已提交
200
        :rtype: AudioSegment
chrisxu2014's avatar
chrisxu2014 已提交
201
        :raises ValueError: If the number of segments is zero, or if the 
chrisxu2014's avatar
chrisxu2014 已提交
202 203
                            sample_rate of any segments does not match.
        :raises TypeError: If any segment is not AudioSegment instance.
chrisxu2014's avatar
chrisxu2014 已提交
204 205
        """
        # Perform basic sanity-checks.
chrisxu2014's avatar
chrisxu2014 已提交
206
        if len(segments) == 0:
chrisxu2014's avatar
chrisxu2014 已提交
207 208
            raise ValueError("No audio segments are given to concatenate.")
        sample_rate = segments[0]._sample_rate
chrisxu2014's avatar
chrisxu2014 已提交
209 210
        for seg in segments:
            if sample_rate != seg._sample_rate:
chrisxu2014's avatar
chrisxu2014 已提交
211 212
                raise ValueError("Can't concatenate segments with "
                                 "different sample rates")
chrisxu2014's avatar
chrisxu2014 已提交
213
            if type(seg) is not cls:
chrisxu2014's avatar
chrisxu2014 已提交
214
                raise TypeError("Only audio segments of the same type "
chrisxu2014's avatar
chrisxu2014 已提交
215
                                "can be concatenated.")
chrisxu2014's avatar
chrisxu2014 已提交
216
        samples = np.concatenate([seg.samples for seg in segments])
chrisxu2014's avatar
chrisxu2014 已提交
217
        return cls(samples, sample_rate)
chrisxu2014's avatar
chrisxu2014 已提交
218

219 220 221 222 223 224 225 226 227 228 229 230 231 232
    @classmethod
    def make_silence(cls, duration, sample_rate):
        """Creates a silent audio segment of the given duration and sample rate.

        :param duration: Length of silence in seconds.
        :type duration: float
        :param sample_rate: Sample rate.
        :type sample_rate: float
        :return: Silent AudioSegment instance of the given duration.
        :rtype: AudioSegment
        """
        samples = np.zeros(int(duration * sample_rate))
        return cls(samples, sample_rate)

233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257
    def to_wav_file(self, filepath, dtype='float32'):
        """Save audio segment to disk as wav file.
        
        :param filepath: WAV filepath or file object to save the
                         audio segment.
        :type filepath: basestring|file
        :param dtype: Subtype for audio file. Options: 'int16', 'int32',
                      'float32', 'float64'. Default is 'float32'.
        :type dtype: str
        :raises TypeError: If dtype is not supported.
        """
        samples = self._convert_samples_from_float32(self._samples, dtype)
        subtype_map = {
            'int16': 'PCM_16',
            'int32': 'PCM_32',
            'float32': 'FLOAT',
            'float64': 'DOUBLE'
        }
        soundfile.write(
            filepath,
            samples,
            self._sample_rate,
            format='WAV',
            subtype=subtype_map[dtype])

chrisxu2014's avatar
chrisxu2014 已提交
258
    def superimpose(self, other):
chrisxu2014's avatar
chrisxu2014 已提交
259 260 261
        """Add samples from another segment to those of this segment
        (sample-wise addition, not segment concatenation).

chrisxu2014's avatar
chrisxu2014 已提交
262 263
        Note that this is an in-place transformation.

chrisxu2014's avatar
chrisxu2014 已提交
264 265 266
        :param other: Segment containing samples to be added in.
        :type other: AudioSegments
        :raise TypeError: If type of two segments don't match.
chrisxu2014's avatar
chrisxu2014 已提交
267 268
        :raise ValueError: If the sample rates of the two segments are not
                           equal, or if the lengths of segments don't match.
chrisxu2014's avatar
chrisxu2014 已提交
269
        """
270
        if isinstance(other, type(self)):
chrisxu2014's avatar
chrisxu2014 已提交
271 272 273 274 275 276 277
            raise TypeError("Cannot add segments of different types: %s "
                            "and %s." % (type(self), type(other)))
        if self._sample_rate != other._sample_rate:
            raise ValueError("Sample rates must match to add segments.")
        if len(self._samples) != len(other._samples):
            raise ValueError("Segment lengths must match to add segments.")
        self._samples += other._samples
chrisxu2014's avatar
chrisxu2014 已提交
278

279 280 281
    def to_bytes(self, dtype='float32'):
        """Create a byte string containing the audio content.
        
chrisxu2014's avatar
chrisxu2014 已提交
282
        :param dtype: Data type for export samples. Options: 'int16', 'int32',
283 284 285 286 287 288 289 290
                      'float32', 'float64'. Default is 'float32'.
        :type dtype: str
        :return: Byte string containing audio content.
        :rtype: str
        """
        samples = self._convert_samples_from_float32(self._samples, dtype)
        return samples.tostring()

291
    def gain_db(self, gain):
292 293 294 295 296
        """Apply gain in decibels to samples.

        Note that this is an in-place transformation.
        
        :param gain: Gain in decibels to apply to samples. 
297
        :type gain: float|1darray
298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321
        """
        self._samples *= 10.**(gain / 20.)

    def change_speed(self, speed_rate):
        """Change the audio speed by linear interpolation.

        Note that this is an in-place transformation.
        
        :param speed_rate: Rate of speed change:
                           speed_rate > 1.0, speed up the audio;
                           speed_rate = 1.0, unchanged;
                           speed_rate < 1.0, slow down the audio;
                           speed_rate <= 0.0, not allowed, raise ValueError.
        :type speed_rate: float
        :raises ValueError: If speed_rate <= 0.0.
        """
        if speed_rate <= 0:
            raise ValueError("speed_rate should be greater than zero.")
        old_length = self._samples.shape[0]
        new_length = int(old_length / speed_rate)
        old_indices = np.arange(old_length)
        new_indices = np.linspace(start=0, stop=old_length, num=new_length)
        self._samples = np.interp(new_indices, old_indices, self._samples)

chrisxu2014's avatar
chrisxu2014 已提交
322
    def normalize(self, target_db=-20, max_gain_db=300.0):
chrisxu2014's avatar
chrisxu2014 已提交
323
        """Normalize audio to be of the desired RMS value in decibels.
chrisxu2014's avatar
chrisxu2014 已提交
324 325 326

        Note that this is an in-place transformation.

chrisxu2014's avatar
chrisxu2014 已提交
327 328
        :param target_db: Target RMS value in decibels. This value should be
                          less than 0.0 as 0.0 is full-scale audio.
chrisxu2014's avatar
chrisxu2014 已提交
329 330
        :type target_db: float
        :param max_gain_db: Max amount of gain in dB that can be applied for
chrisxu2014's avatar
chrisxu2014 已提交
331 332 333
                            normalization. This is to prevent nans when
                            attempting to normalize a signal consisting of
                            all zeros.
chrisxu2014's avatar
chrisxu2014 已提交
334 335 336
        :type max_gain_db: float
        :raises ValueError: If the required gain to normalize the segment to
                            the target_db value exceeds max_gain_db.
chrisxu2014's avatar
chrisxu2014 已提交
337 338 339 340
        """
        gain = target_db - self.rms_db
        if gain > max_gain_db:
            raise ValueError(
chrisxu2014's avatar
chrisxu2014 已提交
341 342 343
                "Unable to normalize segment to %f dB because the "
                "the probable gain have exceeds max_gain_db (%f dB)" %
                (target_db, max_gain_db))
344
        self.gain_db(min(max_gain_db, target_db - self.rms_db))
chrisxu2014's avatar
chrisxu2014 已提交
345 346 347 348 349 350

    def normalize_online_bayesian(self,
                                  target_db,
                                  prior_db,
                                  prior_samples,
                                  startup_delay=0.0):
chrisxu2014's avatar
chrisxu2014 已提交
351 352 353
        """Normalize audio using a production-compatible online/causal
        algorithm. This uses an exponential likelihood and gamma prior to
        make online estimates of the RMS even when there are very few samples.
chrisxu2014's avatar
chrisxu2014 已提交
354 355 356

        Note that this is an in-place transformation.

chrisxu2014's avatar
chrisxu2014 已提交
357
        :param target_db: Target RMS value in decibels.
chrisxu2014's avatar
chrisxu2014 已提交
358
        :type target_bd: float
chrisxu2014's avatar
chrisxu2014 已提交
359
        :param prior_db: Prior RMS estimate in decibels.
chrisxu2014's avatar
chrisxu2014 已提交
360
        :type prior_db: float
chrisxu2014's avatar
chrisxu2014 已提交
361
        :param prior_samples: Prior strength in number of samples.
chrisxu2014's avatar
chrisxu2014 已提交
362
        :type prior_samples: float
chrisxu2014's avatar
chrisxu2014 已提交
363
        :param startup_delay: Default 0.0s. If provided, this function will
chrisxu2014's avatar
chrisxu2014 已提交
364 365
                              accrue statistics for the first startup_delay 
                              seconds before applying online normalization.
chrisxu2014's avatar
chrisxu2014 已提交
366
        :type startup_delay: float
chrisxu2014's avatar
chrisxu2014 已提交
367
        """
chrisxu2014's avatar
chrisxu2014 已提交
368
        # Estimate total RMS online.
chrisxu2014's avatar
chrisxu2014 已提交
369 370 371 372 373
        startup_sample_idx = min(self.num_samples - 1,
                                 int(self.sample_rate * startup_delay))
        prior_mean_squared = 10.**(prior_db / 10.)
        prior_sum_of_squares = prior_mean_squared * prior_samples
        cumsum_of_squares = np.cumsum(self.samples**2)
chrisxu2014's avatar
chrisxu2014 已提交
374
        sample_count = np.arange(self.num_samples) + 1
chrisxu2014's avatar
chrisxu2014 已提交
375 376 377 378 379 380 381 382
        if startup_sample_idx > 0:
            cumsum_of_squares[:startup_sample_idx] = \
                cumsum_of_squares[startup_sample_idx]
            sample_count[:startup_sample_idx] = \
                sample_count[startup_sample_idx]
        mean_squared_estimate = ((cumsum_of_squares + prior_sum_of_squares) /
                                 (sample_count + prior_samples))
        rms_estimate_db = 10 * np.log10(mean_squared_estimate)
chrisxu2014's avatar
chrisxu2014 已提交
383
        # Compute required time-varying gain.
chrisxu2014's avatar
chrisxu2014 已提交
384
        gain_db = target_db - rms_estimate_db
385
        self.gain_db(gain_db)
chrisxu2014's avatar
chrisxu2014 已提交
386

387
    def resample(self, target_sample_rate, filter='kaiser_best'):
chrisxu2014's avatar
chrisxu2014 已提交
388
        """Resample the audio to a target sample rate.
chrisxu2014's avatar
chrisxu2014 已提交
389 390 391

        Note that this is an in-place transformation.

chrisxu2014's avatar
chrisxu2014 已提交
392
        :param target_sample_rate: Target sample rate.
chrisxu2014's avatar
chrisxu2014 已提交
393
        :type target_sample_rate: int
394
        :param filter: The resampling filter to use one of {'kaiser_best',
X
xushaoyong 已提交
395
                       'kaiser_fast'}.
396
        :type filter: str
chrisxu2014's avatar
chrisxu2014 已提交
397
        """
398 399
        self._samples = resampy.resample(
            self.samples, self.sample_rate, target_sample_rate, filter=filter)
chrisxu2014's avatar
chrisxu2014 已提交
400
        self._sample_rate = target_sample_rate
401

402
    def pad_silence(self, duration, sides='both'):
chrisxu2014's avatar
chrisxu2014 已提交
403
        """Pad this audio sample with a period of silence.
chrisxu2014's avatar
chrisxu2014 已提交
404 405 406

        Note that this is an in-place transformation.

chrisxu2014's avatar
chrisxu2014 已提交
407
        :param duration: Length of silence in seconds to pad.
chrisxu2014's avatar
chrisxu2014 已提交
408
        :type duration: float
chrisxu2014's avatar
chrisxu2014 已提交
409 410 411
        :param sides: Position for padding:
                     'beginning' - adds silence in the beginning;
                     'end' - adds silence in the end;
chrisxu2014's avatar
chrisxu2014 已提交
412 413
                     'both' - adds silence in both the beginning and the end.
        :type sides: str
chrisxu2014's avatar
chrisxu2014 已提交
414
        :raises ValueError: If sides is not supported.
chrisxu2014's avatar
chrisxu2014 已提交
415 416 417
        """
        if duration == 0.0:
            return self
chrisxu2014's avatar
chrisxu2014 已提交
418
        cls = type(self)
chrisxu2014's avatar
chrisxu2014 已提交
419
        silence = self.make_silence(duration, self._sample_rate)
chrisxu2014's avatar
chrisxu2014 已提交
420
        if sides == "beginning":
chrisxu2014's avatar
chrisxu2014 已提交
421
            padded = cls.concatenate(silence, self)
chrisxu2014's avatar
chrisxu2014 已提交
422
        elif sides == "end":
chrisxu2014's avatar
chrisxu2014 已提交
423
            padded = cls.concatenate(self, silence)
chrisxu2014's avatar
chrisxu2014 已提交
424
        elif sides == "both":
chrisxu2014's avatar
chrisxu2014 已提交
425
            padded = cls.concatenate(silence, self, silence)
chrisxu2014's avatar
chrisxu2014 已提交
426
        else:
chrisxu2014's avatar
chrisxu2014 已提交
427
            raise ValueError("Unknown value for the sides %s" % sides)
chrisxu2014's avatar
chrisxu2014 已提交
428
        self._samples = padded._samples
429

430 431 432 433 434 435 436 437 438 439 440 441
    def shift(self, shift_ms):
        """Shift the audio in time. If `shift_ms` is positive, shift with time
        advance; if negative, shift with time delay. Silence are padded to
        keep the duration unchanged.

        Note that this is an in-place transformation.

        :param shift_ms: Shift time in millseconds. If positive, shift with
                         time advance; if negative; shift with time delay.
        :type shift_ms: float
        :raises ValueError: If shift_ms is longer than audio duration.
        """
442
        if abs(shift_ms) / 1000.0 > self.duration:
443 444 445 446 447 448 449 450 451 452 453 454
            raise ValueError("Absolute value of shift_ms should be smaller "
                             "than audio duration.")
        shift_samples = int(shift_ms * self._sample_rate / 1000)
        if shift_samples > 0:
            # time advance
            self._samples[:-shift_samples] = self._samples[shift_samples:]
            self._samples[-shift_samples:] = 0
        elif shift_samples < 0:
            # time delay
            self._samples[-shift_samples:] = self._samples[:shift_samples]
            self._samples[:-shift_samples] = 0

455
    def subsegment(self, start_sec=None, end_sec=None):
chrisxu2014's avatar
chrisxu2014 已提交
456 457 458
        """Cut the AudioSegment between given boundaries.

        Note that this is an in-place transformation.
chrisxu2014's avatar
chrisxu2014 已提交
459

chrisxu2014's avatar
chrisxu2014 已提交
460
        :param start_sec: Beginning of subsegment in seconds.
chrisxu2014's avatar
chrisxu2014 已提交
461
        :type start_sec: float
chrisxu2014's avatar
chrisxu2014 已提交
462
        :param end_sec: End of subsegment in seconds.
chrisxu2014's avatar
chrisxu2014 已提交
463
        :type end_sec: float
chrisxu2014's avatar
chrisxu2014 已提交
464 465
        :raise ValueError: If start_sec or end_sec is incorrectly set, e.g. out
                           of bounds in time.
chrisxu2014's avatar
chrisxu2014 已提交
466
        """
chrisxu2014's avatar
chrisxu2014 已提交
467 468
        start_sec = 0.0 if start_sec is None else start_sec
        end_sec = self.duration if end_sec is None else end_sec
chrisxu2014's avatar
chrisxu2014 已提交
469 470 471 472
        if start_sec < 0.0:
            start_sec = self.duration + start_sec
        if end_sec < 0.0:
            end_sec = self.duration + end_sec
chrisxu2014's avatar
chrisxu2014 已提交
473 474 475 476 477 478 479 480 481 482 483 484
        if start_sec < 0.0:
            raise ValueError("The slice start position (%f s) is out of "
                             "bounds." % start_sec)
        if end_sec < 0.0:
            raise ValueError("The slice end position (%f s) is out of bounds." %
                             end_sec)
        if start_sec > end_sec:
            raise ValueError("The slice start position (%f s) is later than "
                             "the end position (%f s)." % (start_sec, end_sec))
        if end_sec > self.duration:
            raise ValueError("The slice end position (%f s) is out of bounds "
                             "(> %f s)" % (end_sec, self.duration))
chrisxu2014's avatar
chrisxu2014 已提交
485 486
        start_sample = int(round(start_sec * self._sample_rate))
        end_sample = int(round(end_sec * self._sample_rate))
chrisxu2014's avatar
chrisxu2014 已提交
487
        self._samples = self._samples[start_sample:end_sample]
chrisxu2014's avatar
chrisxu2014 已提交
488 489

    def random_subsegment(self, subsegment_length, rng=None):
chrisxu2014's avatar
chrisxu2014 已提交
490 491 492
        """Cut the specified length of the audiosegment randomly.

        Note that this is an in-place transformation.
chrisxu2014's avatar
chrisxu2014 已提交
493 494

        :param subsegment_length: Subsegment length in seconds.
chrisxu2014's avatar
chrisxu2014 已提交
495
        :type subsegment_length: float
chrisxu2014's avatar
chrisxu2014 已提交
496
        :param rng: Random number generator state.
chrisxu2014's avatar
chrisxu2014 已提交
497
        :type rng: random.Random
chrisxu2014's avatar
chrisxu2014 已提交
498 499
        :raises ValueError: If the length of subsegment is greater than
                            the origineal segemnt.
chrisxu2014's avatar
chrisxu2014 已提交
500
        """
chrisxu2014's avatar
chrisxu2014 已提交
501
        rng = random.Random() if rng is None else rng
chrisxu2014's avatar
chrisxu2014 已提交
502 503 504 505
        if subsegment_length > self.duration:
            raise ValueError("Length of subsegment must not be greater "
                             "than original segment.")
        start_time = rng.uniform(0.0, self.duration - subsegment_length)
chrisxu2014's avatar
chrisxu2014 已提交
506
        self.subsegment(start_time, start_time + subsegment_length)
chrisxu2014's avatar
chrisxu2014 已提交
507

chrisxu2014's avatar
chrisxu2014 已提交
508
    def convolve(self, impulse_segment, allow_resample=False):
chrisxu2014's avatar
chrisxu2014 已提交
509
        """Convolve this audio segment with the given impulse segment.
chrisxu2014's avatar
chrisxu2014 已提交
510

chrisxu2014's avatar
chrisxu2014 已提交
511
        Note that this is an in-place transformation.
chrisxu2014's avatar
chrisxu2014 已提交
512

chrisxu2014's avatar
chrisxu2014 已提交
513 514
        :param impulse_segment: Impulse response segments.
        :type impulse_segment: AudioSegment
chrisxu2014's avatar
chrisxu2014 已提交
515 516 517 518
        :param allow_resample: Indicates whether resampling is allowed when
                               the impulse_segment has a different sample 
                               rate from this signal.
        :type allow_resample: bool
chrisxu2014's avatar
chrisxu2014 已提交
519
        :raises ValueError: If the sample rate is not match between two
chrisxu2014's avatar
chrisxu2014 已提交
520
                            audio segments when resample is not allowed.
chrisxu2014's avatar
chrisxu2014 已提交
521 522
        """
        if allow_resample and self.sample_rate != impulse_segment.sample_rate:
523
            impulse_segment.resample(self.sample_rate)
chrisxu2014's avatar
chrisxu2014 已提交
524
        if self.sample_rate != impulse_segment.sample_rate:
525
            raise ValueError("Impulse segment's sample rate (%d Hz) is not "
chrisxu2014's avatar
chrisxu2014 已提交
526 527 528 529
                             "equal to base signal sample rate (%d Hz)." %
                             (impulse_segment.sample_rate, self.sample_rate))
        samples = signal.fftconvolve(self.samples, impulse_segment.samples,
                                     "full")
chrisxu2014's avatar
chrisxu2014 已提交
530 531
        self._samples = samples

chrisxu2014's avatar
chrisxu2014 已提交
532
    def convolve_and_normalize(self, impulse_segment, allow_resample=False):
chrisxu2014's avatar
chrisxu2014 已提交
533 534 535
        """Convolve and normalize the resulting audio segment so that it
        has the same average power as the input signal.

chrisxu2014's avatar
chrisxu2014 已提交
536 537
        Note that this is an in-place transformation.

chrisxu2014's avatar
chrisxu2014 已提交
538 539
        :param impulse_segment: Impulse response segments.
        :type impulse_segment: AudioSegment
chrisxu2014's avatar
chrisxu2014 已提交
540 541 542 543
        :param allow_resample: Indicates whether resampling is allowed when
                               the impulse_segment has a different sample
                               rate from this signal.
        :type allow_resample: bool
chrisxu2014's avatar
chrisxu2014 已提交
544
        """
chrisxu2014's avatar
chrisxu2014 已提交
545 546 547
        target_db = self.rms_db
        self.convolve(impulse_segment, allow_resample=allow_resample)
        self.normalize(target_db)
chrisxu2014's avatar
chrisxu2014 已提交
548 549 550 551 552 553 554

    def add_noise(self,
                  noise,
                  snr_dB,
                  allow_downsampling=False,
                  max_gain_db=300.0,
                  rng=None):
chrisxu2014's avatar
chrisxu2014 已提交
555
        """Add the given noise segment at a specific signal-to-noise ratio.
chrisxu2014's avatar
chrisxu2014 已提交
556 557 558
        If the noise segment is longer than this segment, a random subsegment
        of matching length is sampled from it and used instead.

chrisxu2014's avatar
chrisxu2014 已提交
559 560
        Note that this is an in-place transformation.

chrisxu2014's avatar
chrisxu2014 已提交
561
        :param noise: Noise signal to add.
chrisxu2014's avatar
chrisxu2014 已提交
562
        :type noise: AudioSegment
chrisxu2014's avatar
chrisxu2014 已提交
563
        :param snr_dB: Signal-to-Noise Ratio, in decibels.
chrisxu2014's avatar
chrisxu2014 已提交
564
        :type snr_dB: float
chrisxu2014's avatar
chrisxu2014 已提交
565 566 567 568 569 570 571
        :param allow_downsampling: Whether to allow the noise signal to be
                                   downsampled to match the base signal sample
                                   rate.
        :type allow_downsampling: bool
        :param max_gain_db: Maximum amount of gain to apply to noise signal
                            before adding it in. This is to prevent attempting
                            to apply infinite gain to a zero signal.
chrisxu2014's avatar
chrisxu2014 已提交
572
        :type max_gain_db: float
chrisxu2014's avatar
chrisxu2014 已提交
573
        :param rng: Random number generator state.
chrisxu2014's avatar
chrisxu2014 已提交
574 575
        :type rng: None|random.Random
        :raises ValueError: If the sample rate does not match between the two
chrisxu2014's avatar
chrisxu2014 已提交
576 577
                            audio segments when downsampling is not allowed, or
                            if the duration of noise segments is shorter than
chrisxu2014's avatar
chrisxu2014 已提交
578
                            original audio segments.
chrisxu2014's avatar
chrisxu2014 已提交
579
        """
chrisxu2014's avatar
chrisxu2014 已提交
580
        rng = random.Random() if rng is None else rng
chrisxu2014's avatar
chrisxu2014 已提交
581 582 583
        if allow_downsampling and noise.sample_rate > self.sample_rate:
            noise = noise.resample(self.sample_rate)
        if noise.sample_rate != self.sample_rate:
chrisxu2014's avatar
chrisxu2014 已提交
584 585 586
            raise ValueError("Noise sample rate (%d Hz) is not equal to base "
                             "signal sample rate (%d Hz)." % (noise.sample_rate,
                                                              self.sample_rate))
chrisxu2014's avatar
chrisxu2014 已提交
587
        if noise.duration < self.duration:
chrisxu2014's avatar
chrisxu2014 已提交
588 589
            raise ValueError("Noise signal (%f sec) must be at least as long as"
                             " base signal (%f sec)." %
chrisxu2014's avatar
chrisxu2014 已提交
590
                             (noise.duration, self.duration))
chrisxu2014's avatar
chrisxu2014 已提交
591
        noise_gain_db = min(self.rms_db - noise.rms_db - snr_dB, max_gain_db)
chrisxu2014's avatar
chrisxu2014 已提交
592 593
        noise_new = copy.deepcopy(noise)
        noise_new.random_subsegment(self.duration, rng=rng)
594
        noise_new.gain_db(noise_gain_db)
chrisxu2014's avatar
chrisxu2014 已提交
595
        self.superimpose(noise_new)
chrisxu2014's avatar
chrisxu2014 已提交
596

597 598
    @property
    def samples(self):
599 600 601 602 603
        """Return audio samples.

        :return: Audio samples.
        :rtype: ndarray
        """
604 605 606 607
        return self._samples.copy()

    @property
    def sample_rate(self):
608 609 610 611 612
        """Return audio sample rate.

        :return: Audio sample rate.
        :rtype: int
        """
613 614 615
        return self._sample_rate

    @property
616 617
    def num_samples(self):
        """Return number of samples.
618

619 620 621
        :return: Number of samples.
        :rtype: int
        """
chrisxu2014's avatar
chrisxu2014 已提交
622
        return self._samples.shape[0]
623

624 625 626
    @property
    def duration(self):
        """Return audio duration.
627

628 629 630 631
        :return: Audio duration in seconds.
        :rtype: float
        """
        return self._samples.shape[0] / float(self._sample_rate)
632 633

    @property
634 635 636 637 638 639 640 641
    def rms_db(self):
        """Return root mean square energy of the audio in decibels.

        :return: Root mean square energy in decibels.
        :rtype: float
        """
        # square root => multiply by 10 instead of 20 for dBs
        mean_square = np.mean(self._samples**2)
642 643
        if mean_square == 0.:
            return 0.
644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667
        return 10 * np.log10(mean_square)

    def _convert_samples_to_float32(self, samples):
        """Convert sample type to float32.

        Audio sample type is usually integer or float-point.
        Integers will be scaled to [-1, 1] in float32.
        """
        float32_samples = samples.astype('float32')
        if samples.dtype in np.sctypes['int']:
            bits = np.iinfo(samples.dtype).bits
            float32_samples *= (1. / 2**(bits - 1))
        elif samples.dtype in np.sctypes['float']:
            pass
        else:
            raise TypeError("Unsupported sample type: %s." % samples.dtype)
        return float32_samples

    def _convert_samples_from_float32(self, samples, dtype):
        """Convert sample type from float32 to dtype.
        
        Audio sample type is usually integer or float-point. For integer
        type, float32 will be rescaled from [-1, 1] to the maximum range
        supported by the integer type.
chrisxu2014's avatar
chrisxu2014 已提交
668

669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687
        This is for writing a audio file.
        """
        dtype = np.dtype(dtype)
        output_samples = samples.copy()
        if dtype in np.sctypes['int']:
            bits = np.iinfo(dtype).bits
            output_samples *= (2**(bits - 1) / 1.)
            min_val = np.iinfo(dtype).min
            max_val = np.iinfo(dtype).max
            output_samples[output_samples > max_val] = max_val
            output_samples[output_samples < min_val] = min_val
        elif samples.dtype in np.sctypes['float']:
            min_val = np.finfo(dtype).min
            max_val = np.finfo(dtype).max
            output_samples[output_samples > max_val] = max_val
            output_samples[output_samples < min_val] = min_val
        else:
            raise TypeError("Unsupported sample type: %s." % samples.dtype)
        return output_samples.astype(dtype)