speech.py 5.4 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
"""Contains the speech segment class."""
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from data_utils.audio import AudioSegment


class SpeechSegment(AudioSegment):
    """Speech segment abstraction, a subclass of AudioSegment,
    with an additional transcript.

    :param samples: Audio samples [num_samples x num_channels].
    :type samples: ndarray.float32
    :param sample_rate: Audio sample rate.
    :type sample_rate: int
    :param transcript: Transcript text for the speech.
    :type transript: basestring
    :raises TypeError: If the sample data type is not float or int.
    """

    def __init__(self, samples, sample_rate, transcript):
        AudioSegment.__init__(self, samples, sample_rate)
        self._transcript = transcript

    def __eq__(self, other):
        """Return whether two objects are equal.
        """
        if not AudioSegment.__eq__(self, other):
            return False
        if self._transcript != other._transcript:
            return False
        return True

    def __ne__(self, other):
        """Return whether two objects are unequal."""
        return not self.__eq__(other)

    @classmethod
    def from_file(cls, filepath, transcript):
        """Create speech segment from audio file and corresponding transcript.
        
        :param filepath: Filepath or file object to audio file.
        :type filepath: basestring|file
        :param transcript: Transcript text for the speech.
        :type transript: basestring
        :return: Audio segment instance.
        :rtype: AudioSegment
        """
        audio = AudioSegment.from_file(filepath)
        return cls(audio.samples, audio.sample_rate, transcript)

    @classmethod
    def from_bytes(cls, bytes, transcript):
        """Create speech segment from a byte string and corresponding
        transcript.
        
        :param bytes: Byte string containing audio samples.
        :type bytes: str
        :param transcript: Transcript text for the speech.
        :type transript: basestring
        :return: Audio segment instance.
        :rtype: AudioSegment
        """
        audio = AudioSegment.from_bytes(bytes)
        return cls(audio.samples, audio.sample_rate, transcript)

chrisxu2014's avatar
chrisxu2014 已提交
68 69
    @classmethod
    def concatenate(cls, *segments):
chrisxu2014's avatar
chrisxu2014 已提交
70
        """Concatenate an arbitrary number of speech segments together.
chrisxu2014's avatar
chrisxu2014 已提交
71

chrisxu2014's avatar
chrisxu2014 已提交
72
        :param *segments: Input speech segments.
chrisxu2014's avatar
chrisxu2014 已提交
73 74 75
        :type *segments: SpeechSegment
        :return: Speech segment instance.
        :rtype: SpeechSegment
chrisxu2014's avatar
chrisxu2014 已提交
76 77
        :raises ValueError: If the number of segments is zero, or if the 
                            sample_rate of any two segments does not match.
chrisxu2014's avatar
chrisxu2014 已提交
78
        :raises TypeError: If every item in segments is not SpeechSegment
chrisxu2014's avatar
chrisxu2014 已提交
79
                           instance.
chrisxu2014's avatar
chrisxu2014 已提交
80 81
        """
        if len(segments) == 0:
chrisxu2014's avatar
chrisxu2014 已提交
82
            raise ValueError("No speech segments are given to concatenate.")
chrisxu2014's avatar
chrisxu2014 已提交
83
        sample_rate = segments[0]._sample_rate
chrisxu2014's avatar
chrisxu2014 已提交
84
        transcripts = ""
chrisxu2014's avatar
chrisxu2014 已提交
85 86 87 88 89 90 91
        for seg in segments:
            if sample_rate != seg._sample_rate:
                raise ValueError("Can't concatenate segments with "
                                 "different sample rates")
            if type(seg) is not cls:
                raise TypeError("Only speech segments of the same type "
                                "instance can be concatenated.")
chrisxu2014's avatar
chrisxu2014 已提交
92
            transcripts += seg._transcript
chrisxu2014's avatar
chrisxu2014 已提交
93
        samples = np.concatenate([seg.samples for seg in segments])
chrisxu2014's avatar
chrisxu2014 已提交
94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118
        return cls(samples, sample_rate, transcripts)

    @classmethod
    def slice_from_file(cls, filepath, start=None, end=None, transcript=""):
        """Loads a small section of an speech without having to load
        the entire file into the memory which can be incredibly wasteful.

        :param filepath: Filepath or file object to audio file.
        :type filepath: basestring|file
        :param start: Start time in seconds. If start is negative, it wraps
                      around from the end. If not provided, this function 
                      reads from the very beginning.
        :type start: float
        :param end: End time in seconds. If end is negative, it wraps around
                    from the end. If not provided, the default behvaior is
                    to read to the end of the file.
        :type end: float
        :param transcript: Transcript text for the speech. if not provided, 
                           the defaults is an empty string.
        :type transript: basestring
        :return: SpeechSegment instance of the specified slice of the input
                 speech file.
        :rtype: SpeechSegment
        """
        audio = Audiosegment.slice_from_file(filepath, start, end)
chrisxu2014's avatar
chrisxu2014 已提交
119
        return cls(audio.samples, audio.sample_rate, transcript)
chrisxu2014's avatar
chrisxu2014 已提交
120 121 122 123 124 125 126 127 128 129 130

    @classmethod
    def make_silence(cls, duration, sample_rate):
        """Creates a silent speech segment of the given duration and
        sample rate.

        :param duration: Length of silence in seconds.
        :type duration: float
        :param sample_rate: Sample rate.
        :type sample_rate: float
        :return: Silence of the given duration.
chrisxu2014's avatar
chrisxu2014 已提交
131
        :rtype: SpeechSegment
chrisxu2014's avatar
chrisxu2014 已提交
132 133 134
        """
        audio = AudioSegment.make_silence(duration, sample_rate)
        return cls(audio.samples, audio.sample_rate, "")
chrisxu2014's avatar
chrisxu2014 已提交
135

136 137 138 139 140 141 142 143
    @property
    def transcript(self):
        """Return the transcript text.

        :return: Transcript text for the speech.
        :rtype: basestring
        """
        return self._transcript