You need to sign in or sign up before continuing.
speech.py 8.0 KB
Newer Older
H
Hui Zhang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
14
"""Contains the speech segment class."""
H
Hui Zhang 已提交
15
import numpy as np
16

H
Hui Zhang 已提交
17
from deepspeech.frontend.audio import AudioSegment
18 19 20


class SpeechSegment(AudioSegment):
H
Hui Zhang 已提交
21 22 23 24
    """Speech Segment with Text

    Args:
        AudioSegment (AudioSegment): Audio Segment
25 26
    """

27 28 29 30 31 32
    def __init__(self,
                 samples,
                 sample_rate,
                 transcript,
                 tokens=None,
                 token_ids=None):
H
Hui Zhang 已提交
33 34 35 36 37 38 39
        """Speech segment abstraction, a subclass of AudioSegment,
            with an additional transcript.

        Args:
            samples (ndarray.float32): Audio samples [num_samples x num_channels].
            sample_rate (int): Audio sample rate.
            transcript (str): Transcript text for the speech.
40 41
            tokens (List[str], optinal): Transcript tokens for the speech.
            token_ids (List[int], optional): Transcript token ids for the speech.
H
Hui Zhang 已提交
42
        """
43 44
        AudioSegment.__init__(self, samples, sample_rate)
        self._transcript = transcript
45 46 47
        # must init `tokens` with `token_ids` at the same time
        self._tokens = tokens
        self._token_ids = token_ids
48 49 50

    def __eq__(self, other):
        """Return whether two objects are equal.
H
Hui Zhang 已提交
51 52 53

        Returns:
            bool: True, when equal to other
54 55 56 57 58
        """
        if not AudioSegment.__eq__(self, other):
            return False
        if self._transcript != other._transcript:
            return False
59 60 61 62 63
        if self.has_token and other.has_token:
            if self._tokens != other._tokens:
                return False
            if self._token_ids != other._token_ids:
                return False
64 65 66 67 68 69 70
        return True

    def __ne__(self, other):
        """Return whether two objects are unequal."""
        return not self.__eq__(other)

    @classmethod
71 72 73 74 75 76
    def from_file(cls,
                  filepath,
                  transcript,
                  tokens=None,
                  token_ids=None,
                  infos=None):
77
        """Create speech segment from audio file and corresponding transcript.
78 79 80 81 82 83

        Args:
            filepath (str|file): Filepath or file object to audio file.
            transcript (str): Transcript text for the speech.
            tokens (List[str], optional): text tokens. Defaults to None.
            token_ids (List[int], optional): text token ids. Defaults to None.
84
            infos (TarLocalData, optional): tar2obj and tar2infos. Defaults to None.
85 86 87

        Returns:
            SpeechSegment: Speech segment instance.
88
        """
89
        audio = AudioSegment.from_file(filepath, infos)
90 91
        return cls(audio.samples, audio.sample_rate, transcript, tokens,
                   token_ids)
92 93

    @classmethod
94
    def from_bytes(cls, bytes, transcript, tokens=None, token_ids=None):
95
        """Create speech segment from a byte string and corresponding
96 97 98 99 100 101 102 103 104

        Args:
            filepath (str|file): Filepath or file object to audio file.
            transcript (str): Transcript text for the speech.
            tokens (List[str], optional): text tokens. Defaults to None.
            token_ids (List[int], optional): text token ids. Defaults to None.

        Returns:
            SpeechSegment: Speech segment instance.
105 106
        """
        audio = AudioSegment.from_bytes(bytes)
107 108
        return cls(audio.samples, audio.sample_rate, transcript, tokens,
                   token_ids)
109

chrisxu2014's avatar
chrisxu2014 已提交
110 111
    @classmethod
    def concatenate(cls, *segments):
chrisxu2014's avatar
chrisxu2014 已提交
112 113
        """Concatenate an arbitrary number of speech segments together, both
        audio and transcript will be concatenated.
chrisxu2014's avatar
chrisxu2014 已提交
114

chrisxu2014's avatar
chrisxu2014 已提交
115 116
        :param *segments: Input speech segments to be concatenated.
        :type *segments: tuple of SpeechSegment
chrisxu2014's avatar
chrisxu2014 已提交
117 118
        :return: Speech segment instance.
        :rtype: SpeechSegment
chrisxu2014's avatar
chrisxu2014 已提交
119 120
        :raises ValueError: If the number of segments is zero, or if the 
                            sample_rate of any two segments does not match.
chrisxu2014's avatar
chrisxu2014 已提交
121
        :raises TypeError: If any segment is not SpeechSegment instance.
chrisxu2014's avatar
chrisxu2014 已提交
122 123
        """
        if len(segments) == 0:
chrisxu2014's avatar
chrisxu2014 已提交
124
            raise ValueError("No speech segments are given to concatenate.")
chrisxu2014's avatar
chrisxu2014 已提交
125
        sample_rate = segments[0]._sample_rate
chrisxu2014's avatar
chrisxu2014 已提交
126
        transcripts = ""
127 128
        tokens = []
        token_ids = []
chrisxu2014's avatar
chrisxu2014 已提交
129 130 131 132 133 134 135
        for seg in segments:
            if sample_rate != seg._sample_rate:
                raise ValueError("Can't concatenate segments with "
                                 "different sample rates")
            if type(seg) is not cls:
                raise TypeError("Only speech segments of the same type "
                                "instance can be concatenated.")
chrisxu2014's avatar
chrisxu2014 已提交
136
            transcripts += seg._transcript
137 138 139
            if self.has_token:
                tokens += seg._tokens
                token_ids += seg._token_ids
chrisxu2014's avatar
chrisxu2014 已提交
140
        samples = np.concatenate([seg.samples for seg in segments])
141
        return cls(samples, sample_rate, transcripts, tokens, token_ids)
chrisxu2014's avatar
chrisxu2014 已提交
142 143

    @classmethod
144 145 146 147 148 149 150
    def slice_from_file(cls,
                        filepath,
                        transcript,
                        tokens=None,
                        token_ids=None,
                        start=None,
                        end=None):
chrisxu2014's avatar
chrisxu2014 已提交
151 152 153 154
        """Loads a small section of an speech without having to load
        the entire file into the memory which can be incredibly wasteful.

        :param filepath: Filepath or file object to audio file.
H
Hui Zhang 已提交
155
        :type filepath: str|file
chrisxu2014's avatar
chrisxu2014 已提交
156 157 158 159 160 161 162 163 164 165
        :param start: Start time in seconds. If start is negative, it wraps
                      around from the end. If not provided, this function 
                      reads from the very beginning.
        :type start: float
        :param end: End time in seconds. If end is negative, it wraps around
                    from the end. If not provided, the default behvaior is
                    to read to the end of the file.
        :type end: float
        :param transcript: Transcript text for the speech. if not provided, 
                           the defaults is an empty string.
H
Hui Zhang 已提交
166
        :type transript: str
chrisxu2014's avatar
chrisxu2014 已提交
167 168 169 170
        :return: SpeechSegment instance of the specified slice of the input
                 speech file.
        :rtype: SpeechSegment
        """
171
        audio = AudioSegment.slice_from_file(filepath, start, end)
172 173
        return cls(audio.samples, audio.sample_rate, transcript, tokens,
                   token_ids)
chrisxu2014's avatar
chrisxu2014 已提交
174 175 176 177

    @classmethod
    def make_silence(cls, duration, sample_rate):
        """Creates a silent speech segment of the given duration and
chrisxu2014's avatar
chrisxu2014 已提交
178
        sample rate, transcript will be an empty string.
chrisxu2014's avatar
chrisxu2014 已提交
179

180 181 182 183 184 185
        Args:
            duration (float): Length of silence in seconds.
            sample_rate (float): Sample rate.

        Returns:
            SpeechSegment: Silence of the given duration.
chrisxu2014's avatar
chrisxu2014 已提交
186 187 188
        """
        audio = AudioSegment.make_silence(duration, sample_rate)
        return cls(audio.samples, audio.sample_rate, "")
chrisxu2014's avatar
chrisxu2014 已提交
189

190 191 192 193 194 195
    @property
    def has_token(self):
        if self._tokens and self._token_ids:
            return True
        return False

196 197 198 199
    @property
    def transcript(self):
        """Return the transcript text.

200 201
        Returns:
            str: Transcript text for the speech.
202
        """
203

204
        return self._transcript
205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222

    @property
    def tokens(self):
        """Return the transcript text tokens.

        Returns:
            List[str]: text tokens.
        """
        return self._tokens

    @property
    def token_ids(self):
        """Return the transcript text token ids.

        Returns:
            List[int]: text token ids.
        """
        return self._token_ids