D,T to T,D

17092cbb · Hui Zhang · 0c9fbaf7 · 17092cbb · 17092cbb · 17092cbb
9 changed file
--- a/deepspeech/frontend/audio.py
+++ b/deepspeech/frontend/audio.py
@@ -24,8 +24,10 @@ import soundfile
 import soxbindings as sox
 from scipy import signal

+from .utility import subfile_from_tar

-class AudioSegment(object):
+
+class AudioSegment():
    """Monaural audio segment abstraction.

    :param samples: Audio samples [num_samples x num_channels].
@@ -68,16 +70,20 @@ class AudioSegment(object):
                                self.duration, self.rms_db))

    @classmethod
-    def from_file(cls, file):
+    def from_file(cls, file, infos=None):
        """Create audio segment from audio file.

-        :param filepath: Filepath or file object to audio file.
-        :type filepath: str|file
-        :return: Audio segment instance.
-        :rtype: AudioSegment
+        Args:
+            filepath (str|file): Filepath or file object to audio file.
+            infos (TarLocalData, optional): tar2obj and tar2infos. Defaults to None.
+
+        Returns:
+            AudioSegment: Audio segment instance.
        """
        if isinstance(file, str) and re.findall(r".seqbin_\d+$", file):
            return cls.from_sequence_file(file)
+        elif isinstance(file, str) and file.startswith('tar:'):
+            return cls.from_file(subfile_from_tar(file, infos))
        else:
            samples, sample_rate = soundfile.read(file, dtype='float32')
            return cls(samples, sample_rate)

--- a/deepspeech/frontend/augmentor/spec_augment.py
+++ b/deepspeech/frontend/augmentor/spec_augment.py
@@ -133,7 +133,7 @@ class SpecAugmentor(AugmentorBase):
        return self._time_mask

    def __repr__(self):
-        return f"specaug: F-{F}, T-{T}, F-n-{n_freq_masks}, T-n-{n_time_masks}"
+        return f"specaug: F-{self.F}, T-{self.T}, F-n-{self.n_freq_masks}, T-n-{self.n_time_masks}"

    def time_warp(self, x, mode='PIL'):
        """time warp for spec augment

--- a/deepspeech/frontend/featurizer/__init__.py
+++ b/deepspeech/frontend/featurizer/__init__.py
@@ -11,3 +11,6 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from .audio_featurizer import AudioFeaturizer  #noqa: F401
+from .speech_featurizer import SpeechFeaturizer
+from .text_featurizer import TextFeaturizer
--- a/deepspeech/frontend/featurizer/audio_featurizer.py
+++ b/deepspeech/frontend/featurizer/audio_featurizer.py
@@ -18,7 +18,7 @@ from python_speech_features import logfbank
 from python_speech_features import mfcc


-class AudioFeaturizer(object):
+class AudioFeaturizer():
    """Audio featurizer, for extracting features from audio contents of
    AudioSegment or SpeechSegment.

@@ -167,32 +167,6 @@ class AudioFeaturizer(object):
            raise ValueError("Unknown spectrum_type %s. "
                             "Supported values: linear." % self._spectrum_type)

-    def _compute_linear_specgram(self,
-                                 samples,
-                                 sample_rate,
-                                 stride_ms=10.0,
-                                 window_ms=20.0,
-                                 max_freq=None,
-                                 eps=1e-14):
-        """Compute the linear spectrogram from FFT energy."""
-        if max_freq is None:
-            max_freq = sample_rate / 2
-        if max_freq > sample_rate / 2:
-            raise ValueError("max_freq must not be greater than half of "
-                             "sample rate.")
-        if stride_ms > window_ms:
-            raise ValueError("Stride size must not be greater than "
-                             "window size.")
-        stride_size = int(0.001 * sample_rate * stride_ms)
-        window_size = int(0.001 * sample_rate * window_ms)
-        specgram, freqs = self._specgram_real(
-            samples,
-            window_size=window_size,
-            stride_size=stride_size,
-            sample_rate=sample_rate)
-        ind = np.where(freqs <= max_freq)[0][-1] + 1
-        return np.log(specgram[:ind, :] + eps)
-
    def _specgram_real(self, samples, window_size, stride_size, sample_rate):
        """Compute the spectrogram for samples from a real signal."""
        # extract strided windows
@@ -217,26 +191,65 @@ class AudioFeaturizer(object):
        freqs = float(sample_rate) / window_size * np.arange(fft.shape[0])
        return fft, freqs

+    def _compute_linear_specgram(self,
+                                 samples,
+                                 sample_rate,
+                                 stride_ms=10.0,
+                                 window_ms=20.0,
+                                 max_freq=None,
+                                 eps=1e-14):
+        """Compute the linear spectrogram from FFT energy.
+
+        Args:
+            samples ([type]): [description]
+            sample_rate ([type]): [description]
+            stride_ms (float, optional): [description]. Defaults to 10.0.
+            window_ms (float, optional): [description]. Defaults to 20.0.
+            max_freq ([type], optional): [description]. Defaults to None.
+            eps ([type], optional): [description]. Defaults to 1e-14.
+
+        Raises:
+            ValueError: [description]
+            ValueError: [description]
+
+        Returns:
+            np.ndarray: log spectrogram, (time, freq)
+        """
+        if max_freq is None:
+            max_freq = sample_rate / 2
+        if max_freq > sample_rate / 2:
+            raise ValueError("max_freq must not be greater than half of "
+                             "sample rate.")
+        if stride_ms > window_ms:
+            raise ValueError("Stride size must not be greater than "
+                             "window size.")
+        stride_size = int(0.001 * sample_rate * stride_ms)
+        window_size = int(0.001 * sample_rate * window_ms)
+        specgram, freqs = self._specgram_real(
+            samples,
+            window_size=window_size,
+            stride_size=stride_size,
+            sample_rate=sample_rate)
+        ind = np.where(freqs <= max_freq)[0][-1] + 1
+        # (freq, time)
+        spec = np.log(specgram[:ind, :] + eps)
+        return np.transpose(spec)
+
    def _concat_delta_delta(self, feat):
        """append delat, delta-delta feature.

        Args:
-            feat (np.ndarray): (D, T)
+            feat (np.ndarray): (T, D)

        Returns:
-            np.ndarray: feat with delta-delta, (3*D, T)
+            np.ndarray: feat with delta-delta, (T, 3*D)
        """
-        feat = np.transpose(feat)
        # Deltas
        d_feat = delta(feat, 2)
        # Deltas-Deltas
        dd_feat = delta(feat, 2)
-        # transpose
-        feat = np.transpose(feat)
-        d_feat = np.transpose(d_feat)
-        dd_feat = np.transpose(dd_feat)
        # concat above three features
-        concat_feat = np.concatenate((feat, d_feat, dd_feat))
+        concat_feat = np.concatenate((feat, d_feat, dd_feat), axis=1)
        return concat_feat

    def _compute_mfcc(self,
@@ -292,7 +305,6 @@ class AudioFeaturizer(object):
            ceplifter=22,
            useEnergy=True,
            winfunc='povey')
-        mfcc_feat = np.transpose(mfcc_feat)
        if delta_delta:
            mfcc_feat = self._concat_delta_delta(mfcc_feat)
        return mfcc_feat
@@ -346,8 +358,6 @@ class AudioFeaturizer(object):
            remove_dc_offset=True,
            preemph=0.97,
            wintype='povey')
-
-        fbank_feat = np.transpose(fbank_feat)
        if delta_delta:
            fbank_feat = self._concat_delta_delta(fbank_feat)
        return fbank_feat
--- a/deepspeech/frontend/featurizer/speech_featurizer.py
+++ b/deepspeech/frontend/featurizer/speech_featurizer.py
@@ -16,38 +16,8 @@ from deepspeech.frontend.featurizer.audio_featurizer import AudioFeaturizer
 from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer


-class SpeechFeaturizer(object):
-    """Speech featurizer, for extracting features from both audio and transcript
-    contents of SpeechSegment.
-
-    Currently, for audio parts, it supports feature types of linear
-    spectrogram and mfcc; for transcript parts, it only supports char-level
-    tokenizing and conversion into a list of token indices. Note that the
-    token indexing order follows the given vocabulary file.
-
-    :param vocab_filepath: Filepath to load vocabulary for token indices
-                           conversion.
-    :type spectrum_type: str
-    :param spectrum_type: Specgram feature type. Options: 'linear', 'mfcc'.
-    :type spectrum_type: str
-    :param stride_ms: Striding size (in milliseconds) for generating frames.
-    :type stride_ms: float
-    :param window_ms: Window size (in milliseconds) for generating frames.
-    :type window_ms: float
-    :param max_freq: When spectrum_type is 'linear', only FFT bins
-                     corresponding to frequencies between [0, max_freq] are
-                     returned; when spectrum_type is 'mfcc', max_freq is the
-                     highest band edge of mel filters.
-    :types max_freq: None|float
-    :param target_sample_rate: Speech are resampled (if upsampling or
-                               downsampling is allowed) to this before
-                               extracting spectrogram features.
-    :type target_sample_rate: float
-    :param use_dB_normalization: Whether to normalize the audio to a certain
-                                 decibels before extracting the features.
-    :type use_dB_normalization: bool
-    :param target_dB: Target audio decibels for normalization.
-    :type target_dB: float
+class SpeechFeaturizer():
+    """Speech and Text feature extraction.
    """

    def __init__(self,
@@ -64,8 +34,12 @@ class SpeechFeaturizer(object):
                 target_sample_rate=16000,
                 use_dB_normalization=True,
                 target_dB=-20,
-                 dither=1.0):
-        self._audio_featurizer = AudioFeaturizer(
+                 dither=1.0,
+                 maskctc=False):
+        self.stride_ms = stride_ms
+        self.window_ms = window_ms
+
+        self.audio_feature = AudioFeaturizer(
            spectrum_type=spectrum_type,
            feat_dim=feat_dim,
            delta_delta=delta_delta,
@@ -77,8 +51,14 @@ class SpeechFeaturizer(object):
            use_dB_normalization=use_dB_normalization,
            target_dB=target_dB,
            dither=dither)
-        self._text_featurizer = TextFeaturizer(unit_type, vocab_filepath,
-                                               spm_model_prefix)
+        self.feature_size = self.audio_feature.feature_size
+
+        self.text_feature = TextFeaturizer(
+            unit_type=unit_type,
+            vocab_filepath=vocab_filepath,
+            spm_model_prefix=spm_model_prefix,
+            maskctc=maskctc)
+        self.vocab_size = self.text_feature.vocab_size

    def featurize(self, speech_segment, keep_transcription_text):
        """Extract features for speech segment.
@@ -94,66 +74,33 @@ class SpeechFeaturizer(object):
        Returns:
            tuple: 1) spectrogram audio feature in 2darray, 2) list oftoken indices.
        """
-        spec_feature = self._audio_featurizer.featurize(speech_segment)
+        spec_feature = self.audio_feature.featurize(speech_segment)
+
        if keep_transcription_text:
            return spec_feature, speech_segment.transcript
+
        if speech_segment.has_token:
            text_ids = speech_segment.token_ids
        else:
-            text_ids = self._text_featurizer.featurize(
-                speech_segment.transcript)
+            text_ids = self.text_feature.featurize(speech_segment.transcript)
        return spec_feature, text_ids

-    @property
-    def vocab_size(self):
-        """Return the vocabulary size.
-
-        Returns:
-            int: Vocabulary size.
-        """
-        return self._text_featurizer.vocab_size
-
-    @property
-    def vocab_list(self):
-        """Return the vocabulary in list.
-
-        Returns:
-            List[str]: 
-        """
-        return self._text_featurizer.vocab_list
-
-    @property
-    def vocab_dict(self):
-        """Return the vocabulary in dict.
-
-        Returns:
-            Dict[str, int]: 
-        """
-        return self._text_featurizer.vocab_dict
-
-    @property
-    def feature_size(self):
-        """Return the audio feature size.
+    def text_featurize(self, text, keep_transcription_text):
+        """Extract features for speech segment.

-        Returns:
-            int: audio feature size.
-        """
-        return self._audio_featurizer.feature_size
+        1. For audio parts, extract the audio features.
+        2. For transcript parts, keep the original text or convert text string
+           to a list of token indices in char-level.

-    @property
-    def stride_ms(self):
-        """time length in `ms` unit per frame
+        Args:
+            text (str): text.
+            keep_transcription_text (bool): True, keep transcript text, False, token ids

        Returns:
-            float: time(ms)/frame
+            (str|List[int]): text, or list of token indices.
        """
-        return self._audio_featurizer.stride_ms
-
-    @property
-    def text_feature(self):
-        """Return the text feature object.
+        if keep_transcription_text:
+            return text

-        Returns:
-            TextFeaturizer: object.
-        """
-        return self._text_featurizer
+        text_ids = self.text_feature.featurize(text)
+        return text_ids
--- a/deepspeech/frontend/normalizer.py
+++ b/deepspeech/frontend/normalizer.py
@@ -40,21 +40,21 @@ class CollateFunc(object):
        number = 0
        for item in batch:
            audioseg = AudioSegment.from_file(item['feat'])
-            feat = self.feature_func(audioseg)  #(D, T)
+            feat = self.feature_func(audioseg)  #(T, D)

-            sums = np.sum(feat, axis=1)
+            sums = np.sum(feat, axis=0)
            if mean_stat is None:
                mean_stat = sums
            else:
                mean_stat += sums

-            square_sums = np.sum(np.square(feat), axis=1)
+            square_sums = np.sum(np.square(feat), axis=0)
            if var_stat is None:
                var_stat = square_sums
            else:
                var_stat += square_sums

-            number += feat.shape[1]
+            number += feat.shape[0]
        return number, mean_stat, var_stat


@@ -120,7 +120,7 @@ class FeatureNormalizer(object):
        """Normalize features to be of zero mean and unit stddev.

        :param features: Input features to be normalized.
-        :type features: ndarray, shape (D, T)
+        :type features: ndarray, shape (T, D)
        :param eps:  added to stddev to provide numerical stablibity.
        :type eps: float
        :return: Normalized features.
@@ -130,9 +130,10 @@ class FeatureNormalizer(object):

    def _read_mean_std_from_file(self, filepath, eps=1e-20):
        """Load mean and std from file."""
-        mean, istd = load_cmvn(filepath, filetype='json')
-        self._mean = np.expand_dims(mean, axis=-1)
-        self._istd = np.expand_dims(istd, axis=-1)
+        filetype = filepath.split(".")[-1]
+        mean, istd = load_cmvn(filepath, filetype=filetype)
+        self._mean = np.expand_dims(mean, axis=0)
+        self._istd = np.expand_dims(istd, axis=0)

    def write_to_file(self, filepath):
        """Write the mean and stddev to the file.

--- a/deepspeech/frontend/speech.py
+++ b/deepspeech/frontend/speech.py
@@ -68,7 +68,12 @@ class SpeechSegment(AudioSegment):
        return not self.__eq__(other)

    @classmethod
-    def from_file(cls, filepath, transcript, tokens=None, token_ids=None):
+    def from_file(cls,
+                  filepath,
+                  transcript,
+                  tokens=None,
+                  token_ids=None,
+                  infos=None):
        """Create speech segment from audio file and corresponding transcript.

        Args:
@@ -76,12 +81,12 @@ class SpeechSegment(AudioSegment):
            transcript (str): Transcript text for the speech.
            tokens (List[str], optional): text tokens. Defaults to None.
            token_ids (List[int], optional): text token ids. Defaults to None.
+            infos (TarLocalData, optional): tar2obj and tar2infos. Defaults to None.

        Returns:
            SpeechSegment: Speech segment instance.
        """
-
-        audio = AudioSegment.from_file(filepath)
+        audio = AudioSegment.from_file(filepath, infos)
        return cls(audio.samples, audio.sample_rate, transcript, tokens,
                   token_ids)


--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -56,8 +56,8 @@ class SpeechCollator():
        for utt, audio, text in batch:
            utts.append(utt)
            # audio
-            audios.append(audio.T)  # [T, D]
-            audio_lens.append(audio.shape[1])
+            audios.append(audio)  # [T, D]
+            audio_lens.append(audio.shape[0])
            # text
            # for training, text is token ids
            # else text is string, convert to unicode ord

--- a/examples/librispeech/s1/path.sh
+++ b/examples/librispeech/s1/path.sh
@@ -3,6 +3,7 @@ export MAIN_ROOT=${PWD}/../../../
 export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
 export LC_ALL=C

+export PYTHONDONTWRITEBYTECODE=1
 # Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
 export PYTHONIOENCODING=UTF-8
 export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}