fix

aaeef54f · Haoxin Ma · 82ca0f65 · aaeef54f · aaeef54f · aaeef54f
6 changed file
--- a/deepspeech/frontend/augmentor/augmentation.py
+++ b/deepspeech/frontend/augmentor/augmentation.py
@@ -103,7 +103,7 @@ class AugmentationPipeline():
        """
        for augmentor, rate in zip(self._augmentors, self._rates):
            augmentor.randomize_parameters()
    def randomize_parameters_feature_transform(self, n_frames, n_bins):
        """Run the pre-processing pipeline for data augmentation.
@@ -142,7 +142,7 @@ class AugmentationPipeline():
    #     """Run the pre-processing pipeline for data augmentation.
    #     Note that this is an in-place transformation.
    #     :param audio_segment: Audio segment to process.
    #     :type audio_segment: AudioSegmenet|SpeechSegment
    #     """
@@ -152,7 +152,7 @@ class AugmentationPipeline():
    # def transform_feature(self, spec_segment, single=True):
    #     """spectrogram augmentation.
    #     Args:
    #         spec_segment (np.ndarray): audio feature, (D, T).
    #     """

--- a/deepspeech/frontend/augmentor/shift_perturb.py
+++ b/deepspeech/frontend/augmentor/shift_perturb.py
@@ -32,7 +32,8 @@ class ShiftPerturbAugmentor(AugmentorBase):
        self._rng = rng
    def randomize_parameters(self):
-        self.shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms)
+        self.shift_ms = self._rng.uniform(self._min_shift_ms,
+                                          self._max_shift_ms)
    def apply(self, audio_segment):
        audio_segment.shift(self.shift_ms)
@@ -49,7 +50,6 @@ class ShiftPerturbAugmentor(AugmentorBase):
    #         self.randomize_parameters()
    #     self.apply(audio_segment)
    # def transform_audio(self, audio_segment):
    #     """Shift audio.
@@ -60,5 +60,3 @@ class ShiftPerturbAugmentor(AugmentorBase):
    #     """
    #     shift_ms = self._rng.uniform(self._min_shift_ms, self._max_shift_ms)
    #     audio_segment.shift(shift_ms)
--- a/deepspeech/frontend/augmentor/spec_augment.py
+++ b/deepspeech/frontend/augmentor/spec_augment.py
@@ -123,18 +123,18 @@ class SpecAugmentor(AugmentorBase):
    def time_warp(xs, W=40):
        raise NotImplementedError
    def randomize_parameters(self, n_frames, n_bins):
        # n_bins = xs.shape[0]
        # n_frames = xs.shape[1]
-        self.f=[]
+        self.f = []
-        self.f_0=[]
+        self.f_0 = []
-        self.t=[]
+        self.t = []
-        self.t_0=[]
+        self.t_0 = []
        for i in range(0, self.n_freq_masks):
-            f=int(self._rng.uniform(low=0, high=self.F))
+            f = int(self._rng.uniform(low=0, high=self.F))
            self.f.append(f)
            self.f_0.append(int(self._rng.uniform(low=0, high=n_bins - f)))
@@ -166,7 +166,7 @@ class SpecAugmentor(AugmentorBase):
            f_0 = self.f_0[i]
            xs[:, f_0:f_0 + f] = 0
            assert f_0 <= f_0 + f
        for i in range(self.n_masks):
            t = self.t[i]
            t_0 = self.t_0[i]
@@ -174,7 +174,6 @@ class SpecAugmentor(AugmentorBase):
            assert t_0 <= t_0 + t
        return xs
    # def mask_freq(self, xs, replace_with_zero=False):
    #     n_bins = xs.shape[0]
    #     for i in range(0, self.n_freq_masks):
@@ -208,7 +207,6 @@ class SpecAugmentor(AugmentorBase):
    #         self._time_mask = (t_0, t_0 + t)
    #     return xs
    # def transform_feature(self, xs: np.ndarray, single=True):
    #     """
    #     Args:

--- a/deepspeech/frontend/augmentor/speed_perturb.py
+++ b/deepspeech/frontend/augmentor/speed_perturb.py
@@ -79,7 +79,6 @@ class SpeedPerturbAugmentor(AugmentorBase):
            self._rates = np.linspace(
                self._min_rate, self._max_rate, self._num_rates, endpoint=True)
    def randomize_parameters(self):
        if self._num_rates < 0:
            self.speed_rate = self._rng.uniform(self._min_rate, self._max_rate)
@@ -92,8 +91,8 @@ class SpeedPerturbAugmentor(AugmentorBase):
            return
        audio_segment.change_speed(speed_rate)
-    def transform_audio(self, audio_segment,single=True):
+    def transform_audio(self, audio_segment, single=True):
        """Sample a new speed rate from the given range and
        changes the speed of the given audio clip.
@@ -102,7 +101,7 @@ class SpeedPerturbAugmentor(AugmentorBase):
        :param audio_segment: Audio segment to add effects to.
        :type audio_segment: AudioSegment|SpeechSegment
        """
-        if(single):
+        if (single):
            self.randomize_parameters()
        self.apply(audio_segment)

--- a/deepspeech/frontend/featurizer/audio_featurizer.py
+++ b/deepspeech/frontend/featurizer/audio_featurizer.py
@@ -195,7 +195,7 @@ class AudioFeaturizer(object):
        ind = np.where(freqs <= max_freq)[0][-1] + 1
        specgram = np.log(specgram[:ind, :] + eps)
-        specgram = np.transpose(specgram) #T,D
+        specgram = np.transpose(specgram)  #T,D
        return specgram
    def _specgram_real(self, samples, window_size, stride_size, sample_rate):
@@ -299,7 +299,7 @@ class AudioFeaturizer(object):
            ceplifter=22,
            useEnergy=True,
            winfunc='povey')
        mfcc_feat = np.transpose(mfcc_feat)
        if delta_delta:
            mfcc_feat = self._concat_delta_delta(mfcc_feat)

--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -173,7 +173,6 @@ class SpeechCollator():
        self._stride_ms = stride_ms
        self._target_sample_rate = target_sample_rate
        self._speech_featurizer = SpeechFeaturizer(
            unit_type=unit_type,
@@ -229,9 +228,10 @@ class SpeechCollator():
    def randomize_audio_parameters(self):
        self._augmentation_pipeline.randomize_parameters_audio_transform()
    def randomize_feature_parameters(self, n_frames, n_bins):
-        self._augmentation_pipeline.randomize_parameters_feature_transform(n_frames, n_bins)
+        self._augmentation_pipeline.randomize_parameters_feature_transform(
+            n_frames, n_bins)
    def process_feature_and_transform(self, audio_file, transcript):
        """Load, augment, featurize and normalize for speech data.
@@ -252,7 +252,7 @@ class SpeechCollator():
        # Spectrum transform
        specgram, transcript_part = self._speech_featurizer.featurize(
            speech_segment, self._keep_transcription_text)
        if self._normalizer:
            specgram = self._normalizer.apply(specgram)
@@ -261,7 +261,6 @@ class SpeechCollator():
        return specgram, transcript_part
    # def process_utterance(self, audio_file, transcript, single=True):
    #     """Load, augment, featurize and normalize for speech data.
@@ -282,11 +281,10 @@ class SpeechCollator():
    #     # audio augment
    #     self._augmentation_pipeline.transform_audio(speech_segment)
    #     # Spectrum transform
    #     specgram, transcript_part = self._speech_featurizer.featurize(
    #         speech_segment, self._keep_transcription_text)
    #     if self._normalizer:
    #         specgram = self._normalizer.apply(specgram)
@@ -350,14 +348,16 @@ class SpeechCollator():
        padded_texts = pad_sequence(
            texts, padding_value=IGNORE_ID).astype(np.int64)
        text_lens = np.array(text_lens).astype(np.int64)
        #spec augment
-        n_bins=padded_audios.shape[2]
+        n_bins = padded_audios.shape[2]
        self.randomize_feature_parameters(min(audio_lens), n_bins)
        for i in range(len(padded_audios)):
-            if not self._randomize_each_batch: 
+            if not self._randomize_each_batch:
                self.randomize_feature_parameters(audio_lens[i], n_bins)
-            padded_audios[i] = self._augmentation_pipeline.apply_feature_transform(padded_audios[i])
+            padded_audios[
+                i] = self._augmentation_pipeline.apply_feature_transform(
+                    padded_audios[i])
        return utts, padded_audios, audio_lens, padded_texts, text_lens