separate

ae566f66 · Haoxin Ma · 60ac4bc2 · ae566f66 · ae566f66 · ae566f66
3 changed file
--- a/deepspeech/frontend/augmentor/augmentation.py
+++ b/deepspeech/frontend/augmentor/augmentation.py
@@ -93,7 +93,29 @@ class AugmentationPipeline():
        self._spec_augmentors, self._spec_rates = self._parse_pipeline_from(
            augmentation_config, 'feature')

-    def transform_audio(self, audio_segment, single=True):
+    def randomize_parameters_audio_transform(self):
+        """Run the pre-processing pipeline for data augmentation.
+
+        Note that this is an in-place transformation.
+        
+        :param audio_segment: Audio segment to process.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        for augmentor, rate in zip(self._augmentors, self._rates):
+            augmentor.randomize_parameters()
+    
+    def randomize_parameters_feature_transform(self, audio):
+        """Run the pre-processing pipeline for data augmentation.
+
+        Note that this is an in-place transformation.
+        
+        :param audio_segment: Audio segment to process.
+        :type audio_segment: AudioSegmenet|SpeechSegment
+        """
+        for augmentor, rate in zip(self._augmentors, self._rates):
+            augmentor.randomize_parameters(audio)
+
+    def apply_audio_transform(self, audio_segment):
        """Run the pre-processing pipeline for data augmentation.

        Note that this is an in-place transformation.
@@ -103,9 +125,9 @@ class AugmentationPipeline():
        """
        for augmentor, rate in zip(self._augmentors, self._rates):
            if self._rng.uniform(0., 1.) < rate:
-                augmentor.transform_audio(audio_segment, single)
+                augmentor.apply(audio_segment)

-    def transform_feature(self, spec_segment, single=True):
+    def apply_feature_transform(self, spec_segment):
        """spectrogram augmentation.
         
        Args:
@@ -113,9 +135,32 @@ class AugmentationPipeline():
        """
        for augmentor, rate in zip(self._spec_augmentors, self._spec_rates):
            if self._rng.uniform(0., 1.) < rate:
-                spec_segment = augmentor.transform_feature(spec_segment, single)
+                spec_segment = augmentor.apply(spec_segment)
        return spec_segment

+    # def transform_audio(self, audio_segment, single=True):
+    #     """Run the pre-processing pipeline for data augmentation.
+
+    #     Note that this is an in-place transformation.
+        
+    #     :param audio_segment: Audio segment to process.
+    #     :type audio_segment: AudioSegmenet|SpeechSegment
+    #     """
+    #     for augmentor, rate in zip(self._augmentors, self._rates):
+    #         if self._rng.uniform(0., 1.) < rate:
+    #             augmentor.transform_audio(audio_segment, single)
+
+    # def transform_feature(self, spec_segment, single=True):
+    #     """spectrogram augmentation.
+         
+    #     Args:
+    #         spec_segment (np.ndarray): audio feature, (D, T).
+    #     """
+    #     for augmentor, rate in zip(self._spec_augmentors, self._spec_rates):
+    #         if self._rng.uniform(0., 1.) < rate:
+    #             spec_segment = augmentor.transform_feature(spec_segment, single)
+    #     return spec_segment
+
    def _parse_pipeline_from(self, config_json, aug_type='audio'):
        """Parse the config json to build a augmentation pipelien."""
        assert aug_type in ('audio', 'feature'), aug_type

--- a/deepspeech/frontend/augmentor/spec_augment.py
+++ b/deepspeech/frontend/augmentor/spec_augment.py
@@ -124,9 +124,9 @@ class SpecAugmentor(AugmentorBase):
    def time_warp(xs, W=40):
        raise NotImplementedError
    
-    def randomize_parameters(self, xs):
-        n_bins = xs.shape[0]
-        n_frames = xs.shape[1]
+    def randomize_parameters(self, n_bins, n_frame):
+        # n_bins = xs.shape[0]
+        # n_frames = xs.shape[1]

        self.f=[]
        self.f_0=[]

--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -215,7 +215,21 @@ class SpeechCollator():
        return self._local_data.tar2object[tarpath].extractfile(
            self._local_data.tar2info[tarpath][filename])

-    def process_utterance(self, audio_file, transcript, single=True):
+    def load_audio(self, audio_file, transcript):
+        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
+            speech_segment = SpeechSegment.from_file(
+                self._subfile_from_tar(audio_file), transcript)
+        else:
+            speech_segment = SpeechSegment.from_file(audio_file, transcript)
+        return speech_segment
+
+    def randomize_audio_parameters(self):
+        self._augmentation_pipeline.andomize_parameters_audio_transform()
+    
+    def randomize_feature_parameters(self, n_bins, n_frames):
+        self._augmentation_pipeline.andomize_parameters_feature_transform(n_bins, n_frames)
+
+    def process_utterance(self, audio_file, transcript):
        """Load, augment, featurize and normalize for speech data.

        :param audio_file: Filepath or file object of audio file.
@@ -226,25 +240,56 @@ class SpeechCollator():
                 where transcription part could be token ids or text.
        :rtype: tuple of (2darray, list)
        """
-        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
-            speech_segment = SpeechSegment.from_file(
-                self._subfile_from_tar(audio_file), transcript)
-        else:
-            speech_segment = SpeechSegment.from_file(audio_file, transcript)
+        speech_segment = self.load_audio(audio_file, transcript)

-        # audio augment
-        self._augmentation_pipeline.transform_audio(speech_segment)
+        # apply audio augment
+        self._augmentation_pipeline.apply_audio_transform(speech_segment)

+        # Spectrum transform
        specgram, transcript_part = self._speech_featurizer.featurize(
            speech_segment, self._keep_transcription_text)
+        
        if self._normalizer:
            specgram = self._normalizer.apply(specgram)

-        # specgram augment
-
-        specgram = self._augmentation_pipeline.transform_feature(specgram)
+        # # apply specgram augment
+        # specgram = self._augmentation_pipeline.apply_feature_transform(specgram)
        return specgram, transcript_part

+
+    # def process_utterance(self, audio_file, transcript, single=True):
+    #     """Load, augment, featurize and normalize for speech data.
+
+    #     :param audio_file: Filepath or file object of audio file.
+    #     :type audio_file: str | file
+    #     :param transcript: Transcription text.
+    #     :type transcript: str
+    #     :return: Tuple of audio feature tensor and data of transcription part,
+    #              where transcription part could be token ids or text.
+    #     :rtype: tuple of (2darray, list)
+    #     """
+    #     if isinstance(audio_file, str) and audio_file.startswith('tar:'):
+    #         speech_segment = SpeechSegment.from_file(
+    #             self._subfile_from_tar(audio_file), transcript)
+    #     else:
+    #         speech_segment = SpeechSegment.from_file(audio_file, transcript)
+
+    #     # audio augment
+    #     self._augmentation_pipeline.transform_audio(speech_segment)
+
+
+    #     # Spectrum transform
+    #     specgram, transcript_part = self._speech_featurizer.featurize(
+    #         speech_segment, self._keep_transcription_text)
+        
+    #     if self._normalizer:
+    #         specgram = self._normalizer.apply(specgram)
+
+    #     # specgram augment
+
+    #     specgram = self._augmentation_pipeline.transform_feature(specgram)
+    #     return specgram, transcript_part
+
    def __call__(self, batch):
        """batch examples

@@ -269,10 +314,11 @@ class SpeechCollator():
        # print(batch)
        # print(type(batch))
        # print(len(batch))
-        resample=True
+        self.randomize_audio_parameters()
        for utt, audio, text in batch:
-            audio, text = self.process_utterance(audio, text, single=resample)
-            # resample=False
+            if not self.config.randomize_each_batch:
+                self.randomize_audio_parameters()
+            audio, text = self.process_utterance(audio, text)
            #utt
            utts.append(utt)
            # audio
@@ -298,6 +344,15 @@ class SpeechCollator():
        padded_texts = pad_sequence(
            texts, padding_value=IGNORE_ID).astype(np.int64)
        text_lens = np.array(text_lens).astype(np.int64)
+        
+        #spec augment
+        n_bins=padded_audios[0]
+        self.randomize_feature_parameters(n_bins, min(audio_lens))
+        for i in range(len(padded_audios)):
+            if not self.config.randomize_each_batch: 
+                self.randomize_feature_parameters(n_bins, audio_lens[i])
+            padded_audios[i] = self._augmentation_pipeline.apply_feature_transform(padded_audios[i])
+
        return utts, padded_audios, audio_lens, padded_texts, text_lens

    @property