move process utt to collator

279348d7 · Haoxin Ma · 8781ab58 · 279348d7 · 279348d7 · 279348d7
4 changed file
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@@ -165,7 +165,7 @@ class DeepSpeech2Trainer(Trainer):
                sortagrad=config.data.sortagrad,
                shuffle_method=config.data.shuffle_method)

-        collate_fn = SpeechCollator(keep_transcription_text=False)
+        collate_fn = SpeechCollator(config, keep_transcription_text=False)
        self.train_loader = DataLoader(
            train_dataset,
            batch_sampler=batch_sampler,

--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -16,14 +16,22 @@ import numpy as np
 from deepspeech.frontend.utility import IGNORE_ID
 from deepspeech.io.utility import pad_sequence
 from deepspeech.utils.log import Log
+from deepspeech.frontend.augmentor.augmentation import AugmentationPipeline
+from deepspeech.frontend.featurizer.speech_featurizer import SpeechFeaturizer
+from deepspeech.frontend.normalizer import FeatureNormalizer
+from deepspeech.frontend.speech import SpeechSegment
+import io
+import time

 __all__ = ["SpeechCollator"]

 logger = Log(__name__).getlog()

+# namedtupe need global for pickle.
+TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])

 class SpeechCollator():
-    def __init__(self, keep_transcription_text=True):
+    def __init__(self, config, keep_transcription_text=True):
        """
        Padding audio features with zeros to make them have the same shape (or
        a user-defined shape) within one bach.
@@ -32,6 +40,112 @@ class SpeechCollator():
        """
        self._keep_transcription_text = keep_transcription_text

+        if isinstance(config.data.augmentation_config, (str, bytes)):
+            if config.data.augmentation_config:
+                aug_file = io.open(
+                    config.data.augmentation_config, mode='r', encoding='utf8')
+            else:
+                aug_file = io.StringIO(initial_value='{}', newline='')
+        else:
+            aug_file = config.data.augmentation_config
+            assert isinstance(aug_file, io.StringIO)
+
+        self._local_data = TarLocalData(tar2info={}, tar2object={}）
+        self._augmentation_pipeline = AugmentationPipeline(
+            augmentation_config=aug_file.read(), 
+            random_seed=config.data.random_seed)
+        
+        self._normalizer = FeatureNormalizer(
+            config.data.mean_std_filepath) if config.data.mean_std_filepath else None
+
+        self._stride_ms = config.data.stride_ms
+        self._target_sample_rate = config.data.target_sample_rate
+
+        self._speech_featurizer = SpeechFeaturizer(
+            unit_type=config.data.unit_type,
+            vocab_filepath=config.data.vocab_filepath,
+            spm_model_prefix=config.data.spm_model_prefix,
+            specgram_type=config.data.specgram_type,
+            feat_dim=config.data.feat_dim,
+            delta_delta=config.data.delta_delta,
+            stride_ms=config.data.stride_ms,
+            window_ms=config.data.window_ms,
+            n_fft=config.data.n_fft,
+            max_freq=config.data.max_freq,
+            target_sample_rate=config.data.target_sample_rate,
+            use_dB_normalization=config.data.use_dB_normalization,
+            target_dB=config.data.target_dB,
+            dither=config.data.dither)
+
+    def _parse_tar(self, file):
+        """Parse a tar file to get a tarfile object
+        and a map containing tarinfoes
+        """
+        result = {}
+        f = tarfile.open(file)
+        for tarinfo in f.getmembers():
+            result[tarinfo.name] = tarinfo
+        return f, result
+
+    def _subfile_from_tar(self, file):
+        """Get subfile object from tar.
+
+        It will return a subfile object from tar file
+        and cached tar file info for next reading request.
+        """
+        tarpath, filename = file.split(':', 1)[1].split('#', 1)
+        if 'tar2info' not in self._local_data.__dict__:
+            self._local_data.tar2info = {}
+        if 'tar2object' not in self._local_data.__dict__:
+            self._local_data.tar2object = {}
+        if tarpath not in self._local_data.tar2info:
+            object, infoes = self._parse_tar(tarpath)
+            self._local_data.tar2info[tarpath] = infoes
+            self._local_data.tar2object[tarpath] = object
+        return self._local_data.tar2object[tarpath].extractfile(
+            self._local_data.tar2info[tarpath][filename])
+
+    def process_utterance(self, audio_file, transcript):
+        """Load, augment, featurize and normalize for speech data.
+
+        :param audio_file: Filepath or file object of audio file.
+        :type audio_file: str | file
+        :param transcript: Transcription text.
+        :type transcript: str
+        :return: Tuple of audio feature tensor and data of transcription part,
+                 where transcription part could be token ids or text.
+        :rtype: tuple of (2darray, list)
+        """
+        start_time = time.time()
+        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
+            speech_segment = SpeechSegment.from_file(
+                self._subfile_from_tar(audio_file), transcript)
+        else:
+            speech_segment = SpeechSegment.from_file(audio_file, transcript)
+        load_wav_time = time.time() - start_time
+        #logger.debug(f"load wav time: {load_wav_time}")
+
+        # audio augment
+        start_time = time.time()
+        self._augmentation_pipeline.transform_audio(speech_segment)
+        audio_aug_time = time.time() - start_time
+        #logger.debug(f"audio augmentation time: {audio_aug_time}")
+
+        start_time = time.time()
+        specgram, transcript_part = self._speech_featurizer.featurize(
+            speech_segment, self._keep_transcription_text)
+        if self._normalizer:
+            specgram = self._normalizer.apply(specgram)
+        feature_time = time.time() - start_time
+        #logger.debug(f"audio & test feature time: {feature_time}")
+
+        # specgram augment
+        start_time = time.time()
+        specgram = self._augmentation_pipeline.transform_feature(specgram)
+        feature_aug_time = time.time() - start_time
+        #logger.debug(f"audio feature augmentation time: {feature_aug_time}")
+        return specgram, transcript_part
+
    def __call__(self, batch):
        """batch examples

@@ -53,6 +167,7 @@ class SpeechCollator():
        text_lens = []
        utts = []
        for utt, audio, text in batch:
+            audio, text = self.process_utterance(audio, text)
            #utt
            utts.append(utt)
            # audio

--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -34,9 +34,6 @@ __all__ = [

 logger = Log(__name__).getlog()

-# namedtupe need global for pickle.
-TarLocalData = namedtuple('TarLocalData', ['tar2info', 'tar2object'])
-

 class ManifestDataset(Dataset):
    @classmethod
@@ -192,10 +189,6 @@ class ManifestDataset(Dataset):
        self._stride_ms = stride_ms
        self._target_sample_rate = target_sample_rate

-        self._normalizer = FeatureNormalizer(
-            mean_std_filepath) if mean_std_filepath else None
-        self._augmentation_pipeline = AugmentationPipeline(
-            augmentation_config=augmentation_config, random_seed=random_seed)
        self._speech_featurizer = SpeechFeaturizer(
            unit_type=unit_type,
            vocab_filepath=vocab_filepath,
@@ -214,8 +207,6 @@ class ManifestDataset(Dataset):

        self._rng = np.random.RandomState(random_seed)
        self._keep_transcription_text = keep_transcription_text
-        # for caching tar files info
-        self._local_data = TarLocalData(tar2info={}, tar2object={})

        # read manifest
        self._manifest = read_manifest(
@@ -256,74 +247,7 @@ class ManifestDataset(Dataset):
    def stride_ms(self):
        return self._speech_featurizer.stride_ms

-    def _parse_tar(self, file):
-        """Parse a tar file to get a tarfile object
-        and a map containing tarinfoes
-        """
-        result = {}
-        f = tarfile.open(file)
-        for tarinfo in f.getmembers():
-            result[tarinfo.name] = tarinfo
-        return f, result
-
-    def _subfile_from_tar(self, file):
-        """Get subfile object from tar.

-        It will return a subfile object from tar file
-        and cached tar file info for next reading request.
-        """
-        tarpath, filename = file.split(':', 1)[1].split('#', 1)
-        if 'tar2info' not in self._local_data.__dict__:
-            self._local_data.tar2info = {}
-        if 'tar2object' not in self._local_data.__dict__:
-            self._local_data.tar2object = {}
-        if tarpath not in self._local_data.tar2info:
-            object, infoes = self._parse_tar(tarpath)
-            self._local_data.tar2info[tarpath] = infoes
-            self._local_data.tar2object[tarpath] = object
-        return self._local_data.tar2object[tarpath].extractfile(
-            self._local_data.tar2info[tarpath][filename])
-
-    def process_utterance(self, utt, audio_file, transcript):
-        """Load, augment, featurize and normalize for speech data.
-
-        :param audio_file: Filepath or file object of audio file.
-        :type audio_file: str | file
-        :param transcript: Transcription text.
-        :type transcript: str
-        :return: Tuple of audio feature tensor and data of transcription part,
-                 where transcription part could be token ids or text.
-        :rtype: tuple of (2darray, list)
-        """
-        start_time = time.time()
-        if isinstance(audio_file, str) and audio_file.startswith('tar:'):
-            speech_segment = SpeechSegment.from_file(
-                self._subfile_from_tar(audio_file), transcript)
-        else:
-            speech_segment = SpeechSegment.from_file(audio_file, transcript)
-        load_wav_time = time.time() - start_time
-        #logger.debug(f"load wav time: {load_wav_time}")
-
-        # audio augment
-        start_time = time.time()
-        self._augmentation_pipeline.transform_audio(speech_segment)
-        audio_aug_time = time.time() - start_time
-        #logger.debug(f"audio augmentation time: {audio_aug_time}")
-
-        start_time = time.time()
-        specgram, transcript_part = self._speech_featurizer.featurize(
-            speech_segment, self._keep_transcription_text)
-        if self._normalizer:
-            specgram = self._normalizer.apply(specgram)
-        feature_time = time.time() - start_time
-        #logger.debug(f"audio & test feature time: {feature_time}")
-
-        # specgram augment
-        start_time = time.time()
-        specgram = self._augmentation_pipeline.transform_feature(specgram)
-        feature_aug_time = time.time() - start_time
-        #logger.debug(f"audio feature augmentation time: {feature_aug_time}")
-        return utt, specgram, transcript_part

    def _instance_reader_creator(self, manifest):
        """
@@ -336,8 +260,6 @@ class ManifestDataset(Dataset):

        def reader():
            for instance in manifest:
-                # inst = self.process_utterance(instance["feat"],
-                #                               instance["text"])
                inst = self.process_utterance(instance["utt"], instance["feat"],
                                              instance["text"])
                yield inst
@@ -349,6 +271,4 @@ class ManifestDataset(Dataset):

    def __getitem__(self, idx):
        instance = self._manifest[idx]
-        return self.process_utterance(instance["utt"], instance["feat"],
-                                      instance["text"])
-        # return self.process_utterance(instance["feat"], instance["text"])
+        return(instance["utt"], instance["feat"], instance["text"])
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@@ -6,7 +6,7 @@ data:
  mean_std_filepath: data/mean_std.json
  vocab_filepath: data/vocab.txt 
  augmentation_config: conf/augmentation.json
-  batch_size: 4
+  batch_size: 2
  min_input_len: 0.0
  max_input_len: 27.0
  min_output_len: 0.0
@@ -37,7 +37,7 @@ model:
  share_rnn_weights: True 

 training:
-  n_epoch: 20
+  n_epoch: 10
  lr: 1e-5 
  lr_decay: 1.0 
  weight_decay: 1e-06