From 557427736e9f2fba6715cc3ce18b3175a3c42cd8 Mon Sep 17 00:00:00 2001
From: Haoxin Ma <745165806@qq.com>
Date: Fri, 18 Jun 2021 06:41:28 +0000
Subject: [PATCH] move redundant params

---
 deepspeech/exps/deepspeech2/config.py   |  30 +++----
 deepspeech/exps/deepspeech2/model.py    |  14 ++--
 deepspeech/exps/u2/config.py            |  12 +--
 deepspeech/exps/u2/model.py             |  35 ++++----
 deepspeech/io/collator.py               |  36 ++++++--
 deepspeech/io/dataset.py                | 105 +-----------------------
 examples/aishell/s1/conf/conformer.yaml |  14 ++--
 examples/tiny/s0/conf/deepspeech2.yaml  |  10 +--
 examples/tiny/s1/conf/transformer.yaml  |  22 ++---
 9 files changed, 96 insertions(+), 182 deletions(-)

diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py
index faaff1aa..050a50b0 100644
--- a/deepspeech/exps/deepspeech2/config.py
+++ b/deepspeech/exps/deepspeech2/config.py
@@ -21,32 +21,18 @@ _C.data = CN(
         train_manifest="",
         dev_manifest="",
         test_manifest="",
-        unit_type="char",
-        vocab_filepath="",
-        spm_model_prefix="",
-        mean_std_filepath="",
-        augmentation_config="",
         max_duration=float('inf'),
         min_duration=0.0,
     ))
 
-_C.model = CN(
-    dict(
-        num_conv_layers=2,  #Number of stacking convolution layers.
-        num_rnn_layers=3,  #Number of stacking RNN layers.
-        rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
-        use_gru=True,  #Use gru if set True. Use simple rnn if set False.
-        share_rnn_weights=True  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
-    ))
-
 _C.collator =CN(
     dict(
-        augmentation_config="",
-        random_seed=0,
-        mean_std_filepath="",
         unit_type="char",
         vocab_filepath="",
         spm_model_prefix="",
+        mean_std_filepath="",
+        augmentation_config="",
+        random_seed=0,
         specgram_type='linear',  # 'linear', 'mfcc', 'fbank'
         feat_dim=0,  # 'mfcc', 'fbank'
         delta_delta=False,  # 'mfcc', 'fbank'
@@ -65,6 +51,16 @@ _C.collator =CN(
         shuffle_method="batch_shuffle",  # 'batch_shuffle', 'instance_shuffle'
     ))
 
+_C.model = CN(
+    dict(
+        num_conv_layers=2,  #Number of stacking convolution layers.
+        num_rnn_layers=3,  #Number of stacking RNN layers.
+        rnn_layer_size=1024,  #RNN layer size (number of RNN cells).
+        use_gru=True,  #Use gru if set True. Use simple rnn if set False.
+        share_rnn_weights=True  #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported.
+    ))
+
+
 DeepSpeech2Model.params(_C.model)
 
 _C.training = CN(
diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py
index b54192dd..1eefc871 100644
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@@ -143,7 +143,6 @@ class DeepSpeech2Trainer(Trainer):
         train_dataset = ManifestDataset.from_config(config)
 
         config.data.manifest = config.data.dev_manifest
-        config.data.augmentation_config = ""
         dev_dataset = ManifestDataset.from_config(config)
 
         if self.parallel:
@@ -165,18 +164,22 @@ class DeepSpeech2Trainer(Trainer):
                 sortagrad=config.collator.sortagrad,
                 shuffle_method=config.collator.shuffle_method)
 
-        collate_fn = SpeechCollator.from_config(config)
+        collate_fn_train = SpeechCollator.from_config(config)
+
+
+        config.collator.augmentation_config = ""
+        collate_fn_dev = SpeechCollator.from_config(config)
         self.train_loader = DataLoader(
             train_dataset,
             batch_sampler=batch_sampler,
-            collate_fn=collate_fn,
+            collate_fn=collate_fn_train,
             num_workers=config.collator.num_workers)
         self.valid_loader = DataLoader(
             dev_dataset,
             batch_size=config.collator.batch_size,
             shuffle=False,
             drop_last=False,
-            collate_fn=collate_fn)
+            collate_fn=collate_fn_dev)
         logger.info("Setup train/valid Dataloader!")
 
 
@@ -324,8 +327,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
         # return raw text
 
         config.data.manifest = config.data.test_manifest
-        config.data.keep_transcription_text = True
-        config.data.augmentation_config = ""
         # filter test examples, will cause less examples, but no mismatch with training
         # and can use large batch size , save training time, so filter test egs now.
         # config.data.min_input_len = 0.0  # second
@@ -337,6 +338,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
         test_dataset = ManifestDataset.from_config(config)
 
         config.collator.keep_transcription_text = True
+        config.collator.augmentation_config = ""
         # return text ord id
         self.test_loader = DataLoader(
             test_dataset,
diff --git a/deepspeech/exps/u2/config.py b/deepspeech/exps/u2/config.py
index 42725c74..d8735453 100644
--- a/deepspeech/exps/u2/config.py
+++ b/deepspeech/exps/u2/config.py
@@ -17,21 +17,13 @@ from deepspeech.exps.u2.model import U2Tester
 from deepspeech.exps.u2.model import U2Trainer
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.models.u2 import U2Model
+from deepspeech.io.collator import SpeechCollator
 
 _C = CfgNode()
 
 _C.data = ManifestDataset.params()
 
-_C.collator =CfgNode(
-    dict(
-        augmentation_config="",
-        unit_type="char",
-        keep_transcription_text=False,
-        batch_size=32,  # batch size
-        num_workers=0,  # data loader workers
-        sortagrad=False,  # sorted in first epoch when True
-        shuffle_method="batch_shuffle"  # 'batch_shuffle', 'instance_shuffle'
-    ))
+_C.collator = SpeechCollator.params()
 
 _C.model = U2Model.params()
 
diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py
index 164903e6..836afa36 100644
--- a/deepspeech/exps/u2/model.py
+++ b/deepspeech/exps/u2/model.py
@@ -100,7 +100,7 @@ class U2Trainer(Trainer):
 
         if (batch_index + 1) % train_conf.log_interval == 0:
             msg += "train time: {:>.3f}s, ".format(iteration_time)
-            msg += "batch size: {}, ".format(self.config.data.batch_size)
+            msg += "batch size: {}, ".format(self.config.collator.batch_size)
             msg += "accum: {}, ".format(train_conf.accum_grad)
             msg += ', '.join('{}: {:>.6f}'.format(k, v)
                              for k, v in losses_np.items())
@@ -211,51 +211,52 @@ class U2Trainer(Trainer):
     def setup_dataloader(self):
         config = self.config.clone()
         config.defrost()
-        config.data.keep_transcription_text = False
+        config.collator.keep_transcription_text = False
 
         # train/valid dataset, return token ids
         config.data.manifest = config.data.train_manifest
         train_dataset = ManifestDataset.from_config(config)
 
         config.data.manifest = config.data.dev_manifest
-        config.data.augmentation_config = ""
         dev_dataset = ManifestDataset.from_config(config)
 
-        collate_fn = SpeechCollator.from_config(config)
+        collate_fn_train = SpeechCollator.from_config(config)
+        
+        config.collator.augmentation_config = ""
+        collate_fn_dev = SpeechCollator.from_config(config)
+
         if self.parallel:
             batch_sampler = SortagradDistributedBatchSampler(
                 train_dataset,
-                batch_size=config.data.batch_size,
+                batch_size=config.collator.batch_size,
                 num_replicas=None,
                 rank=None,
                 shuffle=True,
                 drop_last=True,
-                sortagrad=config.data.sortagrad,
-                shuffle_method=config.data.shuffle_method)
+                sortagrad=config.collator.sortagrad,
+                shuffle_method=config.collator.shuffle_method)
         else:
             batch_sampler = SortagradBatchSampler(
                 train_dataset,
                 shuffle=True,
-                batch_size=config.data.batch_size,
+                batch_size=config.collator.batch_size,
                 drop_last=True,
-                sortagrad=config.data.sortagrad,
-                shuffle_method=config.data.shuffle_method)
+                sortagrad=config.collator.sortagrad,
+                shuffle_method=config.collator.shuffle_method)
         self.train_loader = DataLoader(
             train_dataset,
             batch_sampler=batch_sampler,
-            collate_fn=collate_fn,
-            num_workers=config.data.num_workers, )
+            collate_fn=collate_fn_train,
+            num_workers=config.collator.num_workers, )
         self.valid_loader = DataLoader(
             dev_dataset,
-            batch_size=config.data.batch_size,
+            batch_size=config.collator.batch_size,
             shuffle=False,
             drop_last=False,
-            collate_fn=collate_fn)
+            collate_fn=collate_fn_dev)
 
         # test dataset, return raw text
         config.data.manifest = config.data.test_manifest
-        config.data.keep_transcription_text = True
-        config.data.augmentation_config = ""
         # filter test examples, will cause less examples, but no mismatch with training
         # and can use large batch size , save training time, so filter test egs now.
         # config.data.min_input_len = 0.0  # second
@@ -264,9 +265,11 @@ class U2Trainer(Trainer):
         # config.data.max_output_len = float('inf')  # tokens
         # config.data.min_output_input_ratio = 0.00
         # config.data.max_output_input_ratio = float('inf')
+
         test_dataset = ManifestDataset.from_config(config)
         # return text ord id
         config.collator.keep_transcription_text = True
+        config.collator.augmentation_config = ""
         self.test_loader = DataLoader(
             test_dataset,
             batch_size=config.decoding.batch_size,
diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py
index ac817a19..ab1e9165 100644
--- a/deepspeech/io/collator.py
+++ b/deepspeech/io/collator.py
@@ -75,8 +75,8 @@ class SpeechCollator():
         """
         assert 'augmentation_config' in config.collator
         assert 'keep_transcription_text' in config.collator
-        assert 'mean_std_filepath' in config.data
-        assert 'vocab_filepath' in config.data
+        assert 'mean_std_filepath' in config.collator
+        assert 'vocab_filepath' in config.collator
         assert 'specgram_type' in config.collator
         assert 'n_fft' in config.collator
         assert config.collator
@@ -94,9 +94,9 @@ class SpeechCollator():
         speech_collator = cls(
                 aug_file=aug_file,
                 random_seed=0,
-                mean_std_filepath=config.data.mean_std_filepath,
+                mean_std_filepath=config.collator.mean_std_filepath,
                 unit_type=config.collator.unit_type,
-                vocab_filepath=config.data.vocab_filepath,
+                vocab_filepath=config.collator.vocab_filepath,
                 spm_model_prefix=config.collator.spm_model_prefix,
                 specgram_type=config.collator.specgram_type, 
                 feat_dim=config.collator.feat_dim, 
@@ -129,11 +129,31 @@ class SpeechCollator():
                 target_dB=-20,
                 dither=1.0,
                 keep_transcription_text=True):
-        """
-        Padding audio features with zeros to make them have the same shape (or
-        a user-defined shape) within one bach.
+        """SpeechCollator Collator
 
-        if ``keep_transcription_text`` is False, text is token ids else is raw string.
+        Args:
+            unit_type(str): token unit type, e.g. char, word, spm
+            vocab_filepath (str): vocab file path.
+            mean_std_filepath (str): mean and std file path, which suffix is *.npy
+            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
+            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
+            stride_ms (float, optional): stride size in ms. Defaults to 10.0.
+            window_ms (float, optional): window size in ms. Defaults to 20.0.
+            n_fft (int, optional): fft points for rfft. Defaults to None.
+            max_freq (int, optional): max cut freq. Defaults to None.
+            target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
+            specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
+            feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
+            delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
+            use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
+            target_dB (int, optional): target dB. Defaults to -20.
+            random_seed (int, optional): for random generator. Defaults to 0.
+            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
+            if ``keep_transcription_text`` is False, text is token ids else is raw string.
+        
+        Do augmentations 
+        Padding audio features with zeros to make them have the same shape (or
+        a user-defined shape) within one batch.
         """
         self._keep_transcription_text = keep_transcription_text
 
diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py
index 24d8486a..70383b4d 100644
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -40,15 +40,7 @@ class ManifestDataset(Dataset):
     def params(cls, config: Optional[CfgNode]=None) -> CfgNode:
         default = CfgNode(
             dict(
-                train_manifest="",
-                dev_manifest="",
-                test_manifest="",
                 manifest="",
-                unit_type="char",
-                vocab_filepath="",
-                spm_model_prefix="",
-                mean_std_filepath="",
-                augmentation_config="",
                 max_input_len=27.0,
                 min_input_len=0.0,
                 max_output_len=float('inf'),
@@ -73,25 +65,10 @@ class ManifestDataset(Dataset):
         """
         assert 'manifest' in config.data
         assert config.data.manifest
-        assert 'keep_transcription_text' in config.collator
-
-        if isinstance(config.data.augmentation_config, (str, bytes)):
-            if config.data.augmentation_config:
-                aug_file = io.open(
-                    config.data.augmentation_config, mode='r', encoding='utf8')
-            else:
-                aug_file = io.StringIO(initial_value='{}', newline='')
-        else:
-            aug_file = config.data.augmentation_config
-            assert isinstance(aug_file, io.StringIO)
+
 
         dataset = cls(
             manifest_path=config.data.manifest,
-            unit_type=config.data.unit_type,
-            vocab_filepath=config.data.vocab_filepath,
-            mean_std_filepath=config.data.mean_std_filepath,
-            spm_model_prefix=config.data.spm_model_prefix,
-            augmentation_config=aug_file.read(),
             max_input_len=config.data.max_input_len,
             min_input_len=config.data.min_input_len,
             max_output_len=config.data.max_output_len,
@@ -101,23 +78,8 @@ class ManifestDataset(Dataset):
             )
         return dataset
 
-    
-    def _read_vocab(self, vocab_filepath):
-        """Load vocabulary from file."""
-        vocab_lines = []
-        with open(vocab_filepath, 'r', encoding='utf-8') as file:
-            vocab_lines.extend(file.readlines())
-        vocab_list = [line[:-1] for line in vocab_lines]
-        return vocab_list
-
-
     def __init__(self,
                  manifest_path,
-                 unit_type,
-                 vocab_filepath,
-                 mean_std_filepath,
-                 spm_model_prefix=None,
-                 augmentation_config='{}',
                  max_input_len=float('inf'),
                  min_input_len=0.0,
                  max_output_len=float('inf'),
@@ -128,34 +90,16 @@ class ManifestDataset(Dataset):
 
         Args:
             manifest_path (str): manifest josn file path
-            unit_type(str): token unit type, e.g. char, word, spm
-            vocab_filepath (str): vocab file path.
-            mean_std_filepath (str): mean and std file path, which suffix is *.npy
-            spm_model_prefix (str): spm model prefix, need if `unit_type` is spm.
-            augmentation_config (str, optional): augmentation json str. Defaults to '{}'.
             max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf').
             min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0.
             max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0.
             min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0.
             max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0.
             min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05.
-            stride_ms (float, optional): stride size in ms. Defaults to 10.0.
-            window_ms (float, optional): window size in ms. Defaults to 20.0.
-            n_fft (int, optional): fft points for rfft. Defaults to None.
-            max_freq (int, optional): max cut freq. Defaults to None.
-            target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000.
-            specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'.
-            feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None.
-            delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False.
-            use_dB_normalization (bool, optional): do dB normalization. Defaults to True.
-            target_dB (int, optional): target dB. Defaults to -20.
-            random_seed (int, optional): for random generator. Defaults to 0.
-            keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False.
+        
         """
         super().__init__()
 
-        # self._rng = np.random.RandomState(random_seed)
-
         # read manifest
         self._manifest = read_manifest(
             manifest_path=manifest_path,
@@ -167,51 +111,6 @@ class ManifestDataset(Dataset):
             min_output_input_ratio=min_output_input_ratio)
         self._manifest.sort(key=lambda x: x["feat_shape"][0])
 
-        # self._vocab_list = self._read_vocab(vocab_filepath)
-
-
-    # @property
-    # def manifest(self):
-    #     return self._manifest
-    
-    # @property
-    # def vocab_size(self):
-    #     """Return the vocabulary size.
-
-    #     Returns:
-    #         int: Vocabulary size.
-    #     """
-    #     return len(self._vocab_list)
-
-    # @property
-    # def vocab_list(self):
-    #     """Return the vocabulary in list.
-
-    #     Returns:
-    #         List[str]: 
-    #     """
-    #     return self._vocab_list
-
-    # @property
-    # def vocab_dict(self):
-    #     """Return the vocabulary in dict.
-
-    #     Returns:
-    #         Dict[str, int]: 
-    #     """
-    #     vocab_dict = dict(
-    #         [(token, idx) for (idx, token) in enumerate(self._vocab_list)])
-    #     return vocab_dict
-
-    # @property
-    # def feature_size(self):
-    #     """Return the audio feature size.
-
-    #     Returns:
-    #         int: audio feature size.
-    #     """
-    #     return self._manifest[0]["feat_shape"][-1]
-
 
     def __len__(self):
         return len(self._manifest)
diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml
index b880f858..116c9192 100644
--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
@@ -3,17 +3,20 @@ data:
   train_manifest: data/manifest.train
   dev_manifest: data/manifest.dev
   test_manifest: data/manifest.test
-  vocab_filepath: data/vocab.txt 
-  unit_type: 'char'
-  spm_model_prefix: ''
-  augmentation_config: conf/augmentation.json
-  batch_size: 64
   min_input_len: 0.5
   max_input_len: 20.0 # second
   min_output_len: 0.0
   max_output_len: 400.0
   min_output_input_ratio: 0.05
   max_output_input_ratio: 10.0
+
+
+collator:
+  vocab_filepath: data/vocab.txt 
+  unit_type: 'char'
+  spm_model_prefix: ''
+  augmentation_config: conf/augmentation.json
+  batch_size: 64
   raw_wav: True  # use raw_wav or kaldi feature
   specgram_type: fbank #linear, mfcc, fbank
   feat_dim: 80
@@ -32,7 +35,6 @@ data:
   shuffle_method: batch_shuffle
   num_workers: 2
 
-
 # network architecture
 model:
     cmvn_file: "data/mean_std.json"
diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml
index 434cf264..6737d1b7 100644
--- a/examples/tiny/s0/conf/deepspeech2.yaml
+++ b/examples/tiny/s0/conf/deepspeech2.yaml
@@ -2,10 +2,7 @@
 data:
   train_manifest: data/manifest.tiny
   dev_manifest: data/manifest.tiny
-  test_manifest: data/manifest.tiny
-  mean_std_filepath: data/mean_std.json
-  unit_type: char
-  vocab_filepath: data/vocab.txt 
+  test_manifest: data/manifest.tiny 
   min_input_len: 0.0
   max_input_len: 27.0
   min_output_len: 0.0
@@ -15,6 +12,9 @@ data:
 
 
 collator:
+  mean_std_filepath: data/mean_std.json
+  unit_type: char
+  vocab_filepath: data/vocab.txt
   augmentation_config: conf/augmentation.json
   random_seed: 0
   spm_model_prefix: 
@@ -43,7 +43,7 @@ model:
   share_rnn_weights: True 
 
 training:
-  n_epoch: 23
+  n_epoch: 24
   lr: 1e-5 
   lr_decay: 1.0 
   weight_decay: 1e-06
diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml
index 5e28e4e8..250995fa 100644
--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
@@ -3,26 +3,20 @@ data:
   train_manifest: data/manifest.tiny
   dev_manifest: data/manifest.tiny
   test_manifest: data/manifest.tiny
-  vocab_filepath: data/vocab.txt 
-  unit_type: 'spm'
-  spm_model_prefix: 'data/bpe_unigram_200'
-  mean_std_filepath: ""
-  batch_size: 4
   min_input_len: 0.5  # second
   max_input_len: 20.0 # second
   min_output_len: 0.0 # tokens
   max_output_len: 400.0 # tokens
   min_output_input_ratio: 0.05
   max_output_input_ratio: 10.0
-  raw_wav: True  # use raw_wav or kaldi feature
-  sortagrad: True 
-  shuffle_method: batch_shuffle
-  num_workers: 0 #2
-
+  
 collator:
+  vocab_filepath: data/vocab.txt 
+  mean_std_filepath: ""
   augmentation_config: conf/augmentation.json
   random_seed: 0
-  spm_model_prefix: 
+  unit_type: 'spm'
+  spm_model_prefix: 'data/bpe_unigram_200'
   specgram_type: fbank
   feat_dim: 80
   delta_delta: False
@@ -35,6 +29,12 @@ collator:
   target_dB: -20
   dither: 1.0
   keep_transcription_text: False
+  batch_size: 4
+  sortagrad: True 
+  shuffle_method: batch_shuffle
+  num_workers: 0 #2
+  raw_wav: True  # use raw_wav or kaldi feature
+
 
 # network architecture
 model:
-- 
GitLab