diff --git a/deepspeech/exps/deepspeech2/config.py b/deepspeech/exps/deepspeech2/config.py index faaff1aad4206194e4704dc7757ac23203363b7e..050a50b00a70e2a9546886b28d530a6b5e694fe3 100644 --- a/deepspeech/exps/deepspeech2/config.py +++ b/deepspeech/exps/deepspeech2/config.py @@ -21,32 +21,18 @@ _C.data = CN( train_manifest="", dev_manifest="", test_manifest="", - unit_type="char", - vocab_filepath="", - spm_model_prefix="", - mean_std_filepath="", - augmentation_config="", max_duration=float('inf'), min_duration=0.0, )) -_C.model = CN( - dict( - num_conv_layers=2, #Number of stacking convolution layers. - num_rnn_layers=3, #Number of stacking RNN layers. - rnn_layer_size=1024, #RNN layer size (number of RNN cells). - use_gru=True, #Use gru if set True. Use simple rnn if set False. - share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. - )) - _C.collator =CN( dict( - augmentation_config="", - random_seed=0, - mean_std_filepath="", unit_type="char", vocab_filepath="", spm_model_prefix="", + mean_std_filepath="", + augmentation_config="", + random_seed=0, specgram_type='linear', # 'linear', 'mfcc', 'fbank' feat_dim=0, # 'mfcc', 'fbank' delta_delta=False, # 'mfcc', 'fbank' @@ -65,6 +51,16 @@ _C.collator =CN( shuffle_method="batch_shuffle", # 'batch_shuffle', 'instance_shuffle' )) +_C.model = CN( + dict( + num_conv_layers=2, #Number of stacking convolution layers. + num_rnn_layers=3, #Number of stacking RNN layers. + rnn_layer_size=1024, #RNN layer size (number of RNN cells). + use_gru=True, #Use gru if set True. Use simple rnn if set False. + share_rnn_weights=True #Whether to share input-hidden weights between forward and backward directional RNNs.Notice that for GRU, weight sharing is not supported. + )) + + DeepSpeech2Model.params(_C.model) _C.training = CN( diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index b54192dd3579777706d38a951109241b355713d7..1eefc871bb6036f85afbcc0a61ca7013a41eb91d 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -143,7 +143,6 @@ class DeepSpeech2Trainer(Trainer): train_dataset = ManifestDataset.from_config(config) config.data.manifest = config.data.dev_manifest - config.data.augmentation_config = "" dev_dataset = ManifestDataset.from_config(config) if self.parallel: @@ -165,18 +164,22 @@ class DeepSpeech2Trainer(Trainer): sortagrad=config.collator.sortagrad, shuffle_method=config.collator.shuffle_method) - collate_fn = SpeechCollator.from_config(config) + collate_fn_train = SpeechCollator.from_config(config) + + + config.collator.augmentation_config = "" + collate_fn_dev = SpeechCollator.from_config(config) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, - collate_fn=collate_fn, + collate_fn=collate_fn_train, num_workers=config.collator.num_workers) self.valid_loader = DataLoader( dev_dataset, batch_size=config.collator.batch_size, shuffle=False, drop_last=False, - collate_fn=collate_fn) + collate_fn=collate_fn_dev) logger.info("Setup train/valid Dataloader!") @@ -324,8 +327,6 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): # return raw text config.data.manifest = config.data.test_manifest - config.data.keep_transcription_text = True - config.data.augmentation_config = "" # filter test examples, will cause less examples, but no mismatch with training # and can use large batch size , save training time, so filter test egs now. # config.data.min_input_len = 0.0 # second @@ -337,6 +338,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): test_dataset = ManifestDataset.from_config(config) config.collator.keep_transcription_text = True + config.collator.augmentation_config = "" # return text ord id self.test_loader = DataLoader( test_dataset, diff --git a/deepspeech/exps/u2/config.py b/deepspeech/exps/u2/config.py index 42725c74fad3a3e0068dc103a0ffc4188594b713..d8735453c641f452b5516f8a8a07f40acf65b86a 100644 --- a/deepspeech/exps/u2/config.py +++ b/deepspeech/exps/u2/config.py @@ -17,21 +17,13 @@ from deepspeech.exps.u2.model import U2Tester from deepspeech.exps.u2.model import U2Trainer from deepspeech.io.dataset import ManifestDataset from deepspeech.models.u2 import U2Model +from deepspeech.io.collator import SpeechCollator _C = CfgNode() _C.data = ManifestDataset.params() -_C.collator =CfgNode( - dict( - augmentation_config="", - unit_type="char", - keep_transcription_text=False, - batch_size=32, # batch size - num_workers=0, # data loader workers - sortagrad=False, # sorted in first epoch when True - shuffle_method="batch_shuffle" # 'batch_shuffle', 'instance_shuffle' - )) +_C.collator = SpeechCollator.params() _C.model = U2Model.params() diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index 164903e69fd24688647e74a9df0843ddd31a4e82..836afa361599704a0292f6538a52ec0c680d754b 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -100,7 +100,7 @@ class U2Trainer(Trainer): if (batch_index + 1) % train_conf.log_interval == 0: msg += "train time: {:>.3f}s, ".format(iteration_time) - msg += "batch size: {}, ".format(self.config.data.batch_size) + msg += "batch size: {}, ".format(self.config.collator.batch_size) msg += "accum: {}, ".format(train_conf.accum_grad) msg += ', '.join('{}: {:>.6f}'.format(k, v) for k, v in losses_np.items()) @@ -211,51 +211,52 @@ class U2Trainer(Trainer): def setup_dataloader(self): config = self.config.clone() config.defrost() - config.data.keep_transcription_text = False + config.collator.keep_transcription_text = False # train/valid dataset, return token ids config.data.manifest = config.data.train_manifest train_dataset = ManifestDataset.from_config(config) config.data.manifest = config.data.dev_manifest - config.data.augmentation_config = "" dev_dataset = ManifestDataset.from_config(config) - collate_fn = SpeechCollator.from_config(config) + collate_fn_train = SpeechCollator.from_config(config) + + config.collator.augmentation_config = "" + collate_fn_dev = SpeechCollator.from_config(config) + if self.parallel: batch_sampler = SortagradDistributedBatchSampler( train_dataset, - batch_size=config.data.batch_size, + batch_size=config.collator.batch_size, num_replicas=None, rank=None, shuffle=True, drop_last=True, - sortagrad=config.data.sortagrad, - shuffle_method=config.data.shuffle_method) + sortagrad=config.collator.sortagrad, + shuffle_method=config.collator.shuffle_method) else: batch_sampler = SortagradBatchSampler( train_dataset, shuffle=True, - batch_size=config.data.batch_size, + batch_size=config.collator.batch_size, drop_last=True, - sortagrad=config.data.sortagrad, - shuffle_method=config.data.shuffle_method) + sortagrad=config.collator.sortagrad, + shuffle_method=config.collator.shuffle_method) self.train_loader = DataLoader( train_dataset, batch_sampler=batch_sampler, - collate_fn=collate_fn, - num_workers=config.data.num_workers, ) + collate_fn=collate_fn_train, + num_workers=config.collator.num_workers, ) self.valid_loader = DataLoader( dev_dataset, - batch_size=config.data.batch_size, + batch_size=config.collator.batch_size, shuffle=False, drop_last=False, - collate_fn=collate_fn) + collate_fn=collate_fn_dev) # test dataset, return raw text config.data.manifest = config.data.test_manifest - config.data.keep_transcription_text = True - config.data.augmentation_config = "" # filter test examples, will cause less examples, but no mismatch with training # and can use large batch size , save training time, so filter test egs now. # config.data.min_input_len = 0.0 # second @@ -264,9 +265,11 @@ class U2Trainer(Trainer): # config.data.max_output_len = float('inf') # tokens # config.data.min_output_input_ratio = 0.00 # config.data.max_output_input_ratio = float('inf') + test_dataset = ManifestDataset.from_config(config) # return text ord id config.collator.keep_transcription_text = True + config.collator.augmentation_config = "" self.test_loader = DataLoader( test_dataset, batch_size=config.decoding.batch_size, diff --git a/deepspeech/io/collator.py b/deepspeech/io/collator.py index ac817a192d5c3c29d3b33e31ebb71cb2aba491da..ab1e9165279a44f0b0fa5ce381de7ae5a0f6c6a0 100644 --- a/deepspeech/io/collator.py +++ b/deepspeech/io/collator.py @@ -75,8 +75,8 @@ class SpeechCollator(): """ assert 'augmentation_config' in config.collator assert 'keep_transcription_text' in config.collator - assert 'mean_std_filepath' in config.data - assert 'vocab_filepath' in config.data + assert 'mean_std_filepath' in config.collator + assert 'vocab_filepath' in config.collator assert 'specgram_type' in config.collator assert 'n_fft' in config.collator assert config.collator @@ -94,9 +94,9 @@ class SpeechCollator(): speech_collator = cls( aug_file=aug_file, random_seed=0, - mean_std_filepath=config.data.mean_std_filepath, + mean_std_filepath=config.collator.mean_std_filepath, unit_type=config.collator.unit_type, - vocab_filepath=config.data.vocab_filepath, + vocab_filepath=config.collator.vocab_filepath, spm_model_prefix=config.collator.spm_model_prefix, specgram_type=config.collator.specgram_type, feat_dim=config.collator.feat_dim, @@ -129,11 +129,31 @@ class SpeechCollator(): target_dB=-20, dither=1.0, keep_transcription_text=True): - """ - Padding audio features with zeros to make them have the same shape (or - a user-defined shape) within one bach. + """SpeechCollator Collator - if ``keep_transcription_text`` is False, text is token ids else is raw string. + Args: + unit_type(str): token unit type, e.g. char, word, spm + vocab_filepath (str): vocab file path. + mean_std_filepath (str): mean and std file path, which suffix is *.npy + spm_model_prefix (str): spm model prefix, need if `unit_type` is spm. + augmentation_config (str, optional): augmentation json str. Defaults to '{}'. + stride_ms (float, optional): stride size in ms. Defaults to 10.0. + window_ms (float, optional): window size in ms. Defaults to 20.0. + n_fft (int, optional): fft points for rfft. Defaults to None. + max_freq (int, optional): max cut freq. Defaults to None. + target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000. + specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'. + feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None. + delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False. + use_dB_normalization (bool, optional): do dB normalization. Defaults to True. + target_dB (int, optional): target dB. Defaults to -20. + random_seed (int, optional): for random generator. Defaults to 0. + keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False. + if ``keep_transcription_text`` is False, text is token ids else is raw string. + + Do augmentations + Padding audio features with zeros to make them have the same shape (or + a user-defined shape) within one batch. """ self._keep_transcription_text = keep_transcription_text diff --git a/deepspeech/io/dataset.py b/deepspeech/io/dataset.py index 24d8486a8eba73528b63f13ba028e8ab6a7b7f05..70383b4daeab05409bdf3889b32bf684154fd4d2 100644 --- a/deepspeech/io/dataset.py +++ b/deepspeech/io/dataset.py @@ -40,15 +40,7 @@ class ManifestDataset(Dataset): def params(cls, config: Optional[CfgNode]=None) -> CfgNode: default = CfgNode( dict( - train_manifest="", - dev_manifest="", - test_manifest="", manifest="", - unit_type="char", - vocab_filepath="", - spm_model_prefix="", - mean_std_filepath="", - augmentation_config="", max_input_len=27.0, min_input_len=0.0, max_output_len=float('inf'), @@ -73,25 +65,10 @@ class ManifestDataset(Dataset): """ assert 'manifest' in config.data assert config.data.manifest - assert 'keep_transcription_text' in config.collator - - if isinstance(config.data.augmentation_config, (str, bytes)): - if config.data.augmentation_config: - aug_file = io.open( - config.data.augmentation_config, mode='r', encoding='utf8') - else: - aug_file = io.StringIO(initial_value='{}', newline='') - else: - aug_file = config.data.augmentation_config - assert isinstance(aug_file, io.StringIO) + dataset = cls( manifest_path=config.data.manifest, - unit_type=config.data.unit_type, - vocab_filepath=config.data.vocab_filepath, - mean_std_filepath=config.data.mean_std_filepath, - spm_model_prefix=config.data.spm_model_prefix, - augmentation_config=aug_file.read(), max_input_len=config.data.max_input_len, min_input_len=config.data.min_input_len, max_output_len=config.data.max_output_len, @@ -101,23 +78,8 @@ class ManifestDataset(Dataset): ) return dataset - - def _read_vocab(self, vocab_filepath): - """Load vocabulary from file.""" - vocab_lines = [] - with open(vocab_filepath, 'r', encoding='utf-8') as file: - vocab_lines.extend(file.readlines()) - vocab_list = [line[:-1] for line in vocab_lines] - return vocab_list - - def __init__(self, manifest_path, - unit_type, - vocab_filepath, - mean_std_filepath, - spm_model_prefix=None, - augmentation_config='{}', max_input_len=float('inf'), min_input_len=0.0, max_output_len=float('inf'), @@ -128,34 +90,16 @@ class ManifestDataset(Dataset): Args: manifest_path (str): manifest josn file path - unit_type(str): token unit type, e.g. char, word, spm - vocab_filepath (str): vocab file path. - mean_std_filepath (str): mean and std file path, which suffix is *.npy - spm_model_prefix (str): spm model prefix, need if `unit_type` is spm. - augmentation_config (str, optional): augmentation json str. Defaults to '{}'. max_input_len ([type], optional): maximum output seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to float('inf'). min_input_len (float, optional): minimum input seq length, in seconds for raw wav, in frame numbers for feature data. Defaults to 0.0. max_output_len (float, optional): maximum input seq length, in modeling units. Defaults to 500.0. min_output_len (float, optional): minimum input seq length, in modeling units. Defaults to 0.0. max_output_input_ratio (float, optional): maximum output seq length/output seq length ratio. Defaults to 10.0. min_output_input_ratio (float, optional): minimum output seq length/output seq length ratio. Defaults to 0.05. - stride_ms (float, optional): stride size in ms. Defaults to 10.0. - window_ms (float, optional): window size in ms. Defaults to 20.0. - n_fft (int, optional): fft points for rfft. Defaults to None. - max_freq (int, optional): max cut freq. Defaults to None. - target_sample_rate (int, optional): target sample rate which used for training. Defaults to 16000. - specgram_type (str, optional): 'linear', 'mfcc' or 'fbank'. Defaults to 'linear'. - feat_dim (int, optional): audio feature dim, using by 'mfcc' or 'fbank'. Defaults to None. - delta_delta (bool, optional): audio feature with delta-delta, using by 'fbank' or 'mfcc'. Defaults to False. - use_dB_normalization (bool, optional): do dB normalization. Defaults to True. - target_dB (int, optional): target dB. Defaults to -20. - random_seed (int, optional): for random generator. Defaults to 0. - keep_transcription_text (bool, optional): True, when not in training mode, will not do tokenizer; Defaults to False. + """ super().__init__() - # self._rng = np.random.RandomState(random_seed) - # read manifest self._manifest = read_manifest( manifest_path=manifest_path, @@ -167,51 +111,6 @@ class ManifestDataset(Dataset): min_output_input_ratio=min_output_input_ratio) self._manifest.sort(key=lambda x: x["feat_shape"][0]) - # self._vocab_list = self._read_vocab(vocab_filepath) - - - # @property - # def manifest(self): - # return self._manifest - - # @property - # def vocab_size(self): - # """Return the vocabulary size. - - # Returns: - # int: Vocabulary size. - # """ - # return len(self._vocab_list) - - # @property - # def vocab_list(self): - # """Return the vocabulary in list. - - # Returns: - # List[str]: - # """ - # return self._vocab_list - - # @property - # def vocab_dict(self): - # """Return the vocabulary in dict. - - # Returns: - # Dict[str, int]: - # """ - # vocab_dict = dict( - # [(token, idx) for (idx, token) in enumerate(self._vocab_list)]) - # return vocab_dict - - # @property - # def feature_size(self): - # """Return the audio feature size. - - # Returns: - # int: audio feature size. - # """ - # return self._manifest[0]["feat_shape"][-1] - def __len__(self): return len(self._manifest) diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml index b880f858755e1716d4304e4ee475cc8f5190f81b..116c919279134bf7ca7f3aa9c50171ca1488be82 100644 --- a/examples/aishell/s1/conf/conformer.yaml +++ b/examples/aishell/s1/conf/conformer.yaml @@ -3,17 +3,20 @@ data: train_manifest: data/manifest.train dev_manifest: data/manifest.dev test_manifest: data/manifest.test - vocab_filepath: data/vocab.txt - unit_type: 'char' - spm_model_prefix: '' - augmentation_config: conf/augmentation.json - batch_size: 64 min_input_len: 0.5 max_input_len: 20.0 # second min_output_len: 0.0 max_output_len: 400.0 min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 + + +collator: + vocab_filepath: data/vocab.txt + unit_type: 'char' + spm_model_prefix: '' + augmentation_config: conf/augmentation.json + batch_size: 64 raw_wav: True # use raw_wav or kaldi feature specgram_type: fbank #linear, mfcc, fbank feat_dim: 80 @@ -32,7 +35,6 @@ data: shuffle_method: batch_shuffle num_workers: 2 - # network architecture model: cmvn_file: "data/mean_std.json" diff --git a/examples/tiny/s0/conf/deepspeech2.yaml b/examples/tiny/s0/conf/deepspeech2.yaml index 434cf264c2b758711c1dcfa51a7b5cac57a459db..6737d1b75a88fb3b00ca8d7cbb168878de21766b 100644 --- a/examples/tiny/s0/conf/deepspeech2.yaml +++ b/examples/tiny/s0/conf/deepspeech2.yaml @@ -2,10 +2,7 @@ data: train_manifest: data/manifest.tiny dev_manifest: data/manifest.tiny - test_manifest: data/manifest.tiny - mean_std_filepath: data/mean_std.json - unit_type: char - vocab_filepath: data/vocab.txt + test_manifest: data/manifest.tiny min_input_len: 0.0 max_input_len: 27.0 min_output_len: 0.0 @@ -15,6 +12,9 @@ data: collator: + mean_std_filepath: data/mean_std.json + unit_type: char + vocab_filepath: data/vocab.txt augmentation_config: conf/augmentation.json random_seed: 0 spm_model_prefix: @@ -43,7 +43,7 @@ model: share_rnn_weights: True training: - n_epoch: 23 + n_epoch: 24 lr: 1e-5 lr_decay: 1.0 weight_decay: 1e-06 diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml index 5e28e4e87a8e9d5e7b247061b7222d91171bf8c5..250995faadc8b4e668ed717d70b9ebadcdc67b60 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/s1/conf/transformer.yaml @@ -3,26 +3,20 @@ data: train_manifest: data/manifest.tiny dev_manifest: data/manifest.tiny test_manifest: data/manifest.tiny - vocab_filepath: data/vocab.txt - unit_type: 'spm' - spm_model_prefix: 'data/bpe_unigram_200' - mean_std_filepath: "" - batch_size: 4 min_input_len: 0.5 # second max_input_len: 20.0 # second min_output_len: 0.0 # tokens max_output_len: 400.0 # tokens min_output_input_ratio: 0.05 max_output_input_ratio: 10.0 - raw_wav: True # use raw_wav or kaldi feature - sortagrad: True - shuffle_method: batch_shuffle - num_workers: 0 #2 - + collator: + vocab_filepath: data/vocab.txt + mean_std_filepath: "" augmentation_config: conf/augmentation.json random_seed: 0 - spm_model_prefix: + unit_type: 'spm' + spm_model_prefix: 'data/bpe_unigram_200' specgram_type: fbank feat_dim: 80 delta_delta: False @@ -35,6 +29,12 @@ collator: target_dB: -20 dither: 1.0 keep_transcription_text: False + batch_size: 4 + sortagrad: True + shuffle_method: batch_shuffle + num_workers: 0 #2 + raw_wav: True # use raw_wav or kaldi feature + # network architecture model: