diff --git a/deepspeech/models/u2.py b/deepspeech/models/u2.py index a01766da64e24b00835264022f092240cacfc8f0..7061d173fad39663e571407b5ed8e8f266103f75 100644 --- a/deepspeech/models/u2.py +++ b/deepspeech/models/u2.py @@ -661,9 +661,7 @@ class U2BaseModel(nn.Layer): xs, offset, required_cache_size, subsampling_cache, elayers_output_cache, conformer_cnn_cache) - # @jit.to_static([ - # paddle.static.InputSpec(shape=[1, None, feat_dim],dtype='float32'), # audio feat, [B,T,D] - # ]) + # @jit.to_static def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor: """ Export interface for c++ call, apply linear transform and log softmax before ctc @@ -830,6 +828,7 @@ class U2Model(U2BaseModel): Returns: int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc """ + # cmvn if configs['cmvn_file'] is not None: mean, istd = load_cmvn(configs['cmvn_file'], configs['cmvn_file_type']) @@ -839,11 +838,13 @@ class U2Model(U2BaseModel): else: global_cmvn = None + # input & output dim input_dim = configs['input_dim'] vocab_size = configs['output_dim'] assert input_dim != 0, input_dim assert vocab_size != 0, vocab_size + # encoder encoder_type = configs.get('encoder', 'transformer') logger.info(f"U2 Encoder type: {encoder_type}") if encoder_type == 'transformer': @@ -855,17 +856,21 @@ class U2Model(U2BaseModel): else: raise ValueError(f"not support encoder type:{encoder_type}") + # decoder decoder = TransformerDecoder(vocab_size, encoder.output_size(), **configs['decoder_conf']) + + # ctc decoder and ctc loss + model_conf = configs['model_conf'] ctc = CTCDecoder( odim=vocab_size, enc_n_units=encoder.output_size(), blank_id=0, - dropout_rate=0.0, + dropout_rate=model_conf['ctc_dropout_rate'], reduction=True, # sum batch_average=True, # sum / batch_size - grad_norm_type='instance') + grad_norm_type=model_conf['ctc_grad_norm_type']) return vocab_size, encoder, decoder, ctc diff --git a/deepspeech/models/u2_st.py b/deepspeech/models/u2_st.py index 7dae3745bb97ee17633fd63d3c586a1f6987b86a..6737a549d978bb1f74a518140ec4e5917328ad1a 100644 --- a/deepspeech/models/u2_st.py +++ b/deepspeech/models/u2_st.py @@ -413,26 +413,26 @@ class U2STBaseModel(nn.Layer): best_hyps = best_hyps[:, 1:] return best_hyps - @jit.to_static + # @jit.to_static def subsampling_rate(self) -> int: """ Export interface for c++ call, return subsampling_rate of the model """ return self.encoder.embed.subsampling_rate - @jit.to_static + # @jit.to_static def right_context(self) -> int: """ Export interface for c++ call, return right_context of the model """ return self.encoder.embed.right_context - @jit.to_static + # @jit.to_static def sos_symbol(self) -> int: """ Export interface for c++ call, return sos symbol id of the model """ return self.sos - @jit.to_static + # @jit.to_static def eos_symbol(self) -> int: """ Export interface for c++ call, return eos symbol id of the model """ @@ -468,7 +468,7 @@ class U2STBaseModel(nn.Layer): xs, offset, required_cache_size, subsampling_cache, elayers_output_cache, conformer_cnn_cache) - @jit.to_static + # @jit.to_static def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor: """ Export interface for c++ call, apply linear transform and log softmax before ctc @@ -643,14 +643,16 @@ class U2STModel(U2STBaseModel): decoder = TransformerDecoder(vocab_size, encoder.output_size(), **configs['decoder_conf']) + # ctc decoder and ctc loss + model_conf = configs['model_conf'] ctc = CTCDecoder( odim=vocab_size, enc_n_units=encoder.output_size(), blank_id=0, - dropout_rate=0.0, + dropout_rate=model_conf['ctc_dropout_rate'], reduction=True, # sum batch_average=True, # sum / batch_size - grad_norm_type='instance') + grad_norm_type=model_conf['ctc_grad_norm_type']) return vocab_size, encoder, (st_decoder, decoder, ctc) else: diff --git a/examples/aishell/s1/conf/chunk_conformer.yaml b/examples/aishell/s1/conf/chunk_conformer.yaml index 3e606788ef86eee601233fab2039923b1ee8cb34..6f8ae135f6210757208dde85cad85e5ee776f381 100644 --- a/examples/aishell/s1/conf/chunk_conformer.yaml +++ b/examples/aishell/s1/conf/chunk_conformer.yaml @@ -76,6 +76,8 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: instance lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/aishell/s1/conf/conformer.yaml b/examples/aishell/s1/conf/conformer.yaml index 4b1430c58848a5cac7303518021a6256b52d525d..a4248459c261d442a3e23e18cf538927cda5236a 100644 --- a/examples/aishell/s1/conf/conformer.yaml +++ b/examples/aishell/s1/conf/conformer.yaml @@ -71,6 +71,8 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: instance lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/librispeech/s1/conf/chunk_conformer.yaml b/examples/librispeech/s1/conf/chunk_conformer.yaml index 0de1aefee5c4397c74ed2b382ceb4471baadd9c5..92db20f6688daea729cec7000e8ccbe6497ecb03 100644 --- a/examples/librispeech/s1/conf/chunk_conformer.yaml +++ b/examples/librispeech/s1/conf/chunk_conformer.yaml @@ -76,6 +76,8 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: instance lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/librispeech/s1/conf/chunk_transformer.yaml b/examples/librispeech/s1/conf/chunk_transformer.yaml index f782a0373680077eab678375b3677c5cfb3340f2..e0bc3135e0a75509da68561079127793a73b41a8 100644 --- a/examples/librispeech/s1/conf/chunk_transformer.yaml +++ b/examples/librispeech/s1/conf/chunk_transformer.yaml @@ -69,6 +69,8 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: instance lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/librispeech/s1/conf/conformer.yaml b/examples/librispeech/s1/conf/conformer.yaml index 6d825f05b4b1376f8ef9eda8551acc515947dfb2..78be249cb72e2b72f158cdb5f0187f64097edcf3 100644 --- a/examples/librispeech/s1/conf/conformer.yaml +++ b/examples/librispeech/s1/conf/conformer.yaml @@ -72,6 +72,8 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: instance lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/librispeech/s2/conf/chunk_conformer.yaml b/examples/librispeech/s2/conf/chunk_conformer.yaml index 0de1aefee5c4397c74ed2b382ceb4471baadd9c5..92db20f6688daea729cec7000e8ccbe6497ecb03 100644 --- a/examples/librispeech/s2/conf/chunk_conformer.yaml +++ b/examples/librispeech/s2/conf/chunk_conformer.yaml @@ -76,6 +76,8 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: instance lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/librispeech/s2/conf/chunk_transformer.yaml b/examples/librispeech/s2/conf/chunk_transformer.yaml index f782a0373680077eab678375b3677c5cfb3340f2..e0bc3135e0a75509da68561079127793a73b41a8 100644 --- a/examples/librispeech/s2/conf/chunk_transformer.yaml +++ b/examples/librispeech/s2/conf/chunk_transformer.yaml @@ -69,6 +69,8 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: instance lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/librispeech/s2/conf/conformer.yaml b/examples/librispeech/s2/conf/conformer.yaml index 955b6108b8b4861dbc85db97e07f821d3333600f..9a72741350219b3455388d73d354ba9004890530 100644 --- a/examples/librispeech/s2/conf/conformer.yaml +++ b/examples/librispeech/s2/conf/conformer.yaml @@ -72,6 +72,8 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: instance lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/librispeech/s2/conf/transformer.yaml b/examples/librispeech/s2/conf/transformer.yaml index 4c60913e937bbd95bcd391e98a2b69a1242f63a6..edf5b81dc3a0c5061f76921e50b8095a5515c9d3 100644 --- a/examples/librispeech/s2/conf/transformer.yaml +++ b/examples/librispeech/s2/conf/transformer.yaml @@ -58,6 +58,8 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: instance lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/ted_en_zh/t0/conf/transformer.yaml b/examples/ted_en_zh/t0/conf/transformer.yaml index 755e044614f921037bf16b8e65458ed15c85b3e3..1aad86d22f02afc68d9eba0c0cb76406873fbfc4 100644 --- a/examples/ted_en_zh/t0/conf/transformer.yaml +++ b/examples/ted_en_zh/t0/conf/transformer.yaml @@ -68,6 +68,8 @@ model: model_conf: asr_weight: 0.0 ctc_weight: 0.0 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: instance lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml index bc1f8890d9d784053ce1ed84c9cbb8675f39782b..0144c40d4bf0f2b66bafefbda41acbc1708be170 100644 --- a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml +++ b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml @@ -68,6 +68,8 @@ model: model_conf: asr_weight: 0.5 ctc_weight: 0.3 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: instance lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/timit/s1/conf/transformer.yaml b/examples/timit/s1/conf/transformer.yaml index eb191d0b25f703c4e95aa79b750fcdaf8962be41..c3b519968822b888693b8cfb302a3fb7ad4156a6 100644 --- a/examples/timit/s1/conf/transformer.yaml +++ b/examples/timit/s1/conf/transformer.yaml @@ -66,6 +66,8 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: instance lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/tiny/s1/conf/chunk_confermer.yaml b/examples/tiny/s1/conf/chunk_confermer.yaml index 96da3d9fb2f2671cc20b7454727ea1ec77ee0630..be2e82f9e675e484f28ef0aead5306e36eefbc33 100644 --- a/examples/tiny/s1/conf/chunk_confermer.yaml +++ b/examples/tiny/s1/conf/chunk_confermer.yaml @@ -76,6 +76,8 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: instance lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/tiny/s1/conf/chunk_transformer.yaml b/examples/tiny/s1/conf/chunk_transformer.yaml index 1adb91c46ceded8d646c8a028c73e8a74b8e8fed..93439a85782f8cb08cc3040ae3a80625d9572117 100644 --- a/examples/tiny/s1/conf/chunk_transformer.yaml +++ b/examples/tiny/s1/conf/chunk_transformer.yaml @@ -69,6 +69,8 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: instance lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/tiny/s1/conf/conformer.yaml b/examples/tiny/s1/conf/conformer.yaml index b40e77e37993990b51b9817b9b3a68be14a65618..9bb67c44e54a968d2fb6e891592aaf0d0a2e3db4 100644 --- a/examples/tiny/s1/conf/conformer.yaml +++ b/examples/tiny/s1/conf/conformer.yaml @@ -72,6 +72,8 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: instance lsm_weight: 0.1 # label smoothing option length_normalized_loss: false diff --git a/examples/tiny/s1/conf/transformer.yaml b/examples/tiny/s1/conf/transformer.yaml index fd5adbdee359e8bc46185a3836d6d91ad0160795..5127e8c66821a866db27e383a7b8aa91dbab0a0b 100644 --- a/examples/tiny/s1/conf/transformer.yaml +++ b/examples/tiny/s1/conf/transformer.yaml @@ -66,6 +66,8 @@ model: # hybrid CTC/attention model_conf: ctc_weight: 0.3 + ctc_dropoutrate: 0.0 + ctc_grad_norm_type: instance lsm_weight: 0.1 # label smoothing option length_normalized_loss: false