add more ctc conf

890a28f9 · Hui Zhang · 41ed7a18 · 890a28f9 · 890a28f9 · 890a28f9
18 changed file
--- a/deepspeech/models/u2.py
+++ b/deepspeech/models/u2.py
@@ -661,9 +661,7 @@ class U2BaseModel(nn.Layer):
            xs, offset, required_cache_size, subsampling_cache,
            elayers_output_cache, conformer_cnn_cache)

-    # @jit.to_static([
-    #         paddle.static.InputSpec(shape=[1, None, feat_dim],dtype='float32'),  # audio feat, [B,T,D]
-    #     ])
+    # @jit.to_static
    def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
        """ Export interface for c++ call, apply linear transform and log
            softmax before ctc
@@ -830,6 +828,7 @@ class U2Model(U2BaseModel):
        Returns:
            int, nn.Layer, nn.Layer, nn.Layer: vocab size, encoder, decoder, ctc
        """
+        # cmvn
        if configs['cmvn_file'] is not None:
            mean, istd = load_cmvn(configs['cmvn_file'],
                                   configs['cmvn_file_type'])
@@ -839,11 +838,13 @@ class U2Model(U2BaseModel):
        else:
            global_cmvn = None

+        # input & output dim
        input_dim = configs['input_dim']
        vocab_size = configs['output_dim']
        assert input_dim != 0, input_dim
        assert vocab_size != 0, vocab_size

+        # encoder
        encoder_type = configs.get('encoder', 'transformer')
        logger.info(f"U2 Encoder type: {encoder_type}")
        if encoder_type == 'transformer':
@@ -855,17 +856,21 @@ class U2Model(U2BaseModel):
        else:
            raise ValueError(f"not support encoder type:{encoder_type}")

+        # decoder
        decoder = TransformerDecoder(vocab_size,
                                     encoder.output_size(),
                                     **configs['decoder_conf'])
+
+        # ctc decoder and ctc loss
+        model_conf = configs['model_conf']
        ctc = CTCDecoder(
            odim=vocab_size,
            enc_n_units=encoder.output_size(),
            blank_id=0,
-            dropout_rate=0.0,
+            dropout_rate=model_conf['ctc_dropout_rate'],
            reduction=True,  # sum
            batch_average=True,  # sum / batch_size
-            grad_norm_type='instance')
+            grad_norm_type=model_conf['ctc_grad_norm_type'])

        return vocab_size, encoder, decoder, ctc


--- a/deepspeech/models/u2_st.py
+++ b/deepspeech/models/u2_st.py
@@ -413,26 +413,26 @@ class U2STBaseModel(nn.Layer):
        best_hyps = best_hyps[:, 1:]
        return best_hyps

-    @jit.to_static
+    # @jit.to_static
    def subsampling_rate(self) -> int:
        """ Export interface for c++ call, return subsampling_rate of the
            model
        """
        return self.encoder.embed.subsampling_rate

-    @jit.to_static
+    # @jit.to_static
    def right_context(self) -> int:
        """ Export interface for c++ call, return right_context of the model
        """
        return self.encoder.embed.right_context

-    @jit.to_static
+    # @jit.to_static
    def sos_symbol(self) -> int:
        """ Export interface for c++ call, return sos symbol id of the model
        """
        return self.sos

-    @jit.to_static
+    # @jit.to_static
    def eos_symbol(self) -> int:
        """ Export interface for c++ call, return eos symbol id of the model
        """
@@ -468,7 +468,7 @@ class U2STBaseModel(nn.Layer):
            xs, offset, required_cache_size, subsampling_cache,
            elayers_output_cache, conformer_cnn_cache)

-    @jit.to_static
+    # @jit.to_static
    def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor:
        """ Export interface for c++ call, apply linear transform and log
            softmax before ctc
@@ -643,14 +643,16 @@ class U2STModel(U2STBaseModel):
            decoder = TransformerDecoder(vocab_size,
                                         encoder.output_size(),
                                         **configs['decoder_conf'])
+            # ctc decoder and ctc loss
+            model_conf = configs['model_conf']
            ctc = CTCDecoder(
                odim=vocab_size,
                enc_n_units=encoder.output_size(),
                blank_id=0,
-                dropout_rate=0.0,
+                dropout_rate=model_conf['ctc_dropout_rate'],
                reduction=True,  # sum
                batch_average=True,  # sum / batch_size
-                grad_norm_type='instance')
+                grad_norm_type=model_conf['ctc_grad_norm_type'])

            return vocab_size, encoder, (st_decoder, decoder, ctc)
        else:

--- a/examples/aishell/s1/conf/chunk_conformer.yaml
+++ b/examples/aishell/s1/conf/chunk_conformer.yaml
@@ -76,6 +76,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false


--- a/examples/aishell/s1/conf/conformer.yaml
+++ b/examples/aishell/s1/conf/conformer.yaml
@@ -71,6 +71,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false


--- a/examples/librispeech/s1/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_conformer.yaml
@@ -76,6 +76,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false


--- a/examples/librispeech/s1/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s1/conf/chunk_transformer.yaml
@@ -69,6 +69,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false


--- a/examples/librispeech/s1/conf/conformer.yaml
+++ b/examples/librispeech/s1/conf/conformer.yaml
@@ -72,6 +72,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false


--- a/examples/librispeech/s2/conf/chunk_conformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_conformer.yaml
@@ -76,6 +76,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false


--- a/examples/librispeech/s2/conf/chunk_transformer.yaml
+++ b/examples/librispeech/s2/conf/chunk_transformer.yaml
@@ -69,6 +69,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false


--- a/examples/librispeech/s2/conf/conformer.yaml
+++ b/examples/librispeech/s2/conf/conformer.yaml
@@ -72,6 +72,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false


--- a/examples/librispeech/s2/conf/transformer.yaml
+++ b/examples/librispeech/s2/conf/transformer.yaml
@@ -58,6 +58,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false


--- a/examples/ted_en_zh/t0/conf/transformer.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer.yaml
@@ -68,6 +68,8 @@ model:
    model_conf:
        asr_weight: 0.0
        ctc_weight: 0.0
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false


--- a/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
+++ b/examples/ted_en_zh/t0/conf/transformer_joint_noam.yaml
@@ -68,6 +68,8 @@ model:
    model_conf:
        asr_weight: 0.5
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false


--- a/examples/timit/s1/conf/transformer.yaml
+++ b/examples/timit/s1/conf/transformer.yaml
@@ -66,6 +66,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false


--- a/examples/tiny/s1/conf/chunk_confermer.yaml
+++ b/examples/tiny/s1/conf/chunk_confermer.yaml
@@ -76,6 +76,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false


--- a/examples/tiny/s1/conf/chunk_transformer.yaml
+++ b/examples/tiny/s1/conf/chunk_transformer.yaml
@@ -69,6 +69,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false


--- a/examples/tiny/s1/conf/conformer.yaml
+++ b/examples/tiny/s1/conf/conformer.yaml
@@ -72,6 +72,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false


--- a/examples/tiny/s1/conf/transformer.yaml
+++ b/examples/tiny/s1/conf/transformer.yaml
@@ -66,6 +66,8 @@ model:
    # hybrid CTC/attention
    model_conf:
        ctc_weight: 0.3
+        ctc_dropoutrate: 0.0
+        ctc_grad_norm_type: instance
        lsm_weight: 0.1     # label smoothing option
        length_normalized_loss: false