fix bugs

7d133368 · Hui Zhang · 16f4bdc5 · 16f4bdc5 · 7d133368 · 7d133368
12 changed file
--- a/.bashrc
+++ b/.bashrc
-# Locales
-
-export LC_ALL=en_US.UTF-8
-export LANG=en_US.UTF-8
-export LANGUAGE=en_US.UTF-8
-
-# Aliases
-alias nvs="nvidia-smi"
-alias rsync="rsync --progress -raz"
-alias his="history"
--- a/.notebook/u2_confermer_model_wenet.ipynb
+++ b/.notebook/u2_confermer_model_wenet.ipynb
@@ -3431,7 +3431,7 @@
    "        convolution_layer_args = (output_size, cnn_module_kernel, activation,\n",
    "                                  cnn_module_norm, causal)\n",
    "\n",
-    "        self.encoders = nn.ModuleList([\n",
+    "        self.encoders = nn.LayerList([\n",
    "            ConformerEncoderLayer(\n",
    "                size=output_size,\n",
    "                self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args),\n",

--- a/deepspeech/frontend/augmentor/augmentation.py
+++ b/deepspeech/frontend/augmentor/augmentation.py
@@ -164,8 +164,6 @@ class AugmentationPipeline():
        :param audio_segment: Audio segment to process.
        :type audio_segment: AudioSegmenet|SpeechSegment
        """
-        if not self._train:
-            return
        for augmentor, rate in zip(self._audio_augmentors, self._audio_rates):
            if self._rng.uniform(0., 1.) < rate:
                augmentor.transform_audio(audio_segment)
@@ -176,8 +174,6 @@ class AugmentationPipeline():
        Args:
            spec_segment (np.ndarray): audio feature, (D, T).
        """
-        if not self._train:
-            return
        for augmentor, rate in zip(self._spec_augmentors, self._spec_rates):
            if self._rng.uniform(0., 1.) < rate:
                spec_segment = augmentor.transform_feature(spec_segment)
@@ -217,3 +213,4 @@ class AugmentationPipeline():
            obj = class_obj(self._rng, **params)
        except Exception:
            raise ValueError("Unknown augmentor type [%s]." % augmentor_type)
+        return obj
--- a/deepspeech/io/dataset.py
+++ b/deepspeech/io/dataset.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 from typing import Optional

+import numpy as np
 from paddle.io import Dataset
 from yacs.config import CfgNode


--- a/deepspeech/models/ds2/rnn.py
+++ b/deepspeech/models/ds2/rnn.py
@@ -297,7 +297,7 @@ class RNNStack(nn.Layer):
                        share_weights=share_rnn_weights))
            i_size = h_size * 2

-        self.rnn_stacks = nn.ModuleList(rnn_stacks)
+        self.rnn_stacks = nn.LayerList(rnn_stacks)

    def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
        """

--- a/deepspeech/models/u2.py
+++ b/deepspeech/models/u2.py
@@ -54,7 +54,7 @@ __all__ = ["U2Model", "U2InferModel"]
 logger = Log(__name__).getlog()


-class U2BaseModel(nn.Module):
+class U2BaseModel(nn.Layer):
    """CTC-Attention hybrid Encoder-Decoder model"""

    @classmethod

--- a/deepspeech/models/u2_st.py
+++ b/deepspeech/models/u2_st.py
@@ -48,7 +48,7 @@ __all__ = ["U2STModel", "U2STInferModel"]
 logger = Log(__name__).getlog()


-class U2STBaseModel(nn.Module):
+class U2STBaseModel(nn.Layer):
    """CTC-Attention hybrid Encoder-Decoder model"""

    @classmethod

--- a/deepspeech/modules/decoder.py
+++ b/deepspeech/modules/decoder.py
@@ -33,7 +33,7 @@ logger = Log(__name__).getlog()
 __all__ = ["TransformerDecoder"]


-class TransformerDecoder(nn.Module):
+class TransformerDecoder(nn.Layer):
    """Base class of Transfomer decoder module.
    Args:
        vocab_size: output dim
@@ -86,7 +86,7 @@ class TransformerDecoder(nn.Module):
        self.use_output_layer = use_output_layer
        self.output_layer = nn.Linear(attention_dim, vocab_size)

-        self.decoders = nn.ModuleList([
+        self.decoders = nn.LayerList([
            DecoderLayer(
                size=attention_dim,
                self_attn=MultiHeadedAttention(attention_heads, attention_dim,

--- a/deepspeech/modules/decoder_layer.py
+++ b/deepspeech/modules/decoder_layer.py
@@ -25,15 +25,15 @@ logger = Log(__name__).getlog()
 __all__ = ["DecoderLayer"]


-class DecoderLayer(nn.Module):
+class DecoderLayer(nn.Layer):
    """Single decoder layer module.
    Args:
        size (int): Input dimension.
-        self_attn (nn.Module): Self-attention module instance.
+        self_attn (nn.Layer): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
-        src_attn (nn.Module): Self-attention module instance.
+        src_attn (nn.Layer): Self-attention module instance.
            `MultiHeadedAttention` instance can be used as the argument.
-        feed_forward (nn.Module): Feed-forward module instance.
+        feed_forward (nn.Layer): Feed-forward module instance.
            `PositionwiseFeedForward` instance can be used as the argument.
        dropout_rate (float): Dropout rate.
        normalize_before (bool):
@@ -48,9 +48,9 @@ class DecoderLayer(nn.Module):
    def __init__(
            self,
            size: int,
-            self_attn: nn.Module,
-            src_attn: nn.Module,
-            feed_forward: nn.Module,
+            self_attn: nn.Layer,
+            src_attn: nn.Layer,
+            feed_forward: nn.Layer,
            dropout_rate: float,
            normalize_before: bool=True,
            concat_after: bool=False, ):

--- a/deepspeech/modules/encoder.py
+++ b/deepspeech/modules/encoder.py
@@ -358,7 +358,7 @@ class TransformerEncoder(BaseEncoder):
                         pos_enc_layer_type, normalize_before, concat_after,
                         static_chunk_size, use_dynamic_chunk, global_cmvn,
                         use_dynamic_left_chunk)
-        self.encoders = nn.ModuleList([
+        self.encoders = nn.LayerList([
            TransformerEncoderLayer(
                size=output_size,
                self_attn=MultiHeadedAttention(attention_heads, output_size,
@@ -438,7 +438,7 @@ class ConformerEncoder(BaseEncoder):
        convolution_layer_args = (output_size, cnn_module_kernel, activation,
                                  cnn_module_norm, causal)

-        self.encoders = nn.ModuleList([
+        self.encoders = nn.LayerList([
            ConformerEncoderLayer(
                size=output_size,
                self_attn=encoder_selfattn_layer(*encoder_selfattn_layer_args),

--- a/deepspeech/modules/rnn.py
+++ b/deepspeech/modules/rnn.py
@@ -297,7 +297,7 @@ class RNNStack(nn.Layer):
                        share_weights=share_rnn_weights))
            i_size = h_size * 2

-        self.rnn_stacks = nn.ModuleList(rnn_stacks)
+        self.rnn_stacks = nn.LayerList(rnn_stacks)

    def forward(self, x: paddle.Tensor, x_len: paddle.Tensor):
        """

--- a/examples/librispeech/s0/conf/deepspeech2.yaml
+++ b/examples/librispeech/s0/conf/deepspeech2.yaml
@@ -32,7 +32,7 @@ collator:
  keep_transcription_text: False
  sortagrad: True 
  shuffle_method: batch_shuffle
-  num_workers: 0
+  num_workers: 2

 model:
  num_conv_layers: 2