add some comments in code

9874fb7d · xiongxinlei · b9eafddd · 9874fb7d · 9874fb7d · 9874fb7d
3 changed file
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -19,34 +19,28 @@ from typing import List
 from typing import Optional
 from typing import Union

-import librosa
-import numpy as np
 import paddle
 import soundfile
 from yacs.config import CfgNode

-from paddleaudio.backends import load as load_audio
-from paddleaudio.compliance.librosa import melspectrogram
-from ..download import get_path_from_url
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import cli_register
 from ..utils import download_and_decompress
 from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
-from paddlespeech.vector.io.batch import feature_normalize
-from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
-from paddlespeech.s2t.transform.transformation import Transformation
+from paddleaudio.backends import load as load_audio
+from paddleaudio.compliance.librosa import melspectrogram
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
-from paddlespeech.s2t.utils.utility import UpdateConfig
+from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification

 pretrained_models = {
    # The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
-    # e.g. "EcapaTdnn_voxceleb12-16k".
+    # e.g. "ecapatdnn_voxceleb12-16k".
    # Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
-    # "paddlespeech vector --task spk --model EcapaTdnn_voxceleb12-voxceleb12-16k --sr 16000 --input ./input.wav"
-    "EcapaTdnn_voxceleb12-16k": {
+    # "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-voxceleb12-16k --sr 16000 --input ./input.wav"
+    "ecapatdnn_voxceleb12-16k": {
        'url':
        'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz',
        'md5':
@@ -59,7 +53,7 @@ pretrained_models = {
 }

 model_alias = {
-    "EcapaTdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
+    "ecapatdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
 }


@@ -75,8 +69,8 @@ class VectorExecutor(BaseExecutor):
        self.parser.add_argument(
            "--model",
            type=str,
-            default="EcapaTdnn_voxceleb12",
-            choices=["EcapaTdnn_voxceleb12"],
+            default="ecapatdnn_voxceleb12",
+            choices=["ecapatdnn_voxceleb12"],
            help="Choose model type of asr task.")
        self.parser.add_argument(
            "--task",
@@ -90,7 +84,7 @@ class VectorExecutor(BaseExecutor):
            "--sample_rate",
            type=int,
            default=16000,
-            choices=[16000, 8000],
+            choices=[16000],
            help="Choose the audio sample rate of the model. 8000 or 16000")
        self.parser.add_argument(
            "--ckpt_path",
@@ -175,7 +169,7 @@ class VectorExecutor(BaseExecutor):
    @stats_wrapper
    def __call__(self,
                 audio_file: os.PathLike,
-                 model: str='EcapaTdnn-voxceleb12',
+                 model: str='ecapatdnn-voxceleb12',
                 sample_rate: int=16000,
                 config: os.PathLike=None,
                 ckpt_path: os.PathLike=None,
@@ -197,9 +191,9 @@ class VectorExecutor(BaseExecutor):
    def _get_pretrained_path(self, tag: str) -> os.PathLike:
        support_models = list(pretrained_models.keys())
        assert tag in pretrained_models, \
-            'The model "{}" you want to use has not been supported, \
-            please choose other models.\n \
-            The support models includes \n\t\t{}'.format(tag, "\n\t\t".join(support_models))
+            'The model "{}" you want to use has not been supported,'\
+            'please choose other models.\n' \
+            'The support models includes\n\t\t{}'.format(tag, "\n\t\t".join(support_models))

        res_path = os.path.join(MODEL_HOME, tag)
        decompressed_path = download_and_decompress(pretrained_models[tag],
@@ -212,7 +206,7 @@ class VectorExecutor(BaseExecutor):
        return decompressed_path

    def _init_from_path(self,
-                        model_type: str='EcapaTdnn_voxceleb12',
+                        model_type: str='ecapatdnn_voxceleb12',
                        sample_rate: int=16000,
                        cfg_path: Optional[os.PathLike]=None,
                        ckpt_path: Optional[os.PathLike]=None):
@@ -228,8 +222,10 @@ class VectorExecutor(BaseExecutor):
            res_path = self._get_pretrained_path(tag)
            self.res_path = res_path

-            self.cfg_path = os.path.join(res_path, pretrained_models[tag]['cfg_path'])
-            self.ckpt_path = os.path.join(res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams')
+            self.cfg_path = os.path.join(res_path,
+                                         pretrained_models[tag]['cfg_path'])
+            self.ckpt_path = os.path.join(
+                res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams')
        else:
            self.cfg_path = os.path.abspath(cfg_path)
            self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams")
@@ -239,7 +235,7 @@ class VectorExecutor(BaseExecutor):
        logger.info(f"start to read the ckpt from {self.ckpt_path}")
        logger.info(f"read the config from {self.cfg_path}")
        logger.info(f"get the res path {self.res_path}")
-        
+
        # stage 2: read and config and init the model body
        self.config = CfgNode(new_allowed=True)
        self.config.merge_from_file(self.cfg_path)
@@ -269,7 +265,7 @@ class VectorExecutor(BaseExecutor):

        feats = self._inputs["feats"]
        lengths = self._inputs["lengths"]
-        logger.info(f"start to do backbone network model forward")
+        logger.info("start to do backbone network model forward")
        logger.info(
            f"feats shape:{feats.shape}, lengths shape: {lengths.shape}")
        # embedding from (1, emb_size, 1) -> (emb_size)

--- a/paddlespeech/vector/io/batch.py
+++ b/paddlespeech/vector/io/batch.py
@@ -11,9 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import numpy
 import numpy as np
 import paddle
-import numpy
+

 def waveform_collate_fn(batch):
    waveforms = np.stack([item['feat'] for item in batch])
@@ -57,6 +58,7 @@ def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs):

    return np.pad(x, pad_width, mode=mode, **kwargs)

+
 def batch_feature_normalize(batch, mean_norm: bool=True, std_norm: bool=True):
    ids = [item['id'] for item in batch]
    lengths = np.asarray([item['feat'].shape[1] for item in batch])
@@ -100,12 +102,11 @@ def pad_right_to(array, target_shape, mode="constant", value=0):
    """
    assert len(target_shape) == array.ndim
    pads = []  # this contains the abs length of the padding for each dimension.
-    valid_vals = []  # thic contains the relative lengths for each dimension.
-    i = 0 # iterating over target_shape ndims
+    valid_vals = []  # this contains the relative lengths for each dimension.
+    i = 0  # iterating over target_shape ndims
    while i < len(target_shape):
-        assert (
-            target_shape[i] >= array.shape[i]
-        ), "Target shape must be >= original shape for every dim"
+        assert (target_shape[i] >= array.shape[i]
+                ), "Target shape must be >= original shape for every dim"
        pads.append([0, target_shape[i] - array.shape[i]])
        valid_vals.append(array.shape[i] / target_shape[i])
        i += 1
@@ -136,11 +137,8 @@ def batch_pad_right(arrays, mode="constant", value=0):
        # if there is only one array in the batch we simply unsqueeze it.
        return numpy.expand_dims(arrays[0], axis=0), numpy.array([1.0])

-    if not (
-        any(
-            [arrays[i].ndim == arrays[0].ndim for i in range(1, len(arrays))]
-        )
-    ):
+    if not (any(
+        [arrays[i].ndim == arrays[0].ndim for i in range(1, len(arrays))])):
        raise IndexError("All arrays must have same number of dimensions")

    # FIXME we limit the support here: we allow padding of only the last dimension
@@ -149,11 +147,9 @@ def batch_pad_right(arrays, mode="constant", value=0):
    for dim in range(arrays[0].ndim):
        if dim != (arrays[0].ndim - 1):
            if not all(
-                [x.shape[dim] == arrays[0].shape[dim] for x in arrays[1:]]
-            ):
+                [x.shape[dim] == arrays[0].shape[dim] for x in arrays[1:]]):
                raise EnvironmentError(
-                    "arrays should have same dimensions except for last one"
-                )
+                    "arrays should have same dimensions except for last one")
        max_shape.append(max([x.shape[dim] for x in arrays]))

    batched = []
@@ -161,8 +157,7 @@ def batch_pad_right(arrays, mode="constant", value=0):
    for t in arrays:
        # for each array we apply pad_right_to
        padded, valid_percent = pad_right_to(
-            t, max_shape, mode=mode, value=value
-        )
+            t, max_shape, mode=mode, value=value)
        batched.append(padded)
        valid.append(valid_percent[-1])


--- a/paddlespeech/vector/modules/sid_model.py
+++ b/paddlespeech/vector/modules/sid_model.py
@@ -24,7 +24,8 @@ class SpeakerIdetification(nn.Layer):
            lin_blocks=0,
            lin_neurons=192,
            dropout=0.1, ):
-        """_summary_
+        """The speaker identification model, which includes the speaker backbone network 
+           and the a linear transform to speaker class num in training

        Args:
            backbone (Paddle.nn.Layer class): the speaker identification backbone network model
@@ -41,7 +42,7 @@ class SpeakerIdetification(nn.Layer):
            self.dropout = nn.Dropout(dropout)
        else:
            self.dropout = None
-        
+
        # construct the speaker classifer
        input_size = self.backbone.emb_size
        self.blocks = nn.LayerList()
@@ -63,14 +64,14 @@ class SpeakerIdetification(nn.Layer):
           including the speaker embedding model and the classifier model network

        Args:
-            x (Paddle.Tensor): input audio feats, 
+            x (paddle.Tensor): input audio feats, 
                               shape=[batch, dimension, times]
-            lengths (_type_, optional): input audio length.
+            lengths (paddle.Tensor, optional): input audio length.
                                        shape=[batch, times]
                                        Defaults to None.

        Returns:
-            _type_: _description_
+            paddle.Tensor: return the logits of the feats
        """
        # x.shape: (N, C, L)
        x = self.backbone(x, lengths).squeeze(