diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py
index c942c85029a6d3ab257ccb86a93d2b0f4179b0eb..f1a0e79c57713270e1af131e5f43d81b3c88fd93 100644
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -19,34 +19,28 @@ from typing import List
 from typing import Optional
 from typing import Union
 
-import librosa
-import numpy as np
 import paddle
 import soundfile
 from yacs.config import CfgNode
 
-from paddleaudio.backends import load as load_audio
-from paddleaudio.compliance.librosa import melspectrogram
-from ..download import get_path_from_url
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import cli_register
 from ..utils import download_and_decompress
 from ..utils import MODEL_HOME
 from ..utils import stats_wrapper
-from paddlespeech.vector.io.batch import feature_normalize
-from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
-from paddlespeech.s2t.transform.transformation import Transformation
+from paddleaudio.backends import load as load_audio
+from paddleaudio.compliance.librosa import melspectrogram
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
-from paddlespeech.s2t.utils.utility import UpdateConfig
+from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification
 
 pretrained_models = {
     # The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
-    # e.g. "EcapaTdnn_voxceleb12-16k".
+    # e.g. "ecapatdnn_voxceleb12-16k".
     # Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
-    # "paddlespeech vector --task spk --model EcapaTdnn_voxceleb12-voxceleb12-16k --sr 16000 --input ./input.wav"
-    "EcapaTdnn_voxceleb12-16k": {
+    # "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-voxceleb12-16k --sr 16000 --input ./input.wav"
+    "ecapatdnn_voxceleb12-16k": {
         'url':
         'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz',
         'md5':
@@ -59,7 +53,7 @@ pretrained_models = {
 }
 
 model_alias = {
-    "EcapaTdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
+    "ecapatdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
 }
 
 
@@ -75,8 +69,8 @@ class VectorExecutor(BaseExecutor):
         self.parser.add_argument(
             "--model",
             type=str,
-            default="EcapaTdnn_voxceleb12",
-            choices=["EcapaTdnn_voxceleb12"],
+            default="ecapatdnn_voxceleb12",
+            choices=["ecapatdnn_voxceleb12"],
             help="Choose model type of asr task.")
         self.parser.add_argument(
             "--task",
@@ -90,7 +84,7 @@ class VectorExecutor(BaseExecutor):
             "--sample_rate",
             type=int,
             default=16000,
-            choices=[16000, 8000],
+            choices=[16000],
             help="Choose the audio sample rate of the model. 8000 or 16000")
         self.parser.add_argument(
             "--ckpt_path",
@@ -175,7 +169,7 @@ class VectorExecutor(BaseExecutor):
     @stats_wrapper
     def __call__(self,
                  audio_file: os.PathLike,
-                 model: str='EcapaTdnn-voxceleb12',
+                 model: str='ecapatdnn-voxceleb12',
                  sample_rate: int=16000,
                  config: os.PathLike=None,
                  ckpt_path: os.PathLike=None,
@@ -197,9 +191,9 @@ class VectorExecutor(BaseExecutor):
     def _get_pretrained_path(self, tag: str) -> os.PathLike:
         support_models = list(pretrained_models.keys())
         assert tag in pretrained_models, \
-            'The model "{}" you want to use has not been supported, \
-            please choose other models.\n \
-            The support models includes \n\t\t{}'.format(tag, "\n\t\t".join(support_models))
+            'The model "{}" you want to use has not been supported,'\
+            'please choose other models.\n' \
+            'The support models includes\n\t\t{}'.format(tag, "\n\t\t".join(support_models))
 
         res_path = os.path.join(MODEL_HOME, tag)
         decompressed_path = download_and_decompress(pretrained_models[tag],
@@ -212,7 +206,7 @@ class VectorExecutor(BaseExecutor):
         return decompressed_path
 
     def _init_from_path(self,
-                        model_type: str='EcapaTdnn_voxceleb12',
+                        model_type: str='ecapatdnn_voxceleb12',
                         sample_rate: int=16000,
                         cfg_path: Optional[os.PathLike]=None,
                         ckpt_path: Optional[os.PathLike]=None):
@@ -228,8 +222,10 @@ class VectorExecutor(BaseExecutor):
             res_path = self._get_pretrained_path(tag)
             self.res_path = res_path
 
-            self.cfg_path = os.path.join(res_path, pretrained_models[tag]['cfg_path'])
-            self.ckpt_path = os.path.join(res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams')
+            self.cfg_path = os.path.join(res_path,
+                                         pretrained_models[tag]['cfg_path'])
+            self.ckpt_path = os.path.join(
+                res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams')
         else:
             self.cfg_path = os.path.abspath(cfg_path)
             self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams")
@@ -239,7 +235,7 @@ class VectorExecutor(BaseExecutor):
         logger.info(f"start to read the ckpt from {self.ckpt_path}")
         logger.info(f"read the config from {self.cfg_path}")
         logger.info(f"get the res path {self.res_path}")
-        
+
         # stage 2: read and config and init the model body
         self.config = CfgNode(new_allowed=True)
         self.config.merge_from_file(self.cfg_path)
@@ -269,7 +265,7 @@ class VectorExecutor(BaseExecutor):
 
         feats = self._inputs["feats"]
         lengths = self._inputs["lengths"]
-        logger.info(f"start to do backbone network model forward")
+        logger.info("start to do backbone network model forward")
         logger.info(
             f"feats shape:{feats.shape}, lengths shape: {lengths.shape}")
         # embedding from (1, emb_size, 1) -> (emb_size)
diff --git a/paddlespeech/vector/io/batch.py b/paddlespeech/vector/io/batch.py
index 25522ebb234dceae47d4b7b9d8b6602470d5daeb..92ca990cf2dd83f6a22127e15b50885e6809c21f 100644
--- a/paddlespeech/vector/io/batch.py
+++ b/paddlespeech/vector/io/batch.py
@@ -11,9 +11,10 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import numpy
 import numpy as np
 import paddle
-import numpy
+
 
 def waveform_collate_fn(batch):
     waveforms = np.stack([item['feat'] for item in batch])
@@ -57,6 +58,7 @@ def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs):
 
     return np.pad(x, pad_width, mode=mode, **kwargs)
 
+
 def batch_feature_normalize(batch, mean_norm: bool=True, std_norm: bool=True):
     ids = [item['id'] for item in batch]
     lengths = np.asarray([item['feat'].shape[1] for item in batch])
@@ -100,12 +102,11 @@ def pad_right_to(array, target_shape, mode="constant", value=0):
     """
     assert len(target_shape) == array.ndim
     pads = []  # this contains the abs length of the padding for each dimension.
-    valid_vals = []  # thic contains the relative lengths for each dimension.
-    i = 0 # iterating over target_shape ndims
+    valid_vals = []  # this contains the relative lengths for each dimension.
+    i = 0  # iterating over target_shape ndims
     while i < len(target_shape):
-        assert (
-            target_shape[i] >= array.shape[i]
-        ), "Target shape must be >= original shape for every dim"
+        assert (target_shape[i] >= array.shape[i]
+                ), "Target shape must be >= original shape for every dim"
         pads.append([0, target_shape[i] - array.shape[i]])
         valid_vals.append(array.shape[i] / target_shape[i])
         i += 1
@@ -136,11 +137,8 @@ def batch_pad_right(arrays, mode="constant", value=0):
         # if there is only one array in the batch we simply unsqueeze it.
         return numpy.expand_dims(arrays[0], axis=0), numpy.array([1.0])
 
-    if not (
-        any(
-            [arrays[i].ndim == arrays[0].ndim for i in range(1, len(arrays))]
-        )
-    ):
+    if not (any(
+        [arrays[i].ndim == arrays[0].ndim for i in range(1, len(arrays))])):
         raise IndexError("All arrays must have same number of dimensions")
 
     # FIXME we limit the support here: we allow padding of only the last dimension
@@ -149,11 +147,9 @@ def batch_pad_right(arrays, mode="constant", value=0):
     for dim in range(arrays[0].ndim):
         if dim != (arrays[0].ndim - 1):
             if not all(
-                [x.shape[dim] == arrays[0].shape[dim] for x in arrays[1:]]
-            ):
+                [x.shape[dim] == arrays[0].shape[dim] for x in arrays[1:]]):
                 raise EnvironmentError(
-                    "arrays should have same dimensions except for last one"
-                )
+                    "arrays should have same dimensions except for last one")
         max_shape.append(max([x.shape[dim] for x in arrays]))
 
     batched = []
@@ -161,8 +157,7 @@ def batch_pad_right(arrays, mode="constant", value=0):
     for t in arrays:
         # for each array we apply pad_right_to
         padded, valid_percent = pad_right_to(
-            t, max_shape, mode=mode, value=value
-        )
+            t, max_shape, mode=mode, value=value)
         batched.append(padded)
         valid.append(valid_percent[-1])
 
diff --git a/paddlespeech/vector/modules/sid_model.py b/paddlespeech/vector/modules/sid_model.py
index dc13b2e02b4309ea8b57d15c53634b864b4d8db2..4045f75d1286bf2efc5b9a27f9cef25d715a8690 100644
--- a/paddlespeech/vector/modules/sid_model.py
+++ b/paddlespeech/vector/modules/sid_model.py
@@ -24,7 +24,8 @@ class SpeakerIdetification(nn.Layer):
             lin_blocks=0,
             lin_neurons=192,
             dropout=0.1, ):
-        """_summary_
+        """The speaker identification model, which includes the speaker backbone network 
+           and the a linear transform to speaker class num in training
 
         Args:
             backbone (Paddle.nn.Layer class): the speaker identification backbone network model
@@ -41,7 +42,7 @@ class SpeakerIdetification(nn.Layer):
             self.dropout = nn.Dropout(dropout)
         else:
             self.dropout = None
-        
+
         # construct the speaker classifer
         input_size = self.backbone.emb_size
         self.blocks = nn.LayerList()
@@ -63,14 +64,14 @@ class SpeakerIdetification(nn.Layer):
            including the speaker embedding model and the classifier model network
 
         Args:
-            x (Paddle.Tensor): input audio feats, 
+            x (paddle.Tensor): input audio feats, 
                                shape=[batch, dimension, times]
-            lengths (_type_, optional): input audio length.
+            lengths (paddle.Tensor, optional): input audio length.
                                         shape=[batch, times]
                                         Defaults to None.
 
         Returns:
-            _type_: _description_
+            paddle.Tensor: return the logits of the feats
         """
         # x.shape: (N, C, L)
         x = self.backbone(x, lengths).squeeze(