diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py index c942c85029a6d3ab257ccb86a93d2b0f4179b0eb..f1a0e79c57713270e1af131e5f43d81b3c88fd93 100644 --- a/paddlespeech/cli/vector/infer.py +++ b/paddlespeech/cli/vector/infer.py @@ -19,34 +19,28 @@ from typing import List from typing import Optional from typing import Union -import librosa -import numpy as np import paddle import soundfile from yacs.config import CfgNode -from paddleaudio.backends import load as load_audio -from paddleaudio.compliance.librosa import melspectrogram -from ..download import get_path_from_url from ..executor import BaseExecutor from ..log import logger from ..utils import cli_register from ..utils import download_and_decompress from ..utils import MODEL_HOME from ..utils import stats_wrapper -from paddlespeech.vector.io.batch import feature_normalize -from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer -from paddlespeech.s2t.transform.transformation import Transformation +from paddleaudio.backends import load as load_audio +from paddleaudio.compliance.librosa import melspectrogram from paddlespeech.s2t.utils.dynamic_import import dynamic_import -from paddlespeech.s2t.utils.utility import UpdateConfig +from paddlespeech.vector.io.batch import feature_normalize from paddlespeech.vector.modules.sid_model import SpeakerIdetification pretrained_models = { # The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]". - # e.g. "EcapaTdnn_voxceleb12-16k". + # e.g. "ecapatdnn_voxceleb12-16k". # Command line and python api use "{model_name}[-{dataset}]" as --model, usage: - # "paddlespeech vector --task spk --model EcapaTdnn_voxceleb12-voxceleb12-16k --sr 16000 --input ./input.wav" - "EcapaTdnn_voxceleb12-16k": { + # "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-voxceleb12-16k --sr 16000 --input ./input.wav" + "ecapatdnn_voxceleb12-16k": { 'url': 'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz', 'md5': @@ -59,7 +53,7 @@ pretrained_models = { } model_alias = { - "EcapaTdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn", + "ecapatdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn", } @@ -75,8 +69,8 @@ class VectorExecutor(BaseExecutor): self.parser.add_argument( "--model", type=str, - default="EcapaTdnn_voxceleb12", - choices=["EcapaTdnn_voxceleb12"], + default="ecapatdnn_voxceleb12", + choices=["ecapatdnn_voxceleb12"], help="Choose model type of asr task.") self.parser.add_argument( "--task", @@ -90,7 +84,7 @@ class VectorExecutor(BaseExecutor): "--sample_rate", type=int, default=16000, - choices=[16000, 8000], + choices=[16000], help="Choose the audio sample rate of the model. 8000 or 16000") self.parser.add_argument( "--ckpt_path", @@ -175,7 +169,7 @@ class VectorExecutor(BaseExecutor): @stats_wrapper def __call__(self, audio_file: os.PathLike, - model: str='EcapaTdnn-voxceleb12', + model: str='ecapatdnn-voxceleb12', sample_rate: int=16000, config: os.PathLike=None, ckpt_path: os.PathLike=None, @@ -197,9 +191,9 @@ class VectorExecutor(BaseExecutor): def _get_pretrained_path(self, tag: str) -> os.PathLike: support_models = list(pretrained_models.keys()) assert tag in pretrained_models, \ - 'The model "{}" you want to use has not been supported, \ - please choose other models.\n \ - The support models includes \n\t\t{}'.format(tag, "\n\t\t".join(support_models)) + 'The model "{}" you want to use has not been supported,'\ + 'please choose other models.\n' \ + 'The support models includes\n\t\t{}'.format(tag, "\n\t\t".join(support_models)) res_path = os.path.join(MODEL_HOME, tag) decompressed_path = download_and_decompress(pretrained_models[tag], @@ -212,7 +206,7 @@ class VectorExecutor(BaseExecutor): return decompressed_path def _init_from_path(self, - model_type: str='EcapaTdnn_voxceleb12', + model_type: str='ecapatdnn_voxceleb12', sample_rate: int=16000, cfg_path: Optional[os.PathLike]=None, ckpt_path: Optional[os.PathLike]=None): @@ -228,8 +222,10 @@ class VectorExecutor(BaseExecutor): res_path = self._get_pretrained_path(tag) self.res_path = res_path - self.cfg_path = os.path.join(res_path, pretrained_models[tag]['cfg_path']) - self.ckpt_path = os.path.join(res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams') + self.cfg_path = os.path.join(res_path, + pretrained_models[tag]['cfg_path']) + self.ckpt_path = os.path.join( + res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams') else: self.cfg_path = os.path.abspath(cfg_path) self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams") @@ -239,7 +235,7 @@ class VectorExecutor(BaseExecutor): logger.info(f"start to read the ckpt from {self.ckpt_path}") logger.info(f"read the config from {self.cfg_path}") logger.info(f"get the res path {self.res_path}") - + # stage 2: read and config and init the model body self.config = CfgNode(new_allowed=True) self.config.merge_from_file(self.cfg_path) @@ -269,7 +265,7 @@ class VectorExecutor(BaseExecutor): feats = self._inputs["feats"] lengths = self._inputs["lengths"] - logger.info(f"start to do backbone network model forward") + logger.info("start to do backbone network model forward") logger.info( f"feats shape:{feats.shape}, lengths shape: {lengths.shape}") # embedding from (1, emb_size, 1) -> (emb_size) diff --git a/paddlespeech/vector/io/batch.py b/paddlespeech/vector/io/batch.py index 25522ebb234dceae47d4b7b9d8b6602470d5daeb..92ca990cf2dd83f6a22127e15b50885e6809c21f 100644 --- a/paddlespeech/vector/io/batch.py +++ b/paddlespeech/vector/io/batch.py @@ -11,9 +11,10 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +import numpy import numpy as np import paddle -import numpy + def waveform_collate_fn(batch): waveforms = np.stack([item['feat'] for item in batch]) @@ -57,6 +58,7 @@ def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs): return np.pad(x, pad_width, mode=mode, **kwargs) + def batch_feature_normalize(batch, mean_norm: bool=True, std_norm: bool=True): ids = [item['id'] for item in batch] lengths = np.asarray([item['feat'].shape[1] for item in batch]) @@ -100,12 +102,11 @@ def pad_right_to(array, target_shape, mode="constant", value=0): """ assert len(target_shape) == array.ndim pads = [] # this contains the abs length of the padding for each dimension. - valid_vals = [] # thic contains the relative lengths for each dimension. - i = 0 # iterating over target_shape ndims + valid_vals = [] # this contains the relative lengths for each dimension. + i = 0 # iterating over target_shape ndims while i < len(target_shape): - assert ( - target_shape[i] >= array.shape[i] - ), "Target shape must be >= original shape for every dim" + assert (target_shape[i] >= array.shape[i] + ), "Target shape must be >= original shape for every dim" pads.append([0, target_shape[i] - array.shape[i]]) valid_vals.append(array.shape[i] / target_shape[i]) i += 1 @@ -136,11 +137,8 @@ def batch_pad_right(arrays, mode="constant", value=0): # if there is only one array in the batch we simply unsqueeze it. return numpy.expand_dims(arrays[0], axis=0), numpy.array([1.0]) - if not ( - any( - [arrays[i].ndim == arrays[0].ndim for i in range(1, len(arrays))] - ) - ): + if not (any( + [arrays[i].ndim == arrays[0].ndim for i in range(1, len(arrays))])): raise IndexError("All arrays must have same number of dimensions") # FIXME we limit the support here: we allow padding of only the last dimension @@ -149,11 +147,9 @@ def batch_pad_right(arrays, mode="constant", value=0): for dim in range(arrays[0].ndim): if dim != (arrays[0].ndim - 1): if not all( - [x.shape[dim] == arrays[0].shape[dim] for x in arrays[1:]] - ): + [x.shape[dim] == arrays[0].shape[dim] for x in arrays[1:]]): raise EnvironmentError( - "arrays should have same dimensions except for last one" - ) + "arrays should have same dimensions except for last one") max_shape.append(max([x.shape[dim] for x in arrays])) batched = [] @@ -161,8 +157,7 @@ def batch_pad_right(arrays, mode="constant", value=0): for t in arrays: # for each array we apply pad_right_to padded, valid_percent = pad_right_to( - t, max_shape, mode=mode, value=value - ) + t, max_shape, mode=mode, value=value) batched.append(padded) valid.append(valid_percent[-1]) diff --git a/paddlespeech/vector/modules/sid_model.py b/paddlespeech/vector/modules/sid_model.py index dc13b2e02b4309ea8b57d15c53634b864b4d8db2..4045f75d1286bf2efc5b9a27f9cef25d715a8690 100644 --- a/paddlespeech/vector/modules/sid_model.py +++ b/paddlespeech/vector/modules/sid_model.py @@ -24,7 +24,8 @@ class SpeakerIdetification(nn.Layer): lin_blocks=0, lin_neurons=192, dropout=0.1, ): - """_summary_ + """The speaker identification model, which includes the speaker backbone network + and the a linear transform to speaker class num in training Args: backbone (Paddle.nn.Layer class): the speaker identification backbone network model @@ -41,7 +42,7 @@ class SpeakerIdetification(nn.Layer): self.dropout = nn.Dropout(dropout) else: self.dropout = None - + # construct the speaker classifer input_size = self.backbone.emb_size self.blocks = nn.LayerList() @@ -63,14 +64,14 @@ class SpeakerIdetification(nn.Layer): including the speaker embedding model and the classifier model network Args: - x (Paddle.Tensor): input audio feats, + x (paddle.Tensor): input audio feats, shape=[batch, dimension, times] - lengths (_type_, optional): input audio length. + lengths (paddle.Tensor, optional): input audio length. shape=[batch, times] Defaults to None. Returns: - _type_: _description_ + paddle.Tensor: return the logits of the feats """ # x.shape: (N, C, L) x = self.backbone(x, lengths).squeeze(