diff --git a/demos/speaker_verification/README.md b/demos/speaker_verification/README.md index 27413bd8d045297a2cfc7ddaa998afc89d410002..7d7180ae9df6ef2c34bd414bfe65ecfc7284fc60 100644 --- a/demos/speaker_verification/README.md +++ b/demos/speaker_verification/README.md @@ -117,6 +117,8 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav audio_file='./123456789.wav', device=paddle.get_device()) print('Test embedding Result: \n{}'.format(test_emb)) + + # score range [0, 1] score = vector_executor.get_embeddings_score(audio_emb, test_emb) print(f"Eembeddings Score: {score}") ``` diff --git a/demos/speaker_verification/README_cn.md b/demos/speaker_verification/README_cn.md index 068802fd36a61f9f84d6c2065f9818c7b6986561..db382f298df74c73ef5fcbd5a3fb64fb2fa1c44f 100644 --- a/demos/speaker_verification/README_cn.md +++ b/demos/speaker_verification/README_cn.md @@ -115,6 +115,8 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav audio_file='./123456789.wav', device=paddle.get_device()) print('Test embedding Result: \n{}'.format(test_emb)) + + # score range [0, 1] score = vector_executor.get_embeddings_score(audio_emb, test_emb) print(f"Eembeddings Score: {score}") ``` diff --git a/paddlespeech/vector/models/ecapa_tdnn.py b/paddlespeech/vector/models/ecapa_tdnn.py index 0e7287cd3614d8964941f6d14179e0ce7f3c4d71..895ff13f4509c7070d2473aebf8ce693a50dbcee 100644 --- a/paddlespeech/vector/models/ecapa_tdnn.py +++ b/paddlespeech/vector/models/ecapa_tdnn.py @@ -79,6 +79,20 @@ class Conv1d(nn.Layer): bias_attr=bias, ) def forward(self, x): + """Do conv1d forward + + Args: + x (paddle.Tensor): [N, C, L] input data, + N is the batch, + C is the data dimension, + L is the time + + Raises: + ValueError: only support the same padding type + + Returns: + paddle.Tensor: the value of conv1d + """ if self.padding == "same": x = self._manage_padding(x, self.kernel_size, self.dilation, self.stride) @@ -88,6 +102,20 @@ class Conv1d(nn.Layer): return self.conv(x) def _manage_padding(self, x, kernel_size: int, dilation: int, stride: int): + """Padding the input data + + Args: + x (paddle.Tensor): [N, C, L] input data + N is the batch, + C is the data dimension, + L is the time + kernel_size (int): 1-d convolution kernel size + dilation (int): 1-d convolution dilation + stride (int): 1-d convolution stride + + Returns: + paddle.Tensor: the padded input data + """ L_in = x.shape[-1] # Detecting input shape padding = self._get_padding_elem(L_in, stride, kernel_size, dilation) # Time padding @@ -101,6 +129,17 @@ class Conv1d(nn.Layer): stride: int, kernel_size: int, dilation: int): + """Calculate the padding value in same mode + + Args: + L_in (int): the times of the input data, + stride (int): 1-d convolution stride + kernel_size (int): 1-d convolution kernel size + dilation (int): 1-d convolution stride + + Returns: + int: return the padding value in same mode + """ if stride > 1: n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1) L_out = stride * (n_steps - 1) + kernel_size * dilation @@ -245,6 +284,13 @@ class SEBlock(nn.Layer): class AttentiveStatisticsPooling(nn.Layer): def __init__(self, channels, attention_channels=128, global_context=True): + """Compute the speaker verification statistics + The detail info is section 3.1 in https://arxiv.org/pdf/1709.01507.pdf + Args: + channels (int): input data channel or data dimension + attention_channels (int, optional): attention dimension. Defaults to 128. + global_context (bool, optional): If use the global context information. Defaults to True. + """ super().__init__() self.eps = 1e-12