Merge pull request #1646 from Honei/develop

[vec]add speaker verification score method

Merge pull request #1646 from Honei/develop
[vec]add speaker verification score method
f500fa8b · Honei_X · GitHub · 1843bed4 · 9b5f7f71 · f500fa8b
5 changed file
--- a/demos/speaker_verification/README.md
+++ b/demos/speaker_verification/README.md
@@ -30,6 +30,11 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
  paddlespeech vector --task spk --input vec.job
  echo -e "demo2 85236145389.wav \n demo3 85236145389.wav" | paddlespeech vector --task spk
+  paddlespeech vector --task score --input "./85236145389.wav ./123456789.wav"
+  echo -e "demo4 85236145389.wav 85236145389.wav \n demo5 85236145389.wav 123456789.wav" > vec.job
+  paddlespeech vector --task score --input vec.job
  ```
  Usage:
@@ -103,6 +108,19 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
      audio_file='./85236145389.wav',
      device=paddle.get_device())
  print('Audio embedding Result: \n{}'.format(audio_emb))
+  test_emb = vector_executor(
+      model='ecapatdnn_voxceleb12',
+      sample_rate=16000,
+      config=None,  # Set `config` and `ckpt_path` to None to use pretrained model.
+      ckpt_path=None,
+      audio_file='./123456789.wav',
+      device=paddle.get_device())
+  print('Test embedding Result: \n{}'.format(test_emb))
+  # score range [0, 1]
+  score = vector_executor.get_embeddings_score(audio_emb, test_emb)
+  print(f"Eembeddings Score: {score}")
  ```
  Output：
@@ -149,6 +167,49 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
    -3.3925855    5.079156     7.759716     4.677565     5.8457737
    2.402413     7.7071047    3.9711342   -6.390043     6.1268735
    -3.7760346  -11.118123  ]
+    # get the test embedding
+    Test embedding Result:
+    [ -1.902964     2.0690894   -8.034194     3.5472693    0.18089125
+      6.9085927    1.4097427   -1.9487704  -10.021278    -0.20755845
+      -8.04332      4.344489     2.3200977  -14.306299     5.184692
+    -11.55602     -3.8497238    0.6444722    1.2833948    2.6766639
+      0.5878921    0.7946299    1.7207596    2.5791872   14.998469
+      -1.3385371   15.031221    -0.8006958    1.99287     -9.52007
+      2.435466     4.003221    -4.33817     -4.898601    -5.304714
+    -18.033886    10.790787   -12.784645    -5.641755     2.9761686
+    -10.566622     1.4839455    6.152458    -5.7195854    2.8603241
+      6.112133     8.489869     5.5958056    1.2836679   -1.2293907
+      0.89927405   7.0288725   -2.854029    -0.9782962    5.8255906
+      14.905906    -5.025907     0.7866458   -4.2444224  -16.354029
+      10.521315     0.9604709   -3.3257897    7.144871   -13.592733
+      -8.568869    -1.7953678    0.26313916  10.916714    -6.9374123
+      1.857403    -6.2746415    2.8154466   -7.2338667   -2.293357
+      -0.05452765   5.4287076    5.0849075   -6.690375    -1.6183422
+      3.654291     0.94352573  -9.200294    -5.4749465   -3.5235846
+      1.3420814    4.240421    -2.772944    -2.8451524   16.311104
+      4.2969875   -1.762936   -12.5758915    8.595198    -0.8835239
+      -1.5708797    1.568961     1.1413603    3.5032008   -0.45251232
+      -6.786333    16.89443      5.3366146   -8.789056     0.6355629
+      3.2579517   -3.328322     7.5969577    0.66025066  -6.550468
+      -9.148656     2.020372    -0.4615173    1.1965656   -3.8764873
+      11.6562195   -6.0750933   12.182899     3.2218833    0.81969476
+      5.570001    -3.8459578   -7.205299     7.9262037   -7.6611166
+      -5.249467    -2.2671914    7.2658715  -13.298164     4.821147
+      -2.7263982   11.691089    -3.8918593   -2.838112    -1.0336838
+      -3.8034165    2.8536487   -5.60398     -1.1972581    1.3455094
+      -3.4903061    2.2408795    5.5010734   -3.970756    11.99696
+      -7.8858757    0.43160373  -5.5059714    4.3426995   16.322706
+      11.635366     0.72157705  -9.245714    -3.91465     -4.449838
+      -1.5716927    7.713747    -2.2430465   -6.198303   -13.481864
+      2.8156567   -5.7812386    5.1456156    2.7289324  -14.505571
+      13.270688     3.448231    -7.0659585    4.5886116   -4.466099
+      -0.296428   -11.463529    -2.6076477   14.110243    -6.9725137
+      -1.9962958    2.7119343   19.391657     0.01961198  14.607133
+      -1.6695905   -4.391516     1.3131028   -6.670972    -5.888604
+      12.0612335    5.9285784    3.3715196    1.492534    10.723728
+      -0.95514804 -12.085431  ]
+    # get the score between enroll and test
+    Eembeddings Score: 0.4292638301849365
  ```
 ### 4.Pretrained Models

--- a/demos/speaker_verification/README_cn.md
+++ b/demos/speaker_verification/README_cn.md
@@ -29,6 +29,11 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
  paddlespeech vector --task spk --input vec.job
  echo -e "demo2 85236145389.wav \n demo3 85236145389.wav" | paddlespeech vector --task spk
+  paddlespeech vector --task score --input "./85236145389.wav ./123456789.wav"
+  echo -e "demo4 85236145389.wav 85236145389.wav \n demo5 85236145389.wav 123456789.wav" > vec.job
+  paddlespeech vector --task score --input vec.job
  ```
  使用方法：
@@ -101,6 +106,19 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
      audio_file='./85236145389.wav',
      device=paddle.get_device())
  print('Audio embedding Result: \n{}'.format(audio_emb))
+  test_emb = vector_executor(
+      model='ecapatdnn_voxceleb12',
+      sample_rate=16000,
+      config=None,  # Set `config` and `ckpt_path` to None to use pretrained model.
+      ckpt_path=None,
+      audio_file='./123456789.wav',
+      device=paddle.get_device())
+  print('Test embedding Result: \n{}'.format(test_emb))
+  # score range [0, 1]
+  score = vector_executor.get_embeddings_score(audio_emb, test_emb)
+  print(f"Eembeddings Score: {score}")
  ```
  输出：
@@ -146,6 +164,49 @@ wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
    -3.3925855    5.079156     7.759716     4.677565     5.8457737
    2.402413     7.7071047    3.9711342   -6.390043     6.1268735
    -3.7760346  -11.118123  ]
+    # get the test embedding
+    Test embedding Result:
+    [ -1.902964     2.0690894   -8.034194     3.5472693    0.18089125
+      6.9085927    1.4097427   -1.9487704  -10.021278    -0.20755845
+      -8.04332      4.344489     2.3200977  -14.306299     5.184692
+    -11.55602     -3.8497238    0.6444722    1.2833948    2.6766639
+      0.5878921    0.7946299    1.7207596    2.5791872   14.998469
+      -1.3385371   15.031221    -0.8006958    1.99287     -9.52007
+      2.435466     4.003221    -4.33817     -4.898601    -5.304714
+    -18.033886    10.790787   -12.784645    -5.641755     2.9761686
+    -10.566622     1.4839455    6.152458    -5.7195854    2.8603241
+      6.112133     8.489869     5.5958056    1.2836679   -1.2293907
+      0.89927405   7.0288725   -2.854029    -0.9782962    5.8255906
+      14.905906    -5.025907     0.7866458   -4.2444224  -16.354029
+      10.521315     0.9604709   -3.3257897    7.144871   -13.592733
+      -8.568869    -1.7953678    0.26313916  10.916714    -6.9374123
+      1.857403    -6.2746415    2.8154466   -7.2338667   -2.293357
+      -0.05452765   5.4287076    5.0849075   -6.690375    -1.6183422
+      3.654291     0.94352573  -9.200294    -5.4749465   -3.5235846
+      1.3420814    4.240421    -2.772944    -2.8451524   16.311104
+      4.2969875   -1.762936   -12.5758915    8.595198    -0.8835239
+      -1.5708797    1.568961     1.1413603    3.5032008   -0.45251232
+      -6.786333    16.89443      5.3366146   -8.789056     0.6355629
+      3.2579517   -3.328322     7.5969577    0.66025066  -6.550468
+      -9.148656     2.020372    -0.4615173    1.1965656   -3.8764873
+      11.6562195   -6.0750933   12.182899     3.2218833    0.81969476
+      5.570001    -3.8459578   -7.205299     7.9262037   -7.6611166
+      -5.249467    -2.2671914    7.2658715  -13.298164     4.821147
+      -2.7263982   11.691089    -3.8918593   -2.838112    -1.0336838
+      -3.8034165    2.8536487   -5.60398     -1.1972581    1.3455094
+      -3.4903061    2.2408795    5.5010734   -3.970756    11.99696
+      -7.8858757    0.43160373  -5.5059714    4.3426995   16.322706
+      11.635366     0.72157705  -9.245714    -3.91465     -4.449838
+      -1.5716927    7.713747    -2.2430465   -6.198303   -13.481864
+      2.8156567   -5.7812386    5.1456156    2.7289324  -14.505571
+      13.270688     3.448231    -7.0659585    4.5886116   -4.466099
+      -0.296428   -11.463529    -2.6076477   14.110243    -6.9725137
+      -1.9962958    2.7119343   19.391657     0.01961198  14.607133
+      -1.6695905   -4.391516     1.3131028   -6.670972    -5.888604
+      12.0612335    5.9285784    3.3715196    1.492534    10.723728
+      -0.95514804 -12.085431  ]
+    # get the score between enroll and test
+    Eembeddings Score: 0.4292638301849365
  ```
 ### 4.预训练模型

--- a/demos/speaker_verification/run.sh
+++ b/demos/speaker_verification/run.sh
 #!/bin/bash
 wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
+wget -c https://paddlespeech.bj.bcebos.com/vector/audio/123456789.wav
 # vector
 paddlespeech vector --task spk --input ./85236145389.wav
+paddlespeech vector --task score --input "./85236145389.wav ./123456789.wav"
--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -15,6 +15,7 @@ import argparse
 import os
 import sys
 from collections import OrderedDict
+from typing import Dict
 from typing import List
 from typing import Optional
 from typing import Union
@@ -79,7 +80,7 @@ class VectorExecutor(BaseExecutor):
            "--task",
            type=str,
            default="spk",
-            choices=["spk"],
+            choices=["spk", "score"],
            help="task type in vector domain")
        self.parser.add_argument(
            "--input",
@@ -147,13 +148,40 @@ class VectorExecutor(BaseExecutor):
        logger.info(f"task source: {task_source}")
        # stage 3: process the audio one by one
+        # we do action according the task type
        task_result = OrderedDict()
        has_exceptions = False
        for id_, input_ in task_source.items():
            try:
-                res = self(input_, model, sample_rate, config, ckpt_path,
+                # extract the speaker audio embedding
-                           device)
+                if parser_args.task == "spk":
-                task_result[id_] = res
+                    logger.info("do vector spk task")
+                    res = self(input_, model, sample_rate, config, ckpt_path,
+                               device)
+                    task_result[id_] = res
+                elif parser_args.task == "score":
+                    logger.info("do vector score task")
+                    logger.info(f"input content {input_}")
+                    if len(input_.split()) != 2:
+                        logger.error(
+                            f"vector score task input {input_} wav num is not two,"
+                            "that is {len(input_.split())}")
+                        sys.exit(-1)
+                    # get the enroll and test embedding
+                    enroll_audio, test_audio = input_.split()
+                    logger.info(
+                        f"score task, enroll audio: {enroll_audio}, test audio: {test_audio}"
+                    )
+                    enroll_embedding = self(enroll_audio, model, sample_rate,
+                                            config, ckpt_path, device)
+                    test_embedding = self(test_audio, model, sample_rate,
+                                          config, ckpt_path, device)
+                    # get the score
+                    res = self.get_embeddings_score(enroll_embedding,
+                                                    test_embedding)
+                    task_result[id_] = res
            except Exception as e:
                has_exceptions = True
                task_result[id_] = f'{e.__class__.__name__}: {e}'
@@ -172,6 +200,49 @@ class VectorExecutor(BaseExecutor):
        else:
            return True
+    def _get_job_contents(
+            self, job_input: os.PathLike) -> Dict[str, Union[str, os.PathLike]]:
+        """
+        Read a job input file and return its contents in a dictionary.
+        Refactor from the Executor._get_job_contents
+        Args:
+            job_input (os.PathLike): The job input file.
+        Returns:
+            Dict[str, str]: Contents of job input.
+        """
+        job_contents = OrderedDict()
+        with open(job_input) as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                k = line.split(' ')[0]
+                v = ' '.join(line.split(' ')[1:])
+                job_contents[k] = v
+        return job_contents
+    def get_embeddings_score(self, enroll_embedding, test_embedding):
+        """get the enroll embedding and test embedding score
+        Args:
+            enroll_embedding (numpy.array): shape: (emb_size), enroll audio embedding
+            test_embedding (numpy.array): shape: (emb_size), test audio embedding
+        Returns:
+            score: the score between enroll embedding and test embedding
+        """
+        if not hasattr(self, "score_func"):
+            self.score_func = paddle.nn.CosineSimilarity(axis=0)
+            logger.info("create the cosine score function ")
+        score = self.score_func(
+            paddle.to_tensor(enroll_embedding),
+            paddle.to_tensor(test_embedding))
+        return score.item()
    @stats_wrapper
    def __call__(self,
                 audio_file: os.PathLike,

--- a/paddlespeech/vector/models/ecapa_tdnn.py
+++ b/paddlespeech/vector/models/ecapa_tdnn.py
@@ -79,6 +79,20 @@ class Conv1d(nn.Layer):
            bias_attr=bias, )
    def forward(self, x):
+        """Do conv1d forward
+        Args:
+            x (paddle.Tensor): [N, C, L] input data, 
+                                N is the batch,
+                                C is the data dimension, 
+                                L is the time
+        Raises:
+            ValueError: only support the same padding type
+        Returns:
+            paddle.Tensor: the value of conv1d
+        """
        if self.padding == "same":
            x = self._manage_padding(x, self.kernel_size, self.dilation,
                                     self.stride)
@@ -88,6 +102,20 @@ class Conv1d(nn.Layer):
        return self.conv(x)
    def _manage_padding(self, x, kernel_size: int, dilation: int, stride: int):
+        """Padding the input data
+        Args:
+            x (paddle.Tensor): [N, C, L] input data
+                                N is the batch,
+                                C is the data dimension, 
+                                L is the time
+            kernel_size (int): 1-d convolution kernel size
+            dilation (int): 1-d convolution dilation
+            stride (int): 1-d convolution stride
+        Returns:
+            paddle.Tensor: the padded input data
+        """
        L_in = x.shape[-1]  # Detecting input shape
        padding = self._get_padding_elem(L_in, stride, kernel_size,
                                         dilation)  # Time padding
@@ -101,6 +129,17 @@ class Conv1d(nn.Layer):
                          stride: int,
                          kernel_size: int,
                          dilation: int):
+        """Calculate the padding value in same mode
+        Args:
+            L_in (int): the times of the input data, 
+            stride (int): 1-d convolution stride
+            kernel_size (int): 1-d convolution kernel size
+            dilation (int): 1-d convolution stride
+        Returns:
+            int: return the padding value in same mode
+        """
        if stride > 1:
            n_steps = math.ceil(((L_in - kernel_size * dilation) / stride) + 1)
            L_out = stride * (n_steps - 1) + kernel_size * dilation
@@ -245,6 +284,13 @@ class SEBlock(nn.Layer):
 class AttentiveStatisticsPooling(nn.Layer):
    def __init__(self, channels, attention_channels=128, global_context=True):
+        """Compute the speaker verification statistics
+           The detail info is section 3.1 in https://arxiv.org/pdf/1709.01507.pdf 
+        Args:
+            channels (int): input data channel or data dimension
+            attention_channels (int, optional): attention dimension. Defaults to 128.
+            global_context (bool, optional): If use the global context information. Defaults to True.
+        """
        super().__init__()
        self.eps = 1e-12