Merge pull request #1835 from Honei/vec_server

[vec][server]add vector server

Merge pull request #1835 from Honei/vec_server
[vec][server]add vector server
435e86b3 · Hui Zhang · GitHub · 8850955d · 10da21a7 · 435e86b3
18 changed file
--- a/demos/speaker_verification/README.md
+++ b/demos/speaker_verification/README.md
@@ -14,7 +14,7 @@ see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/doc
 You can choose one way from easy, meduim and hard to install paddlespeech.
 ### 2. Prepare Input File
-The input of this demo should be a WAV file(`.wav`), and the sample rate must be the same as the model.
+The input of this cli demo should be a WAV file(`.wav`), and the sample rate must be the same as the model.
 Here are sample files for this demo that can be downloaded:
 ```bash

--- a/demos/speaker_verification/README_cn.md
+++ b/demos/speaker_verification/README_cn.md
@@ -4,16 +4,16 @@
 ## 介绍
 声纹识别是一项用计算机程序自动提取说话人特征的技术。
-这个 demo 是一个从给定音频文件提取说话人特征，它可以通过使用 `PaddleSpeech` 的单个命令或 python 中的几行代码来实现。
+这个 demo 是从一个给定音频文件中提取说话人特征，它可以通过使用 `PaddleSpeech` 的单个命令或 python 中的几行代码来实现。
 ## 使用方法
 ### 1. 安装
 请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。
-你可以从 easy，medium，hard 三中方式中选择一种方式安装。
+你可以从easy medium，hard 三种方式中选择一种方式安装。
 ### 2. 准备输入
-这个 demo 的输入应该是一个 WAV 文件（`.wav`），并且采样率必须与模型的采样率相同。
+声纹cli demo 的输入应该是一个 WAV 文件（`.wav`），并且采样率必须与模型的采样率相同。
 可以下载此 demo 的示例音频：
 ```bash

--- a/demos/streaming_asr_server/websocket_client.py
+++ b/demos/streaming_asr_server/websocket_client.py
@@ -28,6 +28,7 @@ def main(args):
    handler = ASRWsAudioHandler(
        args.server_ip,
        args.port,
+        endpoint=args.endpoint,
        punc_server_ip=args.punc_server_ip,
        punc_server_port=args.punc_server_port)
    loop = asyncio.get_event_loop()
@@ -69,7 +70,11 @@ if __name__ == "__main__":
        default=8091,
        dest="punc_server_port",
        help='Punctuation server port')
+    parser.add_argument(
+        "--endpoint",
+        type=str,
+        default="/paddlespeech/asr/streaming",
+        help="ASR websocket endpoint")
    parser.add_argument(
        "--wavfile",
        action="store",

--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -272,7 +272,8 @@ class VectorExecutor(BaseExecutor):
                        model_type: str='ecapatdnn_voxceleb12',
                        sample_rate: int=16000,
                        cfg_path: Optional[os.PathLike]=None,
-                        ckpt_path: Optional[os.PathLike]=None):
+                        ckpt_path: Optional[os.PathLike]=None,
+                        task=None):
        """Init the neural network from the model path
        Args:
@@ -284,8 +285,10 @@ class VectorExecutor(BaseExecutor):
                                                        Defaults to None.
            ckpt_path (Optional[os.PathLike], optional): the pretrained model path, which is stored in the disk. 
                                                         Defaults to None.
+            task (str, optional): the model task type
        """
        # stage 0: avoid to init the mode again
+        self.task = task
        if hasattr(self, "model"):
            logger.info("Model has been initialized")
            return
@@ -434,6 +437,7 @@ class VectorExecutor(BaseExecutor):
        if self.sample_rate != 16000 and self.sample_rate != 8000:
            logger.error(
                "invalid sample rate, please input --sr 8000 or --sr 16000")
+            logger.error(f"The model sample rate: {self.sample_rate}, the external sample rate is: {sample_rate}")
            return False
        if isinstance(audio_file, (str, os.PathLike)):

--- a/paddlespeech/server/README_cn.md
+++ b/paddlespeech/server/README_cn.md
@@ -63,3 +63,23 @@ paddlespeech_server start --config_file conf/tts_online_application.yaml
 ```
 paddlespeech_client tts_online  --server_ip 127.0.0.1 --port 8092 --input "您好，欢迎使用百度飞桨深度学习框架！" --output output.wav
 ```
+## 声纹识别
+### 启动声纹识别服务
+```
+paddlespeech_server start --config_file conf/vector_application.yaml
+```
+### 获取说话人音频声纹
+```
+paddlespeech_client vector --task spk  --server_ip 127.0.0.1 --port 8090 --input 85236145389.wav
+```
+### 两个说话人音频声纹打分
+```
+paddlespeech_client vector --task score  --server_ip 127.0.0.1 --port 8090 --enroll 123456789.wav --test 85236145389.wav
+```
\ No newline at end of file
--- a/paddlespeech/server/bin/paddlespeech_client.py
+++ b/paddlespeech/server/bin/paddlespeech_client.py
@@ -35,7 +35,7 @@ from paddlespeech.server.utils.util import wav2base64
 __all__ = [
    'TTSClientExecutor', 'TTSOnlineClientExecutor', 'ASRClientExecutor',
-    'ASROnlineClientExecutor', 'CLSClientExecutor'
+    'ASROnlineClientExecutor', 'CLSClientExecutor', 'VectorClientExecutor'
 ]
@@ -583,3 +583,108 @@ class TextClientExecutor(BaseExecutor):
        response_dict = res.json()
        punc_text = response_dict["result"]["punc_text"]
        return punc_text
+@cli_client_register(
+    name='paddlespeech_client.vector', description='visit the vector service')
+class VectorClientExecutor(BaseExecutor):
+    def __init__(self):
+        super(VectorClientExecutor, self).__init__()
+        self.parser = argparse.ArgumentParser(
+            prog='paddlespeech_client.vector', add_help=True)
+        self.parser.add_argument(
+            '--server_ip', type=str, default='127.0.0.1', help='server ip')
+        self.parser.add_argument(
+            '--port', type=int, default=8090, help='server port')
+        self.parser.add_argument(
+            '--input',
+            type=str,
+            default=None,
+            help='sentence to be process by text server.')
+        self.parser.add_argument(
+            '--task',
+            type=str,
+            default="spk",
+            choices=["spk", "score"],
+            help="The vector service task")
+        self.parser.add_argument(
+            "--enroll", type=str, default=None, help="The enroll audio")
+        self.parser.add_argument(
+            "--test", type=str, default=None, help="The test audio")
+    def execute(self, argv: List[str]) -> bool:
+        """Execute the request from the argv.
+        Args:
+            argv (List): the request arguments
+        Returns:
+            str: the request flag
+        """
+        args = self.parser.parse_args(argv)
+        input_ = args.input
+        server_ip = args.server_ip
+        port = args.port
+        task = args.task
+        try:
+            time_start = time.time()
+            res = self(
+                input=input_,
+                server_ip=server_ip,
+                port=port,
+                enroll_audio=args.enroll,
+                test_audio=args.test,
+                task=task)
+            time_end = time.time()
+            logger.info(f"The vector: {res}")
+            logger.info("Response time %f s." % (time_end - time_start))
+            return True
+        except Exception as e:
+            logger.error("Failed to extract vector.")
+            logger.error(e)
+            return False
+    @stats_wrapper
+    def __call__(self,
+                 input: str,
+                 server_ip: str="127.0.0.1",
+                 port: int=8090,
+                 audio_format: str="wav",
+                 sample_rate: int=16000,
+                 enroll_audio: str=None,
+                 test_audio: str=None,
+                 task="spk"):
+        """
+        Python API to call text executor.
+        Args:
+            input (str): the request audio data
+            server_ip (str, optional): the server ip. Defaults to "127.0.0.1".
+            port (int, optional): the server port. Defaults to 8090.
+            audio_format (str, optional): audio format. Defaults to "wav".
+            sample_rate (str, optional): audio sample rate. Defaults to 16000.
+            enroll_audio (str, optional): enroll audio data. Defaults to None.
+            test_audio (str, optional): test audio data. Defaults to None.
+            task (str, optional): the task type, "spk" or "socre". Defaults to "spk"
+        Returns:
+            str: the audio embedding or score between enroll and test audio
+        """
+        if task == "spk":
+            from paddlespeech.server.utils.audio_handler import VectorHttpHandler
+            logger.info("vector http client start")
+            logger.info(f"the input audio: {input}")
+            handler = VectorHttpHandler(server_ip=server_ip, port=port)
+            res = handler.run(input, audio_format, sample_rate)
+            return res
+        elif task == "score":
+            from paddlespeech.server.utils.audio_handler import VectorScoreHttpHandler
+            logger.info("vector score http client start")
+            logger.info(
+                f"enroll audio: {enroll_audio}, test audio: {test_audio}")
+            handler = VectorScoreHttpHandler(server_ip=server_ip, port=port)
+            res = handler.run(enroll_audio, test_audio, audio_format,
+                              sample_rate)
+            logger.info(f"The vector score is: {res}")
+        else:
+            logger.error(f"Sorry, we have not support such task {task}")
--- a/paddlespeech/server/conf/application.yaml
+++ b/paddlespeech/server/conf/application.yaml
@@ -11,7 +11,7 @@ port: 8090
 # protocol = ['websocket', 'http'] (only one can be selected). 
 # http only support offline engine type.
 protocol: 'http'
-engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python']
+engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python', 'vector_python']
 #################################################################################
@@ -166,4 +166,15 @@ text_python:
    cfg_path: # [optional]
    ckpt_path: # [optional]
    vocab_file: # [optional]
+    device:  # set 'gpu:id' or 'cpu'
+################################### Vector ######################################
+################### Vector task: spk; engine_type: python #######################
+vector_python:
+    task: spk
+    model_type: 'ecapatdnn_voxceleb12'
+    sample_rate: 16000
+    cfg_path: # [optional]
+    ckpt_path: # [optional]
    device:  # set 'gpu:id' or 'cpu'
\ No newline at end of file
--- a/paddlespeech/server/conf/vector_application.yaml
+++ b/paddlespeech/server/conf/vector_application.yaml
+# This is the parameter configuration file for PaddleSpeech Serving.
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
+host: 0.0.0.0
+port: 8090
+# The task format in the engin_list is: <speech task>_<engine type>
+# protocol = ['http'] (only one can be selected). 
+# http only support offline engine type.
+protocol: 'http'
+engine_list: ['vector_python']
+#################################################################################
+#                                ENGINE CONFIG                                  #
+#################################################################################
+################################### Vector ######################################
+################### Vector task: spk; engine_type: python #######################
+vector_python:
+    task: spk
+    model_type: 'ecapatdnn_voxceleb12'
+    sample_rate: 16000
+    cfg_path: # [optional]
+    ckpt_path: # [optional]
+    device: # set 'gpu:id' or 'cpu'
--- a/paddlespeech/server/engine/asr/online/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import copy
 import os
+import time
 from typing import Optional
 import numpy as np
@@ -153,6 +154,12 @@ class PaddleASRConnectionHanddler:
            self.n_shift = self.preprocess_conf.process[0]['n_shift']
    def extract_feat(self, samples):
+        # we compute the elapsed time of first char occuring 
+        # and we record the start time at the first pcm sample arraving
+        # if self.first_char_occur_elapsed is not None:
+        #     self.first_char_occur_elapsed = time.time()
        if "deepspeech2online" in self.model_type:
            # self.reamined_wav stores all the samples, 
            # include the original remained_wav and this package samples
@@ -290,6 +297,7 @@ class PaddleASRConnectionHanddler:
        self.chunk_num = 0
        self.global_frame_offset = 0
        self.result_transcripts = ['']
+        self.first_char_occur_elapsed = None
    def decode(self, is_finished=False):
        if "deepspeech2online" in self.model_type:

--- a/paddlespeech/server/engine/engine_factory.py
+++ b/paddlespeech/server/engine/engine_factory.py
@@ -49,5 +49,8 @@ class EngineFactory(object):
        elif engine_name.lower() == 'text' and engine_type.lower() == 'python':
            from paddlespeech.server.engine.text.python.text_engine import TextEngine
            return TextEngine()
+        elif engine_name.lower() == 'vector' and engine_type.lower() == 'python':
+            from paddlespeech.server.engine.vector.python.vector_engine import VectorEngine
+            return VectorEngine()
        else:
            return None
--- a/paddlespeech/server/engine/vector/__init__.py
+++ b/paddlespeech/server/engine/vector/__init__.py
--- a/paddlespeech/server/engine/vector/python/__init__.py
+++ b/paddlespeech/server/engine/vector/python/__init__.py
--- a/paddlespeech/server/engine/vector/python/vector_engine.py
+++ b/paddlespeech/server/engine/vector/python/vector_engine.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import io
+from collections import OrderedDict
+import numpy as np
+import paddle
+from paddleaudio.backends import load as load_audio
+from paddleaudio.compliance.librosa import melspectrogram
+from paddlespeech.cli.log import logger
+from paddlespeech.cli.vector.infer import VectorExecutor
+from paddlespeech.server.engine.base_engine import BaseEngine
+from paddlespeech.vector.io.batch import feature_normalize
+class PaddleVectorConnectionHandler:
+    def __init__(self, vector_engine):
+        """The PaddleSpeech Vector Server Connection Handler
+           This connection process every server request
+        Args:
+            vector_engine (VectorEngine): The Vector engine
+        """
+        super().__init__()
+        logger.info(
+            "Create PaddleVectorConnectionHandler to process the vector request")
+        self.vector_engine = vector_engine
+        self.executor = self.vector_engine.executor
+        self.task = self.vector_engine.executor.task
+        self.model = self.vector_engine.executor.model
+        self.config = self.vector_engine.executor.config
+        self._inputs = OrderedDict()
+        self._outputs = OrderedDict()
+    @paddle.no_grad()
+    def run(self, audio_data, task="spk"):
+        """The connection process the http request audio
+        Args:
+            audio_data (bytes): base64.b64decode
+        Returns:
+            str: the punctuation text
+        """
+        logger.info(
+            f"start to extract the do vector {self.task} from the http request")
+        if self.task == "spk" and task == "spk":
+            embedding = self.extract_audio_embedding(audio_data)
+            return embedding
+        else:
+            logger.error(
+                "The request task is not matched with server model task")
+            logger.error(
+                f"The server model task is: {self.task}, but the request task is: {task}"
+            )
+        return np.array([
+            0.0,
+        ])
+    @paddle.no_grad()
+    def get_enroll_test_score(self, enroll_audio, test_audio):
+        """Get the enroll and test audio score
+        Args:
+            enroll_audio (str): the base64 format enroll audio
+            test_audio (str): the base64 format test audio
+        Returns:
+            float: the score between enroll and test audio
+        """
+        logger.info("start to extract the enroll audio embedding")
+        enroll_emb = self.extract_audio_embedding(enroll_audio)
+        logger.info("start to extract the test audio embedding")
+        test_emb = self.extract_audio_embedding(test_audio)
+        logger.info(
+            "start to get the score between the enroll and test embedding")
+        score = self.executor.get_embeddings_score(enroll_emb, test_emb)
+        logger.info(f"get the enroll vs test score: {score}")
+        return score
+    @paddle.no_grad()
+    def extract_audio_embedding(self, audio: str, sample_rate: int=16000):
+        """extract the audio embedding
+        Args:
+            audio (str): the audio data
+            sample_rate (int, optional): the audio sample rate. Defaults to 16000.
+        """
+        # we can not reuse the cache io.BytesIO(audio) data, 
+        # because the soundfile will change the io.BytesIO(audio) to the end
+        # thus we should convert the base64 string to io.BytesIO when we need the audio data
+        if not self.executor._check(io.BytesIO(audio), sample_rate):
+            logger.info("check the audio sample rate occurs error")
+            return np.array([0.0])
+        waveform, sr = load_audio(io.BytesIO(audio))
+        logger.info(f"load the audio sample points, shape is: {waveform.shape}")
+        # stage 2: get the audio feat
+        # Note: Now we only support fbank feature
+        try:
+            feats = melspectrogram(
+                x=waveform,
+                sr=self.config.sr,
+                n_mels=self.config.n_mels,
+                window_size=self.config.window_size,
+                hop_length=self.config.hop_size)
+            logger.info(f"extract the audio feats, shape is: {feats.shape}")
+        except Exception as e:
+            logger.info(f"feats occurs exception {e}")
+            sys.exit(-1)
+        feats = paddle.to_tensor(feats).unsqueeze(0)
+        # in inference period, the lengths is all one without padding
+        lengths = paddle.ones([1])
+        # stage 3: we do feature normalize,
+        #          Now we assume that the feats must do normalize
+        feats = feature_normalize(feats, mean_norm=True, std_norm=False)
+        # stage 4: store the feats and length in the _inputs,
+        #          which will be used in other function
+        logger.info(f"feats shape: {feats.shape}")
+        logger.info("audio extract the feats success")
+        logger.info("start to extract the audio embedding")
+        embedding = self.model.backbone(feats, lengths).squeeze().numpy()
+        logger.info(f"embedding size: {embedding.shape}")
+        return embedding
+class VectorServerExecutor(VectorExecutor):
+    def __init__(self):
+        """The wrapper for TextEcutor
+        """
+        super().__init__()
+        pass
+class VectorEngine(BaseEngine):
+    def __init__(self):
+        """The Vector Engine
+        """
+        super(VectorEngine, self).__init__()
+        logger.info("Create the VectorEngine Instance")
+    def init(self, config: dict):
+        """Init the Vector Engine
+        Args:
+            config (dict): The server configuation
+        Returns:
+            bool: The engine instance flag
+        """
+        logger.info("Init the vector engine")
+        try:
+            self.config = config
+            if self.config.device:
+                self.device = self.config.device
+            else:
+                self.device = paddle.get_device()
+            paddle.set_device(self.device)
+            logger.info(f"Vector Engine set the device: {self.device}")
+        except BaseException as e:
+            logger.error(
+                "Set device failed, please check if device is already used and the parameter 'device' in the yaml file"
+            )
+            logger.error("Initialize Vector server engine Failed on device: %s."
+                         % (self.device))
+            return False
+        self.executor = VectorServerExecutor()
+        self.executor._init_from_path(
+            model_type=config.model_type,
+            cfg_path=config.cfg_path,
+            ckpt_path=config.ckpt_path,
+            task=config.task)
+        logger.info("Init the Vector engine successfully")
+        return True
--- a/paddlespeech/server/restful/api.py
+++ b/paddlespeech/server/restful/api.py
@@ -21,7 +21,7 @@ from paddlespeech.server.restful.asr_api import router as asr_router
 from paddlespeech.server.restful.cls_api import router as cls_router
 from paddlespeech.server.restful.text_api import router as text_router
 from paddlespeech.server.restful.tts_api import router as tts_router
+from paddlespeech.server.restful.vector_api import router as vec_router
 _router = APIRouter()
@@ -43,6 +43,8 @@ def setup_router(api_list: List):
            _router.include_router(cls_router)
        elif api_name == 'text':
            _router.include_router(text_router)
+        elif api_name.lower() == 'vector':
+            _router.include_router(vec_router)
        else:
            logger.error(
                f"PaddleSpeech has not support such service: {api_name}")

--- a/paddlespeech/server/restful/request.py
+++ b/paddlespeech/server/restful/request.py
@@ -15,7 +15,10 @@ from typing import Optional
 from pydantic import BaseModel
-__all__ = ['ASRRequest', 'TTSRequest', 'CLSRequest']
+__all__ = [
+    'ASRRequest', 'TTSRequest', 'CLSRequest', 'VectorRequest',
+    'VectorScoreRequest'
+]
 #****************************************************************************************/
@@ -85,3 +88,40 @@ class CLSRequest(BaseModel):
 #****************************************************************************************/
 class TextRequest(BaseModel):
    text: str
+#****************************************************************************************/
+#************************************ Vecotr request ************************************/
+#****************************************************************************************/
+class VectorRequest(BaseModel):
+    """
+    request body example
+    {
+        "audio": "exSI6ICJlbiIsCgkgICAgInBvc2l0aW9uIjogImZhbHNlIgoJf...",
+        "task": "spk",
+        "audio_format": "wav",
+        "sample_rate": 16000,
+    }
+    """
+    audio: str
+    task: str
+    audio_format: str
+    sample_rate: int
+class VectorScoreRequest(BaseModel):
+    """
+    request body example
+    {
+        "enroll_audio": "exSI6ICJlbiIsCgkgICAgInBvc2l0aW9uIjogImZhbHNlIgoJf...",
+        "test_audio": "exSI6ICJlbiIsCgkgICAgInBvc2l0aW9uIjogImZhbHNlIgoJf...",
+        "task": "score",
+        "audio_format": "wav",
+        "sample_rate": 16000,
+    }
+    """
+    enroll_audio: str
+    test_audio: str
+    task: str
+    audio_format: str
+    sample_rate: int
--- a/paddlespeech/server/restful/response.py
+++ b/paddlespeech/server/restful/response.py
@@ -15,7 +15,10 @@ from typing import List
 from pydantic import BaseModel
-__all__ = ['ASRResponse', 'TTSResponse', 'CLSResponse']
+__all__ = [
+    'ASRResponse', 'TTSResponse', 'CLSResponse', 'TextResponse',
+    'VectorResponse', 'VectorScoreResponse'
+]
 class Message(BaseModel):
@@ -129,6 +132,11 @@ class CLSResponse(BaseModel):
    result: CLSResult
+#****************************************************************************************/
+#************************************ Text response **************************************/
+#****************************************************************************************/
 class TextResult(BaseModel):
    punc_text: str
@@ -153,6 +161,59 @@ class TextResponse(BaseModel):
    result: TextResult
+#****************************************************************************************/
+#************************************ Vector response **************************************/
+#****************************************************************************************/
+class VectorResult(BaseModel):
+    vec: list
+class VectorResponse(BaseModel):
+    """
+    response example
+    {
+        "success": true,
+        "code": 0,
+        "message": {
+            "description": "success" 
+        },
+        "result": {
+            "vec": [1.0, 1.0]
+        }
+    }
+    """
+    success: bool
+    code: int
+    message: Message
+    result: VectorResult
+class VectorScoreResult(BaseModel):
+    score: float
+class VectorScoreResponse(BaseModel):
+    """
+    response example
+    {
+        "success": true,
+        "code": 0,
+        "message": {
+            "description": "success" 
+        },
+        "result": {
+            "score": 1.0
+        }
+    }
+    """
+    success: bool
+    code: int
+    message: Message
+    result: VectorScoreResult
 #****************************************************************************************/
 #********************************** Error response **************************************/
 #****************************************************************************************/

--- a/paddlespeech/server/restful/vector_api.py
+++ b/paddlespeech/server/restful/vector_api.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import base64
+import traceback
+from typing import Union
+import numpy as np
+from fastapi import APIRouter
+from paddlespeech.cli.log import logger
+from paddlespeech.server.engine.engine_pool import get_engine_pool
+from paddlespeech.server.engine.vector.python.vector_engine import PaddleVectorConnectionHandler
+from paddlespeech.server.restful.request import VectorRequest
+from paddlespeech.server.restful.request import VectorScoreRequest
+from paddlespeech.server.restful.response import ErrorResponse
+from paddlespeech.server.restful.response import VectorResponse
+from paddlespeech.server.restful.response import VectorScoreResponse
+from paddlespeech.server.utils.errors import ErrorCode
+from paddlespeech.server.utils.errors import failed_response
+from paddlespeech.server.utils.exception import ServerBaseException
+router = APIRouter()
+@router.get('/paddlespeech/vector/help')
+def help():
+    """help
+    Returns:
+        json: The /paddlespeech/vector api response content
+    """
+    response = {
+        "success": "True",
+        "code": 200,
+        "message": {
+            "global": "success"
+        },
+        "vector": [2.3, 3.5, 5.5, 6.2, 2.8, 1.2, 0.3, 3.6]
+    }
+    return response
+@router.post(
+    "/paddlespeech/vector", response_model=Union[VectorResponse, ErrorResponse])
+def vector(request_body: VectorRequest):
+    """vector api 
+    Args:
+        request_body (VectorRequest): the vector request body
+    Returns:
+        json: the vector response body
+    """
+    try:
+        # 1. get the audio data
+        #    the audio must be base64 format
+        audio_data = base64.b64decode(request_body.audio)
+        # 2. get single engine from engine pool
+        #    and we use the vector_engine to create an connection handler to process the request
+        engine_pool = get_engine_pool()
+        vector_engine = engine_pool['vector']
+        connection_handler = PaddleVectorConnectionHandler(vector_engine)
+        # 3. we use the connection handler to process the audio
+        audio_vec = connection_handler.run(audio_data, request_body.task)
+        # 4. we need the result of the vector instance be numpy.ndarray
+        if not isinstance(audio_vec, np.ndarray):
+            logger.error(
+                f"the vector type is not numpy.array, that is: {type(audio_vec)}"
+            )
+            error_reponse = ErrorResponse()
+            error_reponse.message.description = f"the vector type is not numpy.array, that is: {type(audio_vec)}"
+            return error_reponse
+        response = {
+            "success": True,
+            "code": 200,
+            "message": {
+                "description": "success"
+            },
+            "result": {
+                "vec": audio_vec.tolist()
+            }
+        }
+    except ServerBaseException as e:
+        response = failed_response(e.error_code, e.msg)
+    except BaseException:
+        response = failed_response(ErrorCode.SERVER_UNKOWN_ERR)
+        traceback.print_exc()
+    return response
+@router.post(
+    "/paddlespeech/vector/score",
+    response_model=Union[VectorScoreResponse, ErrorResponse])
+def score(request_body: VectorScoreRequest):
+    """vector api 
+    Args:
+        request_body (VectorScoreRequest): the punctuation request body
+    Returns:
+        json: the punctuation response body
+    """
+    try:
+        # 1. get the audio data
+        #    the audio must be base64 format
+        enroll_data = base64.b64decode(request_body.enroll_audio)
+        test_data = base64.b64decode(request_body.test_audio)
+        # 2. get single engine from engine pool
+        #    and we use the vector_engine to create an connection handler to process the request
+        engine_pool = get_engine_pool()
+        vector_engine = engine_pool['vector']
+        connection_handler = PaddleVectorConnectionHandler(vector_engine)
+        # 3. we use the connection handler to process the audio
+        score = connection_handler.get_enroll_test_score(enroll_data, test_data)
+        response = {
+            "success": True,
+            "code": 200,
+            "message": {
+                "description": "success"
+            },
+            "result": {
+                "score": score
+            }
+        }
+    except ServerBaseException as e:
+        response = failed_response(e.error_code, e.msg)
+    except BaseException:
+        response = failed_response(ErrorCode.SERVER_UNKOWN_ERR)
+        traceback.print_exc()
+    return response
--- a/paddlespeech/server/utils/audio_handler.py
+++ b/paddlespeech/server/utils/audio_handler.py
@@ -142,6 +142,7 @@ class ASRWsAudioHandler:
            return ""
        # 1. send websocket handshake protocal
+        start_time = time.time()
        async with websockets.connect(self.url) as ws:
            # 2. server has already received handshake protocal
            # client start to send the command
@@ -187,7 +188,14 @@ class ASRWsAudioHandler:
            if self.punc_server:
                msg["result"] = self.punc_server.run(msg["result"])
+            # 6. logging the final result and comptute the statstics
+            elapsed_time = time.time() - start_time
+            audio_info = soundfile.info(wavfile_path)
            logger.info("client final receive msg={}".format(msg))
+            logger.info(
+                f"audio duration: {audio_info.duration}, elapsed time: {elapsed_time}, RTF={elapsed_time/audio_info.duration}"
+            )
            result = msg
            return result
@@ -456,3 +464,96 @@ class TTSHttpHandler:
            self.stream.stop_stream()
            self.stream.close()
            self.p.terminate()
+class VectorHttpHandler:
+    def __init__(self, server_ip=None, port=None):
+        """The Vector client http request
+        Args:
+            server_ip (str, optional): the http vector server ip. Defaults to "127.0.0.1".
+            port (int, optional): the http vector server port. Defaults to 8090.
+        """
+        super().__init__()
+        self.server_ip = server_ip
+        self.port = port
+        if server_ip is None or port is None:
+            self.url = None
+        else:
+            self.url = 'http://' + self.server_ip + ":" + str(
+                self.port) + '/paddlespeech/vector'
+    def run(self, input, audio_format, sample_rate, task="spk"):
+        """Call the http asr to process the audio
+        Args:
+            input (str): the audio file path
+            audio_format (str): the audio format
+            sample_rate (str): the audio sample rate
+        Returns:
+            list: the audio vector
+        """
+        if self.url is None:
+            logger.error("No vector server, please input valid ip and port")
+            return ""
+        audio = wav2base64(input)
+        data = {
+            "audio": audio,
+            "task": task,
+            "audio_format": audio_format,
+            "sample_rate": sample_rate,
+        }
+        logger.info(self.url)
+        res = requests.post(url=self.url, data=json.dumps(data))
+        return res.json()
+class VectorScoreHttpHandler:
+    def __init__(self, server_ip=None, port=None):
+        """The Vector score client http request
+        Args:
+            server_ip (str, optional): the http vector server ip. Defaults to "127.0.0.1".
+            port (int, optional): the http vector server port. Defaults to 8090.
+        """
+        super().__init__()
+        self.server_ip = server_ip
+        self.port = port
+        if server_ip is None or port is None:
+            self.url = None
+        else:
+            self.url = 'http://' + self.server_ip + ":" + str(
+                self.port) + '/paddlespeech/vector/score'
+    def run(self, enroll_audio, test_audio, audio_format, sample_rate):
+        """Call the http asr to process the audio
+        Args:
+            input (str): the audio file path
+            audio_format (str): the audio format
+            sample_rate (str): the audio sample rate
+        Returns:
+            list: the audio vector
+        """
+        if self.url is None:
+            logger.error("No vector server, please input valid ip and port")
+            return ""
+        enroll_audio = wav2base64(enroll_audio)
+        test_audio = wav2base64(test_audio)
+        data = {
+            "enroll_audio": enroll_audio,
+            "test_audio": test_audio,
+            "task": "score",
+            "audio_format": audio_format,
+            "sample_rate": sample_rate,
+        }
+        res = requests.post(url=self.url, data=json.dumps(data))
+        return res.json()