Merge pull request #1761 from Honei/develop

[asr][weboscket]fix the streaming asr server bug, server client

Merge pull request #1761 from Honei/develop
[asr][weboscket]fix the streaming asr server bug, server client
f8cb0c8e · Hui Zhang · GitHub · 20dcd671 · 9760d70b · f8cb0c8e
6 changed file
--- a/demos/README.md
+++ b/demos/README.md
@@ -11,6 +11,7 @@ The directory containes many speech applications in multi scenarios.
 * punctuation_restoration - restore punctuation from raw text
 * speech recogintion - recognize text of an audio file 
 * speech server - Server for Speech Task, e.g. ASR,TTS,CLS
+* streaming asr server - receive audio stream from websocket, and recognize to transcript.
 * speech translation - end to end speech translation  
 * story talker - book reader based on OCR and TTS  
 * style_fs2 - multi style control for FastSpeech2 model  

--- a/demos/README_cn.md
+++ b/demos/README_cn.md
@@ -11,6 +11,7 @@
 * 标点恢复 - 通常作为语音识别的文本后处理任务，为一段无标点的纯文本添加相应的标点符号。
 * 语音识别 - 识别一段音频中包含的语音文字。
 * 语音服务 - 离线语音服务，包括ASR、TTS、CLS等
+* 流式语音识别服务 - 流式输入语音数据流识别音频中的文字
 * 语音翻译 - 实时识别音频中的语言，并同时翻译成目标语言。
 * 会说话的故事书 - 基于 OCR 和语音合成的会说话的故事书。
 * 个性化语音合成 - 基于 FastSpeech2 模型的个性化语音合成。 

--- a/demos/streaming_asr_server/README.md
+++ b/demos/streaming_asr_server/README.md
@@ -40,8 +40,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
  paddlespeech_server start --help
  ```
  Arguments:
-  - `config_file`: yaml file of the app, defalut: ./conf/ws_conformer_application.yaml
+  - `config_file`: yaml file of the app, defalut: `./conf/application.yaml`
-  - `log_file`: log file. Default: ./log/paddlespeech.log
+  - `log_file`: log file. Default: `./log/paddlespeech.log`
  Output:
  ```bash

--- a/demos/streaming_asr_server/README_cn.md
+++ b/demos/streaming_asr_server/README_cn.md
@@ -40,8 +40,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
  paddlespeech_server start --help
  ```
  参数:
-  - `config_file`: 服务的配置文件，默认： ./conf/ws_conformer_application.yaml
+  - `config_file`: 服务的配置文件，默认： `./conf/application.yaml`
-  - `log_file`: log 文件. 默认：./log/paddlespeech.log
+  - `log_file`: log 文件. 默认：`./log/paddlespeech.log`
  输出:
  ```bash

--- a/paddlespeech/server/utils/audio_handler.py
+++ b/paddlespeech/server/utils/audio_handler.py
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import logging
+import numpy as np
+import soundfile
+import websockets
+from paddlespeech.cli.log import logger
+class ASRAudioHandler:
+    def __init__(self, url="127.0.0.1", port=8090):
+        """PaddleSpeech Online ASR Server Client  audio handler
+           Online asr server use the websocket protocal
+        Args:
+            url (str, optional): the server ip. Defaults to "127.0.0.1".
+            port (int, optional): the server port. Defaults to 8090.
+        """
+        self.url = url
+        self.port = port
+        self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr"
+    def read_wave(self, wavfile_path: str):
+        """read the audio file from specific wavfile path
+        Args:
+            wavfile_path (str): the audio wavfile, 
+                                 we assume that audio sample rate matches the model
+        Yields:
+            numpy.array: the samall package audio pcm data
+        """
+        samples, sample_rate = soundfile.read(wavfile_path, dtype='int16')
+        x_len = len(samples)
+        chunk_size = 85 * 16  #80ms, sample_rate = 16kHz
+        if x_len % chunk_size != 0:
+            padding_len_x = chunk_size - x_len % chunk_size
+        else:
+            padding_len_x = 0
+        padding = np.zeros((padding_len_x), dtype=samples.dtype)
+        padded_x = np.concatenate([samples, padding], axis=0)
+        assert (x_len + padding_len_x) % chunk_size == 0
+        num_chunk = (x_len + padding_len_x) / chunk_size
+        num_chunk = int(num_chunk)
+        for i in range(0, num_chunk):
+            start = i * chunk_size
+            end = start + chunk_size
+            x_chunk = padded_x[start:end]
+            yield x_chunk
+    async def run(self, wavfile_path: str):
+        """Send a audio file to online server
+        Args:
+            wavfile_path (str): audio path
+        Returns:
+            str: the final asr result
+        """
+        logging.info("send a message to the server")
+        # 1. send websocket handshake protocal
+        async with websockets.connect(self.url) as ws:
+            # 2. server has already received handshake protocal
+            # client start to send the command
+            audio_info = json.dumps(
+                {
+                    "name": "test.wav",
+                    "signal": "start",
+                    "nbest": 5
+                },
+                sort_keys=True,
+                indent=4,
+                separators=(',', ': '))
+            await ws.send(audio_info)
+            msg = await ws.recv()
+            logger.info("receive msg={}".format(msg))
+            # 3. send chunk audio data to engine
+            for chunk_data in self.read_wave(wavfile_path):
+                await ws.send(chunk_data.tobytes())
+                msg = await ws.recv()
+                msg = json.loads(msg)
+                logger.info("receive msg={}".format(msg))
+            # 4. we must send finished signal to the server
+            audio_info = json.dumps(
+                {
+                    "name": "test.wav",
+                    "signal": "end",
+                    "nbest": 5
+                },
+                sort_keys=True,
+                indent=4,
+                separators=(',', ': '))
+            await ws.send(audio_info)
+            msg = await ws.recv()
+            # 5. decode the bytes to str
+            msg = json.loads(msg)
+            logger.info("final receive msg={}".format(msg))
+            result = msg
+            return result
--- a/paddlespeech/server/ws/asr_socket.py
+++ b/paddlespeech/server/ws/asr_socket.py
@@ -20,50 +20,52 @@ from starlette.websockets import WebSocketState as WebSocketState
 from paddlespeech.server.engine.asr.online.asr_engine import PaddleASRConnectionHanddler
 from paddlespeech.server.engine.engine_pool import get_engine_pool
-from paddlespeech.server.utils.buffer import ChunkBuffer
-from paddlespeech.server.utils.vad import VADAudio
 router = APIRouter()
 @router.websocket('/ws/asr')
 async def websocket_endpoint(websocket: WebSocket):
+    """PaddleSpeech Online ASR Server api
+    Args:
+        websocket (WebSocket): the websocket instance
+    """
+    #1. the interface wait to accept the websocket protocal header
+    #   and only we receive the header, it establish the connection with specific thread
    await websocket.accept()
+    #2. if we accept the websocket headers, we will get the online asr engine instance
    engine_pool = get_engine_pool()
    asr_engine = engine_pool['asr']
-    connection_handler = None
-    # init buffer
-    # each websocekt connection has its own chunk buffer
-    chunk_buffer_conf = asr_engine.config.chunk_buffer_conf
-    chunk_buffer = ChunkBuffer(
-        window_n=chunk_buffer_conf.window_n,
-        shift_n=chunk_buffer_conf.shift_n,
-        window_ms=chunk_buffer_conf.window_ms,
-        shift_ms=chunk_buffer_conf.shift_ms,
-        sample_rate=chunk_buffer_conf.sample_rate,
-        sample_width=chunk_buffer_conf.sample_width)
-    # init vad
+    #3. each websocket connection, we will create an PaddleASRConnectionHanddler to process such audio
-    vad_conf = asr_engine.config.get('vad_conf', None)
+    #   and each connection has its own connection instance to process the request
-    if vad_conf:
+    #   and only if client send the start signal, we create the PaddleASRConnectionHanddler instance
-        vad = VADAudio(
+    connection_handler = None
-            aggressiveness=vad_conf['aggressiveness'],
-            rate=vad_conf['sample_rate'],
-            frame_duration_ms=vad_conf['frame_duration_ms'])
    try:
+        #4. we do a loop to process the audio package by package according the protocal
+        #   and only if the client send finished signal, we will break the loop
        while True:
            # careful here, changed the source code from starlette.websockets
+            # 4.1 we wait for the client signal for the specific action
            assert websocket.application_state == WebSocketState.CONNECTED
            message = await websocket.receive()
            websocket._raise_on_disconnect(message)
+            #4.2 text for the action command and bytes for pcm data
            if "text" in message:
+                # we first parse the specific command
                message = json.loads(message["text"])
                if 'signal' not in message:
                    resp = {"status": "ok", "message": "no valid json data"}
                    await websocket.send_json(resp)
+                # start command, we create the PaddleASRConnectionHanddler instance to process the audio data
+                # end command, we process the all the last audio pcm and return the final result
+                #              and we break the loop
                if message['signal'] == 'start':
                    resp = {"status": "ok", "signal": "server_ready"}
                    # do something at begining here
@@ -72,6 +74,7 @@ async def websocket_endpoint(websocket: WebSocket):
                    await websocket.send_json(resp)
                elif message['signal'] == 'end':
                    # reset single  engine for an new connection
+                    # and we will destroy the connection
                    connection_handler.decode(is_finished=True)
                    connection_handler.rescoring()
                    asr_results = connection_handler.get_result()
@@ -88,12 +91,17 @@ async def websocket_endpoint(websocket: WebSocket):
                    resp = {"status": "ok", "message": "no valid json data"}
                    await websocket.send_json(resp)
            elif "bytes" in message:
+                # bytes for the pcm data
                message = message["bytes"]
+                # we extract the remained audio pcm 
+                # and decode for the result in this package data
                connection_handler.extract_feat(message)
                connection_handler.decode(is_finished=False)
                asr_results = connection_handler.get_result()
+                # return the current period result
+                # if the engine create the vad instance, this connection will have many period results 
                resp = {'asr_results': asr_results}
                await websocket.send_json(resp)
    except WebSocketDisconnect: