change sr, test=doc

5b06b76e · lym0302 · a0d1888c · 5b06b76e · 5b06b76e · 5b06b76e
6 changed file
--- a/paddlespeech/server/bin/paddlespeech_client.py
+++ b/paddlespeech/server/bin/paddlespeech_client.py
@@ -192,23 +192,7 @@ class TTSOnlineClientExecutor(BaseExecutor):
        self.parser.add_argument(
            '--spk_id', type=int, default=0, help='Speaker id')
        self.parser.add_argument(
-            '--speed',
-            type=float,
-            default=1.0,
-            help='Audio speed, the value should be set between 0 and 3')
-        self.parser.add_argument(
-            '--volume',
-            type=float,
-            default=1.0,
-            help='Audio volume, the value should be set between 0 and 3')
-        self.parser.add_argument(
-            '--sample_rate',
-            type=int,
-            default=0,
-            choices=[0, 8000, 16000],
-            help='Sampling rate, the default is the same as the model')
-        self.parser.add_argument(
-            '--output', type=str, default=None, help='Synthesized audio file')
+            '--output', type=str, default=None, help='Client saves synthesized audio')
        self.parser.add_argument(
            "--play", type=bool, help="whether to play audio", default=False)

@@ -219,9 +203,6 @@ class TTSOnlineClientExecutor(BaseExecutor):
        port = args.port
        protocol = args.protocol
        spk_id = args.spk_id
-        speed = args.speed
-        volume = args.volume
-        sample_rate = args.sample_rate
        output = args.output
        play = args.play

@@ -232,9 +213,6 @@ class TTSOnlineClientExecutor(BaseExecutor):
                port=port,
                protocol=protocol,
                spk_id=spk_id,
-                speed=speed,
-                volume=volume,
-                sample_rate=sample_rate,
                output=output,
                play=play)
            return True
@@ -250,9 +228,6 @@ class TTSOnlineClientExecutor(BaseExecutor):
                 port: int=8092,
                 protocol: str="http",
                 spk_id: int=0,
-                 speed: float=1.0,
-                 volume: float=1.0,
-                 sample_rate: int=0,
                 output: str=None,
                 play: bool=False):
        """
@@ -264,7 +239,7 @@ class TTSOnlineClientExecutor(BaseExecutor):
            from paddlespeech.server.utils.audio_handler import TTSHttpHandler
            handler = TTSHttpHandler(server_ip, port, play)
            first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = handler.run(
-                input, spk_id, speed, volume, sample_rate, output)
+                input, spk_id, output)
            delay_time_list = compute_delay(receive_time_list,
                                            chunk_duration_list)

@@ -274,7 +249,7 @@ class TTSOnlineClientExecutor(BaseExecutor):
            handler = TTSWsHandler(server_ip, port, play)
            loop = asyncio.get_event_loop()
            first_response, final_response, duration, save_audio_success, receive_time_list, chunk_duration_list = loop.run_until_complete(
-                handler.run(input, output))
+                handler.run(input, spk_id, output))
            delay_time_list = compute_delay(receive_time_list,
                                            chunk_duration_list)


--- a/paddlespeech/server/engine/engine_factory.py
+++ b/paddlespeech/server/engine/engine_factory.py
@@ -13,7 +13,7 @@
 # limitations under the License.
 from typing import Text

-from ..utils.log import logger
+from paddlespeech.cli.log import logger

 __all__ = ['EngineFactory']


--- a/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/onnx/tts_engine.py
@@ -19,6 +19,8 @@ from typing import Optional

 import numpy as np
 import paddle
+import librosa
+from scipy import signal

 from paddlespeech.cli.log import logger
 from paddlespeech.cli.tts.infer import TTSExecutor
@@ -30,6 +32,8 @@ from paddlespeech.server.utils.util import denorm
 from paddlespeech.server.utils.util import get_chunks
 from paddlespeech.t2s.frontend import English
 from paddlespeech.t2s.frontend.zh_frontend import Frontend
+from paddlespeech.server.utils.audio_process import change_speed
+from paddlespeech.server.utils.exception import ServerBaseException

 __all__ = ['TTSEngine', 'PaddleTTSConnectionHandler']

@@ -64,6 +68,8 @@ class TTSServerExecutor(TTSExecutor):
                 self, 'am_postnet_sess'))) and hasattr(self, 'voc_inference'):
            logger.info('Models had been initialized.')
            return
+
+        
        # am
        am_tag = am + '-' + lang
        if am == "fastspeech2_csmsc_onnx":
@@ -213,6 +219,8 @@ class TTSEngine(BaseEngine):
            self.config.voc_sample_rate == self.config.am_sample_rate
        ), "The sample rate of AM and Vocoder model are different, please check model."

+        self.sample_rate = self.config.voc_sample_rate
+
        try:
            if self.config.am_sess_conf.device is not None:
                self.device = self.config.am_sess_conf.device
@@ -441,33 +449,16 @@ class PaddleTTSConnectionHandler:

        self.final_response_time = time.time() - frontend_st

-    def preprocess(self, text_bese64: str=None, text_bytes: bytes=None):
-        # Convert byte to text
-        if text_bese64:
-            text_bytes = base64.b64decode(text_bese64)  # base64 to bytes
-        text = text_bytes.decode('utf-8')  # bytes to text
-
-        return text

    def run(self,
            sentence: str,
-            spk_id: int=0,
-            speed: float=1.0,
-            volume: float=1.0,
-            sample_rate: int=0,
-            save_path: str=None):
+            spk_id: int=0):
        """ run include inference and postprocess.

        Args:
            sentence (str): text to be synthesized
            spk_id (int, optional): speaker id for multi-speaker speech synthesis. Defaults to 0.
-            speed (float, optional): speed. Defaults to 1.0.
-            volume (float, optional): volume. Defaults to 1.0.
-            sample_rate (int, optional): target sample rate for synthesized audio, 
-            0 means the same as the model sampling rate. Defaults to 0.
-            save_path (str, optional): The save path of the synthesized audio. 
-            None means do not save audio. Defaults to None.
-
+            
        Returns:
            wav_base64: The base64 format of the synthesized audio.
        """
@@ -488,7 +479,7 @@ class PaddleTTSConnectionHandler:
            yield wav_base64

        wav_all = np.concatenate(wav_list, axis=0)
-        duration = len(wav_all) / self.config.voc_sample_rate
+        duration = len(wav_all) / self.tts_engine.sample_rate
        logger.info(f"sentence: {sentence}")
        logger.info(f"The durations of audio is: {duration} s")
        logger.info(f"first response time: {self.first_response_time} s")
@@ -496,4 +487,4 @@ class PaddleTTSConnectionHandler:
        logger.info(f"RTF: {self.final_response_time / duration}")
        logger.info(
            f"Other info: front time: {self.frontend_time} s, first am infer time: {self.first_am_infer} s, first voc infer time: {self.first_voc_infer} s,"
-        )
+        )
\ No newline at end of file
--- a/paddlespeech/server/engine/tts/online/python/tts_engine.py
+++ b/paddlespeech/server/engine/tts/online/python/tts_engine.py
@@ -276,6 +276,13 @@ class TTSEngine(BaseEngine):
            logger.error(e)
            return False

+
+        assert (
+            self.executor.am_config.fs == self.executor.voc_config.fs
+        ), "The sample rate of AM and Vocoder model are different, please check model."
+
+        self.sample_rate = self.executor.am_config.fs
+
        self.am_block = self.config.am_block
        self.am_pad = self.config.am_pad
        self.voc_block = self.config.voc_block
@@ -458,33 +465,16 @@ class PaddleTTSConnectionHandler:
                )

        self.final_response_time = time.time() - frontend_st
-
-    def preprocess(self, text_bese64: str=None, text_bytes: bytes=None):
-        # Convert byte to text
-        if text_bese64:
-            text_bytes = base64.b64decode(text_bese64)  # base64 to bytes
-        text = text_bytes.decode('utf-8')  # bytes to text
-
-        return text
+        

    def run(self,
            sentence: str,
-            spk_id: int=0,
-            speed: float=1.0,
-            volume: float=1.0,
-            sample_rate: int=0,
-            save_path: str=None):
+            spk_id: int=0,):
        """ run include inference and postprocess.

        Args:
            sentence (str): text to be synthesized
            spk_id (int, optional): speaker id for multi-speaker speech synthesis. Defaults to 0.
-            speed (float, optional): speed. Defaults to 1.0.
-            volume (float, optional): volume. Defaults to 1.0.
-            sample_rate (int, optional): target sample rate for synthesized audio, 
-            0 means the same as the model sampling rate. Defaults to 0.
-            save_path (str, optional): The save path of the synthesized audio. 
-            None means do not save audio. Defaults to None.

        Returns:
            wav_base64: The base64 format of the synthesized audio.
@@ -507,7 +497,7 @@ class PaddleTTSConnectionHandler:
            yield wav_base64

        wav_all = np.concatenate(wav_list, axis=0)
-        duration = len(wav_all) / self.executor.am_config.fs
+        duration = len(wav_all) / self.tts_engine.sample_rate

        logger.info(f"sentence: {sentence}")
        logger.info(f"The durations of audio is: {duration} s")

--- a/paddlespeech/server/utils/audio_handler.py
+++ b/paddlespeech/server/utils/audio_handler.py
@@ -266,6 +266,12 @@ class TTSWsHandler:
        self.url = "ws://" + self.server + ":" + str(
            self.port) + "/paddlespeech/tts/streaming"
        self.play = play
+
+        # get model sample rate
+        self.url_get_sr = "http://" + str(self.server) + ":" + str(
+            self.port) + "/paddlespeech/tts/streaming/samplerate"
+        self.sample_rate = requests.get(self.url_get_sr).json()["sample_rate"]
+
        if self.play:
            import pyaudio
            self.buffer = b''
@@ -273,7 +279,7 @@ class TTSWsHandler:
            self.stream = self.p.open(
                format=self.p.get_format_from_width(2),
                channels=1,
-                rate=24000,
+                rate=self.sample_rate,
                output=True)
            self.mutex = threading.Lock()
            self.start_play = True
@@ -293,12 +299,16 @@ class TTSWsHandler:
            self.buffer = b''
            self.mutex.release()

-    async def run(self, text: str, output: str=None):
+    async def run(self, 
+                  text: str,
+                  spk_id=0,
+                  output: str=None):
        """Send a text to online server

        Args:
            text (str): sentence to be synthesized
-            output (str): save audio path
+            spk_id (int, optional): speaker id. Defaults to 0.
+            output (str, optional): client save audio path. Defaults to None.
        """
        all_bytes = b''
        receive_time_list = []
@@ -315,8 +325,13 @@ class TTSWsHandler:
            session = msg["session"]

            # 3. send speech synthesis request 
-            text_base64 = str(base64.b64encode((text).encode('utf-8')), "UTF8")
-            request = json.dumps({"text": text_base64})
+            #text_base64 = str(base64.b64encode((text).encode('utf-8')), "UTF8")
+            params = {
+                "text": text,
+                "spk_id": spk_id,
+            }
+
+            request = json.dumps(params)
            st = time.time()
            await ws.send(request)
            logging.info("send a message to the server")
@@ -341,10 +356,10 @@ class TTSWsHandler:
                # Rerutn last packet normally, no audio information
                elif status == 2:
                    final_response = time.time() - st
-                    duration = len(all_bytes) / 2.0 / 24000
+                    duration = len(all_bytes) / 2.0 / self.sample_rate

                    if output is not None:
-                        save_audio_success = save_audio(all_bytes, output)
+                        save_audio_success = save_audio(all_bytes, output, self.sample_rate)
                    else:
                        save_audio_success = False

@@ -362,7 +377,7 @@ class TTSWsHandler:
                    receive_time_list.append(time.time())
                    audio = message["audio"]
                    audio = base64.b64decode(audio)  # bytes
-                    chunk_duration_list.append(len(audio) / 2.0 / 24000)
+                    chunk_duration_list.append(len(audio) / 2.0 / self.sample_rate)
                    all_bytes += audio
                    if self.play:
                        self.mutex.acquire()
@@ -403,19 +418,26 @@ class TTSHttpHandler:
            self.port) + "/paddlespeech/tts/streaming"
        self.play = play

+        # get model sample rate
+        self.url_get_sr = "http://" + str(self.server) + ":" + str(
+            self.port) + "/paddlespeech/tts/streaming/samplerate"
+        self.sample_rate = requests.get(self.url_get_sr).json()["sample_rate"]
+
        if self.play:
            import pyaudio
            self.buffer = b''
            self.p = pyaudio.PyAudio()
+            self.start_play = True
+            self.max_fail = 50
+
            self.stream = self.p.open(
                format=self.p.get_format_from_width(2),
                channels=1,
-                rate=24000,
+                rate=self.sample_rate,
                output=True)
            self.mutex = threading.Lock()
-            self.start_play = True
            self.t = threading.Thread(target=self.play_audio)
-            self.max_fail = 50
+            
        logger.info(f"endpoint: {self.url}")

    def play_audio(self):
@@ -433,28 +455,19 @@ class TTSHttpHandler:
    def run(self,
            text: str,
            spk_id=0,
-            speed=1.0,
-            volume=1.0,
-            sample_rate=0,
            output: str=None):
        """Send a text to tts online server

        Args:
            text (str): sentence to be synthesized.
            spk_id (int, optional): speaker id. Defaults to 0.
-            speed (float, optional): audio speed. Defaults to 1.0.
-            volume (float, optional): audio volume. Defaults to 1.0.
-            sample_rate (int, optional): audio sample rate, 0 means the same as model. Defaults to 0.
-            output (str, optional): save audio path. Defaults to None.
+            output (str, optional): client save audio path. Defaults to None.
        """
+    
        # 1. Create request
        params = {
            "text": text,
            "spk_id": spk_id,
-            "speed": speed,
-            "volume": volume,
-            "sample_rate": sample_rate,
-            "save_path": output
        }

        all_bytes = b''
@@ -482,14 +495,14 @@ class TTSHttpHandler:
                    self.t.start()
                    self.start_play = False
            all_bytes += audio
-            chunk_duration_list.append(len(audio) / 2.0 / 24000)
+            chunk_duration_list.append(len(audio) / 2.0 / self.sample_rate)

        final_response = time.time() - st
-        duration = len(all_bytes) / 2.0 / 24000
+        duration = len(all_bytes) / 2.0 / self.sample_rate
        html.close()  # when stream=True

        if output is not None:
-            save_audio_success = save_audio(all_bytes, output)
+            save_audio_success = save_audio(all_bytes, output, self.sample_rate)
        else:
            save_audio_success = False


--- a/paddlespeech/server/utils/onnx_infer.py
+++ b/paddlespeech/server/utils/onnx_infer.py
@@ -16,7 +16,7 @@ from typing import Optional

 import onnxruntime as ort

-from .log import logger
+from paddlespeech.cli.log import logger


 def get_sess(model_path: Optional[os.PathLike]=None, sess_conf: dict=None):