Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
d65b63b2
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
d65b63b2
编写于
4月 10, 2022
作者:
H
Hui Zhang
提交者:
GitHub
4月 10, 2022
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1652 from lym0302/tts_stream
[server] add stream tts server
上级
664cc9ca
1a3c811f
变更
16
显示空白变更内容
内联
并排
Showing
16 changed file
with
949 addition
and
21 deletion
+949
-21
paddlespeech/server/bin/paddlespeech_server.py
paddlespeech/server/bin/paddlespeech_server.py
+8
-2
paddlespeech/server/conf/tts_online_application.yaml
paddlespeech/server/conf/tts_online_application.yaml
+46
-0
paddlespeech/server/engine/asr/online/asr_engine.py
paddlespeech/server/engine/asr/online/asr_engine.py
+2
-16
paddlespeech/server/engine/engine_factory.py
paddlespeech/server/engine/engine_factory.py
+3
-0
paddlespeech/server/engine/tts/online/__init__.py
paddlespeech/server/engine/tts/online/__init__.py
+13
-0
paddlespeech/server/engine/tts/online/tts_engine.py
paddlespeech/server/engine/tts/online/tts_engine.py
+220
-0
paddlespeech/server/restful/tts_api.py
paddlespeech/server/restful/tts_api.py
+12
-0
paddlespeech/server/tests/tts/offline/http_client.py
paddlespeech/server/tests/tts/offline/http_client.py
+6
-2
paddlespeech/server/tests/tts/online/http_client.py
paddlespeech/server/tests/tts/online/http_client.py
+100
-0
paddlespeech/server/tests/tts/online/http_client_playaudio.py
...lespeech/server/tests/tts/online/http_client_playaudio.py
+112
-0
paddlespeech/server/tests/tts/online/ws_client.py
paddlespeech/server/tests/tts/online/ws_client.py
+126
-0
paddlespeech/server/tests/tts/online/ws_client_playaudio.py
paddlespeech/server/tests/tts/online/ws_client_playaudio.py
+160
-0
paddlespeech/server/utils/audio_process.py
paddlespeech/server/utils/audio_process.py
+37
-0
paddlespeech/server/utils/util.py
paddlespeech/server/utils/util.py
+40
-0
paddlespeech/server/ws/api.py
paddlespeech/server/ws/api.py
+2
-1
paddlespeech/server/ws/tts_socket.py
paddlespeech/server/ws/tts_socket.py
+62
-0
未找到文件。
paddlespeech/server/bin/paddlespeech_server.py
浏览文件 @
d65b63b2
...
...
@@ -23,8 +23,9 @@ from ..util import cli_server_register
from
..util
import
stats_wrapper
from
paddlespeech.cli.log
import
logger
from
paddlespeech.server.engine.engine_pool
import
init_engine_pool
from
paddlespeech.server.restful.api
import
setup_router
from
paddlespeech.server.restful.api
import
setup_router
as
setup_http_router
from
paddlespeech.server.utils.config
import
get_config
from
paddlespeech.server.ws.api
import
setup_router
as
setup_ws_router
__all__
=
[
'ServerExecutor'
,
'ServerStatsExecutor'
]
...
...
@@ -63,7 +64,12 @@ class ServerExecutor(BaseExecutor):
"""
# init api
api_list
=
list
(
engine
.
split
(
"_"
)[
0
]
for
engine
in
config
.
engine_list
)
api_router
=
setup_router
(
api_list
)
if
config
.
protocol
==
"websocket"
:
api_router
=
setup_ws_router
(
api_list
)
elif
config
.
protocol
==
"http"
:
api_router
=
setup_http_router
(
api_list
)
else
:
raise
Exception
(
"unsupported protocol"
)
app
.
include_router
(
api_router
)
if
not
init_engine_pool
(
config
):
...
...
paddlespeech/server/conf/tts_online_application.yaml
0 → 100644
浏览文件 @
d65b63b2
# This is the parameter configuration file for PaddleSpeech Serving.
#################################################################################
# SERVER SETTING #
#################################################################################
host
:
127.0.0.1
port
:
8092
# The task format in the engin_list is: <speech task>_<engine type>
# task choices = ['asr_online', 'tts_online']
# protocol = ['websocket', 'http'] (only one can be selected).
protocol
:
'
http'
engine_list
:
[
'
tts_online'
]
#################################################################################
# ENGINE CONFIG #
#################################################################################
################################### TTS #########################################
################### speech task: tts; engine_type: online #######################
tts_online
:
# am (acoustic model) choices=['fastspeech2_csmsc']
am
:
'
fastspeech2_csmsc'
am_config
:
am_ckpt
:
am_stat
:
phones_dict
:
tones_dict
:
speaker_dict
:
spk_id
:
0
# voc (vocoder) choices=['mb_melgan_csmsc']
voc
:
'
mb_melgan_csmsc'
voc_config
:
voc_ckpt
:
voc_stat
:
# others
lang
:
'
zh'
device
:
# set 'gpu:id' or 'cpu'
am_block
:
42
am_pad
:
12
voc_block
:
14
voc_pad
:
14
paddlespeech/server/engine/asr/online/asr_engine.py
浏览文件 @
d65b63b2
...
...
@@ -27,6 +27,7 @@ from paddlespeech.s2t.frontend.speech import SpeechSegment
from
paddlespeech.s2t.modules.ctc
import
CTCDecoder
from
paddlespeech.s2t.utils.utility
import
UpdateConfig
from
paddlespeech.server.engine.base_engine
import
BaseEngine
from
paddlespeech.server.utils.audio_process
import
pcm2float
from
paddlespeech.server.utils.paddle_predictor
import
init_predictor
__all__
=
[
'ASREngine'
]
...
...
@@ -222,21 +223,6 @@ class ASRServerExecutor(ASRExecutor):
else
:
raise
Exception
(
"invalid model name"
)
def
_pcm16to32
(
self
,
audio
):
"""pcm int16 to float32
Args:
audio(numpy.array): numpy.int16
Returns:
audio(numpy.array): numpy.float32
"""
if
audio
.
dtype
==
np
.
int16
:
audio
=
audio
.
astype
(
"float32"
)
bits
=
np
.
iinfo
(
np
.
int16
).
bits
audio
=
audio
/
(
2
**
(
bits
-
1
))
return
audio
def
extract_feat
(
self
,
samples
,
sample_rate
):
"""extract feat
...
...
@@ -249,7 +235,7 @@ class ASRServerExecutor(ASRExecutor):
x_chunk_lens (numpy.array): shape[B]
"""
# pcm16 -> pcm 32
samples
=
self
.
_pcm16to32
(
samples
)
samples
=
pcm2float
(
samples
)
# read audio
speech_segment
=
SpeechSegment
.
from_pcm
(
...
...
paddlespeech/server/engine/engine_factory.py
浏览文件 @
d65b63b2
...
...
@@ -34,6 +34,9 @@ class EngineFactory(object):
elif
engine_name
==
'tts'
and
engine_type
==
'python'
:
from
paddlespeech.server.engine.tts.python.tts_engine
import
TTSEngine
return
TTSEngine
()
elif
engine_name
==
'tts'
and
engine_type
==
'online'
:
from
paddlespeech.server.engine.tts.online.tts_engine
import
TTSEngine
return
TTSEngine
()
elif
engine_name
==
'cls'
and
engine_type
==
'inference'
:
from
paddlespeech.server.engine.cls.paddleinference.cls_engine
import
CLSEngine
return
CLSEngine
()
...
...
paddlespeech/server/engine/tts/online/__init__.py
0 → 100644
浏览文件 @
d65b63b2
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
paddlespeech/server/engine/tts/online/tts_engine.py
0 → 100644
浏览文件 @
d65b63b2
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
base64
import
time
import
numpy
as
np
import
paddle
from
paddlespeech.cli.log
import
logger
from
paddlespeech.cli.tts.infer
import
TTSExecutor
from
paddlespeech.server.engine.base_engine
import
BaseEngine
from
paddlespeech.server.utils.audio_process
import
float2pcm
from
paddlespeech.server.utils.util
import
get_chunks
__all__
=
[
'TTSEngine'
]
class
TTSServerExecutor
(
TTSExecutor
):
def
__init__
(
self
):
super
().
__init__
()
pass
@
paddle
.
no_grad
()
def
infer
(
self
,
text
:
str
,
lang
:
str
=
'zh'
,
am
:
str
=
'fastspeech2_csmsc'
,
spk_id
:
int
=
0
,
am_block
:
int
=
42
,
am_pad
:
int
=
12
,
voc_block
:
int
=
14
,
voc_pad
:
int
=
14
,
):
"""
Model inference and result stored in self.output.
"""
am_name
=
am
[:
am
.
rindex
(
'_'
)]
am_dataset
=
am
[
am
.
rindex
(
'_'
)
+
1
:]
get_tone_ids
=
False
merge_sentences
=
False
frontend_st
=
time
.
time
()
if
lang
==
'zh'
:
input_ids
=
self
.
frontend
.
get_input_ids
(
text
,
merge_sentences
=
merge_sentences
,
get_tone_ids
=
get_tone_ids
)
phone_ids
=
input_ids
[
"phone_ids"
]
if
get_tone_ids
:
tone_ids
=
input_ids
[
"tone_ids"
]
elif
lang
==
'en'
:
input_ids
=
self
.
frontend
.
get_input_ids
(
text
,
merge_sentences
=
merge_sentences
)
phone_ids
=
input_ids
[
"phone_ids"
]
else
:
print
(
"lang should in {'zh', 'en'}!"
)
self
.
frontend_time
=
time
.
time
()
-
frontend_st
for
i
in
range
(
len
(
phone_ids
)):
am_st
=
time
.
time
()
part_phone_ids
=
phone_ids
[
i
]
# am
if
am_name
==
'speedyspeech'
:
part_tone_ids
=
tone_ids
[
i
]
mel
=
self
.
am_inference
(
part_phone_ids
,
part_tone_ids
)
# fastspeech2
else
:
# multi speaker
if
am_dataset
in
{
"aishell3"
,
"vctk"
}:
mel
=
self
.
am_inference
(
part_phone_ids
,
spk_id
=
paddle
.
to_tensor
(
spk_id
))
else
:
mel
=
self
.
am_inference
(
part_phone_ids
)
am_et
=
time
.
time
()
# voc streaming
voc_upsample
=
self
.
voc_config
.
n_shift
mel_chunks
=
get_chunks
(
mel
,
voc_block
,
voc_pad
,
"voc"
)
chunk_num
=
len
(
mel_chunks
)
voc_st
=
time
.
time
()
for
i
,
mel_chunk
in
enumerate
(
mel_chunks
):
sub_wav
=
self
.
voc_inference
(
mel_chunk
)
front_pad
=
min
(
i
*
voc_block
,
voc_pad
)
if
i
==
0
:
sub_wav
=
sub_wav
[:
voc_block
*
voc_upsample
]
elif
i
==
chunk_num
-
1
:
sub_wav
=
sub_wav
[
front_pad
*
voc_upsample
:]
else
:
sub_wav
=
sub_wav
[
front_pad
*
voc_upsample
:(
front_pad
+
voc_block
)
*
voc_upsample
]
yield
sub_wav
class
TTSEngine
(
BaseEngine
):
"""TTS server engine
Args:
metaclass: Defaults to Singleton.
"""
def
__init__
(
self
,
name
=
None
):
"""Initialize TTS server engine
"""
super
(
TTSEngine
,
self
).
__init__
()
def
init
(
self
,
config
:
dict
)
->
bool
:
self
.
executor
=
TTSServerExecutor
()
self
.
config
=
config
assert
"fastspeech2_csmsc"
in
config
.
am
and
(
config
.
voc
==
"hifigan_csmsc-zh"
or
config
.
voc
==
"mb_melgan_csmsc"
),
'Please check config, am support: fastspeech2, voc support: hifigan_csmsc-zh or mb_melgan_csmsc.'
try
:
if
self
.
config
.
device
:
self
.
device
=
self
.
config
.
device
else
:
self
.
device
=
paddle
.
get_device
()
paddle
.
set_device
(
self
.
device
)
except
Exception
as
e
:
logger
.
error
(
"Set device failed, please check if device is already used and the parameter 'device' in the yaml file"
)
logger
.
error
(
"Initialize TTS server engine Failed on device: %s."
%
(
self
.
device
))
return
False
try
:
self
.
executor
.
_init_from_path
(
am
=
self
.
config
.
am
,
am_config
=
self
.
config
.
am_config
,
am_ckpt
=
self
.
config
.
am_ckpt
,
am_stat
=
self
.
config
.
am_stat
,
phones_dict
=
self
.
config
.
phones_dict
,
tones_dict
=
self
.
config
.
tones_dict
,
speaker_dict
=
self
.
config
.
speaker_dict
,
voc
=
self
.
config
.
voc
,
voc_config
=
self
.
config
.
voc_config
,
voc_ckpt
=
self
.
config
.
voc_ckpt
,
voc_stat
=
self
.
config
.
voc_stat
,
lang
=
self
.
config
.
lang
)
except
Exception
as
e
:
logger
.
error
(
"Failed to get model related files."
)
logger
.
error
(
"Initialize TTS server engine Failed on device: %s."
%
(
self
.
device
))
return
False
self
.
am_block
=
self
.
config
.
am_block
self
.
am_pad
=
self
.
config
.
am_pad
self
.
voc_block
=
self
.
config
.
voc_block
self
.
voc_pad
=
self
.
config
.
voc_pad
logger
.
info
(
"Initialize TTS server engine successfully on device: %s."
%
(
self
.
device
))
return
True
def
preprocess
(
self
,
text_bese64
:
str
=
None
,
text_bytes
:
bytes
=
None
):
# Convert byte to text
if
text_bese64
:
text_bytes
=
base64
.
b64decode
(
text_bese64
)
# base64 to bytes
text
=
text_bytes
.
decode
(
'utf-8'
)
# bytes to text
return
text
def
run
(
self
,
sentence
:
str
,
spk_id
:
int
=
0
,
speed
:
float
=
1.0
,
volume
:
float
=
1.0
,
sample_rate
:
int
=
0
,
save_path
:
str
=
None
):
""" run include inference and postprocess.
Args:
sentence (str): text to be synthesized
spk_id (int, optional): speaker id for multi-speaker speech synthesis. Defaults to 0.
speed (float, optional): speed. Defaults to 1.0.
volume (float, optional): volume. Defaults to 1.0.
sample_rate (int, optional): target sample rate for synthesized audio,
0 means the same as the model sampling rate. Defaults to 0.
save_path (str, optional): The save path of the synthesized audio.
None means do not save audio. Defaults to None.
Returns:
wav_base64: The base64 format of the synthesized audio.
"""
lang
=
self
.
config
.
lang
wav_list
=
[]
for
wav
in
self
.
executor
.
infer
(
text
=
sentence
,
lang
=
lang
,
am
=
self
.
config
.
am
,
spk_id
=
spk_id
,
am_block
=
self
.
am_block
,
am_pad
=
self
.
am_pad
,
voc_block
=
self
.
voc_block
,
voc_pad
=
self
.
voc_pad
):
# wav type: <class 'numpy.ndarray'> float32, convert to pcm (base64)
wav
=
float2pcm
(
wav
)
# float32 to int16
wav_bytes
=
wav
.
tobytes
()
# to bytes
wav_base64
=
base64
.
b64encode
(
wav_bytes
).
decode
(
'utf8'
)
# to base64
wav_list
.
append
(
wav
)
yield
wav_base64
wav_all
=
np
.
concatenate
(
wav_list
,
axis
=
0
)
logger
.
info
(
"The durations of audio is: {} s"
.
format
(
len
(
wav_all
)
/
self
.
executor
.
am_config
.
fs
))
paddlespeech/server/restful/tts_api.py
浏览文件 @
d65b63b2
...
...
@@ -15,6 +15,7 @@ import traceback
from
typing
import
Union
from
fastapi
import
APIRouter
from
fastapi.responses
import
StreamingResponse
from
paddlespeech.cli.log
import
logger
from
paddlespeech.server.engine.engine_pool
import
get_engine_pool
...
...
@@ -125,3 +126,14 @@ def tts(request_body: TTSRequest):
traceback
.
print_exc
()
return
response
@
router
.
post
(
"/paddlespeech/streaming/tts"
)
async
def
stream_tts
(
request_body
:
TTSRequest
):
text
=
request_body
.
text
engine_pool
=
get_engine_pool
()
tts_engine
=
engine_pool
[
'tts'
]
logger
.
info
(
"Get tts engine successfully."
)
return
StreamingResponse
(
tts_engine
.
run
(
sentence
=
text
))
paddlespeech/server/tests/tts/
test
_client.py
→
paddlespeech/server/tests/tts/
offline/http
_client.py
浏览文件 @
d65b63b2
...
...
@@ -33,7 +33,8 @@ def tts_client(args):
text: A sentence to be synthesized
outfile: Synthetic audio file
"""
url
=
'http://127.0.0.1:8090/paddlespeech/tts'
url
=
"http://"
+
str
(
args
.
server
)
+
":"
+
str
(
args
.
port
)
+
"/paddlespeech/tts"
request
=
{
"text"
:
args
.
text
,
"spk_id"
:
args
.
spk_id
,
...
...
@@ -72,7 +73,7 @@ if __name__ == "__main__":
parser
.
add_argument
(
'--text'
,
type
=
str
,
default
=
"
你好,欢迎使用语音合成服务
"
,
default
=
"
您好,欢迎使用语音合成服务。
"
,
help
=
'A sentence to be synthesized'
)
parser
.
add_argument
(
'--spk_id'
,
type
=
int
,
default
=
0
,
help
=
'Speaker id'
)
parser
.
add_argument
(
'--speed'
,
type
=
float
,
default
=
1.0
,
help
=
'Audio speed'
)
...
...
@@ -88,6 +89,9 @@ if __name__ == "__main__":
type
=
str
,
default
=
"./out.wav"
,
help
=
'Synthesized audio file'
)
parser
.
add_argument
(
"--server"
,
type
=
str
,
help
=
"server ip"
,
default
=
"127.0.0.1"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
help
=
"server port"
,
default
=
8090
)
args
=
parser
.
parse_args
()
st
=
time
.
time
()
...
...
paddlespeech/server/tests/tts/online/http_client.py
0 → 100644
浏览文件 @
d65b63b2
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
base64
import
json
import
os
import
time
import
requests
from
paddlespeech.server.utils.audio_process
import
pcm2wav
def
save_audio
(
buffer
,
audio_path
)
->
bool
:
if
args
.
save_path
.
endswith
(
"pcm"
):
with
open
(
args
.
save_path
,
"wb"
)
as
f
:
f
.
write
(
buffer
)
elif
args
.
save_path
.
endswith
(
"wav"
):
with
open
(
"./tmp.pcm"
,
"wb"
)
as
f
:
f
.
write
(
buffer
)
pcm2wav
(
"./tmp.pcm"
,
audio_path
,
channels
=
1
,
bits
=
16
,
sample_rate
=
24000
)
os
.
system
(
"rm ./tmp.pcm"
)
else
:
print
(
"Only supports saved audio format is pcm or wav"
)
return
False
return
True
def
test
(
args
):
params
=
{
"text"
:
args
.
text
,
"spk_id"
:
args
.
spk_id
,
"speed"
:
args
.
speed
,
"volume"
:
args
.
volume
,
"sample_rate"
:
args
.
sample_rate
,
"save_path"
:
''
}
buffer
=
b
''
flag
=
1
url
=
"http://"
+
str
(
args
.
server
)
+
":"
+
str
(
args
.
port
)
+
"/paddlespeech/streaming/tts"
st
=
time
.
time
()
html
=
requests
.
post
(
url
,
json
.
dumps
(
params
),
stream
=
True
)
for
chunk
in
html
.
iter_content
(
chunk_size
=
1024
):
chunk
=
base64
.
b64decode
(
chunk
)
# bytes
if
flag
:
first_response
=
time
.
time
()
-
st
print
(
f
"首包响应:
{
first_response
}
s"
)
flag
=
0
buffer
+=
chunk
final_response
=
time
.
time
()
-
st
duration
=
len
(
buffer
)
/
2.0
/
24000
print
(
f
"尾包响应:
{
final_response
}
s"
)
print
(
f
"音频时长:
{
duration
}
s"
)
print
(
f
"RTF:
{
final_response
/
duration
}
"
)
if
args
.
save_path
is
not
None
:
if
save_audio
(
buffer
,
args
.
save_path
):
print
(
"音频保存至:"
,
args
.
save_path
)
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--text'
,
type
=
str
,
default
=
"您好,欢迎使用语音合成服务。"
,
help
=
'A sentence to be synthesized'
)
parser
.
add_argument
(
'--spk_id'
,
type
=
int
,
default
=
0
,
help
=
'Speaker id'
)
parser
.
add_argument
(
'--speed'
,
type
=
float
,
default
=
1.0
,
help
=
'Audio speed'
)
parser
.
add_argument
(
'--volume'
,
type
=
float
,
default
=
1.0
,
help
=
'Audio volume'
)
parser
.
add_argument
(
'--sample_rate'
,
type
=
int
,
default
=
0
,
help
=
'Sampling rate, the default is the same as the model'
)
parser
.
add_argument
(
"--server"
,
type
=
str
,
help
=
"server ip"
,
default
=
"127.0.0.1"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
help
=
"server port"
,
default
=
8092
)
parser
.
add_argument
(
"--save_path"
,
type
=
str
,
help
=
"save audio path"
,
default
=
None
)
args
=
parser
.
parse_args
()
test
(
args
)
paddlespeech/server/tests/tts/online/http_client_playaudio.py
0 → 100644
浏览文件 @
d65b63b2
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
base64
import
json
import
threading
import
time
import
pyaudio
import
requests
mutex
=
threading
.
Lock
()
buffer
=
b
''
p
=
pyaudio
.
PyAudio
()
stream
=
p
.
open
(
format
=
p
.
get_format_from_width
(
2
),
channels
=
1
,
rate
=
24000
,
output
=
True
)
max_fail
=
50
def
play_audio
():
global
stream
global
buffer
global
max_fail
while
True
:
if
not
buffer
:
max_fail
-=
1
time
.
sleep
(
0.05
)
if
max_fail
<
0
:
break
mutex
.
acquire
()
stream
.
write
(
buffer
)
buffer
=
b
''
mutex
.
release
()
def
test
(
args
):
global
mutex
global
buffer
params
=
{
"text"
:
args
.
text
,
"spk_id"
:
args
.
spk_id
,
"speed"
:
args
.
speed
,
"volume"
:
args
.
volume
,
"sample_rate"
:
args
.
sample_rate
,
"save_path"
:
''
}
all_bytes
=
0.0
t
=
threading
.
Thread
(
target
=
play_audio
)
flag
=
1
url
=
"http://"
+
str
(
args
.
server
)
+
":"
+
str
(
args
.
port
)
+
"/paddlespeech/streaming/tts"
st
=
time
.
time
()
html
=
requests
.
post
(
url
,
json
.
dumps
(
params
),
stream
=
True
)
for
chunk
in
html
.
iter_content
(
chunk_size
=
1024
):
mutex
.
acquire
()
chunk
=
base64
.
b64decode
(
chunk
)
# bytes
buffer
+=
chunk
mutex
.
release
()
if
flag
:
first_response
=
time
.
time
()
-
st
print
(
f
"首包响应:
{
first_response
}
s"
)
flag
=
0
t
.
start
()
all_bytes
+=
len
(
chunk
)
final_response
=
time
.
time
()
-
st
duration
=
all_bytes
/
2
/
24000
print
(
f
"尾包响应:
{
final_response
}
s"
)
print
(
f
"音频时长:
{
duration
}
s"
)
print
(
f
"RTF:
{
final_response
/
duration
}
"
)
t
.
join
()
stream
.
stop_stream
()
stream
.
close
()
p
.
terminate
()
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--text'
,
type
=
str
,
default
=
"您好,欢迎使用语音合成服务。"
,
help
=
'A sentence to be synthesized'
)
parser
.
add_argument
(
'--spk_id'
,
type
=
int
,
default
=
0
,
help
=
'Speaker id'
)
parser
.
add_argument
(
'--speed'
,
type
=
float
,
default
=
1.0
,
help
=
'Audio speed'
)
parser
.
add_argument
(
'--volume'
,
type
=
float
,
default
=
1.0
,
help
=
'Audio volume'
)
parser
.
add_argument
(
'--sample_rate'
,
type
=
int
,
default
=
0
,
help
=
'Sampling rate, the default is the same as the model'
)
parser
.
add_argument
(
"--server"
,
type
=
str
,
help
=
"server ip"
,
default
=
"127.0.0.1"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
help
=
"server port"
,
default
=
8092
)
args
=
parser
.
parse_args
()
test
(
args
)
paddlespeech/server/tests/tts/online/ws_client.py
0 → 100644
浏览文件 @
d65b63b2
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
_thread
as
thread
import
argparse
import
base64
import
json
import
ssl
import
time
import
websocket
flag
=
1
st
=
0.0
all_bytes
=
b
''
class
WsParam
(
object
):
# 初始化
def
__init__
(
self
,
text
,
server
=
"127.0.0.1"
,
port
=
8090
):
self
.
server
=
server
self
.
port
=
port
self
.
url
=
"ws://"
+
self
.
server
+
":"
+
str
(
self
.
port
)
+
"/ws/tts"
self
.
text
=
text
# 生成url
def
create_url
(
self
):
return
self
.
url
def
on_message
(
ws
,
message
):
global
flag
global
st
global
all_bytes
try
:
message
=
json
.
loads
(
message
)
audio
=
message
[
"audio"
]
audio
=
base64
.
b64decode
(
audio
)
# bytes
status
=
message
[
"status"
]
all_bytes
+=
audio
if
status
==
0
:
print
(
"create successfully."
)
elif
status
==
1
:
if
flag
:
print
(
f
"首包响应:
{
time
.
time
()
-
st
}
s"
)
flag
=
0
elif
status
==
2
:
final_response
=
time
.
time
()
-
st
duration
=
len
(
all_bytes
)
/
2.0
/
24000
print
(
f
"尾包响应:
{
final_response
}
s"
)
print
(
f
"音频时长:
{
duration
}
s"
)
print
(
f
"RTF:
{
final_response
/
duration
}
"
)
with
open
(
"./out.pcm"
,
"wb"
)
as
f
:
f
.
write
(
all_bytes
)
print
(
"ws is closed"
)
ws
.
close
()
else
:
print
(
"infer error"
)
except
Exception
as
e
:
print
(
"receive msg,but parse exception:"
,
e
)
# 收到websocket错误的处理
def
on_error
(
ws
,
error
):
print
(
"### error:"
,
error
)
# 收到websocket关闭的处理
def
on_close
(
ws
):
print
(
"### closed ###"
)
# 收到websocket连接建立的处理
def
on_open
(
ws
):
def
run
(
*
args
):
global
st
text_base64
=
str
(
base64
.
b64encode
((
wsParam
.
text
).
encode
(
'utf-8'
)),
"UTF8"
)
d
=
{
"text"
:
text_base64
}
d
=
json
.
dumps
(
d
)
print
(
"Start sending text data"
)
st
=
time
.
time
()
ws
.
send
(
d
)
thread
.
start_new_thread
(
run
,
())
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--text"
,
type
=
str
,
help
=
"A sentence to be synthesized"
,
default
=
"您好,欢迎使用语音合成服务。"
)
parser
.
add_argument
(
"--server"
,
type
=
str
,
help
=
"server ip"
,
default
=
"127.0.0.1"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
help
=
"server port"
,
default
=
8092
)
args
=
parser
.
parse_args
()
print
(
"***************************************"
)
print
(
"Server ip: "
,
args
.
server
)
print
(
"Server port: "
,
args
.
port
)
print
(
"Sentence to be synthesized: "
,
args
.
text
)
print
(
"***************************************"
)
wsParam
=
WsParam
(
text
=
args
.
text
,
server
=
args
.
server
,
port
=
args
.
port
)
websocket
.
enableTrace
(
False
)
wsUrl
=
wsParam
.
create_url
()
ws
=
websocket
.
WebSocketApp
(
wsUrl
,
on_message
=
on_message
,
on_error
=
on_error
,
on_close
=
on_close
)
ws
.
on_open
=
on_open
ws
.
run_forever
(
sslopt
=
{
"cert_reqs"
:
ssl
.
CERT_NONE
})
paddlespeech/server/tests/tts/online/ws_client_playaudio.py
0 → 100644
浏览文件 @
d65b63b2
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
_thread
as
thread
import
argparse
import
base64
import
json
import
ssl
import
threading
import
time
import
pyaudio
import
websocket
mutex
=
threading
.
Lock
()
buffer
=
b
''
p
=
pyaudio
.
PyAudio
()
stream
=
p
.
open
(
format
=
p
.
get_format_from_width
(
2
),
channels
=
1
,
rate
=
24000
,
output
=
True
)
flag
=
1
st
=
0.0
all_bytes
=
0.0
class
WsParam
(
object
):
# 初始化
def
__init__
(
self
,
text
,
server
=
"127.0.0.1"
,
port
=
8090
):
self
.
server
=
server
self
.
port
=
port
self
.
url
=
"ws://"
+
self
.
server
+
":"
+
str
(
self
.
port
)
+
"/ws/tts"
self
.
text
=
text
# 生成url
def
create_url
(
self
):
return
self
.
url
def
play_audio
():
global
stream
global
buffer
while
True
:
time
.
sleep
(
0.05
)
if
not
buffer
:
# buffer 为空
break
mutex
.
acquire
()
stream
.
write
(
buffer
)
buffer
=
b
''
mutex
.
release
()
t
=
threading
.
Thread
(
target
=
play_audio
)
def
on_message
(
ws
,
message
):
global
flag
global
t
global
buffer
global
st
global
all_bytes
try
:
message
=
json
.
loads
(
message
)
audio
=
message
[
"audio"
]
audio
=
base64
.
b64decode
(
audio
)
# bytes
status
=
message
[
"status"
]
all_bytes
+=
len
(
audio
)
if
status
==
0
:
print
(
"create successfully."
)
elif
status
==
1
:
mutex
.
acquire
()
buffer
+=
audio
mutex
.
release
()
if
flag
:
print
(
f
"首包响应:
{
time
.
time
()
-
st
}
s"
)
flag
=
0
print
(
"Start playing audio"
)
t
.
start
()
elif
status
==
2
:
final_response
=
time
.
time
()
-
st
duration
=
all_bytes
/
2
/
24000
print
(
f
"尾包响应:
{
final_response
}
s"
)
print
(
f
"音频时长:
{
duration
}
s"
)
print
(
f
"RTF:
{
final_response
/
duration
}
"
)
print
(
"ws is closed"
)
ws
.
close
()
else
:
print
(
"infer error"
)
except
Exception
as
e
:
print
(
"receive msg,but parse exception:"
,
e
)
# 收到websocket错误的处理
def
on_error
(
ws
,
error
):
print
(
"### error:"
,
error
)
# 收到websocket关闭的处理
def
on_close
(
ws
):
print
(
"### closed ###"
)
# 收到websocket连接建立的处理
def
on_open
(
ws
):
def
run
(
*
args
):
global
st
text_base64
=
str
(
base64
.
b64encode
((
wsParam
.
text
).
encode
(
'utf-8'
)),
"UTF8"
)
d
=
{
"text"
:
text_base64
}
d
=
json
.
dumps
(
d
)
print
(
"Start sending text data"
)
st
=
time
.
time
()
ws
.
send
(
d
)
thread
.
start_new_thread
(
run
,
())
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
"--text"
,
type
=
str
,
help
=
"A sentence to be synthesized"
,
default
=
"您好,欢迎使用语音合成服务。"
)
parser
.
add_argument
(
"--server"
,
type
=
str
,
help
=
"server ip"
,
default
=
"127.0.0.1"
)
parser
.
add_argument
(
"--port"
,
type
=
int
,
help
=
"server port"
,
default
=
8092
)
args
=
parser
.
parse_args
()
print
(
"***************************************"
)
print
(
"Server ip: "
,
args
.
server
)
print
(
"Server port: "
,
args
.
port
)
print
(
"Sentence to be synthesized: "
,
args
.
text
)
print
(
"***************************************"
)
wsParam
=
WsParam
(
text
=
args
.
text
,
server
=
args
.
server
,
port
=
args
.
port
)
websocket
.
enableTrace
(
False
)
wsUrl
=
wsParam
.
create_url
()
ws
=
websocket
.
WebSocketApp
(
wsUrl
,
on_message
=
on_message
,
on_error
=
on_error
,
on_close
=
on_close
)
ws
.
on_open
=
on_open
ws
.
run_forever
(
sslopt
=
{
"cert_reqs"
:
ssl
.
CERT_NONE
})
t
.
join
()
print
(
"End of playing audio"
)
stream
.
stop_stream
()
stream
.
close
()
p
.
terminate
()
paddlespeech/server/utils/audio_process.py
浏览文件 @
d65b63b2
...
...
@@ -103,3 +103,40 @@ def change_speed(sample_raw, speed_rate, sample_rate):
sample_rate_in
=
sample_rate
).
squeeze
(
-
1
).
astype
(
np
.
float32
).
copy
()
return
sample_speed
def
float2pcm
(
sig
,
dtype
=
'int16'
):
"""Convert floating point signal with a range from -1 to 1 to PCM.
Args:
sig (array): Input array, must have floating point type.
dtype (str, optional): Desired (integer) data type. Defaults to 'int16'.
Returns:
numpy.ndarray: Integer data, scaled and clipped to the range of the given
"""
sig
=
np
.
asarray
(
sig
)
if
sig
.
dtype
.
kind
!=
'f'
:
raise
TypeError
(
"'sig' must be a float array"
)
dtype
=
np
.
dtype
(
dtype
)
if
dtype
.
kind
not
in
'iu'
:
raise
TypeError
(
"'dtype' must be an integer type"
)
i
=
np
.
iinfo
(
dtype
)
abs_max
=
2
**
(
i
.
bits
-
1
)
offset
=
i
.
min
+
abs_max
return
(
sig
*
abs_max
+
offset
).
clip
(
i
.
min
,
i
.
max
).
astype
(
dtype
)
def
pcm2float
(
data
):
"""pcm int16 to float32
Args:
audio(numpy.array): numpy.int16
Returns:
audio(numpy.array): numpy.float32
"""
if
data
.
dtype
==
np
.
int16
:
data
=
data
.
astype
(
"float32"
)
bits
=
np
.
iinfo
(
np
.
int16
).
bits
data
=
data
/
(
2
**
(
bits
-
1
))
return
data
paddlespeech/server/utils/util.py
浏览文件 @
d65b63b2
...
...
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the
import
base64
import
math
def
wav2base64
(
wav_file
:
str
):
...
...
@@ -31,3 +32,42 @@ def self_check():
""" self check resource
"""
return
True
def
denorm
(
data
,
mean
,
std
):
"""stream am model need to denorm
"""
return
data
*
std
+
mean
def
get_chunks
(
data
,
block_size
,
pad_size
,
step
):
"""Divide data into multiple chunks
Args:
data (tensor): data
block_size (int): [description]
pad_size (int): [description]
step (str): set "am" or "voc", generate chunk for step am or vocoder(voc)
Returns:
list: chunks list
"""
if
step
==
"am"
:
data_len
=
data
.
shape
[
1
]
elif
step
==
"voc"
:
data_len
=
data
.
shape
[
0
]
else
:
print
(
"Please set correct type to get chunks, am or voc"
)
chunks
=
[]
n
=
math
.
ceil
(
data_len
/
block_size
)
for
i
in
range
(
n
):
start
=
max
(
0
,
i
*
block_size
-
pad_size
)
end
=
min
((
i
+
1
)
*
block_size
+
pad_size
,
data_len
)
if
step
==
"am"
:
chunks
.
append
(
data
[:,
start
:
end
,
:])
elif
step
==
"voc"
:
chunks
.
append
(
data
[
start
:
end
,
:])
else
:
print
(
"Please set correct type to get chunks, am or voc"
)
return
chunks
paddlespeech/server/ws/api.py
浏览文件 @
d65b63b2
...
...
@@ -16,6 +16,7 @@ from typing import List
from
fastapi
import
APIRouter
from
paddlespeech.server.ws.asr_socket
import
router
as
asr_router
from
paddlespeech.server.ws.tts_socket
import
router
as
tts_router
_router
=
APIRouter
()
...
...
@@ -31,7 +32,7 @@ def setup_router(api_list: List):
if
api_name
==
'asr'
:
_router
.
include_router
(
asr_router
)
elif
api_name
==
'tts'
:
pass
_router
.
include_router
(
tts_router
)
else
:
pass
...
...
paddlespeech/server/ws/tts_socket.py
0 → 100644
浏览文件 @
d65b63b2
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
json
from
fastapi
import
APIRouter
from
fastapi
import
WebSocket
from
fastapi
import
WebSocketDisconnect
from
starlette.websockets
import
WebSocketState
as
WebSocketState
from
paddlespeech.cli.log
import
logger
from
paddlespeech.server.engine.engine_pool
import
get_engine_pool
router
=
APIRouter
()
@
router
.
websocket
(
'/ws/tts'
)
async
def
websocket_endpoint
(
websocket
:
WebSocket
):
await
websocket
.
accept
()
try
:
# careful here, changed the source code from starlette.websockets
assert
websocket
.
application_state
==
WebSocketState
.
CONNECTED
message
=
await
websocket
.
receive
()
websocket
.
_raise_on_disconnect
(
message
)
# get engine
engine_pool
=
get_engine_pool
()
tts_engine
=
engine_pool
[
'tts'
]
# 获取 message 并转文本
message
=
json
.
loads
(
message
[
"text"
])
text_bese64
=
message
[
"text"
]
sentence
=
tts_engine
.
preprocess
(
text_bese64
=
text_bese64
)
# run
wav_generator
=
tts_engine
.
run
(
sentence
)
while
True
:
try
:
tts_results
=
next
(
wav_generator
)
resp
=
{
"status"
:
1
,
"audio"
:
tts_results
}
await
websocket
.
send_json
(
resp
)
logger
.
info
(
"streaming audio..."
)
except
StopIteration
as
e
:
resp
=
{
"status"
:
2
,
"audio"
:
''
}
await
websocket
.
send_json
(
resp
)
logger
.
info
(
"Complete the transmission of audio streams"
)
break
except
WebSocketDisconnect
:
pass
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录