Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
79c064fe
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
79c064fe
编写于
2月 15, 2022
作者:
H
Hui Zhang
提交者:
GitHub
2月 15, 2022
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1446 from lym0302/tts-server3
[server] add params type
上级
12195378
37d9dc5a
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
216 addition
and
82 deletion
+216
-82
speechserving/speechserving/conf/tts/tts.yaml
speechserving/speechserving/conf/tts/tts.yaml
+0
-6
speechserving/speechserving/conf/tts/tts_pd.yaml
speechserving/speechserving/conf/tts/tts_pd.yaml
+0
-6
speechserving/speechserving/engine/tts/paddleinference/tts_engine.py
...ng/speechserving/engine/tts/paddleinference/tts_engine.py
+39
-38
speechserving/speechserving/engine/tts/python/tts_engine.py
speechserving/speechserving/engine/tts/python/tts_engine.py
+48
-31
speechserving/speechserving/utils/audio_process.py
speechserving/speechserving/utils/audio_process.py
+19
-1
speechserving/tests/tts/test_client.py
speechserving/tests/tts/test_client.py
+110
-0
未找到文件。
speechserving/speechserving/conf/tts/tts.yaml
浏览文件 @
79c064fe
# This is the parameter configuration file for TTS server.
##################################################################
# TTS SERVER SETTING #
##################################################################
host
:
'
0.0.0.0'
port
:
8692
##################################################################
# ACOUSTIC MODEL SETTING #
# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',
...
...
speechserving/speechserving/conf/tts/tts_pd.yaml
浏览文件 @
79c064fe
# This is the parameter configuration file for TTS server.
# These are the static models that support paddle inference.
##################################################################
# TTS SERVER SETTING #
##################################################################
host
:
'
0.0.0.0'
port
:
8692
##################################################################
# ACOUSTIC MODEL SETTING #
# am choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']
...
...
speechserving/speechserving/engine/tts/paddleinference/tts_engine.py
浏览文件 @
79c064fe
...
...
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
base64
import
io
import
os
...
...
@@ -21,7 +20,6 @@ import librosa
import
numpy
as
np
import
paddle
import
soundfile
as
sf
import
yaml
from
engine.base_engine
import
BaseEngine
from
scipy.io
import
wavfile
...
...
@@ -32,6 +30,7 @@ from paddlespeech.cli.utils import MODEL_HOME
from
paddlespeech.t2s.frontend
import
English
from
paddlespeech.t2s.frontend.zh_frontend
import
Frontend
from
utils.audio_process
import
change_speed
from
utils.config
import
get_config
from
utils.errors
import
ErrorCode
from
utils.exception
import
ServerBaseException
from
utils.paddle_predictor
import
init_predictor
...
...
@@ -118,14 +117,7 @@ pretrained_models = {
class
TTSServerExecutor
(
TTSExecutor
):
def
__init__
(
self
):
super
().
__init__
()
self
.
parser
=
argparse
.
ArgumentParser
(
prog
=
'paddlespeech.tts'
,
add_help
=
True
)
self
.
parser
.
add_argument
(
'--conf'
,
type
=
str
,
default
=
'./conf/tts/tts_pd.yaml'
,
help
=
'Configuration parameters.'
)
pass
def
_get_pretrained_path
(
self
,
tag
:
str
)
->
os
.
PathLike
:
"""
...
...
@@ -224,7 +216,10 @@ class TTSServerExecutor(TTSExecutor):
self
.
voc_sample_rate
=
voc_sample_rate
self
.
voc_res_path
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
self
.
voc_model
))
assert
(
self
.
voc_sample_rate
==
self
.
am_sample_rate
)
assert
(
self
.
voc_sample_rate
==
self
.
am_sample_rate
),
"The sample rate of AM and Vocoder model are different, please check model."
# Init body.
with
open
(
self
.
phones_dict
,
"r"
)
as
f
:
phn_id
=
[
line
.
strip
().
split
()
for
line
in
f
.
readlines
()]
...
...
@@ -339,31 +334,31 @@ class TTSEngine(BaseEngine):
metaclass: Defaults to Singleton.
"""
def
__init__
(
self
,
name
=
None
):
def
__init__
(
self
):
"""Initialize TTS server engine
"""
super
(
TTSEngine
,
self
).
__init__
()
self
.
executor
=
TTSServerExecutor
()
config_path
=
self
.
executor
.
parser
.
parse_args
().
conf
with
open
(
config_path
,
'rt'
)
as
f
:
self
.
conf_dict
=
yaml
.
safe_load
(
f
)
def
init
(
self
,
config_file
:
str
):
self
.
executor
=
TTSServerExecutor
()
self
.
config_file
=
config_file
self
.
config
=
get_config
(
config_file
)
self
.
executor
.
_init_from_path
(
am
=
self
.
conf
_dict
[
"am"
]
,
am_model
=
self
.
conf
_dict
[
"am_model"
]
,
am_params
=
self
.
conf
_dict
[
"am_params"
]
,
am_sample_rate
=
self
.
conf
_dict
[
"am_sample_rate"
]
,
phones_dict
=
self
.
conf
_dict
[
"phones_dict"
]
,
tones_dict
=
self
.
conf
_dict
[
"tones_dict"
]
,
speaker_dict
=
self
.
conf
_dict
[
"speaker_dict"
]
,
voc
=
self
.
conf
_dict
[
"voc"
]
,
voc_model
=
self
.
conf
_dict
[
"voc_model"
]
,
voc_params
=
self
.
conf
_dict
[
"voc_params"
]
,
voc_sample_rate
=
self
.
conf
_dict
[
"voc_sample_rate"
]
,
lang
=
self
.
conf
_dict
[
"lang"
]
,
am_predictor_conf
=
self
.
conf
_dict
[
"am_predictor_conf"
]
,
voc_predictor_conf
=
self
.
conf
_dict
[
"voc_predictor_conf"
]
,
)
am
=
self
.
conf
ig
.
am
,
am_model
=
self
.
conf
ig
.
am_model
,
am_params
=
self
.
conf
ig
.
am_params
,
am_sample_rate
=
self
.
conf
ig
.
am_sample_rate
,
phones_dict
=
self
.
conf
ig
.
phones_dict
,
tones_dict
=
self
.
conf
ig
.
tones_dict
,
speaker_dict
=
self
.
conf
ig
.
speaker_dict
,
voc
=
self
.
conf
ig
.
voc
,
voc_model
=
self
.
conf
ig
.
voc_model
,
voc_params
=
self
.
conf
ig
.
voc_params
,
voc_sample_rate
=
self
.
conf
ig
.
voc_sample_rate
,
lang
=
self
.
conf
ig
.
lang
,
am_predictor_conf
=
self
.
conf
ig
.
am_predictor_conf
,
voc_predictor_conf
=
self
.
conf
ig
.
voc_predictor_conf
,
)
logger
.
info
(
"Initialize TTS server engine successfully."
)
...
...
@@ -382,6 +377,13 @@ class TTSEngine(BaseEngine):
target_fs (int): target audio sample rate
volume (float): target volume
speed (float): target speed
Raises:
ServerBaseException: Throws an exception if the change speed unsuccessfully.
Returns:
target_fs: target sample rate for synthesized audio.
wav_base64: The base64 format of the synthesized audio.
"""
# transform sample_rate
...
...
@@ -440,21 +442,20 @@ class TTSEngine(BaseEngine):
save_path (str, optional): The save path of the synthesized audio. Defaults to None.
Raises:
ServerBaseException:
Exception
ServerBaseException:
Exception
ServerBaseException:
Throws an exception if tts inference unsuccessfully.
ServerBaseException:
Throws an exception if postprocess unsuccessfully.
Returns:
lang, target_sample_rate, wav_base64
lang: model language
target_sample_rate: target sample rate for synthesized audio.
wav_base64: The base64 format of the synthesized audio.
"""
lang
=
self
.
conf
_dict
[
"lang"
]
lang
=
self
.
conf
ig
.
lang
try
:
self
.
executor
.
infer
(
text
=
sentence
,
lang
=
lang
,
am
=
self
.
conf_dict
[
"am"
],
spk_id
=
spk_id
)
text
=
sentence
,
lang
=
lang
,
am
=
self
.
config
.
am
,
spk_id
=
spk_id
)
except
:
raise
ServerBaseException
(
ErrorCode
.
SERVER_INTERNAL_ERR
,
"tts infer failed."
)
...
...
speechserving/speechserving/engine/tts/python/tts_engine.py
浏览文件 @
79c064fe
...
...
@@ -11,20 +11,19 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
base64
import
io
import
librosa
import
numpy
as
np
import
soundfile
as
sf
import
yaml
from
engine.base_engine
import
BaseEngine
from
scipy.io
import
wavfile
from
paddlespeech.cli.log
import
logger
from
paddlespeech.cli.tts.infer
import
TTSExecutor
from
utils.audio_process
import
change_speed
from
utils.config
import
get_config
from
utils.errors
import
ErrorCode
from
utils.exception
import
ServerBaseException
...
...
@@ -34,14 +33,7 @@ __all__ = ['TTSEngine']
class
TTSServerExecutor
(
TTSExecutor
):
def
__init__
(
self
):
super
().
__init__
()
self
.
parser
=
argparse
.
ArgumentParser
(
prog
=
'paddlespeech.tts'
,
add_help
=
True
)
self
.
parser
.
add_argument
(
'--conf'
,
type
=
str
,
default
=
'./conf/tts/tts.yaml'
,
help
=
'Configuration parameters.'
)
pass
class
TTSEngine
(
BaseEngine
):
...
...
@@ -55,25 +47,25 @@ class TTSEngine(BaseEngine):
"""Initialize TTS server engine
"""
super
(
TTSEngine
,
self
).
__init__
()
self
.
executor
=
TTSServerExecutor
()
config_path
=
self
.
executor
.
parser
.
parse_args
().
conf
with
open
(
config_path
,
'rt'
)
as
f
:
self
.
conf_dict
=
yaml
.
safe_load
(
f
)
def
init
(
self
,
config_file
:
str
):
self
.
executor
=
TTSServerExecutor
()
self
.
config_file
=
config_file
self
.
config
=
get_config
(
config_file
)
self
.
executor
.
_init_from_path
(
am
=
self
.
conf
_dict
[
"am"
]
,
am_config
=
self
.
conf
_dict
[
"am_config"
]
,
am_ckpt
=
self
.
conf
_dict
[
"am_ckpt"
]
,
am_stat
=
self
.
conf
_dict
[
"am_stat"
]
,
phones_dict
=
self
.
conf
_dict
[
"phones_dict"
]
,
tones_dict
=
self
.
conf
_dict
[
"tones_dict"
]
,
speaker_dict
=
self
.
conf
_dict
[
"speaker_dict"
]
,
voc
=
self
.
conf
_dict
[
"voc"
]
,
voc_config
=
self
.
conf
_dict
[
"voc_config"
]
,
voc_ckpt
=
self
.
conf
_dict
[
"voc_ckpt"
]
,
voc_stat
=
self
.
conf
_dict
[
"voc_stat"
]
,
lang
=
self
.
conf
_dict
[
"lang"
]
)
am
=
self
.
conf
ig
.
am
,
am_config
=
self
.
conf
ig
.
am_config
,
am_ckpt
=
self
.
conf
ig
.
am_ckpt
,
am_stat
=
self
.
conf
ig
.
am_stat
,
phones_dict
=
self
.
conf
ig
.
phones_dict
,
tones_dict
=
self
.
conf
ig
.
tones_dict
,
speaker_dict
=
self
.
conf
ig
.
speaker_dict
,
voc
=
self
.
conf
ig
.
voc
,
voc_config
=
self
.
conf
ig
.
voc_config
,
voc_ckpt
=
self
.
conf
ig
.
voc_ckpt
,
voc_stat
=
self
.
conf
ig
.
voc_stat
,
lang
=
self
.
conf
ig
.
lang
)
logger
.
info
(
"Initialize TTS server engine successfully."
)
...
...
@@ -92,6 +84,13 @@ class TTSEngine(BaseEngine):
target_fs (int): target audio sample rate
volume (float): target volume
speed (float): target speed
Raises:
ServerBaseException: Throws an exception if the change speed unsuccessfully.
Returns:
target_fs: target sample rate for synthesized audio.
wav_base64: The base64 format of the synthesized audio.
"""
# transform sample_rate
...
...
@@ -137,15 +136,33 @@ class TTSEngine(BaseEngine):
volume
:
float
=
1.0
,
sample_rate
:
int
=
0
,
save_path
:
str
=
None
):
""" run include inference and postprocess.
Args:
sentence (str): text to be synthesized
spk_id (int, optional): speaker id for multi-speaker speech synthesis. Defaults to 0.
speed (float, optional): speed. Defaults to 1.0.
volume (float, optional): volume. Defaults to 1.0.
sample_rate (int, optional): target sample rate for synthesized audio,
0 means the same as the model sampling rate. Defaults to 0.
save_path (str, optional): The save path of the synthesized audio.
None means do not save audio. Defaults to None.
Raises:
ServerBaseException: Throws an exception if tts inference unsuccessfully.
ServerBaseException: Throws an exception if postprocess unsuccessfully.
Returns:
lang: model language
target_sample_rate: target sample rate for synthesized audio.
wav_base64: The base64 format of the synthesized audio.
"""
lang
=
self
.
conf
_dict
[
"lang"
]
lang
=
self
.
conf
ig
.
lang
try
:
self
.
executor
.
infer
(
text
=
sentence
,
lang
=
lang
,
am
=
self
.
conf_dict
[
"am"
],
spk_id
=
spk_id
)
text
=
sentence
,
lang
=
lang
,
am
=
self
.
config
.
am
,
spk_id
=
spk_id
)
except
:
raise
ServerBaseException
(
ErrorCode
.
SERVER_INTERNAL_ERR
,
"tts infer failed."
)
...
...
speechserving/speechserving/utils/audio_process.py
浏览文件 @
79c064fe
...
...
@@ -15,8 +15,17 @@ import wave
import
numpy
as
np
from
paddlespeech.cli.log
import
logger
def
wav2pcm
(
wavfile
,
pcmfile
,
data_type
=
np
.
int16
):
""" Save the wav file as a pcm file
Args:
wavfile (str): wav file path
pcmfile (str): pcm file save path
data_type (type, optional): pcm sample type. Defaults to np.int16.
"""
with
open
(
wavfile
,
"rb"
)
as
f
:
f
.
seek
(
0
)
f
.
read
(
44
)
...
...
@@ -25,12 +34,21 @@ def wav2pcm(wavfile, pcmfile, data_type=np.int16):
def
pcm2wav
(
pcm_file
,
wav_file
,
channels
=
1
,
bits
=
16
,
sample_rate
=
16000
):
"""Save the pcm file as a wav file
Args:
pcm_file (str): pcm file path
wav_file (str): wav file save path
channels (int, optional): audio channel. Defaults to 1.
bits (int, optional): Bit depth. Defaults to 16.
sample_rate (int, optional): sample rate. Defaults to 16000.
"""
pcmf
=
open
(
pcm_file
,
'rb'
)
pcmdata
=
pcmf
.
read
()
pcmf
.
close
()
if
bits
%
8
!=
0
:
raise
ValueE
rror
(
"bits % 8 must == 0. now bits:"
+
str
(
bits
))
logger
.
e
rror
(
"bits % 8 must == 0. now bits:"
+
str
(
bits
))
wavfile
=
wave
.
open
(
wav_file
,
'wb'
)
wavfile
.
setnchannels
(
channels
)
...
...
speechserving/tests/tts/test_client.py
0 → 100644
浏览文件 @
79c064fe
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
argparse
import
base64
import
io
import
json
import
os
import
random
import
time
import
numpy
as
np
import
requests
import
soundfile
def
wav2pcm
(
wavfile
:
str
,
pcmfile
:
str
,
data_type
=
np
.
int16
):
with
open
(
wavfile
,
"rb"
)
as
f
:
f
.
seek
(
0
)
f
.
read
(
44
)
data
=
np
.
fromfile
(
f
,
dtype
=
data_type
)
data
.
tofile
(
pcmfile
)
# Request and response
def
tts_client
(
args
):
""" Request and response
Args:
text: A sentence to be synthesized
outfile: Synthetic audio file
"""
url
=
'http://127.0.0.1:8090/paddlespeech/tts'
request
=
{
"text"
:
args
.
text
,
"spk_id"
:
args
.
spk_id
,
"speed"
:
args
.
speed
,
"volume"
:
args
.
volume
,
"sample_rate"
:
args
.
sample_rate
,
"save_path"
:
args
.
output
}
response
=
requests
.
post
(
url
,
json
.
dumps
(
request
))
response_dict
=
response
.
json
()
wav_base64
=
response_dict
[
"result"
][
"audio"
]
audio_data_byte
=
base64
.
b64decode
(
wav_base64
)
# from byte
samples
,
sample_rate
=
soundfile
.
read
(
io
.
BytesIO
(
audio_data_byte
),
dtype
=
'float32'
)
# transform audio
outfile
=
args
.
output
if
outfile
.
endswith
(
".wav"
):
soundfile
.
write
(
outfile
,
samples
,
sample_rate
)
elif
outfile
.
endswith
(
".pcm"
):
temp_wav
=
str
(
random
.
getrandbits
(
128
))
+
".wav"
soundfile
.
write
(
temp_wav
,
samples
,
sample_rate
)
wav2pcm
(
temp_wav
,
outfile
,
data_type
=
np
.
int16
)
os
.
system
(
"rm %s"
%
(
temp_wav
))
else
:
print
(
"The format for saving audio only supports wav or pcm"
)
return
len
(
samples
),
sample_rate
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'--text'
,
type
=
str
,
default
=
"你好,欢迎使用语音合成服务"
,
help
=
'A sentence to be synthesized'
)
parser
.
add_argument
(
'--spk_id'
,
type
=
int
,
default
=
0
,
help
=
'Speaker id'
)
parser
.
add_argument
(
'--speed'
,
type
=
float
,
default
=
1.0
,
help
=
'Audio speed'
)
parser
.
add_argument
(
'--volume'
,
type
=
float
,
default
=
1.0
,
help
=
'Audio volume'
)
parser
.
add_argument
(
'--sample_rate'
,
type
=
int
,
default
=
0
,
help
=
'Sampling rate, the default is the same as the model'
)
parser
.
add_argument
(
'--output'
,
type
=
str
,
default
=
"./out.wav"
,
help
=
'Synthesized audio file'
)
args
=
parser
.
parse_args
()
st
=
time
.
time
()
try
:
samples_length
,
sample_rate
=
tts_client
(
args
)
time_consume
=
time
.
time
()
-
st
duration
=
samples_length
/
sample_rate
rtf
=
time_consume
/
duration
print
(
"Synthesized audio successfully."
)
print
(
"Inference time: %f"
%
(
time_consume
))
print
(
"The duration of synthesized audio: %f"
%
(
duration
))
print
(
"The RTF is: %f"
%
(
rtf
))
except
:
print
(
"Failed to synthesized audio."
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录