Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
7ebe904e
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
7ebe904e
编写于
2月 22, 2022
作者:
W
WilliamZhang06
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fixed overload , test=doc
上级
b8f16ac9
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
3 addition
and
195 deletion
+3
-195
paddlespeech/server/conf/asr/asr.yaml
paddlespeech/server/conf/asr/asr.yaml
+1
-1
paddlespeech/server/conf/asr/asr_pd.yaml
paddlespeech/server/conf/asr/asr_pd.yaml
+2
-2
paddlespeech/server/engine/asr/paddleinference/asr_engine.py
paddlespeech/server/engine/asr/paddleinference/asr_engine.py
+0
-97
paddlespeech/server/engine/asr/python/asr_engine.py
paddlespeech/server/engine/asr/python/asr_engine.py
+0
-95
未找到文件。
paddlespeech/server/conf/asr/asr.yaml
浏览文件 @
7ebe904e
...
@@ -4,4 +4,4 @@ sample_rate: 16000
...
@@ -4,4 +4,4 @@ sample_rate: 16000
cfg_path
:
# [optional]
cfg_path
:
# [optional]
ckpt_path
:
# [optional]
ckpt_path
:
# [optional]
decode_method
:
'
attention_rescoring'
decode_method
:
'
attention_rescoring'
force_yes
:
Fals
e
force_yes
:
Tru
e
paddlespeech/server/conf/asr/asr_pd.yaml
浏览文件 @
7ebe904e
...
@@ -6,13 +6,13 @@
...
@@ -6,13 +6,13 @@
# am choices=['deepspeech2offline_aishell'] TODO
# am choices=['deepspeech2offline_aishell'] TODO
##################################################################
##################################################################
model_type
:
'
deepspeech2offline_aishell'
model_type
:
'
deepspeech2offline_aishell'
am_model
:
# the pdmodel file of am static model [optional]
am_model
:
# the pdmodel file of am static model [optional]
am_params
:
# the pdiparams file of am static model [optional]
am_params
:
# the pdiparams file of am static model [optional]
lang
:
'
zh'
lang
:
'
zh'
sample_rate
:
16000
sample_rate
:
16000
cfg_path
:
cfg_path
:
decode_method
:
decode_method
:
force_yes
:
force_yes
:
True
am_predictor_conf
:
am_predictor_conf
:
use_gpu
:
True
use_gpu
:
True
...
...
paddlespeech/server/engine/asr/paddleinference/asr_engine.py
浏览文件 @
7ebe904e
...
@@ -60,108 +60,11 @@ pretrained_models = {
...
@@ -60,108 +60,11 @@ pretrained_models = {
}
}
class
ASRServerExecutor
(
ASRExecutor
):
class
ASRServerExecutor
(
ASRExecutor
):
def
__init__
(
self
):
def
__init__
(
self
):
super
().
__init__
()
super
().
__init__
()
pass
pass
def
_check
(
self
,
audio_file
:
str
,
sample_rate
:
int
,
force_yes
:
bool
):
self
.
sample_rate
=
sample_rate
if
self
.
sample_rate
!=
16000
and
self
.
sample_rate
!=
8000
:
logger
.
error
(
"please input --sr 8000 or --sr 16000"
)
return
False
logger
.
info
(
"checking the audio file format......"
)
try
:
audio
,
audio_sample_rate
=
soundfile
.
read
(
audio_file
,
dtype
=
"int16"
,
always_2d
=
True
)
except
Exception
as
e
:
logger
.
exception
(
e
)
logger
.
error
(
"can not open the audio file, please check the audio file format is 'wav'.
\n
\
you can try to use sox to change the file format.
\n
\
For example:
\n
\
sample rate: 16k
\n
\
sox input_audio.xx --rate 16k --bits 16 --channels 1 output_audio.wav
\n
\
sample rate: 8k
\n
\
sox input_audio.xx --rate 8k --bits 16 --channels 1 output_audio.wav
\n
\
"
)
logger
.
info
(
"The sample rate is %d"
%
audio_sample_rate
)
if
audio_sample_rate
!=
self
.
sample_rate
:
logger
.
warning
(
"The sample rate of the input file is not {}.
\n
\
The program will resample the wav file to {}.
\n
\
If the result does not meet your expectations,
\n
\
Please input the 16k 16 bit 1 channel wav file.
\
"
.
format
(
self
.
sample_rate
,
self
.
sample_rate
))
self
.
change_format
=
True
else
:
logger
.
info
(
"The audio file format is right"
)
self
.
change_format
=
False
return
True
def
preprocess
(
self
,
model_type
:
str
,
input
:
Union
[
str
,
os
.
PathLike
]):
"""
Input preprocess and return paddle.Tensor stored in self.input.
Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet).
"""
audio_file
=
input
# Get the object for feature extraction
if
"deepspeech2online"
in
model_type
or
"deepspeech2offline"
in
model_type
:
audio
,
_
=
self
.
collate_fn_test
.
process_utterance
(
audio_file
=
audio_file
,
transcript
=
" "
)
audio_len
=
audio
.
shape
[
0
]
audio
=
paddle
.
to_tensor
(
audio
,
dtype
=
'float32'
)
audio_len
=
paddle
.
to_tensor
(
audio_len
)
audio
=
paddle
.
unsqueeze
(
audio
,
axis
=
0
)
self
.
_inputs
[
"audio"
]
=
audio
self
.
_inputs
[
"audio_len"
]
=
audio_len
logger
.
info
(
f
"audio feat shape:
{
audio
.
shape
}
"
)
elif
"conformer"
in
model_type
or
"transformer"
in
model_type
or
"wenetspeech"
in
model_type
:
logger
.
info
(
"get the preprocess conf"
)
preprocess_conf
=
self
.
config
.
preprocess_config
preprocess_args
=
{
"train"
:
False
}
preprocessing
=
Transformation
(
preprocess_conf
)
logger
.
info
(
"read the audio file"
)
audio
,
audio_sample_rate
=
soundfile
.
read
(
audio_file
,
dtype
=
"int16"
,
always_2d
=
True
)
if
self
.
change_format
:
if
audio
.
shape
[
1
]
>=
2
:
audio
=
audio
.
mean
(
axis
=
1
,
dtype
=
np
.
int16
)
else
:
audio
=
audio
[:,
0
]
# pcm16 -> pcm 32
audio
=
self
.
_pcm16to32
(
audio
)
audio
=
librosa
.
resample
(
audio
,
audio_sample_rate
,
self
.
sample_rate
)
audio_sample_rate
=
self
.
sample_rate
# pcm32 -> pcm 16
audio
=
self
.
_pcm32to16
(
audio
)
else
:
audio
=
audio
[:,
0
]
logger
.
info
(
f
"audio shape:
{
audio
.
shape
}
"
)
# fbank
audio
=
preprocessing
(
audio
,
**
preprocess_args
)
audio_len
=
paddle
.
to_tensor
(
audio
.
shape
[
0
])
audio
=
paddle
.
to_tensor
(
audio
,
dtype
=
'float32'
).
unsqueeze
(
axis
=
0
)
self
.
_inputs
[
"audio"
]
=
audio
self
.
_inputs
[
"audio_len"
]
=
audio_len
logger
.
info
(
f
"audio feat shape:
{
audio
.
shape
}
"
)
else
:
raise
Exception
(
"wrong type"
)
def
_init_from_path
(
self
,
def
_init_from_path
(
self
,
model_type
:
str
=
'wenetspeech'
,
model_type
:
str
=
'wenetspeech'
,
am_model
:
Optional
[
os
.
PathLike
]
=
None
,
am_model
:
Optional
[
os
.
PathLike
]
=
None
,
...
...
paddlespeech/server/engine/asr/python/asr_engine.py
浏览文件 @
7ebe904e
...
@@ -38,101 +38,6 @@ class ASRServerExecutor(ASRExecutor):
...
@@ -38,101 +38,6 @@ class ASRServerExecutor(ASRExecutor):
super
().
__init__
()
super
().
__init__
()
pass
pass
def
_check
(
self
,
audio_file
:
str
,
sample_rate
:
int
,
force_yes
:
bool
):
self
.
sample_rate
=
sample_rate
if
self
.
sample_rate
!=
16000
and
self
.
sample_rate
!=
8000
:
logger
.
error
(
"please input --sr 8000 or --sr 16000"
)
return
False
logger
.
info
(
"checking the audio file format......"
)
try
:
audio
,
audio_sample_rate
=
soundfile
.
read
(
audio_file
,
dtype
=
"int16"
,
always_2d
=
True
)
except
Exception
as
e
:
logger
.
exception
(
e
)
logger
.
error
(
"can not open the audio file, please check the audio file format is 'wav'.
\n
\
you can try to use sox to change the file format.
\n
\
For example:
\n
\
sample rate: 16k
\n
\
sox input_audio.xx --rate 16k --bits 16 --channels 1 output_audio.wav
\n
\
sample rate: 8k
\n
\
sox input_audio.xx --rate 8k --bits 16 --channels 1 output_audio.wav
\n
\
"
)
logger
.
info
(
"The sample rate is %d"
%
audio_sample_rate
)
if
audio_sample_rate
!=
self
.
sample_rate
:
logger
.
warning
(
"The sample rate of the input file is not {}.
\n
\
The program will resample the wav file to {}.
\n
\
If the result does not meet your expectations,
\n
\
Please input the 16k 16 bit 1 channel wav file.
\
"
.
format
(
self
.
sample_rate
,
self
.
sample_rate
))
self
.
change_format
=
True
else
:
logger
.
info
(
"The audio file format is right"
)
self
.
change_format
=
False
return
True
def
preprocess
(
self
,
model_type
:
str
,
input
:
Union
[
str
,
os
.
PathLike
]):
"""
Input preprocess and return paddle.Tensor stored in self.input.
Input content can be a text(tts), a file(asr, cls) or a streaming(not supported yet).
"""
audio_file
=
input
# Get the object for feature extraction
if
"deepspeech2online"
in
model_type
or
"deepspeech2offline"
in
model_type
:
audio
,
_
=
self
.
collate_fn_test
.
process_utterance
(
audio_file
=
audio_file
,
transcript
=
" "
)
audio_len
=
audio
.
shape
[
0
]
audio
=
paddle
.
to_tensor
(
audio
,
dtype
=
'float32'
)
audio_len
=
paddle
.
to_tensor
(
audio_len
)
audio
=
paddle
.
unsqueeze
(
audio
,
axis
=
0
)
# vocab_list = collate_fn_test.vocab_list
self
.
_inputs
[
"audio"
]
=
audio
self
.
_inputs
[
"audio_len"
]
=
audio_len
logger
.
info
(
f
"audio feat shape:
{
audio
.
shape
}
"
)
elif
"conformer"
in
model_type
or
"transformer"
in
model_type
or
"wenetspeech"
in
model_type
:
logger
.
info
(
"get the preprocess conf"
)
preprocess_conf
=
self
.
config
.
preprocess_config
preprocess_args
=
{
"train"
:
False
}
preprocessing
=
Transformation
(
preprocess_conf
)
logger
.
info
(
"read the audio file"
)
audio
,
audio_sample_rate
=
soundfile
.
read
(
audio_file
,
dtype
=
"int16"
,
always_2d
=
True
)
if
self
.
change_format
:
if
audio
.
shape
[
1
]
>=
2
:
audio
=
audio
.
mean
(
axis
=
1
,
dtype
=
np
.
int16
)
else
:
audio
=
audio
[:,
0
]
# pcm16 -> pcm 32
audio
=
self
.
_pcm16to32
(
audio
)
audio
=
librosa
.
resample
(
audio
,
audio_sample_rate
,
self
.
sample_rate
)
audio_sample_rate
=
self
.
sample_rate
# pcm32 -> pcm 16
audio
=
self
.
_pcm32to16
(
audio
)
else
:
audio
=
audio
[:,
0
]
logger
.
info
(
f
"audio shape:
{
audio
.
shape
}
"
)
# fbank
audio
=
preprocessing
(
audio
,
**
preprocess_args
)
audio_len
=
paddle
.
to_tensor
(
audio
.
shape
[
0
])
audio
=
paddle
.
to_tensor
(
audio
,
dtype
=
'float32'
).
unsqueeze
(
axis
=
0
)
self
.
_inputs
[
"audio"
]
=
audio
self
.
_inputs
[
"audio_len"
]
=
audio_len
logger
.
info
(
f
"audio feat shape:
{
audio
.
shape
}
"
)
else
:
raise
Exception
(
"wrong type"
)
class
ASREngine
(
BaseEngine
):
class
ASREngine
(
BaseEngine
):
"""ASR server engine
"""ASR server engine
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录