Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
dfdf450b
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
dfdf450b
编写于
6月 07, 2022
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix #2013; and format
上级
69a6da4c
变更
44
隐藏空白更改
内联
并排
Showing
44 changed file
with
124 addition
and
146 deletion
+124
-146
examples/wenetspeech/asr1/local/extract_meta.py
examples/wenetspeech/asr1/local/extract_meta.py
+0
-1
paddlespeech/__init__.py
paddlespeech/__init__.py
+0
-4
paddlespeech/cli/base_commands.py
paddlespeech/cli/base_commands.py
+0
-1
paddlespeech/cli/cls/infer.py
paddlespeech/cli/cls/infer.py
+2
-2
paddlespeech/cli/vector/infer.py
paddlespeech/cli/vector/infer.py
+2
-2
paddlespeech/resource/model_alias.py
paddlespeech/resource/model_alias.py
+1
-2
paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
+2
-1
paddlespeech/s2t/exps/deepspeech2/model.py
paddlespeech/s2t/exps/deepspeech2/model.py
+4
-6
paddlespeech/s2t/models/ds2/__init__.py
paddlespeech/s2t/models/ds2/__init__.py
+2
-1
paddlespeech/s2t/models/ds2/deepspeech2.py
paddlespeech/s2t/models/ds2/deepspeech2.py
+8
-4
paddlespeech/s2t/models/lm/transformer.py
paddlespeech/s2t/models/lm/transformer.py
+1
-1
paddlespeech/s2t/models/u2/updater.py
paddlespeech/s2t/models/u2/updater.py
+0
-1
paddlespeech/s2t/modules/ctc.py
paddlespeech/s2t/modules/ctc.py
+1
-1
paddlespeech/s2t/utils/tensor_utils.py
paddlespeech/s2t/utils/tensor_utils.py
+2
-1
paddlespeech/server/engine/asr/online/asr_engine.py
paddlespeech/server/engine/asr/online/asr_engine.py
+5
-4
paddlespeech/server/engine/asr/online/ctc_endpoint.py
paddlespeech/server/engine/asr/online/ctc_endpoint.py
+4
-5
paddlespeech/server/engine/tts/online/python/tts_engine.py
paddlespeech/server/engine/tts/online/python/tts_engine.py
+0
-1
paddlespeech/server/ws/asr_api.py
paddlespeech/server/ws/asr_api.py
+2
-3
paddlespeech/t2s/exps/synthesize.py
paddlespeech/t2s/exps/synthesize.py
+2
-8
paddlespeech/t2s/exps/synthesize_e2e.py
paddlespeech/t2s/exps/synthesize_e2e.py
+2
-8
paddlespeech/t2s/exps/voice_cloning.py
paddlespeech/t2s/exps/voice_cloning.py
+2
-8
paddlespeech/t2s/models/vits/__init__.py
paddlespeech/t2s/models/vits/__init__.py
+1
-1
paddlespeech/t2s/models/vits/vits_updater.py
paddlespeech/t2s/models/vits/vits_updater.py
+4
-2
paddlespeech/t2s/modules/losses.py
paddlespeech/t2s/modules/losses.py
+9
-9
speechx/examples/README.md
speechx/examples/README.md
+0
-1
speechx/examples/ds2_ol/README.md
speechx/examples/ds2_ol/README.md
+1
-1
speechx/speechx/codelab/README.md
speechx/speechx/codelab/README.md
+0
-1
speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
...hx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
+2
-2
speechx/speechx/decoder/ctc_tlg_decoder.cc
speechx/speechx/decoder/ctc_tlg_decoder.cc
+1
-1
speechx/speechx/decoder/param.h
speechx/speechx/decoder/param.h
+1
-1
speechx/speechx/decoder/tlg_decoder_main.cc
speechx/speechx/decoder/tlg_decoder_main.cc
+2
-2
speechx/speechx/frontend/audio/assembler.cc
speechx/speechx/frontend/audio/assembler.cc
+15
-14
speechx/speechx/frontend/audio/assembler.h
speechx/speechx/frontend/audio/assembler.h
+3
-7
speechx/speechx/frontend/audio/audio_cache.h
speechx/speechx/frontend/audio/audio_cache.h
+1
-1
speechx/speechx/frontend/audio/fbank.cc
speechx/speechx/frontend/audio/fbank.cc
+4
-4
speechx/speechx/frontend/audio/feature_cache.cc
speechx/speechx/frontend/audio/feature_cache.cc
+2
-2
speechx/speechx/frontend/audio/feature_cache.h
speechx/speechx/frontend/audio/feature_cache.h
+1
-3
speechx/speechx/frontend/audio/feature_common.h
speechx/speechx/frontend/audio/feature_common.h
+4
-3
speechx/speechx/frontend/audio/feature_common_inl.h
speechx/speechx/frontend/audio/feature_common_inl.h
+12
-10
speechx/speechx/frontend/audio/feature_pipeline.h
speechx/speechx/frontend/audio/feature_pipeline.h
+1
-1
speechx/speechx/frontend/audio/linear_spectrogram.cc
speechx/speechx/frontend/audio/linear_spectrogram.cc
+4
-5
speechx/speechx/nnet/nnet_forward_main.cc
speechx/speechx/nnet/nnet_forward_main.cc
+8
-5
speechx/speechx/protocol/websocket/websocket_client.h
speechx/speechx/protocol/websocket/websocket_client.h
+2
-2
speechx/speechx/protocol/websocket/websocket_server.cc
speechx/speechx/protocol/websocket/websocket_server.cc
+4
-3
未找到文件。
examples/wenetspeech/asr1/local/extract_meta.py
浏览文件 @
dfdf450b
...
@@ -13,7 +13,6 @@
...
@@ -13,7 +13,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
import
argparse
import
argparse
import
json
import
json
import
os
import
os
...
...
paddlespeech/__init__.py
浏览文件 @
dfdf450b
...
@@ -14,7 +14,3 @@
...
@@ -14,7 +14,3 @@
import
_locale
import
_locale
_locale
.
_getdefaultlocale
=
(
lambda
*
args
:
[
'en_US'
,
'utf8'
])
_locale
.
_getdefaultlocale
=
(
lambda
*
args
:
[
'en_US'
,
'utf8'
])
paddlespeech/cli/base_commands.py
浏览文件 @
dfdf450b
...
@@ -145,4 +145,3 @@ for com, info in _commands.items():
...
@@ -145,4 +145,3 @@ for com, info in _commands.items():
name
=
'paddlespeech.{}'
.
format
(
com
),
name
=
'paddlespeech.{}'
.
format
(
com
),
description
=
info
[
0
],
description
=
info
[
0
],
cls
=
'paddlespeech.cli.{}.{}'
.
format
(
com
,
info
[
1
]))
cls
=
'paddlespeech.cli.{}.{}'
.
format
(
com
,
info
[
1
]))
\ No newline at end of file
paddlespeech/cli/cls/infer.py
浏览文件 @
dfdf450b
...
@@ -21,12 +21,12 @@ from typing import Union
...
@@ -21,12 +21,12 @@ from typing import Union
import
numpy
as
np
import
numpy
as
np
import
paddle
import
paddle
import
yaml
import
yaml
from
paddleaudio
import
load
from
paddleaudio.features
import
LogMelSpectrogram
from
..executor
import
BaseExecutor
from
..executor
import
BaseExecutor
from
..log
import
logger
from
..log
import
logger
from
..utils
import
stats_wrapper
from
..utils
import
stats_wrapper
from
paddleaudio
import
load
from
paddleaudio.features
import
LogMelSpectrogram
__all__
=
[
'CLSExecutor'
]
__all__
=
[
'CLSExecutor'
]
...
...
paddlespeech/cli/vector/infer.py
浏览文件 @
dfdf450b
...
@@ -22,13 +22,13 @@ from typing import Union
...
@@ -22,13 +22,13 @@ from typing import Union
import
paddle
import
paddle
import
soundfile
import
soundfile
from
paddleaudio.backends
import
load
as
load_audio
from
paddleaudio.compliance.librosa
import
melspectrogram
from
yacs.config
import
CfgNode
from
yacs.config
import
CfgNode
from
..executor
import
BaseExecutor
from
..executor
import
BaseExecutor
from
..log
import
logger
from
..log
import
logger
from
..utils
import
stats_wrapper
from
..utils
import
stats_wrapper
from
paddleaudio.backends
import
load
as
load_audio
from
paddleaudio.compliance.librosa
import
melspectrogram
from
paddlespeech.vector.io.batch
import
feature_normalize
from
paddlespeech.vector.io.batch
import
feature_normalize
from
paddlespeech.vector.modules.sid_model
import
SpeakerIdetification
from
paddlespeech.vector.modules.sid_model
import
SpeakerIdetification
...
...
paddlespeech/resource/model_alias.py
浏览文件 @
dfdf450b
...
@@ -22,8 +22,7 @@ model_alias = {
...
@@ -22,8 +22,7 @@ model_alias = {
# -------------- ASR --------------
# -------------- ASR --------------
# ---------------------------------
# ---------------------------------
"deepspeech2offline"
:
[
"paddlespeech.s2t.models.ds2:DeepSpeech2Model"
],
"deepspeech2offline"
:
[
"paddlespeech.s2t.models.ds2:DeepSpeech2Model"
],
"deepspeech2online"
:
"deepspeech2online"
:
[
"paddlespeech.s2t.models.ds2:DeepSpeech2Model"
],
[
"paddlespeech.s2t.models.ds2:DeepSpeech2Model"
],
"conformer"
:
[
"paddlespeech.s2t.models.u2:U2Model"
],
"conformer"
:
[
"paddlespeech.s2t.models.u2:U2Model"
],
"conformer_online"
:
[
"paddlespeech.s2t.models.u2:U2Model"
],
"conformer_online"
:
[
"paddlespeech.s2t.models.u2:U2Model"
],
"transformer"
:
[
"paddlespeech.s2t.models.u2:U2Model"
],
"transformer"
:
[
"paddlespeech.s2t.models.u2:U2Model"
],
...
...
paddlespeech/s2t/decoders/scorers/ctc_prefix_score.py
浏览文件 @
dfdf450b
...
@@ -76,7 +76,8 @@ class CTCPrefixScorePD():
...
@@ -76,7 +76,8 @@ class CTCPrefixScorePD():
last_ids
=
[
yi
[
-
1
]
for
yi
in
y
]
# last output label ids
last_ids
=
[
yi
[
-
1
]
for
yi
in
y
]
# last output label ids
n_bh
=
len
(
last_ids
)
# batch * hyps
n_bh
=
len
(
last_ids
)
# batch * hyps
n_hyps
=
n_bh
//
self
.
batch
# assuming each utterance has the same # of hyps
n_hyps
=
n_bh
//
self
.
batch
# assuming each utterance has the same # of hyps
self
.
scoring_num
=
paddle
.
shape
(
scoring_ids
)[
-
1
]
if
scoring_ids
is
not
None
else
0
self
.
scoring_num
=
paddle
.
shape
(
scoring_ids
)[
-
1
]
if
scoring_ids
is
not
None
else
0
# prepare state info
# prepare state info
if
state
is
None
:
if
state
is
None
:
r_prev
=
paddle
.
full
(
r_prev
=
paddle
.
full
(
...
...
paddlespeech/s2t/exps/deepspeech2/model.py
浏览文件 @
dfdf450b
...
@@ -22,11 +22,9 @@ import numpy as np
...
@@ -22,11 +22,9 @@ import numpy as np
import
paddle
import
paddle
from
paddle
import
distributed
as
dist
from
paddle
import
distributed
as
dist
from
paddle
import
inference
from
paddle
import
inference
from
paddle.io
import
DataLoader
from
paddlespeech.s2t.io.dataloader
import
BatchDataLoader
from
paddlespeech.s2t.frontend.featurizer.text_featurizer
import
TextFeaturizer
from
paddlespeech.s2t.frontend.featurizer.text_featurizer
import
TextFeaturizer
from
paddlespeech.s2t.io.data
set
import
ManifestDataset
from
paddlespeech.s2t.io.data
loader
import
BatchDataLoader
from
paddlespeech.s2t.models.ds2
import
DeepSpeech2InferModel
from
paddlespeech.s2t.models.ds2
import
DeepSpeech2InferModel
from
paddlespeech.s2t.models.ds2
import
DeepSpeech2Model
from
paddlespeech.s2t.models.ds2
import
DeepSpeech2Model
from
paddlespeech.s2t.training.gradclip
import
ClipGradByGlobalNormWithLog
from
paddlespeech.s2t.training.gradclip
import
ClipGradByGlobalNormWithLog
...
@@ -238,8 +236,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
...
@@ -238,8 +236,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
def
__init__
(
self
,
config
,
args
):
def
__init__
(
self
,
config
,
args
):
super
().
__init__
(
config
,
args
)
super
().
__init__
(
config
,
args
)
self
.
_text_featurizer
=
TextFeaturizer
(
self
.
_text_featurizer
=
TextFeaturizer
(
unit_type
=
config
.
unit_type
,
unit_type
=
config
.
unit_type
,
vocab
=
config
.
vocab_filepath
)
vocab
=
config
.
vocab_filepath
)
self
.
vocab_list
=
self
.
_text_featurizer
.
vocab_list
self
.
vocab_list
=
self
.
_text_featurizer
.
vocab_list
def
ordid2token
(
self
,
texts
,
texts_len
):
def
ordid2token
(
self
,
texts
,
texts_len
):
...
@@ -248,7 +245,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
...
@@ -248,7 +245,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
for
text
,
n
in
zip
(
texts
,
texts_len
):
for
text
,
n
in
zip
(
texts
,
texts_len
):
n
=
n
.
numpy
().
item
()
n
=
n
.
numpy
().
item
()
ids
=
text
[:
n
]
ids
=
text
[:
n
]
trans
.
append
(
self
.
_text_featurizer
.
defeaturize
(
ids
.
numpy
().
tolist
()))
trans
.
append
(
self
.
_text_featurizer
.
defeaturize
(
ids
.
numpy
().
tolist
()))
return
trans
return
trans
def
compute_metrics
(
self
,
def
compute_metrics
(
self
,
...
...
paddlespeech/s2t/models/ds2/__init__.py
浏览文件 @
dfdf450b
...
@@ -11,10 +11,11 @@
...
@@ -11,10 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
import
sys
from
.deepspeech2
import
DeepSpeech2InferModel
from
.deepspeech2
import
DeepSpeech2InferModel
from
.deepspeech2
import
DeepSpeech2Model
from
.deepspeech2
import
DeepSpeech2Model
from
paddlespeech.s2t.utils
import
dynamic_pip_install
from
paddlespeech.s2t.utils
import
dynamic_pip_install
import
sys
try
:
try
:
import
paddlespeech_ctcdecoders
import
paddlespeech_ctcdecoders
...
...
paddlespeech/s2t/models/ds2/deepspeech2.py
浏览文件 @
dfdf450b
...
@@ -372,11 +372,15 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
...
@@ -372,11 +372,15 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
def
__init__
(
self
,
*
args
,
**
kwargs
):
def
__init__
(
self
,
*
args
,
**
kwargs
):
super
().
__init__
(
*
args
,
**
kwargs
)
super
().
__init__
(
*
args
,
**
kwargs
)
def
forward
(
self
,
audio_chunk
,
audio_chunk_lens
,
chunk_state_h_box
=
None
,
def
forward
(
self
,
audio_chunk
,
audio_chunk_lens
,
chunk_state_h_box
=
None
,
chunk_state_c_box
=
None
):
chunk_state_c_box
=
None
):
if
self
.
encoder
.
rnn_direction
==
"forward"
:
if
self
.
encoder
.
rnn_direction
==
"forward"
:
eouts_chunk
,
eouts_chunk_lens
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
eouts_chunk
,
eouts_chunk_lens
,
final_state_h_box
,
final_state_c_box
=
self
.
encoder
(
audio_chunk
,
audio_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
)
audio_chunk
,
audio_chunk_lens
,
chunk_state_h_box
,
chunk_state_c_box
)
probs_chunk
=
self
.
decoder
.
softmax
(
eouts_chunk
)
probs_chunk
=
self
.
decoder
.
softmax
(
eouts_chunk
)
return
probs_chunk
,
eouts_chunk_lens
,
final_state_h_box
,
final_state_c_box
return
probs_chunk
,
eouts_chunk_lens
,
final_state_h_box
,
final_state_c_box
elif
self
.
encoder
.
rnn_direction
==
"bidirect"
:
elif
self
.
encoder
.
rnn_direction
==
"bidirect"
:
...
@@ -392,8 +396,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
...
@@ -392,8 +396,8 @@ class DeepSpeech2InferModel(DeepSpeech2Model):
self
,
self
,
input_spec
=
[
input_spec
=
[
paddle
.
static
.
InputSpec
(
paddle
.
static
.
InputSpec
(
shape
=
[
None
,
None
,
shape
=
[
None
,
None
,
self
.
encoder
.
feat_size
self
.
encoder
.
feat_size
],
#[B, chunk_size, feat_dim]
],
#[B, chunk_size, feat_dim]
dtype
=
'float32'
),
dtype
=
'float32'
),
paddle
.
static
.
InputSpec
(
shape
=
[
None
],
paddle
.
static
.
InputSpec
(
shape
=
[
None
],
dtype
=
'int64'
),
# audio_length, [B]
dtype
=
'int64'
),
# audio_length, [B]
...
...
paddlespeech/s2t/models/lm/transformer.py
浏览文件 @
dfdf450b
...
@@ -90,7 +90,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
...
@@ -90,7 +90,7 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
def
_target_mask
(
self
,
ys_in_pad
):
def
_target_mask
(
self
,
ys_in_pad
):
ys_mask
=
ys_in_pad
!=
0
ys_mask
=
ys_in_pad
!=
0
m
=
subsequent_mask
(
paddle
.
shape
(
ys_mask
)[
-
1
])
)
.
unsqueeze
(
0
)
m
=
subsequent_mask
(
paddle
.
shape
(
ys_mask
)[
-
1
]).
unsqueeze
(
0
)
return
ys_mask
.
unsqueeze
(
-
2
)
&
m
return
ys_mask
.
unsqueeze
(
-
2
)
&
m
def
forward
(
self
,
x
:
paddle
.
Tensor
,
t
:
paddle
.
Tensor
def
forward
(
self
,
x
:
paddle
.
Tensor
,
t
:
paddle
.
Tensor
...
...
paddlespeech/s2t/models/u2/updater.py
浏览文件 @
dfdf450b
...
@@ -11,7 +11,6 @@
...
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
from
contextlib
import
nullcontext
from
contextlib
import
nullcontext
import
paddle
import
paddle
...
...
paddlespeech/s2t/modules/ctc.py
浏览文件 @
dfdf450b
...
@@ -11,6 +11,7 @@
...
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
import
sys
from
typing
import
Union
from
typing
import
Union
import
paddle
import
paddle
...
@@ -22,7 +23,6 @@ from paddlespeech.s2t.modules.align import Linear
...
@@ -22,7 +23,6 @@ from paddlespeech.s2t.modules.align import Linear
from
paddlespeech.s2t.modules.loss
import
CTCLoss
from
paddlespeech.s2t.modules.loss
import
CTCLoss
from
paddlespeech.s2t.utils
import
ctc_utils
from
paddlespeech.s2t.utils
import
ctc_utils
from
paddlespeech.s2t.utils.log
import
Log
from
paddlespeech.s2t.utils.log
import
Log
import
sys
logger
=
Log
(
__name__
).
getlog
()
logger
=
Log
(
__name__
).
getlog
()
...
...
paddlespeech/s2t/utils/tensor_utils.py
浏览文件 @
dfdf450b
...
@@ -82,7 +82,8 @@ def pad_sequence(sequences: List[paddle.Tensor],
...
@@ -82,7 +82,8 @@ def pad_sequence(sequences: List[paddle.Tensor],
max_size
=
paddle
.
shape
(
sequences
[
0
])
max_size
=
paddle
.
shape
(
sequences
[
0
])
# (TODO Hui Zhang): slice not supprot `end==start`
# (TODO Hui Zhang): slice not supprot `end==start`
# trailing_dims = max_size[1:]
# trailing_dims = max_size[1:]
trailing_dims
=
tuple
(
max_size
[
1
:].
numpy
().
tolist
())
if
sequences
[
0
].
ndim
>=
2
else
()
trailing_dims
=
tuple
(
max_size
[
1
:].
numpy
().
tolist
())
if
sequences
[
0
].
ndim
>=
2
else
()
max_len
=
max
([
s
.
shape
[
0
]
for
s
in
sequences
])
max_len
=
max
([
s
.
shape
[
0
]
for
s
in
sequences
])
if
batch_first
:
if
batch_first
:
out_dims
=
(
len
(
sequences
),
max_len
)
+
trailing_dims
out_dims
=
(
len
(
sequences
),
max_len
)
+
trailing_dims
...
...
paddlespeech/server/engine/asr/online/asr_engine.py
浏览文件 @
dfdf450b
...
@@ -55,7 +55,7 @@ class PaddleASRConnectionHanddler:
...
@@ -55,7 +55,7 @@ class PaddleASRConnectionHanddler:
self
.
config
=
asr_engine
.
config
# server config
self
.
config
=
asr_engine
.
config
# server config
self
.
model_config
=
asr_engine
.
executor
.
config
self
.
model_config
=
asr_engine
.
executor
.
config
self
.
asr_engine
=
asr_engine
self
.
asr_engine
=
asr_engine
# model_type, sample_rate and text_feature is shared for deepspeech2 and conformer
# model_type, sample_rate and text_feature is shared for deepspeech2 and conformer
self
.
model_type
=
self
.
asr_engine
.
executor
.
model_type
self
.
model_type
=
self
.
asr_engine
.
executor
.
model_type
self
.
sample_rate
=
self
.
asr_engine
.
executor
.
sample_rate
self
.
sample_rate
=
self
.
asr_engine
.
executor
.
sample_rate
...
@@ -191,7 +191,7 @@ class PaddleASRConnectionHanddler:
...
@@ -191,7 +191,7 @@ class PaddleASRConnectionHanddler:
self
.
num_frames
=
0
self
.
num_frames
=
0
## endpoint
## endpoint
self
.
endpoint_state
=
False
# True for detect endpoint
self
.
endpoint_state
=
False
# True for detect endpoint
## conformer
## conformer
self
.
model_reset
()
self
.
model_reset
()
...
@@ -503,11 +503,13 @@ class PaddleASRConnectionHanddler:
...
@@ -503,11 +503,13 @@ class PaddleASRConnectionHanddler:
# endpoint
# endpoint
if
not
is_finished
:
if
not
is_finished
:
def
contain_nonsilence
():
def
contain_nonsilence
():
return
len
(
self
.
hyps
)
>
0
and
len
(
self
.
hyps
[
0
])
>
0
return
len
(
self
.
hyps
)
>
0
and
len
(
self
.
hyps
[
0
])
>
0
decoding_something
=
contain_nonsilence
()
decoding_something
=
contain_nonsilence
()
if
self
.
endpointer
.
endpoint_detected
(
ctc_probs
.
numpy
(),
decoding_something
):
if
self
.
endpointer
.
endpoint_detected
(
ctc_probs
.
numpy
(),
decoding_something
):
self
.
endpoint_state
=
True
self
.
endpoint_state
=
True
logger
.
info
(
f
"Endpoint is detected at
{
self
.
num_frames
}
frame."
)
logger
.
info
(
f
"Endpoint is detected at
{
self
.
num_frames
}
frame."
)
...
@@ -869,7 +871,6 @@ class ASREngine(BaseEngine):
...
@@ -869,7 +871,6 @@ class ASREngine(BaseEngine):
logger
.
info
(
"Initialize ASR server engine successfully."
)
logger
.
info
(
"Initialize ASR server engine successfully."
)
return
True
return
True
def
new_handler
(
self
):
def
new_handler
(
self
):
"""New handler from model.
"""New handler from model.
...
...
paddlespeech/server/engine/asr/online/ctc_endpoint.py
浏览文件 @
dfdf450b
...
@@ -12,7 +12,7 @@
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
from
dataclasses
import
dataclass
from
dataclasses
import
dataclass
from
typing
import
List
import
numpy
as
np
import
numpy
as
np
from
paddlespeech.cli.log
import
logger
from
paddlespeech.cli.log
import
logger
...
@@ -76,12 +76,11 @@ class OnlineCTCEndpoint:
...
@@ -76,12 +76,11 @@ class OnlineCTCEndpoint:
decoding_something
or
(
not
rule
.
must_contain_nonsilence
)
decoding_something
or
(
not
rule
.
must_contain_nonsilence
)
)
and
trailine_silence
>=
rule
.
min_trailing_silence
and
utterance_length
>=
rule
.
min_utterance_length
)
and
trailine_silence
>=
rule
.
min_trailing_silence
and
utterance_length
>=
rule
.
min_utterance_length
if
(
ans
):
if
(
ans
):
logger
.
info
(
logger
.
info
(
f
"Endpoint Rule:
{
rule_name
}
activated:
{
rule
}
"
)
f
"Endpoint Rule:
{
rule_name
}
activated:
{
rule
}
"
)
return
ans
return
ans
def
endpoint_detected
(
self
,
ctc_log_probs
:
np
.
ndarray
,
def
endpoint_detected
(
self
,
ctc_log_probs
:
np
.
ndarray
,
decoding_something
:
bool
)
->
bool
:
decoding_something
:
bool
)
->
bool
:
"""detect endpoint.
"""detect endpoint.
...
...
paddlespeech/server/engine/tts/online/python/tts_engine.py
浏览文件 @
dfdf450b
...
@@ -42,7 +42,6 @@ class TTSServerExecutor(TTSExecutor):
...
@@ -42,7 +42,6 @@ class TTSServerExecutor(TTSExecutor):
self
.
task_resource
=
CommonTaskResource
(
self
.
task_resource
=
CommonTaskResource
(
task
=
'tts'
,
model_format
=
'dynamic'
,
inference_mode
=
'online'
)
task
=
'tts'
,
model_format
=
'dynamic'
,
inference_mode
=
'online'
)
def
get_model_info
(
self
,
def
get_model_info
(
self
,
field
:
str
,
field
:
str
,
model_name
:
str
,
model_name
:
str
,
...
...
paddlespeech/server/ws/asr_api.py
浏览文件 @
dfdf450b
...
@@ -19,7 +19,6 @@ from fastapi import WebSocketDisconnect
...
@@ -19,7 +19,6 @@ from fastapi import WebSocketDisconnect
from
starlette.websockets
import
WebSocketState
as
WebSocketState
from
starlette.websockets
import
WebSocketState
as
WebSocketState
from
paddlespeech.cli.log
import
logger
from
paddlespeech.cli.log
import
logger
from
paddlespeech.server.engine.asr.online.asr_engine
import
PaddleASRConnectionHanddler
from
paddlespeech.server.engine.engine_pool
import
get_engine_pool
from
paddlespeech.server.engine.engine_pool
import
get_engine_pool
router
=
APIRouter
()
router
=
APIRouter
()
...
@@ -106,7 +105,7 @@ async def websocket_endpoint(websocket: WebSocket):
...
@@ -106,7 +105,7 @@ async def websocket_endpoint(websocket: WebSocket):
logger
.
info
(
"endpoint: detected and rescoring."
)
logger
.
info
(
"endpoint: detected and rescoring."
)
connection_handler
.
rescoring
()
connection_handler
.
rescoring
()
word_time_stamp
=
connection_handler
.
get_word_time_stamp
()
word_time_stamp
=
connection_handler
.
get_word_time_stamp
()
asr_results
=
connection_handler
.
get_result
()
asr_results
=
connection_handler
.
get_result
()
if
connection_handler
.
endpoint_state
:
if
connection_handler
.
endpoint_state
:
...
@@ -124,7 +123,7 @@ async def websocket_endpoint(websocket: WebSocket):
...
@@ -124,7 +123,7 @@ async def websocket_endpoint(websocket: WebSocket):
}
}
await
websocket
.
send_json
(
resp
)
await
websocket
.
send_json
(
resp
)
break
break
# return the current partial result
# return the current partial result
# if the engine create the vad instance, this connection will have many partial results
# if the engine create the vad instance, this connection will have many partial results
resp
=
{
'result'
:
asr_results
}
resp
=
{
'result'
:
asr_results
}
...
...
paddlespeech/t2s/exps/synthesize.py
浏览文件 @
dfdf450b
...
@@ -140,10 +140,7 @@ def parse_args():
...
@@ -140,10 +140,7 @@ def parse_args():
],
],
help
=
'Choose acoustic model type of tts task.'
)
help
=
'Choose acoustic model type of tts task.'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--am_config'
,
'--am_config'
,
type
=
str
,
default
=
None
,
help
=
'Config of acoustic model.'
)
type
=
str
,
default
=
None
,
help
=
'Config of acoustic model.'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--am_ckpt'
,
'--am_ckpt'
,
type
=
str
,
type
=
str
,
...
@@ -179,10 +176,7 @@ def parse_args():
...
@@ -179,10 +176,7 @@ def parse_args():
],
],
help
=
'Choose vocoder type of tts task.'
)
help
=
'Choose vocoder type of tts task.'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--voc_config'
,
'--voc_config'
,
type
=
str
,
default
=
None
,
help
=
'Config of voc.'
)
type
=
str
,
default
=
None
,
help
=
'Config of voc.'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--voc_ckpt'
,
type
=
str
,
default
=
None
,
help
=
'Checkpoint file of voc.'
)
'--voc_ckpt'
,
type
=
str
,
default
=
None
,
help
=
'Checkpoint file of voc.'
)
parser
.
add_argument
(
parser
.
add_argument
(
...
...
paddlespeech/t2s/exps/synthesize_e2e.py
浏览文件 @
dfdf450b
...
@@ -174,10 +174,7 @@ def parse_args():
...
@@ -174,10 +174,7 @@ def parse_args():
],
],
help
=
'Choose acoustic model type of tts task.'
)
help
=
'Choose acoustic model type of tts task.'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--am_config'
,
'--am_config'
,
type
=
str
,
default
=
None
,
help
=
'Config of acoustic model.'
)
type
=
str
,
default
=
None
,
help
=
'Config of acoustic model.'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--am_ckpt'
,
'--am_ckpt'
,
type
=
str
,
type
=
str
,
...
@@ -220,10 +217,7 @@ def parse_args():
...
@@ -220,10 +217,7 @@ def parse_args():
],
],
help
=
'Choose vocoder type of tts task.'
)
help
=
'Choose vocoder type of tts task.'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--voc_config'
,
'--voc_config'
,
type
=
str
,
default
=
None
,
help
=
'Config of voc.'
)
type
=
str
,
default
=
None
,
help
=
'Config of voc.'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--voc_ckpt'
,
type
=
str
,
default
=
None
,
help
=
'Checkpoint file of voc.'
)
'--voc_ckpt'
,
type
=
str
,
default
=
None
,
help
=
'Checkpoint file of voc.'
)
parser
.
add_argument
(
parser
.
add_argument
(
...
...
paddlespeech/t2s/exps/voice_cloning.py
浏览文件 @
dfdf450b
...
@@ -131,10 +131,7 @@ def parse_args():
...
@@ -131,10 +131,7 @@ def parse_args():
choices
=
[
'fastspeech2_aishell3'
,
'tacotron2_aishell3'
],
choices
=
[
'fastspeech2_aishell3'
,
'tacotron2_aishell3'
],
help
=
'Choose acoustic model type of tts task.'
)
help
=
'Choose acoustic model type of tts task.'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--am_config'
,
'--am_config'
,
type
=
str
,
default
=
None
,
help
=
'Config of acoustic model.'
)
type
=
str
,
default
=
None
,
help
=
'Config of acoustic model.'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--am_ckpt'
,
'--am_ckpt'
,
type
=
str
,
type
=
str
,
...
@@ -160,10 +157,7 @@ def parse_args():
...
@@ -160,10 +157,7 @@ def parse_args():
help
=
'Choose vocoder type of tts task.'
)
help
=
'Choose vocoder type of tts task.'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--voc_config'
,
'--voc_config'
,
type
=
str
,
default
=
None
,
help
=
'Config of voc.'
)
type
=
str
,
default
=
None
,
help
=
'Config of voc.'
)
parser
.
add_argument
(
parser
.
add_argument
(
'--voc_ckpt'
,
type
=
str
,
default
=
None
,
help
=
'Checkpoint file of voc.'
)
'--voc_ckpt'
,
type
=
str
,
default
=
None
,
help
=
'Checkpoint file of voc.'
)
parser
.
add_argument
(
parser
.
add_argument
(
...
...
paddlespeech/t2s/models/vits/__init__.py
浏览文件 @
dfdf450b
...
@@ -12,4 +12,4 @@
...
@@ -12,4 +12,4 @@
# See the License for the specific language governing permissions and
# See the License for the specific language governing permissions and
# limitations under the License.
# limitations under the License.
from
.vits
import
*
from
.vits
import
*
from
.vits_updater
import
*
from
.vits_updater
import
*
\ No newline at end of file
paddlespeech/t2s/models/vits/vits_updater.py
浏览文件 @
dfdf450b
...
@@ -56,7 +56,8 @@ class VITSUpdater(StandardUpdater):
...
@@ -56,7 +56,8 @@ class VITSUpdater(StandardUpdater):
self
.
models
:
Dict
[
str
,
Layer
]
=
models
self
.
models
:
Dict
[
str
,
Layer
]
=
models
# self.model = model
# self.model = model
self
.
model
=
model
.
_layers
if
isinstance
(
model
,
paddle
.
DataParallel
)
else
model
self
.
model
=
model
.
_layers
if
isinstance
(
model
,
paddle
.
DataParallel
)
else
model
self
.
optimizers
=
optimizers
self
.
optimizers
=
optimizers
self
.
optimizer_g
:
Optimizer
=
optimizers
[
'generator'
]
self
.
optimizer_g
:
Optimizer
=
optimizers
[
'generator'
]
...
@@ -225,7 +226,8 @@ class VITSEvaluator(StandardEvaluator):
...
@@ -225,7 +226,8 @@ class VITSEvaluator(StandardEvaluator):
models
=
{
"main"
:
model
}
models
=
{
"main"
:
model
}
self
.
models
:
Dict
[
str
,
Layer
]
=
models
self
.
models
:
Dict
[
str
,
Layer
]
=
models
# self.model = model
# self.model = model
self
.
model
=
model
.
_layers
if
isinstance
(
model
,
paddle
.
DataParallel
)
else
model
self
.
model
=
model
.
_layers
if
isinstance
(
model
,
paddle
.
DataParallel
)
else
model
self
.
criterions
=
criterions
self
.
criterions
=
criterions
self
.
criterion_mel
=
criterions
[
'mel'
]
self
.
criterion_mel
=
criterions
[
'mel'
]
...
...
paddlespeech/t2s/modules/losses.py
浏览文件 @
dfdf450b
...
@@ -971,18 +971,18 @@ class FeatureMatchLoss(nn.Layer):
...
@@ -971,18 +971,18 @@ class FeatureMatchLoss(nn.Layer):
return
feat_match_loss
return
feat_match_loss
# loss for VITS
# loss for VITS
class
KLDivergenceLoss
(
nn
.
Layer
):
class
KLDivergenceLoss
(
nn
.
Layer
):
"""KL divergence loss."""
"""KL divergence loss."""
def
forward
(
def
forward
(
self
,
self
,
z_p
:
paddle
.
Tensor
,
z_p
:
paddle
.
Tensor
,
logs_q
:
paddle
.
Tensor
,
logs_q
:
paddle
.
Tensor
,
m_p
:
paddle
.
Tensor
,
m_p
:
paddle
.
Tensor
,
logs_p
:
paddle
.
Tensor
,
logs_p
:
paddle
.
Tensor
,
z_mask
:
paddle
.
Tensor
,
z_mask
:
paddle
.
Tensor
,
)
->
paddle
.
Tensor
:
)
->
paddle
.
Tensor
:
"""Calculate KL divergence loss.
"""Calculate KL divergence loss.
Args:
Args:
...
@@ -1002,8 +1002,8 @@ class KLDivergenceLoss(nn.Layer):
...
@@ -1002,8 +1002,8 @@ class KLDivergenceLoss(nn.Layer):
logs_p
=
paddle
.
cast
(
logs_p
,
'float32'
)
logs_p
=
paddle
.
cast
(
logs_p
,
'float32'
)
z_mask
=
paddle
.
cast
(
z_mask
,
'float32'
)
z_mask
=
paddle
.
cast
(
z_mask
,
'float32'
)
kl
=
logs_p
-
logs_q
-
0.5
kl
=
logs_p
-
logs_q
-
0.5
kl
+=
0.5
*
((
z_p
-
m_p
)
**
2
)
*
paddle
.
exp
(
-
2.0
*
logs_p
)
kl
+=
0.5
*
((
z_p
-
m_p
)
**
2
)
*
paddle
.
exp
(
-
2.0
*
logs_p
)
kl
=
paddle
.
sum
(
kl
*
z_mask
)
kl
=
paddle
.
sum
(
kl
*
z_mask
)
loss
=
kl
/
paddle
.
sum
(
z_mask
)
loss
=
kl
/
paddle
.
sum
(
z_mask
)
return
loss
return
loss
\ No newline at end of file
speechx/examples/README.md
浏览文件 @
dfdf450b
...
@@ -25,4 +25,3 @@ netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel --port 8022 --host
...
@@ -25,4 +25,3 @@ netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel --port 8022 --host
> Reminder: Only for developer, make sure you know what's it.
> Reminder: Only for developer, make sure you know what's it.
*
codelab - for speechx developer, using for test.
*
codelab - for speechx developer, using for test.
speechx/examples/ds2_ol/README.md
浏览文件 @
dfdf450b
...
@@ -3,4 +3,4 @@
...
@@ -3,4 +3,4 @@
## Examples
## Examples
*
`websocket`
- Streaming ASR with websocket for deepspeech2_aishell.
*
`websocket`
- Streaming ASR with websocket for deepspeech2_aishell.
*
`aishell`
- Streaming Decoding under aishell dataset, for local WER test.
*
`aishell`
- Streaming Decoding under aishell dataset, for local WER test.
\ No newline at end of file
speechx/speechx/codelab/README.md
浏览文件 @
dfdf450b
...
@@ -4,4 +4,3 @@
...
@@ -4,4 +4,3 @@
> Reminder: Only for developer.
> Reminder: Only for developer.
*
codelab - for speechx developer, using for test.
*
codelab - for speechx developer, using for test.
speechx/speechx/decoder/ctc_prefix_beam_search_decoder_main.cc
浏览文件 @
dfdf450b
...
@@ -91,8 +91,8 @@ int main(int argc, char* argv[]) {
...
@@ -91,8 +91,8 @@ int main(int argc, char* argv[]) {
std
::
shared_ptr
<
ppspeech
::
Decodable
>
decodable
(
std
::
shared_ptr
<
ppspeech
::
Decodable
>
decodable
(
new
ppspeech
::
Decodable
(
nnet
,
raw_data
));
new
ppspeech
::
Decodable
(
nnet
,
raw_data
));
int32
chunk_size
=
FLAGS_receptive_field_length
int32
chunk_size
=
FLAGS_receptive_field_length
+
+
(
FLAGS_nnet_decoder_chunk
-
1
)
*
FLAGS_downsampling_rate
;
(
FLAGS_nnet_decoder_chunk
-
1
)
*
FLAGS_downsampling_rate
;
int32
chunk_stride
=
FLAGS_downsampling_rate
*
FLAGS_nnet_decoder_chunk
;
int32
chunk_stride
=
FLAGS_downsampling_rate
*
FLAGS_nnet_decoder_chunk
;
int32
receptive_field_length
=
FLAGS_receptive_field_length
;
int32
receptive_field_length
=
FLAGS_receptive_field_length
;
LOG
(
INFO
)
<<
"chunk size (frame): "
<<
chunk_size
;
LOG
(
INFO
)
<<
"chunk size (frame): "
<<
chunk_size
;
...
...
speechx/speechx/decoder/ctc_tlg_decoder.cc
浏览文件 @
dfdf450b
...
@@ -64,7 +64,7 @@ std::string TLGDecoder::GetPartialResult() {
...
@@ -64,7 +64,7 @@ std::string TLGDecoder::GetPartialResult() {
std
::
string
word
=
word_symbol_table_
->
Find
(
words_id
[
idx
]);
std
::
string
word
=
word_symbol_table_
->
Find
(
words_id
[
idx
]);
words
+=
word
;
words
+=
word
;
}
}
return
words
;
return
words
;
}
}
std
::
string
TLGDecoder
::
GetFinalBestPath
()
{
std
::
string
TLGDecoder
::
GetFinalBestPath
()
{
...
...
speechx/speechx/decoder/param.h
浏览文件 @
dfdf450b
...
@@ -82,7 +82,7 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
...
@@ -82,7 +82,7 @@ FeaturePipelineOptions InitFeaturePipelineOptions() {
opts
.
assembler_opts
.
subsampling_rate
=
FLAGS_downsampling_rate
;
opts
.
assembler_opts
.
subsampling_rate
=
FLAGS_downsampling_rate
;
opts
.
assembler_opts
.
receptive_filed_length
=
FLAGS_receptive_field_length
;
opts
.
assembler_opts
.
receptive_filed_length
=
FLAGS_receptive_field_length
;
opts
.
assembler_opts
.
nnet_decoder_chunk
=
FLAGS_nnet_decoder_chunk
;
opts
.
assembler_opts
.
nnet_decoder_chunk
=
FLAGS_nnet_decoder_chunk
;
return
opts
;
return
opts
;
}
}
...
...
speechx/speechx/decoder/tlg_decoder_main.cc
浏览文件 @
dfdf450b
...
@@ -93,8 +93,8 @@ int main(int argc, char* argv[]) {
...
@@ -93,8 +93,8 @@ int main(int argc, char* argv[]) {
std
::
shared_ptr
<
ppspeech
::
Decodable
>
decodable
(
std
::
shared_ptr
<
ppspeech
::
Decodable
>
decodable
(
new
ppspeech
::
Decodable
(
nnet
,
raw_data
,
FLAGS_acoustic_scale
));
new
ppspeech
::
Decodable
(
nnet
,
raw_data
,
FLAGS_acoustic_scale
));
int32
chunk_size
=
FLAGS_receptive_field_length
int32
chunk_size
=
FLAGS_receptive_field_length
+
+
(
FLAGS_nnet_decoder_chunk
-
1
)
*
FLAGS_downsampling_rate
;
(
FLAGS_nnet_decoder_chunk
-
1
)
*
FLAGS_downsampling_rate
;
int32
chunk_stride
=
FLAGS_downsampling_rate
*
FLAGS_nnet_decoder_chunk
;
int32
chunk_stride
=
FLAGS_downsampling_rate
*
FLAGS_nnet_decoder_chunk
;
int32
receptive_field_length
=
FLAGS_receptive_field_length
;
int32
receptive_field_length
=
FLAGS_receptive_field_length
;
LOG
(
INFO
)
<<
"chunk size (frame): "
<<
chunk_size
;
LOG
(
INFO
)
<<
"chunk size (frame): "
<<
chunk_size
;
...
...
speechx/speechx/frontend/audio/assembler.cc
浏览文件 @
dfdf450b
...
@@ -24,7 +24,8 @@ using std::unique_ptr;
...
@@ -24,7 +24,8 @@ using std::unique_ptr;
Assembler
::
Assembler
(
AssemblerOptions
opts
,
Assembler
::
Assembler
(
AssemblerOptions
opts
,
unique_ptr
<
FrontendInterface
>
base_extractor
)
{
unique_ptr
<
FrontendInterface
>
base_extractor
)
{
frame_chunk_stride_
=
opts
.
subsampling_rate
*
opts
.
nnet_decoder_chunk
;
frame_chunk_stride_
=
opts
.
subsampling_rate
*
opts
.
nnet_decoder_chunk
;
frame_chunk_size_
=
(
opts
.
nnet_decoder_chunk
-
1
)
*
opts
.
subsampling_rate
+
opts
.
receptive_filed_length
;
frame_chunk_size_
=
(
opts
.
nnet_decoder_chunk
-
1
)
*
opts
.
subsampling_rate
+
opts
.
receptive_filed_length
;
receptive_filed_length_
=
opts
.
receptive_filed_length
;
receptive_filed_length_
=
opts
.
receptive_filed_length
;
base_extractor_
=
std
::
move
(
base_extractor
);
base_extractor_
=
std
::
move
(
base_extractor
);
dim_
=
base_extractor_
->
Dim
();
dim_
=
base_extractor_
->
Dim
();
...
@@ -50,8 +51,8 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
...
@@ -50,8 +51,8 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
Vector
<
BaseFloat
>
feature
;
Vector
<
BaseFloat
>
feature
;
result
=
base_extractor_
->
Read
(
&
feature
);
result
=
base_extractor_
->
Read
(
&
feature
);
if
(
result
==
false
||
feature
.
Dim
()
==
0
)
{
if
(
result
==
false
||
feature
.
Dim
()
==
0
)
{
if
(
IsFinished
()
==
false
)
return
false
;
if
(
IsFinished
()
==
false
)
return
false
;
break
;
break
;
}
}
feature_cache_
.
push
(
feature
);
feature_cache_
.
push
(
feature
);
}
}
...
@@ -61,22 +62,22 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
...
@@ -61,22 +62,22 @@ bool Assembler::Compute(Vector<BaseFloat>* feats) {
}
}
while
(
feature_cache_
.
size
()
<
frame_chunk_size_
)
{
while
(
feature_cache_
.
size
()
<
frame_chunk_size_
)
{
Vector
<
BaseFloat
>
feature
(
dim_
,
kaldi
::
kSetZero
);
Vector
<
BaseFloat
>
feature
(
dim_
,
kaldi
::
kSetZero
);
feature_cache_
.
push
(
feature
);
feature_cache_
.
push
(
feature
);
}
}
int32
counter
=
0
;
int32
counter
=
0
;
int32
cache_size
=
frame_chunk_size_
-
frame_chunk_stride_
;
int32
cache_size
=
frame_chunk_size_
-
frame_chunk_stride_
;
int32
elem_dim
=
base_extractor_
->
Dim
();
int32
elem_dim
=
base_extractor_
->
Dim
();
while
(
counter
<
frame_chunk_size_
)
{
while
(
counter
<
frame_chunk_size_
)
{
Vector
<
BaseFloat
>&
val
=
feature_cache_
.
front
();
Vector
<
BaseFloat
>&
val
=
feature_cache_
.
front
();
int32
start
=
counter
*
elem_dim
;
int32
start
=
counter
*
elem_dim
;
feats
->
Range
(
start
,
elem_dim
).
CopyFromVec
(
val
);
feats
->
Range
(
start
,
elem_dim
).
CopyFromVec
(
val
);
if
(
frame_chunk_size_
-
counter
<=
cache_size
)
{
if
(
frame_chunk_size_
-
counter
<=
cache_size
)
{
feature_cache_
.
push
(
val
);
feature_cache_
.
push
(
val
);
}
}
feature_cache_
.
pop
();
feature_cache_
.
pop
();
counter
++
;
counter
++
;
}
}
return
result
;
return
result
;
...
...
speechx/speechx/frontend/audio/assembler.h
浏览文件 @
dfdf450b
...
@@ -25,7 +25,7 @@ struct AssemblerOptions {
...
@@ -25,7 +25,7 @@ struct AssemblerOptions {
int32
receptive_filed_length
;
int32
receptive_filed_length
;
int32
subsampling_rate
;
int32
subsampling_rate
;
int32
nnet_decoder_chunk
;
int32
nnet_decoder_chunk
;
AssemblerOptions
()
AssemblerOptions
()
:
receptive_filed_length
(
1
),
:
receptive_filed_length
(
1
),
subsampling_rate
(
1
),
subsampling_rate
(
1
),
...
@@ -47,15 +47,11 @@ class Assembler : public FrontendInterface {
...
@@ -47,15 +47,11 @@ class Assembler : public FrontendInterface {
// feat dim
// feat dim
virtual
size_t
Dim
()
const
{
return
dim_
;
}
virtual
size_t
Dim
()
const
{
return
dim_
;
}
virtual
void
SetFinished
()
{
virtual
void
SetFinished
()
{
base_extractor_
->
SetFinished
();
}
base_extractor_
->
SetFinished
();
}
virtual
bool
IsFinished
()
const
{
return
base_extractor_
->
IsFinished
();
}
virtual
bool
IsFinished
()
const
{
return
base_extractor_
->
IsFinished
();
}
virtual
void
Reset
()
{
virtual
void
Reset
()
{
base_extractor_
->
Reset
();
}
base_extractor_
->
Reset
();
}
private:
private:
bool
Compute
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
bool
Compute
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
...
...
speechx/speechx/frontend/audio/audio_cache.h
浏览文件 @
dfdf450b
...
@@ -30,7 +30,7 @@ class AudioCache : public FrontendInterface {
...
@@ -30,7 +30,7 @@ class AudioCache : public FrontendInterface {
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
waves
);
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
waves
);
// the audio dim is 1, one sample, which is useless,
// the audio dim is 1, one sample, which is useless,
// so we return size_(cache samples) instead.
// so we return size_(cache samples) instead.
virtual
size_t
Dim
()
const
{
return
size_
;
}
virtual
size_t
Dim
()
const
{
return
size_
;
}
...
...
speechx/speechx/frontend/audio/fbank.cc
浏览文件 @
dfdf450b
...
@@ -29,19 +29,19 @@ using kaldi::Matrix;
...
@@ -29,19 +29,19 @@ using kaldi::Matrix;
using
std
::
vector
;
using
std
::
vector
;
FbankComputer
::
FbankComputer
(
const
Options
&
opts
)
FbankComputer
::
FbankComputer
(
const
Options
&
opts
)
:
opts_
(
opts
),
:
opts_
(
opts
),
computer_
(
opts
)
{}
computer_
(
opts
)
{}
int32
FbankComputer
::
Dim
()
const
{
int32
FbankComputer
::
Dim
()
const
{
return
opts_
.
mel_opts
.
num_bins
+
(
opts_
.
use_energy
?
1
:
0
);
return
opts_
.
mel_opts
.
num_bins
+
(
opts_
.
use_energy
?
1
:
0
);
}
}
bool
FbankComputer
::
NeedRawLogEnergy
()
{
bool
FbankComputer
::
NeedRawLogEnergy
()
{
return
opts_
.
use_energy
&&
opts_
.
raw_energy
;
return
opts_
.
use_energy
&&
opts_
.
raw_energy
;
}
}
// Compute feat
// Compute feat
bool
FbankComputer
::
Compute
(
Vector
<
BaseFloat
>*
window
,
Vector
<
BaseFloat
>*
feat
)
{
bool
FbankComputer
::
Compute
(
Vector
<
BaseFloat
>*
window
,
Vector
<
BaseFloat
>*
feat
)
{
RealFft
(
window
,
true
);
RealFft
(
window
,
true
);
kaldi
::
ComputePowerSpectrum
(
window
);
kaldi
::
ComputePowerSpectrum
(
window
);
const
kaldi
::
MelBanks
&
mel_bank
=
*
(
computer_
.
GetMelBanks
(
1.0
));
const
kaldi
::
MelBanks
&
mel_bank
=
*
(
computer_
.
GetMelBanks
(
1.0
));
...
...
speechx/speechx/frontend/audio/feature_cache.cc
浏览文件 @
dfdf450b
...
@@ -72,9 +72,9 @@ bool FeatureCache::Compute() {
...
@@ -72,9 +72,9 @@ bool FeatureCache::Compute() {
bool
result
=
base_extractor_
->
Read
(
&
feature
);
bool
result
=
base_extractor_
->
Read
(
&
feature
);
if
(
result
==
false
||
feature
.
Dim
()
==
0
)
return
false
;
if
(
result
==
false
||
feature
.
Dim
()
==
0
)
return
false
;
int32
num_chunk
=
feature
.
Dim
()
/
dim_
;
int32
num_chunk
=
feature
.
Dim
()
/
dim_
;
for
(
int
chunk_idx
=
0
;
chunk_idx
<
num_chunk
;
++
chunk_idx
)
{
for
(
int
chunk_idx
=
0
;
chunk_idx
<
num_chunk
;
++
chunk_idx
)
{
int32
start
=
chunk_idx
*
dim_
;
int32
start
=
chunk_idx
*
dim_
;
Vector
<
BaseFloat
>
feature_chunk
(
dim_
);
Vector
<
BaseFloat
>
feature_chunk
(
dim_
);
SubVector
<
BaseFloat
>
tmp
(
feature
.
Data
()
+
start
,
dim_
);
SubVector
<
BaseFloat
>
tmp
(
feature
.
Data
()
+
start
,
dim_
);
feature_chunk
.
CopyFromVec
(
tmp
);
feature_chunk
.
CopyFromVec
(
tmp
);
...
...
speechx/speechx/frontend/audio/feature_cache.h
浏览文件 @
dfdf450b
...
@@ -22,9 +22,7 @@ namespace ppspeech {
...
@@ -22,9 +22,7 @@ namespace ppspeech {
struct
FeatureCacheOptions
{
struct
FeatureCacheOptions
{
int32
max_size
;
int32
max_size
;
int32
timeout
;
// ms
int32
timeout
;
// ms
FeatureCacheOptions
()
FeatureCacheOptions
()
:
max_size
(
kint16max
),
timeout
(
1
)
{}
:
max_size
(
kint16max
),
timeout
(
1
)
{}
};
};
class
FeatureCache
:
public
FrontendInterface
{
class
FeatureCache
:
public
FrontendInterface
{
...
...
speechx/speechx/frontend/audio/feature_common.h
浏览文件 @
dfdf450b
...
@@ -23,11 +23,11 @@ template <class F>
...
@@ -23,11 +23,11 @@ template <class F>
class
StreamingFeatureTpl
:
public
FrontendInterface
{
class
StreamingFeatureTpl
:
public
FrontendInterface
{
public:
public:
typedef
typename
F
::
Options
Options
;
typedef
typename
F
::
Options
Options
;
StreamingFeatureTpl
(
const
Options
&
opts
,
StreamingFeatureTpl
(
const
Options
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
);
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
);
virtual
void
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
waves
);
virtual
void
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
waves
);
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
virtual
bool
Read
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
// the dim_ is the dim of single frame feature
// the dim_ is the dim of single frame feature
virtual
size_t
Dim
()
const
{
return
computer_
.
Dim
();
}
virtual
size_t
Dim
()
const
{
return
computer_
.
Dim
();
}
...
@@ -39,8 +39,9 @@ class StreamingFeatureTpl : public FrontendInterface {
...
@@ -39,8 +39,9 @@ class StreamingFeatureTpl : public FrontendInterface {
base_extractor_
->
Reset
();
base_extractor_
->
Reset
();
remained_wav_
.
Resize
(
0
);
remained_wav_
.
Resize
(
0
);
}
}
private:
private:
bool
Compute
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
waves
,
bool
Compute
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
waves
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
);
Options
opts_
;
Options
opts_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
std
::
unique_ptr
<
FrontendInterface
>
base_extractor_
;
...
...
speechx/speechx/frontend/audio/feature_common_inl.h
浏览文件 @
dfdf450b
...
@@ -16,16 +16,15 @@
...
@@ -16,16 +16,15 @@
namespace
ppspeech
{
namespace
ppspeech
{
template
<
class
F
>
template
<
class
F
>
StreamingFeatureTpl
<
F
>::
StreamingFeatureTpl
(
const
Options
&
opts
,
StreamingFeatureTpl
<
F
>::
StreamingFeatureTpl
(
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
)
:
const
Options
&
opts
,
std
::
unique_ptr
<
FrontendInterface
>
base_extractor
)
opts_
(
opts
),
:
opts_
(
opts
),
computer_
(
opts
),
window_function_
(
opts
.
frame_opts
)
{
computer_
(
opts
),
window_function_
(
opts
.
frame_opts
)
{
base_extractor_
=
std
::
move
(
base_extractor
);
base_extractor_
=
std
::
move
(
base_extractor
);
}
}
template
<
class
F
>
template
<
class
F
>
void
StreamingFeatureTpl
<
F
>::
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
waves
)
{
void
StreamingFeatureTpl
<
F
>::
Accept
(
const
kaldi
::
VectorBase
<
kaldi
::
BaseFloat
>&
waves
)
{
base_extractor_
->
Accept
(
waves
);
base_extractor_
->
Accept
(
waves
);
}
}
...
@@ -58,8 +57,9 @@ bool StreamingFeatureTpl<F>::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
...
@@ -58,8 +57,9 @@ bool StreamingFeatureTpl<F>::Read(kaldi::Vector<kaldi::BaseFloat>* feats) {
// Compute feat
// Compute feat
template
<
class
F
>
template
<
class
F
>
bool
StreamingFeatureTpl
<
F
>::
Compute
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
waves
,
bool
StreamingFeatureTpl
<
F
>::
Compute
(
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
)
{
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
waves
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
feats
)
{
const
kaldi
::
FrameExtractionOptions
&
frame_opts
=
const
kaldi
::
FrameExtractionOptions
&
frame_opts
=
computer_
.
GetFrameOptions
();
computer_
.
GetFrameOptions
();
int32
num_samples
=
waves
.
Dim
();
int32
num_samples
=
waves
.
Dim
();
...
@@ -84,9 +84,11 @@ bool StreamingFeatureTpl<F>::Compute(const kaldi::Vector<kaldi::BaseFloat>& wave
...
@@ -84,9 +84,11 @@ bool StreamingFeatureTpl<F>::Compute(const kaldi::Vector<kaldi::BaseFloat>& wave
&
window
,
&
window
,
need_raw_log_energy
?
&
raw_log_energy
:
NULL
);
need_raw_log_energy
?
&
raw_log_energy
:
NULL
);
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
this_feature
(
computer_
.
Dim
(),
kaldi
::
kUndefined
);
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
this_feature
(
computer_
.
Dim
(),
kaldi
::
kUndefined
);
computer_
.
Compute
(
&
window
,
&
this_feature
);
computer_
.
Compute
(
&
window
,
&
this_feature
);
kaldi
::
SubVector
<
kaldi
::
BaseFloat
>
output_row
(
feats
->
Data
()
+
frame
*
Dim
(),
Dim
());
kaldi
::
SubVector
<
kaldi
::
BaseFloat
>
output_row
(
feats
->
Data
()
+
frame
*
Dim
(),
Dim
());
output_row
.
CopyFromVec
(
this_feature
);
output_row
.
CopyFromVec
(
this_feature
);
}
}
return
true
;
return
true
;
...
...
speechx/speechx/frontend/audio/feature_pipeline.h
浏览文件 @
dfdf450b
...
@@ -16,6 +16,7 @@
...
@@ -16,6 +16,7 @@
#pragma once
#pragma once
#include "frontend/audio/assembler.h"
#include "frontend/audio/audio_cache.h"
#include "frontend/audio/audio_cache.h"
#include "frontend/audio/data_cache.h"
#include "frontend/audio/data_cache.h"
#include "frontend/audio/fbank.h"
#include "frontend/audio/fbank.h"
...
@@ -23,7 +24,6 @@
...
@@ -23,7 +24,6 @@
#include "frontend/audio/frontend_itf.h"
#include "frontend/audio/frontend_itf.h"
#include "frontend/audio/linear_spectrogram.h"
#include "frontend/audio/linear_spectrogram.h"
#include "frontend/audio/normalizer.h"
#include "frontend/audio/normalizer.h"
#include "frontend/audio/assembler.h"
namespace
ppspeech
{
namespace
ppspeech
{
...
...
speechx/speechx/frontend/audio/linear_spectrogram.cc
浏览文件 @
dfdf450b
...
@@ -28,22 +28,21 @@ using kaldi::VectorBase;
...
@@ -28,22 +28,21 @@ using kaldi::VectorBase;
using
kaldi
::
Matrix
;
using
kaldi
::
Matrix
;
using
std
::
vector
;
using
std
::
vector
;
LinearSpectrogramComputer
::
LinearSpectrogramComputer
(
LinearSpectrogramComputer
::
LinearSpectrogramComputer
(
const
Options
&
opts
)
const
Options
&
opts
)
:
opts_
(
opts
)
{
:
opts_
(
opts
)
{
kaldi
::
FeatureWindowFunction
feature_window_function
(
opts
.
frame_opts
);
kaldi
::
FeatureWindowFunction
feature_window_function
(
opts
.
frame_opts
);
int32
window_size
=
opts
.
frame_opts
.
WindowSize
();
int32
window_size
=
opts
.
frame_opts
.
WindowSize
();
frame_length_
=
window_size
;
frame_length_
=
window_size
;
dim_
=
window_size
/
2
+
1
;
dim_
=
window_size
/
2
+
1
;
BaseFloat
hanning_window_energy
=
kaldi
::
VecVec
(
feature_window_function
.
window
,
BaseFloat
hanning_window_energy
=
kaldi
::
VecVec
(
feature_window_function
.
window
);
feature_window_function
.
window
,
feature_window_function
.
window
);
int32
sample_rate
=
opts
.
frame_opts
.
samp_freq
;
int32
sample_rate
=
opts
.
frame_opts
.
samp_freq
;
scale_
=
2.0
/
(
hanning_window_energy
*
sample_rate
);
scale_
=
2.0
/
(
hanning_window_energy
*
sample_rate
);
}
}
// Compute spectrogram feat
// Compute spectrogram feat
bool
LinearSpectrogramComputer
::
Compute
(
Vector
<
BaseFloat
>*
window
,
bool
LinearSpectrogramComputer
::
Compute
(
Vector
<
BaseFloat
>*
window
,
Vector
<
BaseFloat
>*
feat
)
{
Vector
<
BaseFloat
>*
feat
)
{
window
->
Resize
(
frame_length_
,
kaldi
::
kCopyData
);
window
->
Resize
(
frame_length_
,
kaldi
::
kCopyData
);
RealFft
(
window
,
true
);
RealFft
(
window
,
true
);
kaldi
::
ComputePowerSpectrum
(
window
);
kaldi
::
ComputePowerSpectrum
(
window
);
...
...
speechx/speechx/nnet/nnet_forward_main.cc
浏览文件 @
dfdf450b
...
@@ -14,8 +14,8 @@
...
@@ -14,8 +14,8 @@
#include "base/flags.h"
#include "base/flags.h"
#include "base/log.h"
#include "base/log.h"
#include "frontend/audio/data_cache.h"
#include "frontend/audio/assembler.h"
#include "frontend/audio/assembler.h"
#include "frontend/audio/data_cache.h"
#include "kaldi/util/table-types.h"
#include "kaldi/util/table-types.h"
#include "nnet/decodable.h"
#include "nnet/decodable.h"
#include "nnet/paddle_nnet.h"
#include "nnet/paddle_nnet.h"
...
@@ -75,8 +75,8 @@ int main(int argc, char* argv[]) {
...
@@ -75,8 +75,8 @@ int main(int argc, char* argv[]) {
std
::
shared_ptr
<
ppspeech
::
Decodable
>
decodable
(
std
::
shared_ptr
<
ppspeech
::
Decodable
>
decodable
(
new
ppspeech
::
Decodable
(
nnet
,
raw_data
,
FLAGS_acoustic_scale
));
new
ppspeech
::
Decodable
(
nnet
,
raw_data
,
FLAGS_acoustic_scale
));
int32
chunk_size
=
FLAGS_receptive_field_length
int32
chunk_size
=
FLAGS_receptive_field_length
+
+
(
FLAGS_nnet_decoder_chunk
-
1
)
*
FLAGS_downsampling_rate
;
(
FLAGS_nnet_decoder_chunk
-
1
)
*
FLAGS_downsampling_rate
;
int32
chunk_stride
=
FLAGS_downsampling_rate
*
FLAGS_nnet_decoder_chunk
;
int32
chunk_stride
=
FLAGS_downsampling_rate
*
FLAGS_nnet_decoder_chunk
;
int32
receptive_field_length
=
FLAGS_receptive_field_length
;
int32
receptive_field_length
=
FLAGS_receptive_field_length
;
LOG
(
INFO
)
<<
"chunk size (frame): "
<<
chunk_size
;
LOG
(
INFO
)
<<
"chunk size (frame): "
<<
chunk_size
;
...
@@ -130,7 +130,9 @@ int main(int argc, char* argv[]) {
...
@@ -130,7 +130,9 @@ int main(int argc, char* argv[]) {
vector
<
kaldi
::
BaseFloat
>
prob
;
vector
<
kaldi
::
BaseFloat
>
prob
;
while
(
decodable
->
FrameLikelihood
(
frame_idx
,
&
prob
))
{
while
(
decodable
->
FrameLikelihood
(
frame_idx
,
&
prob
))
{
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
vec_tmp
(
prob
.
size
());
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
vec_tmp
(
prob
.
size
());
std
::
memcpy
(
vec_tmp
.
Data
(),
prob
.
data
(),
sizeof
(
kaldi
::
BaseFloat
)
*
prob
.
size
());
std
::
memcpy
(
vec_tmp
.
Data
(),
prob
.
data
(),
sizeof
(
kaldi
::
BaseFloat
)
*
prob
.
size
());
prob_vec
.
push_back
(
vec_tmp
);
prob_vec
.
push_back
(
vec_tmp
);
frame_idx
++
;
frame_idx
++
;
}
}
...
@@ -142,7 +144,8 @@ int main(int argc, char* argv[]) {
...
@@ -142,7 +144,8 @@ int main(int argc, char* argv[]) {
KALDI_LOG
<<
" the nnet prob of "
<<
utt
<<
" is empty"
;
KALDI_LOG
<<
" the nnet prob of "
<<
utt
<<
" is empty"
;
continue
;
continue
;
}
}
kaldi
::
Matrix
<
kaldi
::
BaseFloat
>
result
(
prob_vec
.
size
(),
prob_vec
[
0
].
Dim
());
kaldi
::
Matrix
<
kaldi
::
BaseFloat
>
result
(
prob_vec
.
size
(),
prob_vec
[
0
].
Dim
());
for
(
int32
row_idx
=
0
;
row_idx
<
prob_vec
.
size
();
++
row_idx
)
{
for
(
int32
row_idx
=
0
;
row_idx
<
prob_vec
.
size
();
++
row_idx
)
{
for
(
int32
col_idx
=
0
;
col_idx
<
prob_vec
[
0
].
Dim
();
++
col_idx
)
{
for
(
int32
col_idx
=
0
;
col_idx
<
prob_vec
[
0
].
Dim
();
++
col_idx
)
{
result
(
row_idx
,
col_idx
)
=
prob_vec
[
row_idx
](
col_idx
);
result
(
row_idx
,
col_idx
)
=
prob_vec
[
row_idx
](
col_idx
);
...
...
speechx/speechx/protocol/websocket/websocket_client.h
浏览文件 @
dfdf450b
...
@@ -40,8 +40,8 @@ class WebSocketClient {
...
@@ -40,8 +40,8 @@ class WebSocketClient {
void
SendEndSignal
();
void
SendEndSignal
();
void
SendDataEnd
();
void
SendDataEnd
();
bool
Done
()
const
{
return
done_
;
}
bool
Done
()
const
{
return
done_
;
}
std
::
string
GetResult
()
const
{
return
result_
;
}
std
::
string
GetResult
()
const
{
return
result_
;
}
std
::
string
GetPartialResult
()
const
{
return
partial_result_
;}
std
::
string
GetPartialResult
()
const
{
return
partial_result_
;
}
private:
private:
void
Connect
();
void
Connect
();
...
...
speechx/speechx/protocol/websocket/websocket_server.cc
浏览文件 @
dfdf450b
...
@@ -76,9 +76,10 @@ void ConnectionHandler::OnSpeechData(const beast::flat_buffer& buffer) {
...
@@ -76,9 +76,10 @@ void ConnectionHandler::OnSpeechData(const beast::flat_buffer& buffer) {
recognizer_
->
Accept
(
pcm_data
);
recognizer_
->
Accept
(
pcm_data
);
std
::
string
partial_result
=
recognizer_
->
GetPartialResult
();
std
::
string
partial_result
=
recognizer_
->
GetPartialResult
();
json
::
value
rv
=
{
json
::
value
rv
=
{{
"status"
,
"ok"
},
{
"status"
,
"ok"
},
{
"type"
,
"partial_result"
},
{
"result"
,
partial_result
}};
{
"type"
,
"partial_result"
},
{
"result"
,
partial_result
}};
ws_
.
text
(
true
);
ws_
.
text
(
true
);
ws_
.
write
(
asio
::
buffer
(
json
::
serialize
(
rv
)));
ws_
.
write
(
asio
::
buffer
(
json
::
serialize
(
rv
)));
}
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录