Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
cd1ced4e
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
cd1ced4e
编写于
10月 11, 2022
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add nnetout struct
上级
290c23b9
变更
9
隐藏空白更改
内联
并排
Showing
9 changed file
with
59 addition
and
43 deletion
+59
-43
speechx/examples/ds2_ol/aishell/run.sh
speechx/examples/ds2_ol/aishell/run.sh
+2
-2
speechx/speechx/nnet/CMakeLists.txt
speechx/speechx/nnet/CMakeLists.txt
+0
-1
speechx/speechx/nnet/decodable.cc
speechx/speechx/nnet/decodable.cc
+13
-12
speechx/speechx/nnet/decodable.h
speechx/speechx/nnet/decodable.h
+1
-1
speechx/speechx/nnet/ds2_nnet.cc
speechx/speechx/nnet/ds2_nnet.cc
+9
-6
speechx/speechx/nnet/ds2_nnet.h
speechx/speechx/nnet/ds2_nnet.h
+2
-3
speechx/speechx/nnet/nnet_itf.h
speechx/speechx/nnet/nnet_itf.h
+14
-3
speechx/speechx/nnet/u2_nnet.cc
speechx/speechx/nnet/u2_nnet.cc
+10
-8
speechx/speechx/nnet/u2_nnet.h
speechx/speechx/nnet/u2_nnet.h
+8
-7
未找到文件。
speechx/examples/ds2_ol/aishell/run.sh
浏览文件 @
cd1ced4e
#!/bin/bash
#!/bin/bash
set
+
x
set
-
x
set
-e
set
-e
.
path.sh
.
path.sh
...
@@ -11,7 +11,7 @@ stop_stage=100
...
@@ -11,7 +11,7 @@ stop_stage=100
.
utils/parse_options.sh
.
utils/parse_options.sh
# 1. compile
# 1. compile
if
[
!
-d
${
SPEECHX_
EXAMPLES
}
]
;
then
if
[
!
-d
${
SPEECHX_
BUILD
}
]
;
then
pushd
${
SPEECHX_ROOT
}
pushd
${
SPEECHX_ROOT
}
bash build.sh
bash build.sh
popd
popd
...
...
speechx/speechx/nnet/CMakeLists.txt
浏览文件 @
cd1ced4e
...
@@ -14,7 +14,6 @@ target_link_libraries(nnet absl::strings)
...
@@ -14,7 +14,6 @@ target_link_libraries(nnet absl::strings)
if
(
USING_U2
)
if
(
USING_U2
)
target_compile_options
(
nnet PUBLIC
${
PADDLE_COMPILE_FLAGS
}
)
target_compile_options
(
nnet PUBLIC
${
PADDLE_COMPILE_FLAGS
}
)
target_include_directories
(
nnet PUBLIC
${
pybind11_INCLUDE_DIRS
}
${
PROJECT_SOURCE_DIR
}
)
target_include_directories
(
nnet PUBLIC
${
pybind11_INCLUDE_DIRS
}
${
PROJECT_SOURCE_DIR
}
)
# target_link_libraries(nnet ${PYTHON_LIBRARIES} ${PADDLE_LINK_FLAGS})
endif
()
endif
()
...
...
speechx/speechx/nnet/decodable.cc
浏览文件 @
cd1ced4e
...
@@ -32,7 +32,7 @@ Decodable::Decodable(const std::shared_ptr<NnetInterface>& nnet,
...
@@ -32,7 +32,7 @@ Decodable::Decodable(const std::shared_ptr<NnetInterface>& nnet,
// for debug
// for debug
void
Decodable
::
Acceptlikelihood
(
const
Matrix
<
BaseFloat
>&
likelihood
)
{
void
Decodable
::
Acceptlikelihood
(
const
Matrix
<
BaseFloat
>&
likelihood
)
{
nnet_cache_
=
likelihood
;
nnet_
out_
cache_
=
likelihood
;
frames_ready_
+=
likelihood
.
NumRows
();
frames_ready_
+=
likelihood
.
NumRows
();
}
}
...
@@ -56,13 +56,13 @@ int32 Decodable::NumIndices() const { return 0; }
...
@@ -56,13 +56,13 @@ int32 Decodable::NumIndices() const { return 0; }
int32
Decodable
::
TokenId2NnetId
(
int32
token_id
)
{
return
token_id
-
1
;
}
int32
Decodable
::
TokenId2NnetId
(
int32
token_id
)
{
return
token_id
-
1
;
}
BaseFloat
Decodable
::
LogLikelihood
(
int32
frame
,
int32
index
)
{
BaseFloat
Decodable
::
LogLikelihood
(
int32
frame
,
int32
index
)
{
CHECK_LE
(
index
,
nnet_cache_
.
NumCols
());
CHECK_LE
(
index
,
nnet_
out_
cache_
.
NumCols
());
CHECK_LE
(
frame
,
frames_ready_
);
CHECK_LE
(
frame
,
frames_ready_
);
int32
frame_idx
=
frame
-
frame_offset_
;
int32
frame_idx
=
frame
-
frame_offset_
;
// the nnet output is prob ranther than log prob
// the nnet output is prob ranther than log prob
// the index - 1, because the ilabel
// the index - 1, because the ilabel
return
acoustic_scale_
*
return
acoustic_scale_
*
std
::
log
(
nnet_cache_
(
frame_idx
,
TokenId2NnetId
(
index
))
+
std
::
log
(
nnet_
out_
cache_
(
frame_idx
,
TokenId2NnetId
(
index
))
+
std
::
numeric_limits
<
float
>::
min
());
std
::
numeric_limits
<
float
>::
min
());
}
}
...
@@ -82,17 +82,18 @@ bool Decodable::AdvanceChunk() {
...
@@ -82,17 +82,18 @@ bool Decodable::AdvanceChunk() {
}
}
// forward feats
// forward feats
int32
vocab_dim
=
0
;
NnetOut
out
;
Vector
<
BaseFloat
>
probs
;
nnet_
->
FeedForward
(
features
,
frontend_
->
Dim
(),
&
out
);
nnet_
->
FeedForward
(
features
,
frontend_
->
Dim
(),
&
probs
,
&
vocab_dim
);
int32
&
vocab_dim
=
out
.
vocab_dim
;
Vector
<
BaseFloat
>&
probs
=
out
.
logprobs
;
// cache nnet outupts
// cache nnet outupts
nnet_cache_
.
Resize
(
probs
.
Dim
()
/
vocab_dim
,
vocab_dim
);
nnet_
out_
cache_
.
Resize
(
probs
.
Dim
()
/
vocab_dim
,
vocab_dim
);
nnet_cache_
.
CopyRowsFromVec
(
probs
);
nnet_
out_
cache_
.
CopyRowsFromVec
(
probs
);
// update state
// update state
frame_offset_
=
frames_ready_
;
frame_offset_
=
frames_ready_
;
frames_ready_
+=
nnet_cache_
.
NumRows
();
frames_ready_
+=
nnet_
out_
cache_
.
NumRows
();
return
true
;
return
true
;
}
}
...
@@ -102,12 +103,12 @@ bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
...
@@ -102,12 +103,12 @@ bool Decodable::FrameLikelihood(int32 frame, vector<BaseFloat>* likelihood) {
return
false
;
return
false
;
}
}
int
vocab_size
=
nnet_cache_
.
NumCols
();
int
vocab_size
=
nnet_
out_
cache_
.
NumCols
();
likelihood
->
resize
(
vocab_size
);
likelihood
->
resize
(
vocab_size
);
for
(
int32
idx
=
0
;
idx
<
vocab_size
;
++
idx
)
{
for
(
int32
idx
=
0
;
idx
<
vocab_size
;
++
idx
)
{
(
*
likelihood
)[
idx
]
=
(
*
likelihood
)[
idx
]
=
nnet_cache_
(
frame
-
frame_offset_
,
idx
)
*
acoustic_scale_
;
nnet_
out_
cache_
(
frame
-
frame_offset_
,
idx
)
*
acoustic_scale_
;
}
}
return
true
;
return
true
;
}
}
...
@@ -117,7 +118,7 @@ void Decodable::Reset() {
...
@@ -117,7 +118,7 @@ void Decodable::Reset() {
if
(
nnet_
!=
nullptr
)
nnet_
->
Reset
();
if
(
nnet_
!=
nullptr
)
nnet_
->
Reset
();
frame_offset_
=
0
;
frame_offset_
=
0
;
frames_ready_
=
0
;
frames_ready_
=
0
;
nnet_cache_
.
Resize
(
0
,
0
);
nnet_
out_
cache_
.
Resize
(
0
,
0
);
}
}
}
// namespace ppspeech
}
// namespace ppspeech
\ No newline at end of file
speechx/speechx/nnet/decodable.h
浏览文件 @
cd1ced4e
...
@@ -62,7 +62,7 @@ class Decodable : public kaldi::DecodableInterface {
...
@@ -62,7 +62,7 @@ class Decodable : public kaldi::DecodableInterface {
std
::
shared_ptr
<
NnetInterface
>
nnet_
;
std
::
shared_ptr
<
NnetInterface
>
nnet_
;
// nnet outputs' cache
// nnet outputs' cache
kaldi
::
Matrix
<
kaldi
::
BaseFloat
>
nnet_cache_
;
kaldi
::
Matrix
<
kaldi
::
BaseFloat
>
nnet_
out_
cache_
;
// the frame is nnet prob frame rather than audio feature frame
// the frame is nnet prob frame rather than audio feature frame
// nnet frame subsample the feature frame
// nnet frame subsample the feature frame
...
...
speechx/speechx/nnet/ds2_nnet.cc
浏览文件 @
cd1ced4e
...
@@ -143,9 +143,8 @@ shared_ptr<Tensor<BaseFloat>> PaddleNnet::GetCacheEncoder(const string& name) {
...
@@ -143,9 +143,8 @@ shared_ptr<Tensor<BaseFloat>> PaddleNnet::GetCacheEncoder(const string& name) {
}
}
void
PaddleNnet
::
FeedForward
(
const
Vector
<
BaseFloat
>&
features
,
void
PaddleNnet
::
FeedForward
(
const
Vector
<
BaseFloat
>&
features
,
int32
feature_dim
,
const
int32
&
feature_dim
,
Vector
<
BaseFloat
>*
inferences
,
NnetOut
*
out
)
{
int32
*
inference_dim
)
{
paddle_infer
::
Predictor
*
predictor
=
GetPredictor
();
paddle_infer
::
Predictor
*
predictor
=
GetPredictor
();
int
feat_row
=
features
.
Dim
()
/
feature_dim
;
int
feat_row
=
features
.
Dim
()
/
feature_dim
;
...
@@ -203,9 +202,13 @@ void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
...
@@ -203,9 +202,13 @@ void PaddleNnet::FeedForward(const Vector<BaseFloat>& features,
std
::
vector
<
int
>
output_shape
=
output_tensor
->
shape
();
std
::
vector
<
int
>
output_shape
=
output_tensor
->
shape
();
int32
row
=
output_shape
[
1
];
int32
row
=
output_shape
[
1
];
int32
col
=
output_shape
[
2
];
int32
col
=
output_shape
[
2
];
inferences
->
Resize
(
row
*
col
);
*
inference_dim
=
col
;
output_tensor
->
CopyToCpu
(
inferences
->
Data
());
// inferences->Resize(row * col);
// *inference_dim = col;
out
->
logprobs
.
Resize
(
row
*
col
);
out
->
vocab_dim
=
col
;
output_tensor
->
CopyToCpu
(
out
->
logprobs
.
Data
());
ReleasePredictor
(
predictor
);
ReleasePredictor
(
predictor
);
}
}
...
...
speechx/speechx/nnet/ds2_nnet.h
浏览文件 @
cd1ced4e
...
@@ -97,9 +97,8 @@ class PaddleNnet : public NnetInterface {
...
@@ -97,9 +97,8 @@ class PaddleNnet : public NnetInterface {
PaddleNnet
(
const
ModelOptions
&
opts
);
PaddleNnet
(
const
ModelOptions
&
opts
);
virtual
void
FeedForward
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
features
,
virtual
void
FeedForward
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
features
,
int32
feature_dim
,
const
int32
&
feature_dim
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
inferences
,
NnetOut
*
out
);
int32
*
inference_dim
);
void
Dim
();
void
Dim
();
virtual
void
Reset
();
virtual
void
Reset
();
...
...
speechx/speechx/nnet/nnet_itf.h
浏览文件 @
cd1ced4e
...
@@ -21,12 +21,23 @@
...
@@ -21,12 +21,23 @@
namespace
ppspeech
{
namespace
ppspeech
{
struct
NnetOut
{
// nnet out, maybe logprob or prob
kaldi
::
Vector
<
kaldi
::
BaseFloat
>
logprobs
;
int32
vocab_dim
;
// nnet state. Only using in Attention model.
std
::
vector
<
std
::
vector
<
kaldi
::
BaseFloat
>>
encoder_outs
;
NnetOut
()
:
logprobs
({}),
vocab_dim
(
-
1
),
encoder_outs
({})
{}
};
class
NnetInterface
{
class
NnetInterface
{
public:
public:
virtual
void
FeedForward
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
features
,
virtual
void
FeedForward
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
features
,
int32
feature_dim
,
const
int32
&
feature_dim
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
inferences
,
NnetOut
*
out
)
=
0
;
int32
*
inference_dim
)
=
0
;
virtual
void
Reset
()
=
0
;
virtual
void
Reset
()
=
0
;
virtual
~
NnetInterface
()
{}
virtual
~
NnetInterface
()
{}
};
};
...
...
speechx/speechx/nnet/u2_nnet.cc
浏览文件 @
cd1ced4e
...
@@ -64,7 +64,7 @@ void U2NnetBase::CacheFeature(const std::vector<kaldi::BaseFloat>& chunk_feats,
...
@@ -64,7 +64,7 @@ void U2NnetBase::CacheFeature(const std::vector<kaldi::BaseFloat>& chunk_feats,
void
U2NnetBase
::
ForwardEncoderChunk
(
void
U2NnetBase
::
ForwardEncoderChunk
(
const
std
::
vector
<
kaldi
::
BaseFloat
>&
chunk_feats
,
const
std
::
vector
<
kaldi
::
BaseFloat
>&
chunk_feats
,
int32
feat_dim
,
const
int32
&
feat_dim
,
std
::
vector
<
kaldi
::
BaseFloat
>*
ctc_probs
,
std
::
vector
<
kaldi
::
BaseFloat
>*
ctc_probs
,
int32
*
vocab_dim
)
{
int32
*
vocab_dim
)
{
ctc_probs
->
clear
();
ctc_probs
->
clear
();
...
@@ -221,16 +221,17 @@ void U2Nnet::FeedEncoderOuts(paddle::Tensor& encoder_out) {
...
@@ -221,16 +221,17 @@ void U2Nnet::FeedEncoderOuts(paddle::Tensor& encoder_out) {
void
U2Nnet
::
FeedForward
(
const
kaldi
::
Vector
<
BaseFloat
>&
features
,
void
U2Nnet
::
FeedForward
(
const
kaldi
::
Vector
<
BaseFloat
>&
features
,
int32
feature_dim
,
const
int32
&
feature_dim
,
kaldi
::
Vector
<
BaseFloat
>*
inferences
,
NnetOut
*
out
)
{
int32
*
inference_dim
)
{
std
::
vector
<
kaldi
::
BaseFloat
>
chunk_feats
(
features
.
Data
(),
std
::
vector
<
kaldi
::
BaseFloat
>
chunk_feats
(
features
.
Data
(),
features
.
Data
()
+
features
.
Dim
());
features
.
Data
()
+
features
.
Dim
());
std
::
vector
<
kaldi
::
BaseFloat
>
ctc_probs
;
std
::
vector
<
kaldi
::
BaseFloat
>
ctc_probs
;
ForwardEncoderChunkImpl
(
ForwardEncoderChunkImpl
(
chunk_feats
,
feature_dim
,
&
ctc_probs
,
inference_dim
);
chunk_feats
,
feature_dim
,
&
ctc_probs
,
&
out
->
vocab_dim
);
inferences
->
Resize
(
ctc_probs
.
size
(),
kaldi
::
kSetZero
);
std
::
memcpy
(
inferences
->
Data
(),
out
->
logprobs
.
Resize
(
ctc_probs
.
size
(),
kaldi
::
kSetZero
);
std
::
memcpy
(
out
->
logprobs
.
Data
(),
ctc_probs
.
data
(),
ctc_probs
.
data
(),
ctc_probs
.
size
()
*
sizeof
(
kaldi
::
BaseFloat
));
ctc_probs
.
size
()
*
sizeof
(
kaldi
::
BaseFloat
));
}
}
...
@@ -238,9 +239,10 @@ void U2Nnet::FeedForward(const kaldi::Vector<BaseFloat>& features,
...
@@ -238,9 +239,10 @@ void U2Nnet::FeedForward(const kaldi::Vector<BaseFloat>& features,
void
U2Nnet
::
ForwardEncoderChunkImpl
(
void
U2Nnet
::
ForwardEncoderChunkImpl
(
const
std
::
vector
<
kaldi
::
BaseFloat
>&
chunk_feats
,
const
std
::
vector
<
kaldi
::
BaseFloat
>&
chunk_feats
,
int32
feat_dim
,
const
int32
&
feat_dim
,
std
::
vector
<
kaldi
::
BaseFloat
>*
out_prob
,
std
::
vector
<
kaldi
::
BaseFloat
>*
out_prob
,
int32
*
vocab_dim
)
{
int32
*
vocab_dim
)
{
#ifdef USE_PROFILING
#ifdef USE_PROFILING
RecordEvent
event
(
RecordEvent
event
(
"ForwardEncoderChunkImpl"
,
TracerEventType
::
UserDefined
,
1
);
"ForwardEncoderChunkImpl"
,
TracerEventType
::
UserDefined
,
1
);
...
...
speechx/speechx/nnet/u2_nnet.h
浏览文件 @
cd1ced4e
...
@@ -61,7 +61,7 @@ class U2NnetBase : public NnetInterface {
...
@@ -61,7 +61,7 @@ class U2NnetBase : public NnetInterface {
virtual
void
ForwardEncoderChunk
(
virtual
void
ForwardEncoderChunk
(
const
std
::
vector
<
kaldi
::
BaseFloat
>&
chunk_feats
,
const
std
::
vector
<
kaldi
::
BaseFloat
>&
chunk_feats
,
int32
feat_dim
,
const
int32
&
feat_dim
,
std
::
vector
<
kaldi
::
BaseFloat
>*
ctc_probs
,
std
::
vector
<
kaldi
::
BaseFloat
>*
ctc_probs
,
int32
*
vocab_dim
);
int32
*
vocab_dim
);
...
@@ -72,7 +72,7 @@ class U2NnetBase : public NnetInterface {
...
@@ -72,7 +72,7 @@ class U2NnetBase : public NnetInterface {
protected:
protected:
virtual
void
ForwardEncoderChunkImpl
(
virtual
void
ForwardEncoderChunkImpl
(
const
std
::
vector
<
kaldi
::
BaseFloat
>&
chunk_feats
,
const
std
::
vector
<
kaldi
::
BaseFloat
>&
chunk_feats
,
int32
feat_dim
,
const
int32
&
feat_dim
,
std
::
vector
<
kaldi
::
BaseFloat
>*
ctc_probs
,
std
::
vector
<
kaldi
::
BaseFloat
>*
ctc_probs
,
int32
*
vocab_dim
)
=
0
;
int32
*
vocab_dim
)
=
0
;
...
@@ -93,7 +93,7 @@ class U2NnetBase : public NnetInterface {
...
@@ -93,7 +93,7 @@ class U2NnetBase : public NnetInterface {
// case. Otherwise, none streaming case
// case. Otherwise, none streaming case
int
num_left_chunks_
{
-
1
};
// -1 means all left chunks
int
num_left_chunks_
{
-
1
};
// -1 means all left chunks
// asr decoder state
// asr decoder state
, not used in nnet
int
offset_
{
0
};
// current offset in encoder output time stamp. Used by
int
offset_
{
0
};
// current offset in encoder output time stamp. Used by
// position embedding.
// position embedding.
std
::
vector
<
std
::
vector
<
float
>>
cached_feats_
{};
// features cache
std
::
vector
<
std
::
vector
<
float
>>
cached_feats_
{};
// features cache
...
@@ -106,9 +106,8 @@ class U2Nnet : public U2NnetBase {
...
@@ -106,9 +106,8 @@ class U2Nnet : public U2NnetBase {
U2Nnet
(
const
U2Nnet
&
other
);
U2Nnet
(
const
U2Nnet
&
other
);
void
FeedForward
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
features
,
void
FeedForward
(
const
kaldi
::
Vector
<
kaldi
::
BaseFloat
>&
features
,
int32
feature_dim
,
const
int32
&
feature_dim
,
kaldi
::
Vector
<
kaldi
::
BaseFloat
>*
inferences
,
NnetOut
*
out
)
override
;
int32
*
inference_dim
)
override
;
void
Reset
()
override
;
void
Reset
()
override
;
...
@@ -123,7 +122,7 @@ class U2Nnet : public U2NnetBase {
...
@@ -123,7 +122,7 @@ class U2Nnet : public U2NnetBase {
void
ForwardEncoderChunkImpl
(
void
ForwardEncoderChunkImpl
(
const
std
::
vector
<
kaldi
::
BaseFloat
>&
chunk_feats
,
const
std
::
vector
<
kaldi
::
BaseFloat
>&
chunk_feats
,
int32
feat_dim
,
const
int32
&
feat_dim
,
std
::
vector
<
kaldi
::
BaseFloat
>*
ctc_probs
,
std
::
vector
<
kaldi
::
BaseFloat
>*
ctc_probs
,
int32
*
vocab_dim
)
override
;
int32
*
vocab_dim
)
override
;
...
@@ -138,6 +137,8 @@ class U2Nnet : public U2NnetBase {
...
@@ -138,6 +137,8 @@ class U2Nnet : public U2NnetBase {
// debug
// debug
void
FeedEncoderOuts
(
paddle
::
Tensor
&
encoder_out
);
void
FeedEncoderOuts
(
paddle
::
Tensor
&
encoder_out
);
const
std
::
vector
<
paddle
::
Tensor
>&
EncoderOuts
()
const
{
return
encoder_outs_
;
}
private:
private:
U2ModelOptions
opts_
;
U2ModelOptions
opts_
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录