Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
models
提交
e6740af4
M
models
项目概览
PaddlePaddle
/
models
大约 1 年 前同步成功
通知
222
Star
6828
Fork
2962
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
602
列表
看板
标记
里程碑
合并请求
255
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
models
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
602
Issue
602
列表
看板
标记
里程碑
合并请求
255
合并请求
255
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e6740af4
编写于
9月 17, 2017
作者:
Y
Yibing Liu
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
adjust scorer's init & add logging for scorer & separate long functions
上级
15728d04
变更
23
隐藏空白更改
内联
并排
Showing
23 changed file
with
310 addition
and
239 deletion
+310
-239
deep_speech_2/README.md
deep_speech_2/README.md
+0
-1
deep_speech_2/decoders/decoders_deprecated.py
deep_speech_2/decoders/decoders_deprecated.py
+3
-3
deep_speech_2/decoders/scorer_deprecated.py
deep_speech_2/decoders/scorer_deprecated.py
+0
-0
deep_speech_2/decoders/swig/ctc_beam_search_decoder.cpp
deep_speech_2/decoders/swig/ctc_beam_search_decoder.cpp
+25
-139
deep_speech_2/decoders/swig/ctc_beam_search_decoder.h
deep_speech_2/decoders/swig/ctc_beam_search_decoder.h
+8
-21
deep_speech_2/decoders/swig/ctc_greedy_decoder.cpp
deep_speech_2/decoders/swig/ctc_greedy_decoder.cpp
+45
-0
deep_speech_2/decoders/swig/ctc_greedy_decoder.h
deep_speech_2/decoders/swig/ctc_greedy_decoder.h
+20
-0
deep_speech_2/decoders/swig/decoder_utils.cpp
deep_speech_2/decoders/swig/decoder_utils.cpp
+65
-0
deep_speech_2/decoders/swig/decoder_utils.h
deep_speech_2/decoders/swig/decoder_utils.h
+26
-13
deep_speech_2/decoders/swig/decoders.i
deep_speech_2/decoders/swig/decoders.i
+4
-2
deep_speech_2/decoders/swig/path_trie.h
deep_speech_2/decoders/swig/path_trie.h
+4
-5
deep_speech_2/decoders/swig/scorer.cpp
deep_speech_2/decoders/swig/scorer.cpp
+30
-12
deep_speech_2/decoders/swig/scorer.h
deep_speech_2/decoders/swig/scorer.h
+23
-12
deep_speech_2/decoders/swig/setup.py
deep_speech_2/decoders/swig/setup.py
+11
-2
deep_speech_2/decoders/swig/setup.sh
deep_speech_2/decoders/swig/setup.sh
+1
-1
deep_speech_2/decoders/swig_wrapper.py
deep_speech_2/decoders/swig_wrapper.py
+11
-11
deep_speech_2/examples/tiny/run_infer.sh
deep_speech_2/examples/tiny/run_infer.sh
+3
-3
deep_speech_2/examples/tiny/run_infer_golden.sh
deep_speech_2/examples/tiny/run_infer_golden.sh
+3
-3
deep_speech_2/examples/tiny/run_test.sh
deep_speech_2/examples/tiny/run_test.sh
+3
-3
deep_speech_2/examples/tiny/run_test_golden.sh
deep_speech_2/examples/tiny/run_test_golden.sh
+3
-3
deep_speech_2/infer.py
deep_speech_2/infer.py
+1
-0
deep_speech_2/model_utils/model.py
deep_speech_2/model_utils/model.py
+20
-5
deep_speech_2/test.py
deep_speech_2/test.py
+1
-0
未找到文件。
deep_speech_2/README.md
浏览文件 @
e6740af4
...
...
@@ -176,7 +176,6 @@ Data augmentation has often been a highly effective technique to boost the deep
Six optional augmentation components are provided to be selected, configured and inserted into the processing pipeline.
### Inference
-
Volume Perturbation
-
Speed Perturbation
-
Shifting Perturbation
...
...
deep_speech_2/decoders/decoder_deprecated.py
→
deep_speech_2/decoders/decoder
s
_deprecated.py
浏览文件 @
e6740af4
...
...
@@ -119,7 +119,7 @@ def ctc_beam_search_decoder(probs_seq,
cutoff_len
+=
1
if
cum_prob
>=
cutoff_prob
:
break
cutoff_len
=
min
(
cutoff_
top_
n
,
cutoff_top_n
)
cutoff_len
=
min
(
cutoff_
le
n
,
cutoff_top_n
)
prob_idx
=
prob_idx
[
0
:
cutoff_len
]
for
l
in
prefix_set_prev
:
...
...
@@ -228,8 +228,8 @@ def ctc_beam_search_decoder_batch(probs_split,
pool
=
multiprocessing
.
Pool
(
processes
=
num_processes
)
results
=
[]
for
i
,
probs_list
in
enumerate
(
probs_split
):
args
=
(
probs_list
,
beam_size
,
vocabulary
,
blank_id
,
cutoff_prob
,
cutoff_top_n
,
None
,
nproc
)
args
=
(
probs_list
,
beam_size
,
vocabulary
,
cutoff_prob
,
cutoff_top_n
,
None
,
nproc
)
results
.
append
(
pool
.
apply_async
(
ctc_beam_search_decoder
,
args
))
pool
.
close
()
...
...
deep_speech_2/decoders/
lm_
scorer_deprecated.py
→
deep_speech_2/decoders/scorer_deprecated.py
浏览文件 @
e6740af4
文件已移动
deep_speech_2/decoders/swig/ctc_
decoders
.cpp
→
deep_speech_2/decoders/swig/ctc_
beam_search_decoder
.cpp
浏览文件 @
e6740af4
#include "ctc_
decoders
.h"
#include "ctc_
beam_search_decoder
.h"
#include <algorithm>
#include <cmath>
...
...
@@ -9,59 +9,19 @@
#include "ThreadPool.h"
#include "fst/fstlib.h"
#include "fst/log.h"
#include "decoder_utils.h"
#include "path_trie.h"
std
::
string
ctc_greedy_decoder
(
const
std
::
vector
<
std
::
vector
<
double
>>
&
probs_seq
,
const
std
::
vector
<
std
::
string
>
&
vocabulary
)
{
// dimension check
size_t
num_time_steps
=
probs_seq
.
size
();
for
(
size_t
i
=
0
;
i
<
num_time_steps
;
++
i
)
{
VALID_CHECK_EQ
(
probs_seq
[
i
].
size
(),
vocabulary
.
size
()
+
1
,
"The shape of probs_seq does not match with "
"the shape of the vocabulary"
);
}
size_t
blank_id
=
vocabulary
.
size
();
std
::
vector
<
size_t
>
max_idx_vec
;
for
(
size_t
i
=
0
;
i
<
num_time_steps
;
++
i
)
{
double
max_prob
=
0.0
;
size_t
max_idx
=
0
;
for
(
size_t
j
=
0
;
j
<
probs_seq
[
i
].
size
();
j
++
)
{
if
(
max_prob
<
probs_seq
[
i
][
j
])
{
max_idx
=
j
;
max_prob
=
probs_seq
[
i
][
j
];
}
}
max_idx_vec
.
push_back
(
max_idx
);
}
std
::
vector
<
size_t
>
idx_vec
;
for
(
size_t
i
=
0
;
i
<
max_idx_vec
.
size
();
++
i
)
{
if
((
i
==
0
)
||
((
i
>
0
)
&&
max_idx_vec
[
i
]
!=
max_idx_vec
[
i
-
1
]))
{
idx_vec
.
push_back
(
max_idx_vec
[
i
]);
}
}
std
::
string
best_path_result
;
for
(
size_t
i
=
0
;
i
<
idx_vec
.
size
();
++
i
)
{
if
(
idx_vec
[
i
]
!=
blank_id
)
{
best_path_result
+=
vocabulary
[
idx_vec
[
i
]];
}
}
return
best_path_result
;
}
using
FSTMATCH
=
fst
::
SortedMatcher
<
fst
::
StdVectorFst
>
;
std
::
vector
<
std
::
pair
<
double
,
std
::
string
>>
ctc_beam_search_decoder
(
const
std
::
vector
<
std
::
vector
<
double
>>
&
probs_seq
,
const
size_t
beam_size
,
size_t
beam_size
,
std
::
vector
<
std
::
string
>
vocabulary
,
const
double
cutoff_prob
,
const
size_t
cutoff_top_n
,
double
cutoff_prob
,
size_t
cutoff_top_n
,
Scorer
*
ext_scorer
)
{
// dimension check
size_t
num_time_steps
=
probs_seq
.
size
();
...
...
@@ -80,7 +40,7 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
std
::
find
(
vocabulary
.
begin
(),
vocabulary
.
end
(),
" "
);
int
space_id
=
it
-
vocabulary
.
begin
();
// if no space in vocabulary
if
(
space_id
>=
vocabulary
.
size
())
{
if
(
(
size_t
)
space_id
>=
vocabulary
.
size
())
{
space_id
=
-
2
;
}
...
...
@@ -90,30 +50,17 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
std
::
vector
<
PathTrie
*>
prefixes
;
prefixes
.
push_back
(
&
root
);
if
(
ext_scorer
!=
nullptr
)
{
if
(
ext_scorer
->
is_char_map_empty
())
{
ext_scorer
->
set_char_map
(
vocabulary
);
}
if
(
!
ext_scorer
->
is_character_based
())
{
if
(
ext_scorer
->
dictionary
==
nullptr
)
{
// fill dictionary for fst with space
ext_scorer
->
fill_dictionary
(
true
);
}
auto
fst_dict
=
static_cast
<
fst
::
StdVectorFst
*>
(
ext_scorer
->
dictionary
);
fst
::
StdVectorFst
*
dict_ptr
=
fst_dict
->
Copy
(
true
);
root
.
set_dictionary
(
dict_ptr
);
auto
matcher
=
std
::
make_shared
<
FSTMATCH
>
(
*
dict_ptr
,
fst
::
MATCH_INPUT
);
root
.
set_matcher
(
matcher
);
}
if
(
ext_scorer
!=
nullptr
&&
!
ext_scorer
->
is_character_based
())
{
auto
fst_dict
=
static_cast
<
fst
::
StdVectorFst
*>
(
ext_scorer
->
dictionary
);
fst
::
StdVectorFst
*
dict_ptr
=
fst_dict
->
Copy
(
true
);
root
.
set_dictionary
(
dict_ptr
);
auto
matcher
=
std
::
make_shared
<
FSTMATCH
>
(
*
dict_ptr
,
fst
::
MATCH_INPUT
);
root
.
set_matcher
(
matcher
);
}
// prefix search over time
for
(
size_t
time_step
=
0
;
time_step
<
num_time_steps
;
time_step
++
)
{
std
::
vector
<
double
>
prob
=
probs_seq
[
time_step
];
std
::
vector
<
std
::
pair
<
int
,
double
>>
prob_idx
;
for
(
size_t
i
=
0
;
i
<
prob
.
size
();
++
i
)
{
prob_idx
.
push_back
(
std
::
pair
<
int
,
double
>
(
i
,
prob
[
i
]));
}
for
(
size_t
time_step
=
0
;
time_step
<
num_time_steps
;
++
time_step
)
{
auto
&
prob
=
probs_seq
[
time_step
];
float
min_cutoff
=
-
NUM_FLT_INF
;
bool
full_beam
=
false
;
...
...
@@ -121,43 +68,20 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
size_t
num_prefixes
=
std
::
min
(
prefixes
.
size
(),
beam_size
);
std
::
sort
(
prefixes
.
begin
(),
prefixes
.
begin
()
+
num_prefixes
,
prefix_compare
);
min_cutoff
=
prefixes
[
num_prefixes
-
1
]
->
score
+
log
(
prob
[
blank_id
])
-
std
::
max
(
0.0
,
ext_scorer
->
beta
);
min_cutoff
=
prefixes
[
num_prefixes
-
1
]
->
score
+
std
::
log
(
prob
[
blank_id
])
-
std
::
max
(
0.0
,
ext_scorer
->
beta
);
full_beam
=
(
num_prefixes
==
beam_size
);
}
// pruning of vacobulary
size_t
cutoff_len
=
prob
.
size
();
if
(
cutoff_prob
<
1.0
||
cutoff_top_n
<
cutoff_len
)
{
std
::
sort
(
prob_idx
.
begin
(),
prob_idx
.
end
(),
pair_comp_second_rev
<
int
,
double
>
);
if
(
cutoff_prob
<
1.0
)
{
double
cum_prob
=
0.0
;
cutoff_len
=
0
;
for
(
size_t
i
=
0
;
i
<
prob_idx
.
size
();
++
i
)
{
cum_prob
+=
prob_idx
[
i
].
second
;
cutoff_len
+=
1
;
if
(
cum_prob
>=
cutoff_prob
)
break
;
}
}
cutoff_len
=
std
::
min
(
cutoff_len
,
cutoff_top_n
);
prob_idx
=
std
::
vector
<
std
::
pair
<
int
,
double
>>
(
prob_idx
.
begin
(),
prob_idx
.
begin
()
+
cutoff_len
);
}
std
::
vector
<
std
::
pair
<
size_t
,
float
>>
log_prob_idx
;
for
(
size_t
i
=
0
;
i
<
cutoff_len
;
++
i
)
{
log_prob_idx
.
push_back
(
std
::
pair
<
int
,
float
>
(
prob_idx
[
i
].
first
,
log
(
prob_idx
[
i
].
second
+
NUM_FLT_MIN
)));
}
std
::
vector
<
std
::
pair
<
size_t
,
float
>>
log_prob_idx
=
get_pruned_log_probs
(
prob
,
cutoff_prob
,
cutoff_top_n
);
// loop over chars
for
(
size_t
index
=
0
;
index
<
log_prob_idx
.
size
();
index
++
)
{
auto
c
=
log_prob_idx
[
index
].
first
;
float
log_prob_c
=
log_prob_idx
[
index
].
second
;
auto
log_prob_c
=
log_prob_idx
[
index
].
second
;
for
(
size_t
i
=
0
;
i
<
prefixes
.
size
()
&&
i
<
beam_size
;
++
i
)
{
auto
prefix
=
prefixes
[
i
];
if
(
full_beam
&&
log_prob_c
+
prefix
->
score
<
min_cutoff
)
{
break
;
}
...
...
@@ -189,7 +113,6 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
if
(
ext_scorer
!=
nullptr
&&
(
c
==
space_id
||
ext_scorer
->
is_character_based
()))
{
PathTrie
*
prefix_toscore
=
nullptr
;
// skip scoring the space
if
(
ext_scorer
->
is_character_based
())
{
prefix_toscore
=
prefix_new
;
...
...
@@ -201,7 +124,6 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
std
::
vector
<
std
::
string
>
ngram
;
ngram
=
ext_scorer
->
make_ngram
(
prefix_toscore
);
score
=
ext_scorer
->
get_log_cond_prob
(
ngram
)
*
ext_scorer
->
alpha
;
log_p
+=
score
;
log_p
+=
ext_scorer
->
beta
;
}
...
...
@@ -221,57 +143,33 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
prefixes
.
begin
()
+
beam_size
,
prefixes
.
end
(),
prefix_compare
);
for
(
size_t
i
=
beam_size
;
i
<
prefixes
.
size
();
++
i
)
{
prefixes
[
i
]
->
remove
();
}
}
}
// end of loop over time
// compute aproximate ctc score as the return score
// compute aproximate ctc score as the return score, without affecting the
// return order of decoding result. To delete when decoder gets stable.
for
(
size_t
i
=
0
;
i
<
beam_size
&&
i
<
prefixes
.
size
();
++
i
)
{
double
approx_ctc
=
prefixes
[
i
]
->
score
;
if
(
ext_scorer
!=
nullptr
)
{
std
::
vector
<
int
>
output
;
prefixes
[
i
]
->
get_path_vec
(
output
);
size_t
prefix_length
=
output
.
size
();
auto
prefix_length
=
output
.
size
();
auto
words
=
ext_scorer
->
split_labels
(
output
);
// remove word insert
approx_ctc
=
approx_ctc
-
prefix_length
*
ext_scorer
->
beta
;
// remove language model weight:
approx_ctc
-=
(
ext_scorer
->
get_sent_log_prob
(
words
))
*
ext_scorer
->
alpha
;
}
prefixes
[
i
]
->
approx_ctc
=
approx_ctc
;
}
// allow for the post processing
std
::
vector
<
PathTrie
*>
space_prefixes
;
if
(
space_prefixes
.
empty
())
{
for
(
size_t
i
=
0
;
i
<
beam_size
&&
i
<
prefixes
.
size
();
++
i
)
{
space_prefixes
.
push_back
(
prefixes
[
i
]);
}
}
std
::
sort
(
space_prefixes
.
begin
(),
space_prefixes
.
end
(),
prefix_compare
);
std
::
vector
<
std
::
pair
<
double
,
std
::
string
>>
output_vecs
;
for
(
size_t
i
=
0
;
i
<
beam_size
&&
i
<
space_prefixes
.
size
();
++
i
)
{
std
::
vector
<
int
>
output
;
space_prefixes
[
i
]
->
get_path_vec
(
output
);
// convert index to string
std
::
string
output_str
;
for
(
size_t
j
=
0
;
j
<
output
.
size
();
j
++
)
{
output_str
+=
vocabulary
[
output
[
j
]];
}
std
::
pair
<
double
,
std
::
string
>
output_pair
(
-
space_prefixes
[
i
]
->
approx_ctc
,
output_str
);
output_vecs
.
emplace_back
(
output_pair
);
}
return
output_vecs
;
return
get_beam_search_result
(
prefixes
,
vocabulary
,
beam_size
);
}
std
::
vector
<
std
::
vector
<
std
::
pair
<
double
,
std
::
string
>>>
ctc_beam_search_decoder_batch
(
const
std
::
vector
<
std
::
vector
<
std
::
vector
<
double
>>>
&
probs_split
,
...
...
@@ -287,18 +185,6 @@ ctc_beam_search_decoder_batch(
// number of samples
size_t
batch_size
=
probs_split
.
size
();
// scorer filling up
if
(
ext_scorer
!=
nullptr
)
{
if
(
ext_scorer
->
is_char_map_empty
())
{
ext_scorer
->
set_char_map
(
vocabulary
);
}
if
(
!
ext_scorer
->
is_character_based
()
&&
ext_scorer
->
dictionary
==
nullptr
)
{
// init dictionary
ext_scorer
->
fill_dictionary
(
true
);
}
}
// enqueue the tasks of decoding
std
::
vector
<
std
::
future
<
std
::
vector
<
std
::
pair
<
double
,
std
::
string
>>>>
res
;
for
(
size_t
i
=
0
;
i
<
batch_size
;
++
i
)
{
...
...
deep_speech_2/decoders/swig/ctc_
decoders
.h
→
deep_speech_2/decoders/swig/ctc_
beam_search_decoder
.h
浏览文件 @
e6740af4
...
...
@@ -7,19 +7,6 @@
#include "scorer.h"
/* CTC Best Path Decoder
*
* Parameters:
* probs_seq: 2-D vector that each element is a vector of probabilities
* over vocabulary of one time step.
* vocabulary: A vector of vocabulary.
* Return:
* The decoding result in string
*/
std
::
string
ctc_greedy_decoder
(
const
std
::
vector
<
std
::
vector
<
double
>>
&
probs_seq
,
const
std
::
vector
<
std
::
string
>
&
vocabulary
);
/* CTC Beam Search Decoder
* Parameters:
...
...
@@ -38,11 +25,11 @@ std::string ctc_greedy_decoder(
*/
std
::
vector
<
std
::
pair
<
double
,
std
::
string
>>
ctc_beam_search_decoder
(
const
std
::
vector
<
std
::
vector
<
double
>>
&
probs_seq
,
const
size_t
beam_size
,
size_t
beam_size
,
std
::
vector
<
std
::
string
>
vocabulary
,
const
double
cutoff_prob
=
1
.
0
,
const
size_t
cutoff_top_n
=
40
,
Scorer
*
ext_scorer
=
NULL
);
double
cutoff_prob
=
1
.
0
,
size_t
cutoff_top_n
=
40
,
Scorer
*
ext_scorer
=
nullptr
);
/* CTC Beam Search Decoder for batch data
...
...
@@ -65,11 +52,11 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
std
::
vector
<
std
::
vector
<
std
::
pair
<
double
,
std
::
string
>>>
ctc_beam_search_decoder_batch
(
const
std
::
vector
<
std
::
vector
<
std
::
vector
<
double
>>>
&
probs_split
,
const
size_t
beam_size
,
size_t
beam_size
,
const
std
::
vector
<
std
::
string
>
&
vocabulary
,
const
size_t
num_processes
,
size_t
num_processes
,
double
cutoff_prob
=
1
.
0
,
const
size_t
cutoff_top_n
=
40
,
Scorer
*
ext_scorer
=
NULL
);
size_t
cutoff_top_n
=
40
,
Scorer
*
ext_scorer
=
nullptr
);
#endif // CTC_BEAM_SEARCH_DECODER_H_
deep_speech_2/decoders/swig/ctc_greedy_decoder.cpp
0 → 100644
浏览文件 @
e6740af4
#include "ctc_greedy_decoder.h"
#include "decoder_utils.h"
std
::
string
ctc_greedy_decoder
(
const
std
::
vector
<
std
::
vector
<
double
>>
&
probs_seq
,
const
std
::
vector
<
std
::
string
>
&
vocabulary
)
{
// dimension check
size_t
num_time_steps
=
probs_seq
.
size
();
for
(
size_t
i
=
0
;
i
<
num_time_steps
;
++
i
)
{
VALID_CHECK_EQ
(
probs_seq
[
i
].
size
(),
vocabulary
.
size
()
+
1
,
"The shape of probs_seq does not match with "
"the shape of the vocabulary"
);
}
size_t
blank_id
=
vocabulary
.
size
();
std
::
vector
<
size_t
>
max_idx_vec
(
num_time_steps
,
0
);
std
::
vector
<
size_t
>
idx_vec
;
for
(
size_t
i
=
0
;
i
<
num_time_steps
;
++
i
)
{
double
max_prob
=
0.0
;
size_t
max_idx
=
0
;
const
std
::
vector
<
double
>
&
probs_step
=
probs_seq
[
i
];
for
(
size_t
j
=
0
;
j
<
probs_step
.
size
();
++
j
)
{
if
(
max_prob
<
probs_step
[
j
])
{
max_idx
=
j
;
max_prob
=
probs_step
[
j
];
}
}
// id with maximum probability in current step
max_idx_vec
[
i
]
=
max_idx
;
// deduplicate
if
((
i
==
0
)
||
((
i
>
0
)
&&
max_idx_vec
[
i
]
!=
max_idx_vec
[
i
-
1
]))
{
idx_vec
.
push_back
(
max_idx_vec
[
i
]);
}
}
std
::
string
best_path_result
;
for
(
size_t
i
=
0
;
i
<
idx_vec
.
size
();
++
i
)
{
if
(
idx_vec
[
i
]
!=
blank_id
)
{
best_path_result
+=
vocabulary
[
idx_vec
[
i
]];
}
}
return
best_path_result
;
}
deep_speech_2/decoders/swig/ctc_greedy_decoder.h
0 → 100644
浏览文件 @
e6740af4
#ifndef CTC_GREEDY_DECODER_H
#define CTC_GREEDY_DECODER_H
#include <string>
#include <vector>
/* CTC Greedy (Best Path) Decoder
*
* Parameters:
* probs_seq: 2-D vector that each element is a vector of probabilities
* over vocabulary of one time step.
* vocabulary: A vector of vocabulary.
* Return:
* The decoding result in string
*/
std
::
string
ctc_greedy_decoder
(
const
std
::
vector
<
std
::
vector
<
double
>>
&
probs_seq
,
const
std
::
vector
<
std
::
string
>
&
vocabulary
);
#endif // CTC_GREEDY_DECODER_H
deep_speech_2/decoders/swig/decoder_utils.cpp
浏览文件 @
e6740af4
...
...
@@ -4,6 +4,71 @@
#include <cmath>
#include <limits>
std
::
vector
<
std
::
pair
<
size_t
,
float
>>
get_pruned_log_probs
(
const
std
::
vector
<
double
>
&
prob_step
,
double
cutoff_prob
,
size_t
cutoff_top_n
)
{
std
::
vector
<
std
::
pair
<
int
,
double
>>
prob_idx
;
for
(
size_t
i
=
0
;
i
<
prob_step
.
size
();
++
i
)
{
prob_idx
.
push_back
(
std
::
pair
<
int
,
double
>
(
i
,
prob_step
[
i
]));
}
// pruning of vacobulary
size_t
cutoff_len
=
prob_step
.
size
();
if
(
cutoff_prob
<
1.0
||
cutoff_top_n
<
cutoff_len
)
{
std
::
sort
(
prob_idx
.
begin
(),
prob_idx
.
end
(),
pair_comp_second_rev
<
int
,
double
>
);
if
(
cutoff_prob
<
1.0
)
{
double
cum_prob
=
0.0
;
cutoff_len
=
0
;
for
(
size_t
i
=
0
;
i
<
prob_idx
.
size
();
++
i
)
{
cum_prob
+=
prob_idx
[
i
].
second
;
cutoff_len
+=
1
;
if
(
cum_prob
>=
cutoff_prob
)
break
;
}
}
cutoff_len
=
std
::
min
(
cutoff_len
,
cutoff_top_n
);
prob_idx
=
std
::
vector
<
std
::
pair
<
int
,
double
>>
(
prob_idx
.
begin
(),
prob_idx
.
begin
()
+
cutoff_len
);
}
std
::
vector
<
std
::
pair
<
size_t
,
float
>>
log_prob_idx
;
for
(
size_t
i
=
0
;
i
<
cutoff_len
;
++
i
)
{
log_prob_idx
.
push_back
(
std
::
pair
<
int
,
float
>
(
prob_idx
[
i
].
first
,
log
(
prob_idx
[
i
].
second
+
NUM_FLT_MIN
)));
}
return
log_prob_idx
;
}
std
::
vector
<
std
::
pair
<
double
,
std
::
string
>>
get_beam_search_result
(
const
std
::
vector
<
PathTrie
*>
&
prefixes
,
const
std
::
vector
<
std
::
string
>
&
vocabulary
,
size_t
beam_size
)
{
// allow for the post processing
std
::
vector
<
PathTrie
*>
space_prefixes
;
if
(
space_prefixes
.
empty
())
{
for
(
size_t
i
=
0
;
i
<
beam_size
&&
i
<
prefixes
.
size
();
++
i
)
{
space_prefixes
.
push_back
(
prefixes
[
i
]);
}
}
std
::
sort
(
space_prefixes
.
begin
(),
space_prefixes
.
end
(),
prefix_compare
);
std
::
vector
<
std
::
pair
<
double
,
std
::
string
>>
output_vecs
;
for
(
size_t
i
=
0
;
i
<
beam_size
&&
i
<
space_prefixes
.
size
();
++
i
)
{
std
::
vector
<
int
>
output
;
space_prefixes
[
i
]
->
get_path_vec
(
output
);
// convert index to string
std
::
string
output_str
;
for
(
size_t
j
=
0
;
j
<
output
.
size
();
j
++
)
{
output_str
+=
vocabulary
[
output
[
j
]];
}
std
::
pair
<
double
,
std
::
string
>
output_pair
(
-
space_prefixes
[
i
]
->
approx_ctc
,
output_str
);
output_vecs
.
emplace_back
(
output_pair
);
}
return
output_vecs
;
}
size_t
get_utf8_str_len
(
const
std
::
string
&
str
)
{
size_t
str_len
=
0
;
for
(
char
c
:
str
)
{
...
...
deep_speech_2/decoders/swig/decoder_utils.h
浏览文件 @
e6740af4
...
...
@@ -3,25 +3,26 @@
#include <utility>
#include "path_trie.h"
#include "fst/log.h"
const
float
NUM_FLT_INF
=
std
::
numeric_limits
<
float
>::
max
();
const
float
NUM_FLT_MIN
=
std
::
numeric_limits
<
float
>::
min
();
//
check if __A == _B
#define VALID_CHECK_EQ(__A, __B, __ERR) \
if ((__A) != (__B)) { \
std::ostringstream str; \
st
r << (__A) << " != " << (__B) << ", "; \
throw std::runtime_error(str.str() + __ERR); \
//
inline function for validation check
inline
void
check
(
bool
x
,
const
char
*
expr
,
const
char
*
file
,
int
line
,
const
char
*
err
)
{
if
(
!
x
)
{
st
d
::
cout
<<
"["
<<
file
<<
":"
<<
line
<<
"] "
;
LOG
(
FATAL
)
<<
"
\"
"
<<
expr
<<
"
\"
check failed. "
<<
err
;
}
}
#define VALID_CHECK(x, info) \
check(static_cast<bool>(x), #x, __FILE__, __LINE__, info)
#define VALID_CHECK_EQ(x, y, info) VALID_CHECK((x) == (y), info)
#define VALID_CHECK_GT(x, y, info) VALID_CHECK((x) > (y), info)
#define VALID_CHECK_LT(x, y, info) VALID_CHECK((x) < (y), info)
// check if __A > __B
#define VALID_CHECK_GT(__A, __B, __ERR) \
if ((__A) <= (__B)) { \
std::ostringstream str; \
str << (__A) << " <= " << (__B) << ", "; \
throw std::runtime_error(str.str() + __ERR); \
}
// Function template for comparing two pairs
template
<
typename
T1
,
typename
T2
>
...
...
@@ -47,6 +48,18 @@ T log_sum_exp(const T &x, const T &y) {
return
std
::
log
(
std
::
exp
(
x
-
xmax
)
+
std
::
exp
(
y
-
xmax
))
+
xmax
;
}
// Get pruned probability vector for each time step's beam search
std
::
vector
<
std
::
pair
<
size_t
,
float
>>
get_pruned_log_probs
(
const
std
::
vector
<
double
>
&
prob_step
,
double
cutoff_prob
,
size_t
cutoff_top_n
);
// Get beam search result from prefixes in trie tree
std
::
vector
<
std
::
pair
<
double
,
std
::
string
>>
get_beam_search_result
(
const
std
::
vector
<
PathTrie
*>
&
prefixes
,
const
std
::
vector
<
std
::
string
>
&
vocabulary
,
size_t
beam_size
);
// Functor for prefix comparsion
bool
prefix_compare
(
const
PathTrie
*
x
,
const
PathTrie
*
y
);
...
...
deep_speech_2/decoders/swig/decoders.i
浏览文件 @
e6740af4
%
module
swig_decoders
%
{
#
include
"scorer.h"
#
include
"ctc_decoders.h"
#
include
"ctc_greedy_decoder.h"
#
include
"ctc_beam_search_decoder.h"
#
include
"decoder_utils.h"
%
}
...
...
@@ -28,4 +29,5 @@ namespace std {
%
template
(
DoubleStringPairCompFirstRev
)
pair_comp_first_rev
<
double
,
std
::
string
>
;
%
include
"scorer.h"
%
include
"ctc_decoders.h"
%
include
"ctc_greedy_decoder.h"
%
include
"ctc_beam_search_decoder.h"
deep_speech_2/decoders/swig/path_trie.h
浏览文件 @
e6740af4
#ifndef PATH_TRIE_H
#define PATH_TRIE_H
#pragma once
#include <fst/fstlib.h>
#include <algorithm>
#include <limits>
#include <memory>
#include <utility>
#include <vector>
using
FSTMATCH
=
fst
::
SortedMatcher
<
fst
::
StdVectorFst
>
;
#include "fst/fstlib.h"
/* Trie tree for prefix storing and manipulating, with a dictionary in
* finite-state transducer for spelling correction.
...
...
@@ -35,7 +34,7 @@ public:
// set dictionary for FST
void
set_dictionary
(
fst
::
StdVectorFst
*
dictionary
);
void
set_matcher
(
std
::
shared_ptr
<
FSTMATCH
>
matcher
);
void
set_matcher
(
std
::
shared_ptr
<
fst
::
SortedMatcher
<
fst
::
StdVectorFst
>>
);
bool
is_empty
()
{
return
_ROOT
==
character
;
}
...
...
@@ -62,7 +61,7 @@ private:
fst
::
StdVectorFst
*
_dictionary
;
fst
::
StdVectorFst
::
StateId
_dictionary_state
;
// true if finding ars in FST
std
::
shared_ptr
<
FSTMATCH
>
_matcher
;
std
::
shared_ptr
<
fst
::
SortedMatcher
<
fst
::
StdVectorFst
>
>
_matcher
;
};
#endif // PATH_TRIE_H
deep_speech_2/decoders/swig/scorer.cpp
浏览文件 @
e6740af4
...
...
@@ -13,29 +13,47 @@
using
namespace
lm
::
ngram
;
Scorer
::
Scorer
(
double
alpha
,
double
beta
,
const
std
::
string
&
lm_path
)
{
Scorer
::
Scorer
(
double
alpha
,
double
beta
,
const
std
::
string
&
lm_path
,
const
std
::
vector
<
std
::
string
>&
vocab_list
)
{
this
->
alpha
=
alpha
;
this
->
beta
=
beta
;
_is_character_based
=
true
;
_language_model
=
nullptr
;
dictionary
=
nullptr
;
_max_order
=
0
;
_dict_size
=
0
;
_SPACE_ID
=
-
1
;
// load language model
load_LM
(
lm_path
.
c_str
()
);
setup
(
lm_path
,
vocab_list
);
}
Scorer
::~
Scorer
()
{
if
(
_language_model
!=
nullptr
)
if
(
_language_model
!=
nullptr
)
{
delete
static_cast
<
lm
::
base
::
Model
*>
(
_language_model
);
if
(
dictionary
!=
nullptr
)
delete
static_cast
<
fst
::
StdVectorFst
*>
(
dictionary
);
}
if
(
dictionary
!=
nullptr
)
{
delete
static_cast
<
fst
::
StdVectorFst
*>
(
dictionary
);
}
}
void
Scorer
::
load_LM
(
const
char
*
filename
)
{
if
(
access
(
filename
,
F_OK
)
!=
0
)
{
std
::
cerr
<<
"Invalid language model file !!!"
<<
std
::
endl
;
exit
(
1
);
void
Scorer
::
setup
(
const
std
::
string
&
lm_path
,
const
std
::
vector
<
std
::
string
>&
vocab_list
)
{
// load language model
load_lm
(
lm_path
);
// set char map for scorer
set_char_map
(
vocab_list
);
// fill the dictionary for FST
if
(
!
is_character_based
())
{
fill_dictionary
(
true
);
}
}
void
Scorer
::
load_lm
(
const
std
::
string
&
lm_path
)
{
const
char
*
filename
=
lm_path
.
c_str
();
VALID_CHECK_EQ
(
access
(
filename
,
F_OK
),
0
,
"Invalid language model path"
);
RetriveStrEnumerateVocab
enumerate
;
lm
::
ngram
::
Config
config
;
config
.
enumerate_vocab
=
&
enumerate
;
...
...
@@ -180,14 +198,14 @@ void Scorer::fill_dictionary(bool add_space) {
}
// For each unigram convert to ints and put in trie
int
vocab
_size
=
0
;
int
dict
_size
=
0
;
for
(
const
auto
&
word
:
_vocabulary
)
{
bool
added
=
add_word_to_dictionary
(
word
,
char_map
,
add_space
,
_SPACE_ID
,
&
dictionary
);
vocab
_size
+=
added
?
1
:
0
;
dict
_size
+=
added
?
1
:
0
;
}
std
::
cerr
<<
"Vocab Size "
<<
vocab_size
<<
std
::
endl
;
_dict_size
=
dict_size
;
/* Simplify FST
...
...
deep_speech_2/decoders/swig/scorer.h
浏览文件 @
e6740af4
...
...
@@ -40,31 +40,32 @@ public:
*/
class
Scorer
{
public:
Scorer
(
double
alpha
,
double
beta
,
const
std
::
string
&
lm_path
);
Scorer
(
double
alpha
,
double
beta
,
const
std
::
string
&
lm_path
,
const
std
::
vector
<
std
::
string
>
&
vocabulary
);
~
Scorer
();
double
get_log_cond_prob
(
const
std
::
vector
<
std
::
string
>
&
words
);
double
get_sent_log_prob
(
const
std
::
vector
<
std
::
string
>
&
words
);
size_t
get_max_order
()
{
return
_max_order
;
}
size_t
get_max_order
()
const
{
return
_max_order
;
}
bool
is_char_map_empty
()
{
return
_char_map
.
size
()
==
0
;
}
size_t
get_dict_size
()
const
{
return
_dict_size
;
}
bool
is_character_based
()
{
return
_is_character_based
;
}
bool
is_char_map_empty
()
const
{
return
_char_map
.
size
()
==
0
;
}
bool
is_character_based
()
const
{
return
_is_character_based
;
}
// reset params alpha & beta
void
reset_params
(
float
alpha
,
float
beta
);
// make ngram
// make ngram
for a given prefix
std
::
vector
<
std
::
string
>
make_ngram
(
PathTrie
*
prefix
);
// fill dictionary for fst
void
fill_dictionary
(
bool
add_space
);
// set char map
void
set_char_map
(
const
std
::
vector
<
std
::
string
>
&
char_list
);
// trransform the labels in index to the vector of words (word based lm) or
// the vector of characters (character based lm)
std
::
vector
<
std
::
string
>
split_labels
(
const
std
::
vector
<
int
>
&
labels
);
// expose to decoder
...
...
@@ -75,7 +76,16 @@ public:
void
*
dictionary
;
protected:
void
load_LM
(
const
char
*
filename
);
void
setup
(
const
std
::
string
&
lm_path
,
const
std
::
vector
<
std
::
string
>
&
vocab_list
);
void
load_lm
(
const
std
::
string
&
lm_path
);
// fill dictionary for fst
void
fill_dictionary
(
bool
add_space
);
// set char map
void
set_char_map
(
const
std
::
vector
<
std
::
string
>
&
char_list
);
double
get_log_prob
(
const
std
::
vector
<
std
::
string
>
&
words
);
...
...
@@ -85,6 +95,7 @@ private:
void
*
_language_model
;
bool
_is_character_based
;
size_t
_max_order
;
size_t
_dict_size
;
int
_SPACE_ID
;
std
::
vector
<
std
::
string
>
_char_list
;
...
...
deep_speech_2/decoders/swig/setup.py
浏览文件 @
e6740af4
...
...
@@ -70,8 +70,11 @@ FILES = glob.glob('kenlm/util/*.cc') \
FILES
+=
glob
.
glob
(
'openfst-1.6.3/src/lib/*.cc'
)
# FILES + glob.glob('glog/src/*.cc')
FILES
=
[
fn
for
fn
in
FILES
if
not
(
fn
.
endswith
(
'main.cc'
)
or
fn
.
endswith
(
'test.cc'
))
fn
for
fn
in
FILES
if
not
(
fn
.
endswith
(
'main.cc'
)
or
fn
.
endswith
(
'test.cc'
)
or
fn
.
endswith
(
'unittest.cc'
))
]
LIBS
=
[
'stdc++'
]
...
...
@@ -99,7 +102,13 @@ decoders_module = [
name
=
'_swig_decoders'
,
sources
=
FILES
+
glob
.
glob
(
'*.cxx'
)
+
glob
.
glob
(
'*.cpp'
),
language
=
'c++'
,
include_dirs
=
[
'.'
,
'kenlm'
,
'openfst-1.6.3/src/include'
,
'ThreadPool'
],
include_dirs
=
[
'.'
,
'kenlm'
,
'openfst-1.6.3/src/include'
,
'ThreadPool'
,
#'glog/src'
],
libraries
=
LIBS
,
extra_compile_args
=
ARGS
)
]
...
...
deep_speech_2/decoders/swig/setup.sh
浏览文件 @
e6740af4
#!/
bin/
bash
#!/
usr/bin/env
bash
if
[
!
-d
kenlm
]
;
then
git clone https://github.com/luotao1/kenlm.git
...
...
deep_speech_2/decoders/swig_wrapper.py
浏览文件 @
e6740af4
...
...
@@ -13,14 +13,14 @@ class Scorer(swig_decoders.Scorer):
language model when alpha = 0.
:type alpha: float
:param beta: Parameter associated with word count. Don't use word
count when beta = 0.
count when beta = 0.
:type beta: float
:model_path: Path to load language model.
:type model_path: basestring
"""
def
__init__
(
self
,
alpha
,
beta
,
model_path
):
swig_decoders
.
Scorer
.
__init__
(
self
,
alpha
,
beta
,
model_path
)
def
__init__
(
self
,
alpha
,
beta
,
model_path
,
vocabulary
):
swig_decoders
.
Scorer
.
__init__
(
self
,
alpha
,
beta
,
model_path
,
vocabulary
)
def
ctc_greedy_decoder
(
probs_seq
,
vocabulary
):
...
...
@@ -58,12 +58,12 @@ def ctc_beam_search_decoder(probs_seq,
default 1.0, no pruning.
:type cutoff_prob: float
:param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
characters with highest probs in vocabulary will be
used in beam search, default 40.
characters with highest probs in vocabulary will be
used in beam search, default 40.
:type cutoff_top_n: int
:param ext_scoring_func: External scoring function for
partially decoded sentence, e.g. word count
or language model.
partially decoded sentence, e.g. word count
or language model.
:type external_scoring_func: callable
:return: List of tuples of log probability and sentence as decoding
results, in descending order of the probability.
...
...
@@ -96,14 +96,14 @@ def ctc_beam_search_decoder_batch(probs_split,
default 1.0, no pruning.
:type cutoff_prob: float
:param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n
characters with highest probs in vocabulary will be
used in beam search, default 40.
characters with highest probs in vocabulary will be
used in beam search, default 40.
:type cutoff_top_n: int
:param num_processes: Number of parallel processes.
:type num_processes: int
:param ext_scoring_func: External scoring function for
partially decoded sentence, e.g. word count
or language model.
partially decoded sentence, e.g. word count
or language model.
:type external_scoring_function: callable
:return: List of tuples of log probability and sentence as decoding
results, in descending order of the probability.
...
...
deep_speech_2/examples/tiny/run_infer.sh
浏览文件 @
e6740af4
...
...
@@ -21,9 +21,9 @@ python -u infer.py \
--num_conv_layers
=
2
\
--num_rnn_layers
=
3
\
--rnn_layer_size
=
2048
\
--alpha
=
0.36
\
--beta
=
0.
2
5
\
--cutoff_prob
=
0.99
\
--alpha
=
2.15
\
--beta
=
0.
3
5
\
--cutoff_prob
=
1.0
\
--use_gru
=
False
\
--use_gpu
=
True
\
--share_rnn_weights
=
True
\
...
...
deep_speech_2/examples/tiny/run_infer_golden.sh
浏览文件 @
e6740af4
...
...
@@ -30,9 +30,9 @@ python -u infer.py \
--num_conv_layers
=
2
\
--num_rnn_layers
=
3
\
--rnn_layer_size
=
2048
\
--alpha
=
0.36
\
--beta
=
0.
2
5
\
--cutoff_prob
=
0.99
\
--alpha
=
2.15
\
--beta
=
0.
3
5
\
--cutoff_prob
=
1.0
\
--use_gru
=
False
\
--use_gpu
=
True
\
--share_rnn_weights
=
True
\
...
...
deep_speech_2/examples/tiny/run_test.sh
浏览文件 @
e6740af4
...
...
@@ -22,9 +22,9 @@ python -u test.py \
--num_conv_layers
=
2
\
--num_rnn_layers
=
3
\
--rnn_layer_size
=
2048
\
--alpha
=
0.36
\
--beta
=
0.
2
5
\
--cutoff_prob
=
0.99
\
--alpha
=
2.15
\
--beta
=
0.
3
5
\
--cutoff_prob
=
1.0
\
--use_gru
=
False
\
--use_gpu
=
True
\
--share_rnn_weights
=
True
\
...
...
deep_speech_2/examples/tiny/run_test_golden.sh
浏览文件 @
e6740af4
...
...
@@ -31,9 +31,9 @@ python -u test.py \
--num_conv_layers
=
2
\
--num_rnn_layers
=
3
\
--rnn_layer_size
=
2048
\
--alpha
=
0.36
\
--beta
=
0.
2
5
\
--cutoff_prob
=
0.99
\
--alpha
=
2.15
\
--beta
=
0.
3
5
\
--cutoff_prob
=
1.0
\
--use_gru
=
False
\
--use_gpu
=
True
\
--share_rnn_weights
=
True
\
...
...
deep_speech_2/infer.py
浏览文件 @
e6740af4
...
...
@@ -112,6 +112,7 @@ def infer():
print
(
"Current error rate [%s] = %f"
%
(
args
.
error_rate_type
,
error_rate_func
(
target
,
result
)))
ds2_model
.
logger
.
info
(
"finish inference"
)
def
main
():
print_arguments
(
args
)
...
...
deep_speech_2/model_utils/model.py
浏览文件 @
e6740af4
...
...
@@ -6,6 +6,7 @@ from __future__ import print_function
import
sys
import
os
import
time
import
logging
import
gzip
import
paddle.v2
as
paddle
from
decoders.swig_wrapper
import
Scorer
...
...
@@ -13,6 +14,9 @@ from decoders.swig_wrapper import ctc_greedy_decoder
from
decoders.swig_wrapper
import
ctc_beam_search_decoder_batch
from
model_utils.network
import
deep_speech_v2_network
logging
.
basicConfig
(
format
=
'[%(levelname)s %(asctime)s %(filename)s:%(lineno)d] %(message)s'
)
class
DeepSpeech2Model
(
object
):
"""DeepSpeech2Model class.
...
...
@@ -43,6 +47,8 @@ class DeepSpeech2Model(object):
self
.
_inferer
=
None
self
.
_loss_inferer
=
None
self
.
_ext_scorer
=
None
self
.
logger
=
logging
.
getLogger
(
""
)
self
.
logger
.
setLevel
(
level
=
logging
.
INFO
)
def
train
(
self
,
train_batch_reader
,
...
...
@@ -204,16 +210,25 @@ class DeepSpeech2Model(object):
elif
decoding_method
==
"ctc_beam_search"
:
# initialize external scorer
if
self
.
_ext_scorer
==
None
:
self
.
_ext_scorer
=
Scorer
(
beam_alpha
,
beam_beta
,
language_model_path
)
self
.
_loaded_lm_path
=
language_model_path
self
.
_ext_scorer
.
set_char_map
(
vocab_list
)
if
(
not
self
.
_ext_scorer
.
is_character_based
()):
self
.
_ext_scorer
.
fill_dictionary
(
True
)
self
.
logger
.
info
(
"begin to initialize the external scorer "
"for decoding"
)
self
.
_ext_scorer
=
Scorer
(
beam_alpha
,
beam_beta
,
language_model_path
,
vocab_list
)
lm_char_based
=
self
.
_ext_scorer
.
is_character_based
()
lm_max_order
=
self
.
_ext_scorer
.
get_max_order
()
lm_dict_size
=
self
.
_ext_scorer
.
get_dict_size
()
self
.
logger
.
info
(
"language model: "
"is_character_based = %d,"
%
lm_char_based
+
" max_order = %d,"
%
lm_max_order
+
" dict_size = %d"
%
lm_dict_size
)
self
.
logger
.
info
(
"end initializing scorer. Start decoding ..."
)
else
:
self
.
_ext_scorer
.
reset_params
(
beam_alpha
,
beam_beta
)
assert
self
.
_loaded_lm_path
==
language_model_path
# beam search decode
num_processes
=
min
(
num_processes
,
len
(
probs_split
))
beam_search_results
=
ctc_beam_search_decoder_batch
(
probs_split
=
probs_split
,
vocabulary
=
vocab_list
,
...
...
deep_speech_2/test.py
浏览文件 @
e6740af4
...
...
@@ -115,6 +115,7 @@ def evaluate():
print
(
"Final error rate [%s] (%d/%d) = %f"
%
(
args
.
error_rate_type
,
num_ins
,
num_ins
,
error_sum
/
num_ins
))
ds2_model
.
logger
.
info
(
"finish evaluation"
)
def
main
():
print_arguments
(
args
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录