Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
2da198b7
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 1 年 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
2da198b7
编写于
9月 23, 2021
作者:
H
Hui Zhang
提交者:
GitHub
9月 23, 2021
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #854 from PaddlePaddle/space
space as <space>
上级
4d1ce107
30563981
变更
5
显示空白变更内容
内联
并排
Showing
5 changed file
with
235 addition
and
20 deletion
+235
-20
deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
+2
-6
deepspeech/frontend/featurizer/text_featurizer.py
deepspeech/frontend/featurizer/text_featurizer.py
+15
-12
deepspeech/frontend/utility.py
deepspeech/frontend/utility.py
+2
-1
utils/build_vocab.py
utils/build_vocab.py
+4
-1
utils/split_scp.pl
utils/split_scp.pl
+212
-0
未找到文件。
deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
浏览文件 @
2da198b7
...
...
@@ -28,6 +28,7 @@
#include "path_trie.h"
using
FSTMATCH
=
fst
::
SortedMatcher
<
fst
::
StdVectorFst
>
;
constexpr
kSPACE
=
"<space>"
std
::
vector
<
std
::
pair
<
double
,
std
::
string
>>
ctc_beam_search_decoder
(
const
std
::
vector
<
std
::
vector
<
double
>>
&
probs_seq
,
...
...
@@ -46,13 +47,8 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
"The shape of probs_seq does not match with "
"the shape of the vocabulary"
);
}
// assign blank id
// size_t blank_id = vocabulary.size();
// size_t blank_id = 0;
// assign space id
auto
it
=
std
::
find
(
vocabulary
.
begin
(),
vocabulary
.
end
(),
" "
);
auto
it
=
std
::
find
(
vocabulary
.
begin
(),
vocabulary
.
end
(),
kSPACE
);
int
space_id
=
it
-
vocabulary
.
begin
();
// if no space in vocabulary
if
((
size_t
)
space_id
>=
vocabulary
.
size
())
{
...
...
deepspeech/frontend/featurizer/text_featurizer.py
浏览文件 @
2da198b7
...
...
@@ -16,6 +16,7 @@ import sentencepiece as spm
from
..utility
import
EOS
from
..utility
import
load_dict
from
..utility
import
SPACE
from
..utility
import
UNK
__all__
=
[
"TextFeaturizer"
]
...
...
@@ -53,9 +54,9 @@ class TextFeaturizer():
self
.
sp
=
spm
.
SentencePieceProcessor
()
self
.
sp
.
Load
(
spm_model
)
def
tokenize
(
self
,
text
):
def
tokenize
(
self
,
text
,
replace_space
=
True
):
if
self
.
unit_type
==
'char'
:
tokens
=
self
.
char_tokenize
(
text
)
tokens
=
self
.
char_tokenize
(
text
,
replace_space
)
elif
self
.
unit_type
==
'word'
:
tokens
=
self
.
word_tokenize
(
text
)
else
:
# spm
...
...
@@ -105,16 +106,20 @@ class TextFeaturizer():
text
=
self
.
detokenize
(
tokens
)
return
text
def
char_tokenize
(
self
,
text
):
def
char_tokenize
(
self
,
text
,
replace_space
=
True
):
"""Character tokenizer.
Args:
text (str): text string.
replace_space (bool): False only used by build_vocab.py.
Returns:
List[str]: tokens.
"""
return
list
(
text
.
strip
())
text
=
text
.
strip
()
if
replace_space
:
text
=
text
.
replace
(
" "
,
SPACE
)
return
list
(
text
)
def
char_detokenize
(
self
,
tokens
):
"""Character detokenizer.
...
...
@@ -125,6 +130,7 @@ class TextFeaturizer():
Returns:
str: text string.
"""
tokens
=
tokens
.
replace
(
SPACE
,
" "
)
return
""
.
join
(
tokens
)
def
word_tokenize
(
self
,
text
):
...
...
@@ -191,17 +197,14 @@ class TextFeaturizer():
"""Load vocabulary from file."""
vocab_list
=
load_dict
(
vocab_filepath
,
maskctc
)
assert
vocab_list
is
not
None
assert
SPACE
in
vocab_list
id2token
=
dict
(
[(
idx
,
token
)
for
(
idx
,
token
)
in
enumerate
(
vocab_list
)])
token2id
=
dict
(
[(
token
,
idx
)
for
(
idx
,
token
)
in
enumerate
(
vocab_list
)])
if
UNK
in
vocab_list
:
unk_id
=
vocab_list
.
index
(
UNK
)
else
:
unk_id
=
-
1
if
EOS
in
vocab_list
:
eos_id
=
vocab_list
.
index
(
EOS
)
else
:
eos_id
=
-
1
unk_id
=
vocab_list
.
index
(
UNK
)
if
UNK
in
vocab_list
else
-
1
eos_id
=
vocab_list
.
index
(
EOS
)
if
EOS
in
vocab_list
else
-
1
return
token2id
,
id2token
,
vocab_list
,
unk_id
,
eos_id
deepspeech/frontend/utility.py
浏览文件 @
2da198b7
...
...
@@ -28,7 +28,7 @@ logger = Log(__name__).getlog()
__all__
=
[
"load_dict"
,
"load_cmvn"
,
"read_manifest"
,
"rms_to_db"
,
"rms_to_dbfs"
,
"max_dbfs"
,
"mean_dbfs"
,
"gain_db_to_ratio"
,
"normalize_audio"
,
"SOS"
,
"EOS"
,
"UNK"
,
"BLANK"
,
"MASKCTC"
"EOS"
,
"UNK"
,
"BLANK"
,
"MASKCTC"
,
"SPACE"
]
IGNORE_ID
=
-
1
...
...
@@ -38,6 +38,7 @@ EOS = SOS
UNK
=
"<unk>"
BLANK
=
"<blank>"
MASKCTC
=
"<mask>"
SPACE
=
"<space>"
def
load_dict
(
dict_path
:
Optional
[
Text
],
maskctc
=
False
)
->
Optional
[
List
[
Text
]]:
...
...
utils/build_vocab.py
浏览文件 @
2da198b7
...
...
@@ -25,6 +25,7 @@ from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
from
deepspeech.frontend.utility
import
BLANK
from
deepspeech.frontend.utility
import
read_manifest
from
deepspeech.frontend.utility
import
SOS
from
deepspeech.frontend.utility
import
SPACE
from
deepspeech.frontend.utility
import
UNK
from
deepspeech.utils.utility
import
add_arguments
from
deepspeech.utils.utility
import
print_arguments
...
...
@@ -60,7 +61,7 @@ args = parser.parse_args()
def
count_manifest
(
counter
,
text_feature
,
manifest_path
):
manifest_jsons
=
read_manifest
(
manifest_path
)
for
line_json
in
manifest_jsons
:
line
=
text_feature
.
tokenize
(
line_json
[
'text'
])
line
=
text_feature
.
tokenize
(
line_json
[
'text'
]
,
replace_space
=
False
)
counter
.
update
(
line
)
def
dump_text_manifest
(
fileobj
,
manifest_path
,
key
=
'text'
):
...
...
@@ -109,6 +110,8 @@ def main():
for
token
,
count
in
count_sorted
:
if
count
<
args
.
count_threshold
:
break
# replace space by `<space>`
token
=
SPACE
if
token
==
' '
else
token
tokens
.
append
(
token
)
tokens
=
sorted
(
tokens
)
...
...
utils/split_scp.pl
浏览文件 @
2da198b7
#!/usr/bin/env perl
use
warnings
;
#sed replacement for -w perl parameter
# Copyright 2010-2011 Microsoft Corporation
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.
# This program splits up any kind of .scp or archive-type file.
# If there is no utt2spk option it will work on any text file and
# will split it up with an approximately equal number of lines in
# each but.
# With the --utt2spk option it will work on anything that has the
# utterance-id as the first entry on each line; the utt2spk file is
# of the form "utterance speaker" (on each line).
# It splits it into equal size chunks as far as it can. If you use
# the utt2spk option it will make sure these chunks coincide with
# speaker boundaries. In this case, if there are more chunks
# than speakers (and in some other circumstances), some of the
# resulting chunks will be empty and it
# will print a warning.
# You will normally call this like:
# split_scp.pl scp scp.1 scp.2 scp.3 ...
# or
# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
# Note that you can use this script to split the utt2spk file itself,
# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
# You can also call the scripts like:
# split_scp.pl -j 3 0 scp scp.0
# [note: with this option, it assumes zero-based indexing of the split parts,
# i.e. the second number must be 0 <= n < num-jobs.]
$num_jobs
=
0
;
$job_id
=
0
;
$utt2spk_file
=
"";
for
(
$x
=
1
;
$x
<=
2
;
$x
++
)
{
if
(
$ARGV
[
0
]
eq
"
-j
")
{
shift
@ARGV
;
$num_jobs
=
shift
@ARGV
;
$job_id
=
shift
@ARGV
;
if
(
$num_jobs
<=
0
||
$job_id
<
0
||
$job_id
>=
$num_jobs
)
{
die
"
Invalid num-jobs and job-id:
$num_jobs
and
$job_id
";
}
}
if
(
$ARGV
[
0
]
=~
"
--utt2spk=(.+)
")
{
$utt2spk_file
=
$
1
;
shift
;
}
}
if
((
$num_jobs
==
0
&&
@ARGV
<
2
)
||
(
$num_jobs
>
0
&&
(
@ARGV
<
1
||
@ARGV
>
2
)))
{
die
"
Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
\n
"
.
"
or: split_scp.pl -j num-jobs job-id [--utt2spk=<utt2spk_file>] in.scp [out.scp]
\n
"
.
"
... where 0 <= job-id < num-jobs.
";
}
$inscp
=
shift
@ARGV
;
if
(
$num_jobs
==
0
)
{
# without -j option
@OUTPUTS
=
@ARGV
;
}
else
{
for
(
$j
=
0
;
$j
<
$num_jobs
;
$j
++
)
{
if
(
$j
==
$job_id
)
{
if
(
@ARGV
>
0
)
{
push
@OUTPUTS
,
$ARGV
[
0
];
}
else
{
push
@OUTPUTS
,
"
-
";
}
}
else
{
push
@OUTPUTS
,
"
/dev/null
";
}
}
}
if
(
$utt2spk_file
ne
"")
{
# We have the --utt2spk option...
open
(
U
,
"
<
$utt2spk_file
")
||
die
"
Failed to open utt2spk file
$utt2spk_file
";
while
(
<
U
>
)
{
@A
=
split
;
@A
==
2
||
die
"
Bad line
$_
in utt2spk file
$utt2spk_file
";
(
$u
,
$s
)
=
@A
;
$utt2spk
{
$u
}
=
$s
;
}
open
(
I
,
"
<
$inscp
")
||
die
"
Opening input scp file
$inscp
";
@spkrs
=
();
while
(
<
I
>
)
{
@A
=
split
;
if
(
@A
==
0
)
{
die
"
Empty or space-only line in scp file
$inscp
";
}
$u
=
$A
[
0
];
$s
=
$utt2spk
{
$u
};
if
(
!
defined
$s
)
{
die
"
No such utterance
$u
in utt2spk file
$utt2spk_file
";
}
if
(
!
defined
$spk_count
{
$s
})
{
push
@spkrs
,
$s
;
$spk_count
{
$s
}
=
0
;
$spk_data
{
$s
}
=
"";
}
$spk_count
{
$s
}
++
;
$spk_data
{
$s
}
=
$spk_data
{
$s
}
.
$_
;
}
# Now split as equally as possible ..
# First allocate spks to files by allocating an approximately
# equal number of speakers.
$numspks
=
@spkrs
;
# number of speakers.
$numscps
=
@OUTPUTS
;
# number of output files.
for
(
$scpidx
=
0
;
$scpidx
<
$numscps
;
$scpidx
++
)
{
$scparray
[
$scpidx
]
=
[]
;
# [] is array reference.
}
for
(
$spkidx
=
0
;
$spkidx
<
$numspks
;
$spkidx
++
)
{
$scpidx
=
int
((
$spkidx*$numscps
)
/
$numspks
);
$spk
=
$spkrs
[
$spkidx
];
push
@
{
$scparray
[
$scpidx
]},
$spk
;
$scpcount
[
$scpidx
]
+=
$spk_count
{
$spk
};
}
# Now will try to reassign beginning + ending speakers
# to different scp's and see if it gets more balanced.
# Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
# We can show that if considering changing just 2 scp's, we minimize
# this by minimizing the squared difference in sizes. This is
# equivalent to minimizing the absolute difference in sizes. This
# shows this method is bound to converge.
$changed
=
1
;
while
(
$changed
)
{
$changed
=
0
;
for
(
$scpidx
=
0
;
$scpidx
<
$numscps
;
$scpidx
++
)
{
# First try to reassign ending spk of this scp.
if
(
$scpidx
<
$numscps
-
1
)
{
$sz
=
@
{
$scparray
[
$scpidx
]};
if
(
$sz
>
0
)
{
$spk
=
$scparray
[
$scpidx
]
->
[
$sz
-
1
];
$count
=
$spk_count
{
$spk
};
$nutt1
=
$scpcount
[
$scpidx
];
$nutt2
=
$scpcount
[
$scpidx
+
1
];
if
(
abs
(
(
$nutt2
+
$count
)
-
(
$nutt1
-
$count
))
<
abs
(
$nutt2
-
$nutt1
))
{
# Would decrease
# size-diff by reassigning spk...
$scpcount
[
$scpidx
+
1
]
+=
$count
;
$scpcount
[
$scpidx
]
-=
$count
;
pop
@
{
$scparray
[
$scpidx
]};
unshift
@
{
$scparray
[
$scpidx
+
1
]},
$spk
;
$changed
=
1
;
}
}
}
if
(
$scpidx
>
0
&&
@
{
$scparray
[
$scpidx
]}
>
0
)
{
$spk
=
$scparray
[
$scpidx
]
->
[
0
];
$count
=
$spk_count
{
$spk
};
$nutt1
=
$scpcount
[
$scpidx
-
1
];
$nutt2
=
$scpcount
[
$scpidx
];
if
(
abs
(
(
$nutt2
-
$count
)
-
(
$nutt1
+
$count
))
<
abs
(
$nutt2
-
$nutt1
))
{
# Would decrease
# size-diff by reassigning spk...
$scpcount
[
$scpidx
-
1
]
+=
$count
;
$scpcount
[
$scpidx
]
-=
$count
;
shift
@
{
$scparray
[
$scpidx
]};
push
@
{
$scparray
[
$scpidx
-
1
]},
$spk
;
$changed
=
1
;
}
}
}
}
# Now print out the files...
for
(
$scpidx
=
0
;
$scpidx
<
$numscps
;
$scpidx
++
)
{
$scpfn
=
$OUTPUTS
[
$scpidx
];
open
(
F
,
"
>
$scpfn
")
||
die
"
Could not open scp file
$scpfn
for writing.
";
$count
=
0
;
if
(
@
{
$scparray
[
$scpidx
]}
==
0
)
{
print
STDERR
"
Warning: split_scp.pl producing empty .scp file
$scpfn
(too many splits and too few speakers?)
\n
";
}
else
{
foreach
$spk
(
@
{
$scparray
[
$scpidx
]}
)
{
print
F
$spk_data
{
$spk
};
$count
+=
$spk_count
{
$spk
};
}
if
(
$count
!=
$scpcount
[
$scpidx
])
{
die
"
Count mismatch [code error]
";
}
}
close
(
F
);
}
}
else
{
# This block is the "normal" case where there is no --utt2spk
# option and we just break into equal size chunks.
open
(
I
,
"
<
$inscp
")
||
die
"
Opening input scp file
$inscp
";
$numscps
=
@OUTPUTS
;
# size of array.
@F
=
();
while
(
<
I
>
)
{
push
@F
,
$_
;
}
$numlines
=
@F
;
if
(
$numlines
==
0
)
{
print
STDERR
"
split_scp.pl: warning: empty input scp file
$inscp
";
}
$linesperscp
=
int
(
(
$numlines
+
(
$numscps
-
1
))
/
$numscps
);
# the +$(numscps-1) forces rounding up.
# [just doing int() rounds down].
for
(
$scpidx
=
0
;
$scpidx
<
@OUTPUTS
;
$scpidx
++
)
{
$scpfile
=
$OUTPUTS
[
$scpidx
];
open
(
O
,
"
>
$scpfile
")
||
die
"
Opening output scp file
$scpfile
";
for
(
$n
=
$linesperscp
*
$scpidx
;
$n
<
$numlines
&&
$n
<
$linesperscp
*
(
$scpidx
+
1
);
$n
++
)
{
print
O
$F
[
$n
];
}
close
(
O
)
||
die
"
Closing scp file
$scpfile
";
}
}
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录