Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
ed793b30
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
ed793b30
编写于
4月 08, 2021
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refactor build vocab
上级
af453e02
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
59 addition
and
134 deletion
+59
-134
deepspeech/frontend/featurizer/text_featurizer.py
deepspeech/frontend/featurizer/text_featurizer.py
+11
-12
examples/tiny/s0/local/data.sh
examples/tiny/s0/local/data.sh
+6
-6
utils/build_vocab.py
utils/build_vocab.py
+14
-35
utils/format_data.py
utils/format_data.py
+28
-81
未找到文件。
deepspeech/frontend/featurizer/text_featurizer.py
浏览文件 @
ed793b30
...
...
@@ -14,7 +14,6 @@
"""Contains the text featurizer class."""
import
os
import
codecs
import
sentencepiece
as
spm
from
deepspeech.frontend.utility
import
UNK
...
...
@@ -42,7 +41,7 @@ class TextFeaturizer(object):
if
unit_type
==
'spm'
:
spm_model
=
spm_model_prefix
+
'.model'
self
.
sp
=
spm
.
SentencePieceProcessor
()
self
.
sp
.
Load
(
s
elf
.
s
pm_model
)
self
.
sp
.
Load
(
spm_model
)
def
featurize
(
self
,
text
):
"""Convert text string to a list of token indices in char-level.Note
...
...
@@ -51,14 +50,14 @@ class TextFeaturizer(object):
:param text: Text to process.
:type text: str
:return: List of char-level token indices.
:rtype:
list
:rtype:
List[int]
"""
if
unit_type
==
'char'
:
tokens
=
self
.
_
char_tokenize
(
text
)
elif
unit_type
==
'word'
:
tokens
=
self
.
_
word_tokenize
(
text
)
if
self
.
unit_type
==
'char'
:
tokens
=
self
.
char_tokenize
(
text
)
elif
self
.
unit_type
==
'word'
:
tokens
=
self
.
word_tokenize
(
text
)
else
:
tokens
=
self
.
_
spm_tokenize
(
text
)
tokens
=
self
.
spm_tokenize
(
text
)
ids
=
[]
for
token
in
tokens
:
...
...
@@ -84,15 +83,15 @@ class TextFeaturizer(object):
"""
return
self
.
_vocab_list
def
_
char_tokenize
(
self
,
text
):
def
char_tokenize
(
self
,
text
):
"""Character tokenizer."""
return
list
(
text
.
strip
())
def
_
word_tokenize
(
self
,
text
):
def
word_tokenize
(
self
,
text
):
"""Word tokenizer, spearte by <space>."""
return
text
.
strip
().
split
()
def
_
spm_tokenize
(
self
,
text
):
def
spm_tokenize
(
self
,
text
):
"""spm tokenize.
Args:
...
...
@@ -127,7 +126,7 @@ class TextFeaturizer(object):
def
_load_vocabulary_from_file
(
self
,
vocab_filepath
):
"""Load vocabulary from file."""
vocab_lines
=
[]
with
codecs
.
open
(
vocab_filepath
,
'r'
,
'utf-8'
)
as
file
:
with
open
(
vocab_filepath
,
'r'
,
encoding
=
'utf-8'
)
as
file
:
vocab_lines
.
extend
(
file
.
readlines
())
vocab_list
=
[
line
[:
-
1
]
for
line
in
vocab_lines
]
vocab_dict
=
dict
(
...
...
examples/tiny/s0/local/data.sh
浏览文件 @
ed793b30
...
...
@@ -23,10 +23,10 @@ bpemode=unigram
bpeprefix
=
"data/bpe_
${
bpemode
}
_
${
nbpe
}
"
# build vocabulary
python3
${
MAIN_ROOT
}
/utils/build_vocab.py
\
--unit_type
"
bpe
"
\
--unit_type
"
spm
"
\
--count_threshold
=
${
nbpe
}
\
--
bpe
_mode
${
bpemode
}
\
--
bpe
_model_prefix
${
bpeprefix
}
\
--
spm
_mode
${
bpemode
}
\
--
spm
_model_prefix
${
bpeprefix
}
\
--vocab_path
=
"data/vocab.txt"
\
--manifest_paths
=
"data/manifest.tiny.raw"
...
...
@@ -53,8 +53,8 @@ fi
python3
${
MAIN_ROOT
}
/utils/format_data.py
\
--feat_type
"raw"
\
--cmvn_path
"data/mean_std.npz"
\
--unit_type
"
bpe
"
\
--
bpe
_model_prefix
${
bpeprefix
}
\
--unit_type
"
spm
"
\
--
spm
_model_prefix
${
bpeprefix
}
\
--vocab_path
=
"data/vocab.txt"
\
--manifest_path
=
"data/manifest.tiny.raw"
\
--output_path
=
"data/manifest.tiny"
...
...
utils/build_vocab.py
浏览文件 @
ed793b30
...
...
@@ -29,12 +29,13 @@ from deepspeech.frontend.utility import BLANK
from
deepspeech.frontend.utility
import
SOS
from
deepspeech.utils.utility
import
add_arguments
from
deepspeech.utils.utility
import
print_arguments
from
deepspeech.frontend.featurizer.text_featurizer
import
TextFeaturizer
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
add_arg
=
functools
.
partial
(
add_arguments
,
argparser
=
parser
)
# yapf: disable
add_arg
(
'unit_type'
,
str
,
"char
acter"
,
"Unit type, e.g. character, word, bpe
"
)
add_arg
(
'count_threshold'
,
int
,
0
,
"Truncation threshold for char/word/
bpe
counts."
)
add_arg
(
'unit_type'
,
str
,
"char
"
,
"Unit type, e.g. char, word, spm
"
)
add_arg
(
'count_threshold'
,
int
,
0
,
"Truncation threshold for char/word/
spm
counts."
)
add_arg
(
'vocab_path'
,
str
,
'examples/librispeech/data/vocab.txt'
,
"Filepath to write the vocabulary."
)
...
...
@@ -45,10 +46,10 @@ add_arg('manifest_paths', str,
nargs
=
'+'
,
required
=
True
)
# bpe
add_arg
(
'
bpe
_mode'
,
str
,
'unigram'
,
"
bpe model type, e.g. unigram, bpe, char, word. only need when `unit_type` is bpe
"
)
add_arg
(
'
bpe_model_prefix'
,
str
,
"bpe_model_%(bpe
_mode)_%(count_threshold)"
,
"
bpe model prefix, only need when `unit_type` is bpe
"
)
add_arg
(
'
spm
_mode'
,
str
,
'unigram'
,
"
spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm
"
)
add_arg
(
'
spm_model_prefix'
,
str
,
"spm_model_%(spm
_mode)_%(count_threshold)"
,
"
spm model prefix, only need when `unit_type` is spm
"
)
# yapf: disable
args
=
parser
.
parse_args
()
...
...
@@ -56,7 +57,7 @@ args = parser.parse_args()
def
count_manifest
(
counter
,
manifest_path
):
manifest_jsons
=
read_manifest
(
manifest_path
)
for
line_json
in
manifest_jsons
:
if
args
.
unit_type
==
'char
acter
'
:
if
args
.
unit_type
==
'char'
:
for
char
in
line_json
[
'text'
]:
counter
.
update
(
char
)
elif
args
.
unit_type
==
'word'
:
...
...
@@ -75,7 +76,7 @@ def main():
fout
.
write
(
BLANK
+
"
\n
"
)
# 0 will be used for "blank" in CTC
fout
.
write
(
UNK
+
'
\n
'
)
# <unk> must be 1
if
args
.
unit_type
!=
'
bpe
'
:
if
args
.
unit_type
!=
'
spm
'
:
counter
=
Counter
()
for
manifest_path
in
args
.
manifest_paths
:
count_manifest
(
counter
,
manifest_path
)
...
...
@@ -98,41 +99,21 @@ def main():
spm
.
SentencePieceTrainer
.
Train
(
input
=
fp
.
name
,
vocab_size
=
args
.
count_threshold
,
model_type
=
args
.
bpe
_mode
,
model_prefix
=
args
.
bpe
_model_prefix
,
model_type
=
args
.
spm
_mode
,
model_prefix
=
args
.
spm
_model_prefix
,
input_sentence_size
=
100000000
,
character_coverage
=
0.9995
)
os
.
unlink
(
fp
.
name
)
# encode
sp
=
spm
.
SentencePieceProcessor
()
sp
.
Load
(
args
.
bpe_model_prefix
+
'.model'
)
stats
=
{
"num_empty"
:
0
,
"num_filtered"
:
0
}
def
valid
(
line
):
return
True
def
encode
(
l
):
return
sp
.
EncodeAsPieces
(
l
)
def
encode_line
(
line
):
line
=
line
.
strip
()
if
len
(
line
)
>
0
:
line
=
encode
(
line
)
if
valid
(
line
):
return
line
else
:
stats
[
"num_filtered"
]
+=
1
else
:
stats
[
"num_empty"
]
+=
1
return
None
text_feature
=
TextFeaturizer
(
args
.
unit_type
,
args
.
vocab_path
,
args
.
spm_model_prefix
)
vocabs
=
set
()
for
manifest_path
in
args
.
manifest_paths
:
manifest_jsons
=
read_manifest
(
manifest_path
)
for
line_json
in
manifest_jsons
:
line
=
line_json
[
'text'
]
enc_line
=
encode_lin
e
(
line
)
enc_line
=
text_feature
.
spm_tokeniz
e
(
line
)
for
code
in
enc_line
:
vocabs
.
add
(
code
)
#print(" ".join(enc_line))
...
...
@@ -140,9 +121,7 @@ def main():
for
unit
in
vocabs_sorted
:
fout
.
write
(
unit
+
"
\n
"
)
print
(
f
"bpe vocab size:
{
len
(
vocabs_sorted
)
}
"
)
print
(
f
"skip
{
stats
[
'num_empty'
]
}
empty lines"
)
print
(
f
"filter
{
stats
[
'num_filtered'
]
}
invalid lines"
)
print
(
f
"spm vocab size:
{
len
(
vocabs_sorted
)
}
"
)
fout
.
write
(
SOS
+
"
\n
"
)
# <sos/eos>
fout
.
close
()
...
...
utils/format_data.py
浏览文件 @
ed793b30
...
...
@@ -27,6 +27,7 @@ from deepspeech.frontend.utility import SOS
from
deepspeech.frontend.utility
import
load_cmvn
from
deepspeech.utils.utility
import
add_arguments
from
deepspeech.utils.utility
import
print_arguments
from
deepspeech.frontend.featurizer.text_featurizer
import
TextFeaturizer
parser
=
argparse
.
ArgumentParser
(
description
=
__doc__
)
add_arg
=
functools
.
partial
(
add_arguments
,
argparser
=
parser
)
...
...
@@ -35,7 +36,7 @@ add_arg('feat_type', str, "raw", "speech feature type, e.g. raw(wav, flac), kald
add_arg
(
'cmvn_path'
,
str
,
'examples/librispeech/data/mean_std.npz'
,
"Filepath of cmvn."
)
add_arg
(
'unit_type'
,
str
,
"char
acter"
,
"Unit type, e.g. character, word, bpe
"
)
add_arg
(
'unit_type'
,
str
,
"char
"
,
"Unit type, e.g. char, word, spm
"
)
add_arg
(
'vocab_path'
,
str
,
'examples/librispeech/data/vocab.txt'
,
"Filepath of the vocabulary."
)
...
...
@@ -46,7 +47,8 @@ add_arg('manifest_paths', str,
nargs
=
'+'
,
required
=
True
)
# bpe
add_arg
(
'bpe_model_prefix'
,
str
,
"bpe_model_%(bpe_mode)_%(count_threshold)"
,
"bpe model prefix, only need when `unit_type` is bpe"
)
add_arg
(
'spm_model_prefix'
,
str
,
None
,
"spm model prefix, spm_model_%(bpe_mode)_%(count_threshold), only need when `unit_type` is spm"
)
add_arg
(
'output_path'
,
str
,
None
,
"filepath of formated manifest."
,
required
=
True
)
# yapf: disable
args
=
parser
.
parse_args
()
...
...
@@ -54,83 +56,28 @@ args = parser.parse_args()
def
main
():
print_arguments
(
args
)
fout
=
open
(
args
.
output_path
,
'w'
,
encoding
=
'utf-8'
)
# get feat dim
mean
,
std
=
load_cmvn
(
args
.
cmvn_path
,
filetype
=
'npz'
)
feat_dim
=
mean
.
shape
[
0
]
print
(
f
"Feature dim:
{
feat_dim
}
"
)
# read vocab
vocab
=
dict
()
with
open
(
args
.
vocab_path
,
'r'
,
encoding
=
'utf-8'
)
as
fin
:
for
line
in
fin
:
token
=
line
.
strip
()
vocab
[
token
]
=
len
(
vocab
)
vocab_size
=
len
(
vocab
)
text_feature
=
TextFeaturizer
(
args
.
unit_type
,
args
.
vocab_path
,
args
.
spm_model_prefix
)
vocab_size
=
text_feature
.
vocab_size
print
(
f
"Vocab size:
{
vocab_size
}
"
)
fout
=
open
(
args
.
output_path
,
'w'
,
encoding
=
'utf-8'
)
if
args
.
unit_type
!=
'bpe'
:
for
manifest_path
in
args
.
manifest_paths
:
manifest_jsons
=
read_manifest
(
manifest_path
)
for
line_json
in
manifest_jsons
:
tokens
=
[]
tokenids
=
[]
if
args
.
unit_type
==
'character'
:
for
char
in
line_json
[
'text'
]:
tokens
.
append
(
char
)
tokenids
.
append
(
vocab
[
char
])
elif
args
.
unit_type
==
'word'
:
for
word
in
line_json
[
'text'
].
split
():
tokens
.
append
(
word
)
tokenids
.
append
(
vocab
[
word
])
line_json
[
'token'
]
=
tokens
line_json
[
'token_id'
]
=
tokenids
line_json
[
'token_shape'
]
=
(
len
(
tokenids
),
vocab_size
)
feat_shape
=
line_json
[
'feat_shape'
]
assert
isinstance
(
feat_shape
,
(
list
,
tuple
)),
type
(
feat_shape
)
if
args
.
feat_type
==
'raw'
:
feat_shape
.
append
(
feat_dim
)
else
:
# kaldi
raise
NotImplemented
(
'no support kaldi feat now!'
)
fout
.
write
(
json
.
dumps
(
line_json
)
+
'
\n
'
)
else
:
import
sentencepiece
as
spm
# encode
sp
=
spm
.
SentencePieceProcessor
()
sp
.
Load
(
args
.
bpe_model_prefix
+
'.model'
)
def
valid
(
line
):
return
True
def
encode
(
l
):
return
sp
.
EncodeAsPieces
(
l
)
def
encode_line
(
line
):
line
=
line
.
strip
()
if
len
(
line
)
>
0
:
line
=
encode
(
line
)
if
valid
(
line
):
return
line
else
:
stats
[
"num_filtered"
]
+=
1
else
:
stats
[
"num_empty"
]
+=
1
return
None
for
manifest_path
in
args
.
manifest_paths
:
manifest_jsons
=
read_manifest
(
manifest_path
)
for
line_json
in
manifest_jsons
:
line
=
line_json
[
'text'
]
tokens
=
[]
token
ids
=
[]
enc_line
=
encode_line
(
line
)
for
code
in
enc_line
:
tokens
.
append
(
code
)
tokenids
.
append
(
vocab
[
code
]
)
#print(code, vocab[code]
)
if
args
.
unit_type
==
'char'
:
token
s
=
text_feature
.
char_tokenize
(
line
)
elif
args
.
unit_type
==
'word'
:
tokens
=
text_feature
.
word_tokenize
(
line
)
else
:
#spm
tokens
=
text_feature
.
spm_tokenize
(
line
)
tokenids
=
text_feature
.
featurize
(
line
)
line_json
[
'token'
]
=
tokens
line_json
[
'token_id'
]
=
tokenids
line_json
[
'token_shape'
]
=
(
len
(
tokenids
),
vocab_size
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录