Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
553aa359
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
1 年多 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
553aa359
编写于
4月 09, 2021
作者:
H
Hui Zhang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refactor data preprare
上级
b7674866
变更
3
显示空白变更内容
内联
并排
Showing
3 changed file
with
27 addition
and
48 deletion
+27
-48
examples/tiny/s0/local/data.sh
examples/tiny/s0/local/data.sh
+1
-1
utils/build_vocab.py
utils/build_vocab.py
+21
-40
utils/format_data.py
utils/format_data.py
+5
-7
未找到文件。
examples/tiny/s0/local/data.sh
浏览文件 @
553aa359
...
...
@@ -24,7 +24,7 @@ bpeprefix="data/bpe_${bpemode}_${nbpe}"
# build vocabulary
python3
${
MAIN_ROOT
}
/utils/build_vocab.py
\
--unit_type
"spm"
\
--
count_threshold
=
${
nbpe
}
\
--
vocab_size
=
${
nbpe
}
\
--spm_mode
${
bpemode
}
\
--spm_model_prefix
${
bpeprefix
}
\
--vocab_path
=
"data/vocab.txt"
\
...
...
utils/build_vocab.py
浏览文件 @
553aa359
...
...
@@ -35,7 +35,8 @@ parser = argparse.ArgumentParser(description=__doc__)
add_arg
=
functools
.
partial
(
add_arguments
,
argparser
=
parser
)
# yapf: disable
add_arg
(
'unit_type'
,
str
,
"char"
,
"Unit type, e.g. char, word, spm"
)
add_arg
(
'count_threshold'
,
int
,
0
,
"Truncation threshold for char/word/spm counts."
)
add_arg
(
'count_threshold'
,
int
,
0
,
"Truncation threshold for char/word counts.Default 0, no truncate."
)
add_arg
(
'vocab_path'
,
str
,
'examples/librispeech/data/vocab.txt'
,
"Filepath to write the vocabulary."
)
...
...
@@ -46,6 +47,7 @@ add_arg('manifest_paths', str,
nargs
=
'+'
,
required
=
True
)
# bpe
add_arg
(
'vocab_size'
,
int
,
0
,
"Vocab size for spm."
)
add_arg
(
'spm_mode'
,
str
,
'unigram'
,
"spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm"
)
add_arg
(
'spm_model_prefix'
,
str
,
"spm_model_%(spm_mode)_%(count_threshold)"
,
...
...
@@ -72,18 +74,7 @@ def main():
fout
.
write
(
BLANK
+
"
\n
"
)
# 0 will be used for "blank" in CTC
fout
.
write
(
UNK
+
'
\n
'
)
# <unk> must be 1
if
args
.
unit_type
!=
'spm'
:
text_feature
=
TextFeaturizer
(
args
.
unit_type
,
args
.
vocab_path
)
counter
=
Counter
()
for
manifest_path
in
args
.
manifest_paths
:
count_manifest
(
counter
,
text_feature
,
manifest_path
)
count_sorted
=
sorted
(
counter
.
items
(),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
for
char
,
count
in
count_sorted
:
if
count
<
args
.
count_threshold
:
break
fout
.
write
(
char
+
'
\n
'
)
else
:
if
args
.
unit_type
==
'spm'
:
# tools/spm_train --input=$wave_data/lang_char/input.txt
# --vocab_size=${nbpe} --model_type=${bpemode}
# --model_prefix=${bpemodel} --input_sentence_size=100000000
...
...
@@ -96,7 +87,7 @@ def main():
# train
spm
.
SentencePieceTrainer
.
Train
(
input
=
fp
.
name
,
vocab_size
=
args
.
count_threshold
,
vocab_size
=
args
.
vocab_size
,
model_type
=
args
.
spm_mode
,
model_prefix
=
args
.
spm_model_prefix
,
input_sentence_size
=
100000000
,
...
...
@@ -105,30 +96,20 @@ def main():
# encode
text_feature
=
TextFeaturizer
(
args
.
unit_type
,
args
.
vocab_path
,
args
.
spm_model_prefix
)
# vocabs = set()
# for manifest_path in args.manifest_paths:
# manifest_jsons = read_manifest(manifest_path)
# for line_json in manifest_jsons:
# line = line_json['text']
# enc_line = text_feature.spm_tokenize(line)
# for code in enc_line:
# vocabs.add(code)
# #print(" ".join(enc_line))
# vocabs_sorted = sorted(vocabs)
# for unit in vocabs_sorted:
# fout.write(unit + "\n")
counter
=
Counter
()
for
manifest_path
in
args
.
manifest_paths
:
count_manifest
(
counter
,
text_feature
,
manifest_path
)
count_sorted
=
sorted
(
counter
.
items
(),
key
=
lambda
x
:
x
[
1
],
reverse
=
True
)
tokens
=
[]
for
token
,
count
in
count_sorted
:
fout
.
write
(
token
+
'
\n
'
)
if
count
<
args
.
count_threshold
:
break
tokens
.
append
(
token
)
print
(
f
"spm vocab size:
{
len
(
count_sorted
)
}
"
)
tokens
=
sorted
(
tokens
)
for
token
in
tokens
:
fout
.
write
(
token
+
'
\n
'
)
fout
.
write
(
SOS
+
"
\n
"
)
# <sos/eos>
fout
.
close
()
...
...
utils/format_data.py
浏览文件 @
553aa359
...
...
@@ -67,16 +67,12 @@ def main():
vocab_size
=
text_feature
.
vocab_size
print
(
f
"Vocab size:
{
vocab_size
}
"
)
count
=
0
for
manifest_path
in
args
.
manifest_paths
:
manifest_jsons
=
read_manifest
(
manifest_path
)
for
line_json
in
manifest_jsons
:
line
=
line_json
[
'text'
]
if
args
.
unit_type
==
'char'
:
tokens
=
text_feature
.
char_tokenize
(
line
)
elif
args
.
unit_type
==
'word'
:
tokens
=
text_feature
.
word_tokenize
(
line
)
else
:
#spm
tokens
=
text_feature
.
spm_tokenize
(
line
)
tokens
=
text_feature
.
tokenize
(
line
)
tokenids
=
text_feature
.
featurize
(
line
)
line_json
[
'token'
]
=
tokens
line_json
[
'token_id'
]
=
tokenids
...
...
@@ -88,7 +84,9 @@ def main():
else
:
# kaldi
raise
NotImplemented
(
'no support kaldi feat now!'
)
fout
.
write
(
json
.
dumps
(
line_json
)
+
'
\n
'
)
count
+=
1
print
(
f
"Examples number:
{
count
}
"
)
fout
.
close
()
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录