Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
dc479df9
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 2 年 前同步成功
通知
210
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
dc479df9
编写于
12月 01, 2021
作者:
H
Hui Zhang
提交者:
GitHub
12月 01, 2021
浏览文件
操作
浏览文件
下载
差异文件
Merge pull request #1064 from LittleChenCc/develop
[ST] update data process
上级
48238921
72a8c933
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
20 addition
and
6 deletion
+20
-6
examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
examples/ted_en_zh/st0/conf/transformer_mtl_noam.yaml
+0
-0
examples/ted_en_zh/st0/local/data.sh
examples/ted_en_zh/st0/local/data.sh
+2
-1
examples/ted_en_zh/st0/run.sh
examples/ted_en_zh/st0/run.sh
+1
-1
utils/build_vocab.py
utils/build_vocab.py
+17
-4
未找到文件。
examples/ted_en_zh/st0/conf/transformer_
joint
_noam.yaml
→
examples/ted_en_zh/st0/conf/transformer_
mtl
_noam.yaml
浏览文件 @
dc479df9
文件已移动
examples/ted_en_zh/st0/local/data.sh
浏览文件 @
dc479df9
...
...
@@ -76,8 +76,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--spm_vocab_size
=
${
nbpe
}
\
--spm_mode
${
bpemode
}
\
--spm_model_prefix
${
bpeprefix
}
\
--spm_character_coverage
1.
\
--vocab_path
=
"
${
dict_dir
}
/vocab.txt"
\
--text_keys
'text'
'text1'
\
--text_keys
'text'
\
--manifest_paths
=
"data/manifest.train.raw"
if
[
$?
-ne
0
]
;
then
...
...
examples/ted_en_zh/st0/run.sh
浏览文件 @
dc479df9
...
...
@@ -5,7 +5,7 @@ source path.sh
gpus
=
0,1,2,3
stage
=
0
stop_stage
=
100
conf_path
=
conf/transformer_
joint
_noam.yaml
conf_path
=
conf/transformer_
mtl
_noam.yaml
avg_num
=
5
data_path
=
./TED_EnZh
# path to unzipped data
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
...
...
utils/build_vocab.py
浏览文件 @
dc479df9
...
...
@@ -55,6 +55,8 @@ add_arg('text_keys', str,
add_arg
(
'spm_vocab_size'
,
int
,
0
,
"Vocab size for spm."
)
add_arg
(
'spm_mode'
,
str
,
'unigram'
,
"spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm"
)
add_arg
(
'spm_model_prefix'
,
str
,
""
,
"spm_model_%(spm_mode)_%(count_threshold), spm model prefix, only need when `unit_type` is spm"
)
add_arg
(
'spm_character_coverage'
,
float
,
0.9995
,
"character coverage to determine the minimum symbols"
)
# yapf: disable
args
=
parser
.
parse_args
()
...
...
@@ -66,8 +68,14 @@ def count_manifest(counter, text_feature, manifest_path):
manifest_jsons
.
append
(
json_data
)
for
line_json
in
manifest_jsons
:
line
=
text_feature
.
tokenize
(
line_json
[
'text'
],
replace_space
=
False
)
counter
.
update
(
line
)
if
isinstance
(
line_json
[
'text'
],
str
):
line
=
text_feature
.
tokenize
(
line_json
[
'text'
],
replace_space
=
False
)
counter
.
update
(
line
)
else
:
assert
isinstance
(
line_json
[
'text'
],
list
)
for
text
in
line_json
[
'text'
]:
line
=
text_feature
.
tokenize
(
text
,
replace_space
=
False
)
counter
.
update
(
line
)
def
dump_text_manifest
(
fileobj
,
manifest_path
,
key
=
'text'
):
manifest_jsons
=
[]
...
...
@@ -76,7 +84,12 @@ def dump_text_manifest(fileobj, manifest_path, key='text'):
manifest_jsons
.
append
(
json_data
)
for
line_json
in
manifest_jsons
:
fileobj
.
write
(
line_json
[
key
]
+
"
\n
"
)
if
isinstance
(
line_json
[
key
],
str
):
fileobj
.
write
(
line_json
[
key
]
+
"
\n
"
)
else
:
assert
isinstance
(
line_json
[
key
],
list
)
for
line
in
line_json
[
key
]:
fileobj
.
write
(
line
+
"
\n
"
)
def
main
():
print_arguments
(
args
,
globals
())
...
...
@@ -104,7 +117,7 @@ def main():
model_type
=
args
.
spm_mode
,
model_prefix
=
args
.
spm_model_prefix
,
input_sentence_size
=
100000000
,
character_coverage
=
0.9995
)
character_coverage
=
args
.
spm_character_coverage
)
os
.
unlink
(
fp
.
name
)
# encode
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录