Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
00017301
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 1 年 前同步成功
通知
206
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
00017301
编写于
7月 03, 2021
作者:
小湉湉
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
restructure thchs30/a0
上级
c0ee57d4
变更
9
展开全部
隐藏空白更改
内联
并排
Showing
9 changed file
with
188 addition
and
8165 deletion
+188
-8165
examples/thchs30/README.md
examples/thchs30/README.md
+5
-0
examples/thchs30/a0/data/dict/syllable.lexicon
examples/thchs30/a0/data/dict/syllable.lexicon
+0
-0
examples/thchs30/a0/local/data.sh
examples/thchs30/a0/local/data.sh
+22
-1
examples/thchs30/a0/local/gen_word2phone.py
examples/thchs30/a0/local/gen_word2phone.py
+5
-7
examples/thchs30/a0/local/reorganize_thchs30.py
examples/thchs30/a0/local/reorganize_thchs30.py
+83
-0
examples/thchs30/a0/local/thchs30_cn2phone
examples/thchs30/a0/local/thchs30_cn2phone
+0
-8139
examples/thchs30/a0/path.sh
examples/thchs30/a0/path.sh
+2
-3
examples/thchs30/a0/run.sh
examples/thchs30/a0/run.sh
+8
-15
utils/dump_manifest.py
utils/dump_manifest.py
+63
-0
未找到文件。
examples/thchs30/README.md
浏览文件 @
00017301
this is the example of MFA for thchs30 dataset
cd a0 run run.sh to get start
MFA 对齐所使用的字典
MFA 字典的格式可以参考: https://montreal-forced-aligner.readthedocs.io/en/latest/dictionary.html
phone.lexicon 直接使用的是 THCHS-30/data_thchs30/lm_phone/lexicon.txt
word.lexicon 是一个带概率的字典, 生成规则请参考 local/gen_word2phone.py
examples/thchs30/a0/
local/thchs30_pinyin2phone
→
examples/thchs30/a0/
data/dict/syllable.lexicon
浏览文件 @
00017301
文件已移动
examples/thchs30/a0/local/data.sh
浏览文件 @
00017301
...
...
@@ -8,9 +8,10 @@ source ${MAIN_ROOT}/utils/parse_options.sh
mkdir
-p
data
TARGET_DIR
=
${
MAIN_ROOT
}
/examples/dataset
mkdir
-p
${
TARGET_DIR
}
LEXICON_NAME
=
$1
# download data, generate manifests
if
[
${
stage
}
-le
-1
]
&&
[
${
stop_stage
}
-ge
-1
]
;
then
# download data, generate manifests
python3
${
TARGET_DIR
}
/thchs30/thchs30.py
\
--manifest_prefix
=
"data/manifest"
\
--target_dir
=
"
${
TARGET_DIR
}
/thchs30"
...
...
@@ -22,5 +23,25 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
fi
# dump manifest to data/
python3
${
MAIN_ROOT
}
/utils/dump_manifest.py
--manifest-path
=
data/manifest.train
--output-dir
=
data
# copy files to data/dict to gen word.lexicon
cp
${
TARGET_DIR
}
/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1
cp
${
TARGET_DIR
}
/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2
# copy phone.lexicon to data/dict
cp
${
TARGET_DIR
}
/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon
# gen word.lexicon
python
local
/gen_word2phone.py
--root-dir
=
data/dict
--output-dir
=
data/dict
# reorganize dataset for MFA
if
[
!
-d
$EXP_DIR
/thchs30_corpus
]
;
then
echo
"reorganizing thchs30 corpus..."
python
local
/reorganize_thchs30.py
--root-dir
=
data
--output-dir
=
data/thchs30_corpus
--script-type
=
$LEXICON_NAME
echo
"reorganization done."
fi
echo
"THCHS-30 data preparation done."
exit
0
examples/thchs30/a0/local/gen_
cn
2phone.py
→
examples/thchs30/a0/local/gen_
word
2phone.py
浏览文件 @
00017301
...
...
@@ -53,9 +53,9 @@ def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]):
root_dir
=
Path
(
root_dir
).
expanduser
()
output_dir
=
Path
(
output_dir
).
expanduser
()
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
file1
=
root_dir
/
"
data_thchs30/lm_word/lexicon.txt
"
file2
=
root_dir
/
"
resource/dict/lexicon.txt
"
write_file
=
output_dir
/
"
thchs30_cn2phone
"
file1
=
root_dir
/
"
lm_word_lexicon_1
"
file2
=
root_dir
/
"
lm_word_lexicon_2
"
write_file
=
output_dir
/
"
word.lexicon
"
with
open
(
file1
,
"r"
)
as
f1
:
for
line
in
f1
:
...
...
@@ -87,10 +87,8 @@ if __name__ == "__main__":
parser
=
argparse
.
ArgumentParser
(
description
=
"Gen Chinese characters to phone lexicon for THCHS-30 dataset"
)
parser
.
add_argument
(
"--root-dir"
,
type
=
str
,
help
=
"path to thchs30 dataset."
)
parser
.
add_argument
(
"--output-dir"
,
type
=
str
,
help
=
"path to save outputs(audio and transcriptions)"
)
"--root-dir"
,
type
=
str
,
help
=
"dir to thchs30 lm_word_lexicons"
)
parser
.
add_argument
(
"--output-dir"
,
type
=
str
,
help
=
"path to save outputs"
)
args
=
parser
.
parse_args
()
gen_lexicon
(
args
.
root_dir
,
args
.
output_dir
)
examples/thchs30/a0/local/re
c
organize_thchs30.py
→
examples/thchs30/a0/local/reorganize_thchs30.py
浏览文件 @
00017301
...
...
@@ -23,64 +23,36 @@ import os
from
pathlib
import
Path
from
typing
import
Union
from
deepspeech.frontend.utility
import
read_manifest
def
link_wav
(
root_dir
:
Union
[
str
,
Path
],
output_dir
:
Union
[
str
,
Path
]):
manifest_path
=
root_dir
/
"manifest.train"
manifest_jsons
=
read_manifest
(
manifest_path
)
for
line_json
in
manifest_jsons
:
wav_path
=
line_json
[
'feat'
]
wav_name
=
wav_path
.
split
(
"/"
)[
-
1
]
new_wav_path
=
output_dir
/
wav_name
os
.
symlink
(
wav_path
,
new_wav_path
)
def
link_lexicon
(
root_dir
:
Union
[
str
,
Path
],
output_dir
:
Union
[
str
,
Path
],
script_type
=
'phone'
):
manifest_path
=
root_dir
/
"manifest.train"
manifest_jsons
=
read_manifest
(
manifest_path
)
line_json
=
manifest_jsons
[
0
]
wav_path
=
line_json
[
'feat'
]
if
script_type
==
'phone'
:
# find lexicon.txt in THCHS-30
grader_father
=
os
.
path
.
abspath
(
os
.
path
.
dirname
(
wav_path
)
+
os
.
path
.
sep
+
".."
)
grader_father
=
Path
(
grader_father
).
expanduser
()
lexicon_name
=
"lexicon.txt"
lexicon_father_dir
=
"lm_phone"
lexicon_path
=
grader_father
/
lexicon_father_dir
/
lexicon_name
elif
script_type
==
'syllable'
:
# find thchs30_pinyin2phone in dir of this py file
py_dir_path
=
os
.
path
.
split
(
os
.
path
.
realpath
(
__file__
))[
0
]
py_dir_path
=
Path
(
py_dir_path
).
expanduser
()
lexicon_path
=
py_dir_path
/
"thchs30_pinyin2phone"
else
:
# script_type == 'text'
# find thchs30_cn2phone in dir of this py file
py_dir_path
=
os
.
path
.
split
(
os
.
path
.
realpath
(
__file__
))[
0
]
py_dir_path
=
Path
(
py_dir_path
).
expanduser
()
lexicon_path
=
py_dir_path
/
"thchs30_cn2phone"
new_lexicon_name
=
script_type
+
".lexicon"
new_lexicon_path
=
os
.
path
.
dirname
(
output_dir
)
+
"/"
+
new_lexicon_name
os
.
symlink
(
lexicon_path
,
new_lexicon_path
)
wav_scp_path
=
root_dir
/
'wav.scp'
with
open
(
wav_scp_path
,
'r'
)
as
rf
:
for
line
in
rf
:
utt
,
feat
=
line
.
strip
().
split
()
wav_path
=
feat
wav_name
=
wav_path
.
split
(
"/"
)[
-
1
]
new_wav_path
=
output_dir
/
wav_name
os
.
symlink
(
wav_path
,
new_wav_path
)
def
dump_lab
(
root_dir
:
Union
[
str
,
Path
],
output_dir
:
Union
[
str
,
Path
],
script_type
=
'phone'
):
# script_type can in {'text', 'syllable', 'phone'}
manifest_path
=
root_dir
/
"manifest.train"
manifest_jsons
=
read_manifest
(
manifest_path
)
for
line_json
in
manifest_jsons
:
utt_id
=
line_json
[
'utt'
]
transcript_name
=
utt_id
+
".lab"
transcript_path
=
output_dir
/
transcript_name
with
open
(
transcript_path
,
'wt'
)
as
wf
:
wf
.
write
(
line_json
[
script_type
]
+
"
\n
"
)
def
write_lab
(
root_dir
:
Union
[
str
,
Path
],
output_dir
:
Union
[
str
,
Path
],
script_type
=
'phone'
):
# script_type can in {'word', 'syllable', 'phone'}
json_name
=
'text.'
+
script_type
json_path
=
root_dir
/
json_name
with
open
(
json_path
,
'r'
)
as
rf
:
for
line
in
rf
:
line
=
line
.
strip
().
split
()
utt_id
=
line
[
0
]
context
=
' '
.
join
(
line
[
1
:])
transcript_name
=
utt_id
+
'.lab'
transcript_path
=
output_dir
/
transcript_name
with
open
(
transcript_path
,
'wt'
)
as
wf
:
if
script_type
==
'word'
:
# add space between chinese char
context
=
''
.
join
([
f
+
' '
for
f
in
context
])[:
-
1
]
wf
.
write
(
context
+
"
\n
"
)
def
reorganize_thchs30
(
root_dir
:
Union
[
str
,
Path
],
...
...
@@ -90,8 +62,7 @@ def reorganize_thchs30(root_dir: Union[str, Path],
output_dir
=
Path
(
output_dir
).
expanduser
()
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
link_wav
(
root_dir
,
output_dir
)
dump_lab
(
root_dir
,
output_dir
,
script_type
)
link_lexicon
(
root_dir
,
output_dir
,
script_type
)
write_lab
(
root_dir
,
output_dir
,
script_type
)
if
__name__
==
"__main__"
:
...
...
@@ -107,6 +78,6 @@ if __name__ == "__main__":
"--script-type"
,
type
=
str
,
default
=
"phone"
,
help
=
"type of lab (
text
'/'syllable'/'phone')"
)
help
=
"type of lab (
'word
'/'syllable'/'phone')"
)
args
=
parser
.
parse_args
()
reorganize_thchs30
(
args
.
root_dir
,
args
.
output_dir
,
args
.
script_type
)
examples/thchs30/a0/local/thchs30_cn2phone
已删除
100644 → 0
浏览文件 @
c0ee57d4
此差异已折叠。
点击以展开。
examples/thchs30/a0/path.sh
浏览文件 @
00017301
...
...
@@ -9,6 +9,5 @@ export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
export
LD_LIBRARY_PATH
=
${
LD_LIBRARY_PATH
}
:/usr/local/lib/
MODEL
=
deepspeech2
export
BIN_DIR
=
${
MAIN_ROOT
}
/deepspeech/exps/
${
MODEL
}
/bin
# MFA is in tools
export
PATH
=
${
MAIN_ROOT
}
/tools/montreal-forced-aligner/bin:
$PATH
\ No newline at end of file
examples/thchs30/a0/run.sh
浏览文件 @
00017301
...
...
@@ -4,33 +4,26 @@ source path.sh
stage
=
0
stop_stage
=
100
EXP_DIR
=
exp
# LEXICON_NAME in {'phone', 'syllable', '
text
'}
# LEXICON_NAME in {'phone', 'syllable', '
word
'}
LEXICON_NAME
=
'phone'
# get machine's cpu core number
NUM_JOBS
=
`
grep
'processor'
/proc/cpuinfo |
sort
-u
|
wc
-l
`
NUM_JOBS
=
$((
NUM_JOBS/2
))
# set MFA num_jobs as half of machine's cpu core number
NUM_JOBS
=
$((
`
nproc
`
/
2
))
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
# download dataset、unzip and generate manifest
# gen lexicon relink gen dump
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
# prepare data
bash ./local/data.sh
||
exit
-1
bash ./local/data.sh
$LEXICON_NAME
||
exit
-1
fi
# reorganize dataset for MFA
if
[
!
-d
$EXP_DIR
/thchs30_corpus
]
;
then
echo
"reorganizing thchs30 corpus..."
python
local
/recorganize_thchs30.py
--root-dir
=
./data
--output-dir
=
$EXP_DIR
/thchs30_corpus
--script-type
=
$LEXICON_NAME
echo
"reorganization done."
fi
# MFA is in tools
export
PATH
=
"
${
MAIN_ROOT
}
/tools/montreal-forced-aligner/bin"
# run MFA
if
[
!
-d
"
$EXP_DIR
/thchs30_alignment"
]
;
then
echo
"Start MFA training..."
mfa_train_and_align
$EXP_DIR
/thchs30_corpus
"
$EXP_DIR
/
$LEXICON_NAME
.lexicon"
$EXP_DIR
/thchs30_alignment
-o
$EXP_DIR
/thchs30_model
--clean
--verbose
--temp_directory
exp/.mfa_train_and_align
--num_jobs
$NUM_JOBS
mfa_train_and_align
data/thchs30_corpus
"data
/
$LEXICON_NAME
.lexicon"
$EXP_DIR
/thchs30_alignment
-o
$EXP_DIR
/thchs30_model
--clean
--verbose
--temp_directory
exp/.mfa_train_and_align
--num_jobs
$NUM_JOBS
echo
"training done!
\n
results:
$EXP_DIR
/thchs30_alignment
\n
model:
$EXP_DIR
/thchs30_model
\n
"
fi
mfa_train_and_align data/thchs30_corpus data/dict/
$LEXICON_NAME
.lexicon
$EXP_DIR
/thchs30_alignment
-o
$EXP_DIR
/thchs30_model
--clean
--verbose
--temp_directory
exp/.mfa_train_and_align
--num_jobs
$NUM_JOBS
...
...
utils/dump_manifest.py
0 → 100644
浏览文件 @
00017301
#!/usr/bin/env python3
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""format manifest into wav.scp text.word [text.syllable text.phone]"""
import
argparse
from
pathlib
import
Path
from
typing
import
Union
from
deepspeech.frontend.utility
import
read_manifest
key_whitelist
=
set
([
'feat'
,
'text'
,
'syllable'
,
'phone'
])
filename
=
{
'text'
:
'text.word'
,
'syllable'
:
'text.syllable'
,
'phone'
:
'text.phone'
,
'feat'
:
'wav.scp'
,
}
def
dump_manifest
(
manifest_path
,
output_dir
:
Union
[
str
,
Path
]):
output_dir
=
Path
(
output_dir
).
expanduser
()
manifest_path
=
Path
(
manifest_path
).
expanduser
()
manifest_jsons
=
read_manifest
(
manifest_path
)
first_line
=
manifest_jsons
[
0
]
file_map
=
{}
for
k
in
first_line
.
keys
():
if
k
not
in
key_whitelist
:
continue
file_map
[
k
]
=
open
(
output_dir
/
filename
[
k
],
'w'
)
for
line_json
in
manifest_jsons
:
for
k
in
line_json
.
keys
():
if
k
not
in
key_whitelist
:
continue
file_map
[
k
].
write
(
line_json
[
'utt'
]
+
' '
+
line_json
[
k
]
+
'
\n
'
)
for
_
,
file
in
file_map
.
items
():
file
.
close
()
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"dump manifest to wav.scp text.word ..."
)
parser
.
add_argument
(
"--manifest-path"
,
type
=
str
,
help
=
"path to manifest"
)
parser
.
add_argument
(
"--output-dir"
,
type
=
str
,
help
=
"path to save outputs(audio and transcriptions)"
)
args
=
parser
.
parse_args
()
dump_manifest
(
args
.
manifest_path
,
args
.
output_dir
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录