Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
DeepSpeech
提交
00017301
D
DeepSpeech
项目概览
PaddlePaddle
/
DeepSpeech
大约 1 年 前同步成功
通知
207
Star
8425
Fork
1598
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
245
列表
看板
标记
里程碑
合并请求
3
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
D
DeepSpeech
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
245
Issue
245
列表
看板
标记
里程碑
合并请求
3
合并请求
3
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
00017301
编写于
7月 03, 2021
作者:
小湉湉
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
restructure thchs30/a0
上级
c0ee57d4
变更
9
展开全部
隐藏空白更改
内联
并排
Showing
9 changed file
with
188 addition
and
8165 deletion
+188
-8165
examples/thchs30/README.md
examples/thchs30/README.md
+5
-0
examples/thchs30/a0/data/dict/syllable.lexicon
examples/thchs30/a0/data/dict/syllable.lexicon
+0
-0
examples/thchs30/a0/local/data.sh
examples/thchs30/a0/local/data.sh
+22
-1
examples/thchs30/a0/local/gen_word2phone.py
examples/thchs30/a0/local/gen_word2phone.py
+5
-7
examples/thchs30/a0/local/reorganize_thchs30.py
examples/thchs30/a0/local/reorganize_thchs30.py
+83
-0
examples/thchs30/a0/local/thchs30_cn2phone
examples/thchs30/a0/local/thchs30_cn2phone
+0
-8139
examples/thchs30/a0/path.sh
examples/thchs30/a0/path.sh
+2
-3
examples/thchs30/a0/run.sh
examples/thchs30/a0/run.sh
+8
-15
utils/dump_manifest.py
utils/dump_manifest.py
+63
-0
未找到文件。
examples/thchs30/README.md
浏览文件 @
00017301
this is the example of MFA for thchs30 dataset
this is the example of MFA for thchs30 dataset
cd a0 run run.sh to get start
cd a0 run run.sh to get start
MFA 对齐所使用的字典
MFA 字典的格式可以参考: https://montreal-forced-aligner.readthedocs.io/en/latest/dictionary.html
phone.lexicon 直接使用的是 THCHS-30/data_thchs30/lm_phone/lexicon.txt
word.lexicon 是一个带概率的字典, 生成规则请参考 local/gen_word2phone.py
examples/thchs30/a0/
local/thchs30_pinyin2phone
→
examples/thchs30/a0/
data/dict/syllable.lexicon
浏览文件 @
00017301
文件已移动
examples/thchs30/a0/local/data.sh
浏览文件 @
00017301
...
@@ -8,9 +8,10 @@ source ${MAIN_ROOT}/utils/parse_options.sh
...
@@ -8,9 +8,10 @@ source ${MAIN_ROOT}/utils/parse_options.sh
mkdir
-p
data
mkdir
-p
data
TARGET_DIR
=
${
MAIN_ROOT
}
/examples/dataset
TARGET_DIR
=
${
MAIN_ROOT
}
/examples/dataset
mkdir
-p
${
TARGET_DIR
}
mkdir
-p
${
TARGET_DIR
}
LEXICON_NAME
=
$1
# download data, generate manifests
if
[
${
stage
}
-le
-1
]
&&
[
${
stop_stage
}
-ge
-1
]
;
then
if
[
${
stage
}
-le
-1
]
&&
[
${
stop_stage
}
-ge
-1
]
;
then
# download data, generate manifests
python3
${
TARGET_DIR
}
/thchs30/thchs30.py
\
python3
${
TARGET_DIR
}
/thchs30/thchs30.py
\
--manifest_prefix
=
"data/manifest"
\
--manifest_prefix
=
"data/manifest"
\
--target_dir
=
"
${
TARGET_DIR
}
/thchs30"
--target_dir
=
"
${
TARGET_DIR
}
/thchs30"
...
@@ -22,5 +23,25 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
...
@@ -22,5 +23,25 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
fi
fi
# dump manifest to data/
python3
${
MAIN_ROOT
}
/utils/dump_manifest.py
--manifest-path
=
data/manifest.train
--output-dir
=
data
# copy files to data/dict to gen word.lexicon
cp
${
TARGET_DIR
}
/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1
cp
${
TARGET_DIR
}
/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2
# copy phone.lexicon to data/dict
cp
${
TARGET_DIR
}
/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon
# gen word.lexicon
python
local
/gen_word2phone.py
--root-dir
=
data/dict
--output-dir
=
data/dict
# reorganize dataset for MFA
if
[
!
-d
$EXP_DIR
/thchs30_corpus
]
;
then
echo
"reorganizing thchs30 corpus..."
python
local
/reorganize_thchs30.py
--root-dir
=
data
--output-dir
=
data/thchs30_corpus
--script-type
=
$LEXICON_NAME
echo
"reorganization done."
fi
echo
"THCHS-30 data preparation done."
echo
"THCHS-30 data preparation done."
exit
0
exit
0
examples/thchs30/a0/local/gen_
cn
2phone.py
→
examples/thchs30/a0/local/gen_
word
2phone.py
浏览文件 @
00017301
...
@@ -53,9 +53,9 @@ def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]):
...
@@ -53,9 +53,9 @@ def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]):
root_dir
=
Path
(
root_dir
).
expanduser
()
root_dir
=
Path
(
root_dir
).
expanduser
()
output_dir
=
Path
(
output_dir
).
expanduser
()
output_dir
=
Path
(
output_dir
).
expanduser
()
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
file1
=
root_dir
/
"
data_thchs30/lm_word/lexicon.txt
"
file1
=
root_dir
/
"
lm_word_lexicon_1
"
file2
=
root_dir
/
"
resource/dict/lexicon.txt
"
file2
=
root_dir
/
"
lm_word_lexicon_2
"
write_file
=
output_dir
/
"
thchs30_cn2phone
"
write_file
=
output_dir
/
"
word.lexicon
"
with
open
(
file1
,
"r"
)
as
f1
:
with
open
(
file1
,
"r"
)
as
f1
:
for
line
in
f1
:
for
line
in
f1
:
...
@@ -87,10 +87,8 @@ if __name__ == "__main__":
...
@@ -87,10 +87,8 @@ if __name__ == "__main__":
parser
=
argparse
.
ArgumentParser
(
parser
=
argparse
.
ArgumentParser
(
description
=
"Gen Chinese characters to phone lexicon for THCHS-30 dataset"
description
=
"Gen Chinese characters to phone lexicon for THCHS-30 dataset"
)
)
parser
.
add_argument
(
"--root-dir"
,
type
=
str
,
help
=
"path to thchs30 dataset."
)
parser
.
add_argument
(
parser
.
add_argument
(
"--output-dir"
,
"--root-dir"
,
type
=
str
,
help
=
"dir to thchs30 lm_word_lexicons"
)
type
=
str
,
parser
.
add_argument
(
"--output-dir"
,
type
=
str
,
help
=
"path to save outputs"
)
help
=
"path to save outputs(audio and transcriptions)"
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
gen_lexicon
(
args
.
root_dir
,
args
.
output_dir
)
gen_lexicon
(
args
.
root_dir
,
args
.
output_dir
)
examples/thchs30/a0/local/re
c
organize_thchs30.py
→
examples/thchs30/a0/local/reorganize_thchs30.py
浏览文件 @
00017301
...
@@ -23,64 +23,36 @@ import os
...
@@ -23,64 +23,36 @@ import os
from
pathlib
import
Path
from
pathlib
import
Path
from
typing
import
Union
from
typing
import
Union
from
deepspeech.frontend.utility
import
read_manifest
def
link_wav
(
root_dir
:
Union
[
str
,
Path
],
output_dir
:
Union
[
str
,
Path
]):
def
link_wav
(
root_dir
:
Union
[
str
,
Path
],
output_dir
:
Union
[
str
,
Path
]):
manifest_path
=
root_dir
/
"manifest.train"
wav_scp_path
=
root_dir
/
'wav.scp'
manifest_jsons
=
read_manifest
(
manifest_path
)
with
open
(
wav_scp_path
,
'r'
)
as
rf
:
for
line_json
in
manifest_jsons
:
for
line
in
rf
:
wav_path
=
line_json
[
'feat'
]
utt
,
feat
=
line
.
strip
().
split
()
wav_name
=
wav_path
.
split
(
"/"
)[
-
1
]
wav_path
=
feat
new_wav_path
=
output_dir
/
wav_name
wav_name
=
wav_path
.
split
(
"/"
)[
-
1
]
os
.
symlink
(
wav_path
,
new_wav_path
)
new_wav_path
=
output_dir
/
wav_name
os
.
symlink
(
wav_path
,
new_wav_path
)
def
link_lexicon
(
root_dir
:
Union
[
str
,
Path
],
output_dir
:
Union
[
str
,
Path
],
script_type
=
'phone'
):
manifest_path
=
root_dir
/
"manifest.train"
manifest_jsons
=
read_manifest
(
manifest_path
)
line_json
=
manifest_jsons
[
0
]
wav_path
=
line_json
[
'feat'
]
if
script_type
==
'phone'
:
# find lexicon.txt in THCHS-30
grader_father
=
os
.
path
.
abspath
(
os
.
path
.
dirname
(
wav_path
)
+
os
.
path
.
sep
+
".."
)
grader_father
=
Path
(
grader_father
).
expanduser
()
lexicon_name
=
"lexicon.txt"
lexicon_father_dir
=
"lm_phone"
lexicon_path
=
grader_father
/
lexicon_father_dir
/
lexicon_name
elif
script_type
==
'syllable'
:
# find thchs30_pinyin2phone in dir of this py file
py_dir_path
=
os
.
path
.
split
(
os
.
path
.
realpath
(
__file__
))[
0
]
py_dir_path
=
Path
(
py_dir_path
).
expanduser
()
lexicon_path
=
py_dir_path
/
"thchs30_pinyin2phone"
else
:
# script_type == 'text'
# find thchs30_cn2phone in dir of this py file
py_dir_path
=
os
.
path
.
split
(
os
.
path
.
realpath
(
__file__
))[
0
]
py_dir_path
=
Path
(
py_dir_path
).
expanduser
()
lexicon_path
=
py_dir_path
/
"thchs30_cn2phone"
new_lexicon_name
=
script_type
+
".lexicon"
new_lexicon_path
=
os
.
path
.
dirname
(
output_dir
)
+
"/"
+
new_lexicon_name
os
.
symlink
(
lexicon_path
,
new_lexicon_path
)
def
dump_lab
(
root_dir
:
Union
[
str
,
Path
],
def
write_lab
(
root_dir
:
Union
[
str
,
Path
],
output_dir
:
Union
[
str
,
Path
],
output_dir
:
Union
[
str
,
Path
],
script_type
=
'phone'
):
script_type
=
'phone'
):
# script_type can in {'text', 'syllable', 'phone'}
# script_type can in {'word', 'syllable', 'phone'}
manifest_path
=
root_dir
/
"manifest.train"
json_name
=
'text.'
+
script_type
manifest_jsons
=
read_manifest
(
manifest_path
)
json_path
=
root_dir
/
json_name
for
line_json
in
manifest_jsons
:
with
open
(
json_path
,
'r'
)
as
rf
:
utt_id
=
line_json
[
'utt'
]
for
line
in
rf
:
transcript_name
=
utt_id
+
".lab"
line
=
line
.
strip
().
split
()
transcript_path
=
output_dir
/
transcript_name
utt_id
=
line
[
0
]
with
open
(
transcript_path
,
'wt'
)
as
wf
:
context
=
' '
.
join
(
line
[
1
:])
wf
.
write
(
line_json
[
script_type
]
+
"
\n
"
)
transcript_name
=
utt_id
+
'.lab'
transcript_path
=
output_dir
/
transcript_name
with
open
(
transcript_path
,
'wt'
)
as
wf
:
if
script_type
==
'word'
:
# add space between chinese char
context
=
''
.
join
([
f
+
' '
for
f
in
context
])[:
-
1
]
wf
.
write
(
context
+
"
\n
"
)
def
reorganize_thchs30
(
root_dir
:
Union
[
str
,
Path
],
def
reorganize_thchs30
(
root_dir
:
Union
[
str
,
Path
],
...
@@ -90,8 +62,7 @@ def reorganize_thchs30(root_dir: Union[str, Path],
...
@@ -90,8 +62,7 @@ def reorganize_thchs30(root_dir: Union[str, Path],
output_dir
=
Path
(
output_dir
).
expanduser
()
output_dir
=
Path
(
output_dir
).
expanduser
()
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
output_dir
.
mkdir
(
parents
=
True
,
exist_ok
=
True
)
link_wav
(
root_dir
,
output_dir
)
link_wav
(
root_dir
,
output_dir
)
dump_lab
(
root_dir
,
output_dir
,
script_type
)
write_lab
(
root_dir
,
output_dir
,
script_type
)
link_lexicon
(
root_dir
,
output_dir
,
script_type
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
...
@@ -107,6 +78,6 @@ if __name__ == "__main__":
...
@@ -107,6 +78,6 @@ if __name__ == "__main__":
"--script-type"
,
"--script-type"
,
type
=
str
,
type
=
str
,
default
=
"phone"
,
default
=
"phone"
,
help
=
"type of lab (
text
'/'syllable'/'phone')"
)
help
=
"type of lab (
'word
'/'syllable'/'phone')"
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
reorganize_thchs30
(
args
.
root_dir
,
args
.
output_dir
,
args
.
script_type
)
reorganize_thchs30
(
args
.
root_dir
,
args
.
output_dir
,
args
.
script_type
)
examples/thchs30/a0/local/thchs30_cn2phone
已删除
100644 → 0
浏览文件 @
c0ee57d4
此差异已折叠。
点击以展开。
examples/thchs30/a0/path.sh
浏览文件 @
00017301
...
@@ -9,6 +9,5 @@ export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
...
@@ -9,6 +9,5 @@ export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
export
LD_LIBRARY_PATH
=
${
LD_LIBRARY_PATH
}
:/usr/local/lib/
export
LD_LIBRARY_PATH
=
${
LD_LIBRARY_PATH
}
:/usr/local/lib/
# MFA is in tools
MODEL
=
deepspeech2
export
PATH
=
${
MAIN_ROOT
}
/tools/montreal-forced-aligner/bin:
$PATH
export
BIN_DIR
=
${
MAIN_ROOT
}
/deepspeech/exps/
${
MODEL
}
/bin
\ No newline at end of file
examples/thchs30/a0/run.sh
浏览文件 @
00017301
...
@@ -4,33 +4,26 @@ source path.sh
...
@@ -4,33 +4,26 @@ source path.sh
stage
=
0
stage
=
0
stop_stage
=
100
stop_stage
=
100
EXP_DIR
=
exp
EXP_DIR
=
exp
# LEXICON_NAME in {'phone', 'syllable', '
text
'}
# LEXICON_NAME in {'phone', 'syllable', '
word
'}
LEXICON_NAME
=
'phone'
LEXICON_NAME
=
'phone'
# get machine's cpu core number
# set MFA num_jobs as half of machine's cpu core number
NUM_JOBS
=
`
grep
'processor'
/proc/cpuinfo |
sort
-u
|
wc
-l
`
NUM_JOBS
=
$((
`
nproc
`
/
2
))
NUM_JOBS
=
$((
NUM_JOBS/2
))
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
source
${
MAIN_ROOT
}
/utils/parse_options.sh
||
exit
1
;
# download dataset、unzip and generate manifest
# download dataset、unzip and generate manifest
# gen lexicon relink gen dump
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
if
[
${
stage
}
-le
0
]
&&
[
${
stop_stage
}
-ge
0
]
;
then
# prepare data
# prepare data
bash ./local/data.sh
||
exit
-1
bash ./local/data.sh
$LEXICON_NAME
||
exit
-1
fi
fi
# reorganize dataset for MFA
# run MFA
if
[
!
-d
$EXP_DIR
/thchs30_corpus
]
;
then
echo
"reorganizing thchs30 corpus..."
python
local
/recorganize_thchs30.py
--root-dir
=
./data
--output-dir
=
$EXP_DIR
/thchs30_corpus
--script-type
=
$LEXICON_NAME
echo
"reorganization done."
fi
# MFA is in tools
export
PATH
=
"
${
MAIN_ROOT
}
/tools/montreal-forced-aligner/bin"
if
[
!
-d
"
$EXP_DIR
/thchs30_alignment"
]
;
then
if
[
!
-d
"
$EXP_DIR
/thchs30_alignment"
]
;
then
echo
"Start MFA training..."
echo
"Start MFA training..."
mfa_train_and_align
$EXP_DIR
/thchs30_corpus
"
$EXP_DIR
/
$LEXICON_NAME
.lexicon"
$EXP_DIR
/thchs30_alignment
-o
$EXP_DIR
/thchs30_model
--clean
--verbose
--temp_directory
exp/.mfa_train_and_align
--num_jobs
$NUM_JOBS
mfa_train_and_align
data/thchs30_corpus
"data
/
$LEXICON_NAME
.lexicon"
$EXP_DIR
/thchs30_alignment
-o
$EXP_DIR
/thchs30_model
--clean
--verbose
--temp_directory
exp/.mfa_train_and_align
--num_jobs
$NUM_JOBS
echo
"training done!
\n
results:
$EXP_DIR
/thchs30_alignment
\n
model:
$EXP_DIR
/thchs30_model
\n
"
echo
"training done!
\n
results:
$EXP_DIR
/thchs30_alignment
\n
model:
$EXP_DIR
/thchs30_model
\n
"
fi
fi
mfa_train_and_align data/thchs30_corpus data/dict/
$LEXICON_NAME
.lexicon
$EXP_DIR
/thchs30_alignment
-o
$EXP_DIR
/thchs30_model
--clean
--verbose
--temp_directory
exp/.mfa_train_and_align
--num_jobs
$NUM_JOBS
...
...
utils/dump_manifest.py
0 → 100644
浏览文件 @
00017301
#!/usr/bin/env python3
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""format manifest into wav.scp text.word [text.syllable text.phone]"""
import
argparse
from
pathlib
import
Path
from
typing
import
Union
from
deepspeech.frontend.utility
import
read_manifest
key_whitelist
=
set
([
'feat'
,
'text'
,
'syllable'
,
'phone'
])
filename
=
{
'text'
:
'text.word'
,
'syllable'
:
'text.syllable'
,
'phone'
:
'text.phone'
,
'feat'
:
'wav.scp'
,
}
def
dump_manifest
(
manifest_path
,
output_dir
:
Union
[
str
,
Path
]):
output_dir
=
Path
(
output_dir
).
expanduser
()
manifest_path
=
Path
(
manifest_path
).
expanduser
()
manifest_jsons
=
read_manifest
(
manifest_path
)
first_line
=
manifest_jsons
[
0
]
file_map
=
{}
for
k
in
first_line
.
keys
():
if
k
not
in
key_whitelist
:
continue
file_map
[
k
]
=
open
(
output_dir
/
filename
[
k
],
'w'
)
for
line_json
in
manifest_jsons
:
for
k
in
line_json
.
keys
():
if
k
not
in
key_whitelist
:
continue
file_map
[
k
].
write
(
line_json
[
'utt'
]
+
' '
+
line_json
[
k
]
+
'
\n
'
)
for
_
,
file
in
file_map
.
items
():
file
.
close
()
if
__name__
==
"__main__"
:
parser
=
argparse
.
ArgumentParser
(
description
=
"dump manifest to wav.scp text.word ..."
)
parser
.
add_argument
(
"--manifest-path"
,
type
=
str
,
help
=
"path to manifest"
)
parser
.
add_argument
(
"--output-dir"
,
type
=
str
,
help
=
"path to save outputs(audio and transcriptions)"
)
args
=
parser
.
parse_args
()
dump_manifest
(
args
.
manifest_path
,
args
.
output_dir
)
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录