提交 00017301 编写于 作者: 小湉湉's avatar 小湉湉

restructure thchs30/a0

上级 c0ee57d4
this is the example of MFA for thchs30 dataset this is the example of MFA for thchs30 dataset
cd a0 run run.sh to get start cd a0 run run.sh to get start
MFA 对齐所使用的字典
MFA 字典的格式可以参考: https://montreal-forced-aligner.readthedocs.io/en/latest/dictionary.html
phone.lexicon 直接使用的是 THCHS-30/data_thchs30/lm_phone/lexicon.txt
word.lexicon 是一个带概率的字典, 生成规则请参考 local/gen_word2phone.py
...@@ -8,9 +8,10 @@ source ${MAIN_ROOT}/utils/parse_options.sh ...@@ -8,9 +8,10 @@ source ${MAIN_ROOT}/utils/parse_options.sh
mkdir -p data mkdir -p data
TARGET_DIR=${MAIN_ROOT}/examples/dataset TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR} mkdir -p ${TARGET_DIR}
LEXICON_NAME=$1
# download data, generate manifests
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
# download data, generate manifests
python3 ${TARGET_DIR}/thchs30/thchs30.py \ python3 ${TARGET_DIR}/thchs30/thchs30.py \
--manifest_prefix="data/manifest" \ --manifest_prefix="data/manifest" \
--target_dir="${TARGET_DIR}/thchs30" --target_dir="${TARGET_DIR}/thchs30"
...@@ -22,5 +23,25 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then ...@@ -22,5 +23,25 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
fi fi
# dump manifest to data/
python3 ${MAIN_ROOT}/utils/dump_manifest.py --manifest-path=data/manifest.train --output-dir=data
# copy files to data/dict to gen word.lexicon
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1
cp ${TARGET_DIR}/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2
# copy phone.lexicon to data/dict
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon
# gen word.lexicon
python local/gen_word2phone.py --root-dir=data/dict --output-dir=data/dict
# reorganize dataset for MFA
if [ ! -d $EXP_DIR/thchs30_corpus ]; then
echo "reorganizing thchs30 corpus..."
python local/reorganize_thchs30.py --root-dir=data --output-dir=data/thchs30_corpus --script-type=$LEXICON_NAME
echo "reorganization done."
fi
echo "THCHS-30 data preparation done." echo "THCHS-30 data preparation done."
exit 0 exit 0
...@@ -53,9 +53,9 @@ def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]): ...@@ -53,9 +53,9 @@ def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]):
root_dir = Path(root_dir).expanduser() root_dir = Path(root_dir).expanduser()
output_dir = Path(output_dir).expanduser() output_dir = Path(output_dir).expanduser()
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
file1 = root_dir / "data_thchs30/lm_word/lexicon.txt" file1 = root_dir / "lm_word_lexicon_1"
file2 = root_dir / "resource/dict/lexicon.txt" file2 = root_dir / "lm_word_lexicon_2"
write_file = output_dir / "thchs30_cn2phone" write_file = output_dir / "word.lexicon"
with open(file1, "r") as f1: with open(file1, "r") as f1:
for line in f1: for line in f1:
...@@ -87,10 +87,8 @@ if __name__ == "__main__": ...@@ -87,10 +87,8 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Gen Chinese characters to phone lexicon for THCHS-30 dataset" description="Gen Chinese characters to phone lexicon for THCHS-30 dataset"
) )
parser.add_argument("--root-dir", type=str, help="path to thchs30 dataset.")
parser.add_argument( parser.add_argument(
"--output-dir", "--root-dir", type=str, help="dir to thchs30 lm_word_lexicons")
type=str, parser.add_argument("--output-dir", type=str, help="path to save outputs")
help="path to save outputs(audio and transcriptions)")
args = parser.parse_args() args = parser.parse_args()
gen_lexicon(args.root_dir, args.output_dir) gen_lexicon(args.root_dir, args.output_dir)
...@@ -23,64 +23,36 @@ import os ...@@ -23,64 +23,36 @@ import os
from pathlib import Path from pathlib import Path
from typing import Union from typing import Union
from deepspeech.frontend.utility import read_manifest
def link_wav(root_dir: Union[str, Path], output_dir: Union[str, Path]): def link_wav(root_dir: Union[str, Path], output_dir: Union[str, Path]):
manifest_path = root_dir / "manifest.train" wav_scp_path = root_dir / 'wav.scp'
manifest_jsons = read_manifest(manifest_path) with open(wav_scp_path, 'r') as rf:
for line_json in manifest_jsons: for line in rf:
wav_path = line_json['feat'] utt, feat = line.strip().split()
wav_name = wav_path.split("/")[-1] wav_path = feat
new_wav_path = output_dir / wav_name wav_name = wav_path.split("/")[-1]
os.symlink(wav_path, new_wav_path) new_wav_path = output_dir / wav_name
os.symlink(wav_path, new_wav_path)
def link_lexicon(root_dir: Union[str, Path],
output_dir: Union[str, Path],
script_type='phone'):
manifest_path = root_dir / "manifest.train"
manifest_jsons = read_manifest(manifest_path)
line_json = manifest_jsons[0]
wav_path = line_json['feat']
if script_type == 'phone':
# find lexicon.txt in THCHS-30
grader_father = os.path.abspath(
os.path.dirname(wav_path) + os.path.sep + "..")
grader_father = Path(grader_father).expanduser()
lexicon_name = "lexicon.txt"
lexicon_father_dir = "lm_phone"
lexicon_path = grader_father / lexicon_father_dir / lexicon_name
elif script_type == 'syllable':
# find thchs30_pinyin2phone in dir of this py file
py_dir_path = os.path.split(os.path.realpath(__file__))[0]
py_dir_path = Path(py_dir_path).expanduser()
lexicon_path = py_dir_path / "thchs30_pinyin2phone"
else:
# script_type == 'text'
# find thchs30_cn2phone in dir of this py file
py_dir_path = os.path.split(os.path.realpath(__file__))[0]
py_dir_path = Path(py_dir_path).expanduser()
lexicon_path = py_dir_path / "thchs30_cn2phone"
new_lexicon_name = script_type + ".lexicon"
new_lexicon_path = os.path.dirname(output_dir) + "/" + new_lexicon_name
os.symlink(lexicon_path, new_lexicon_path)
def dump_lab(root_dir: Union[str, Path], def write_lab(root_dir: Union[str, Path],
output_dir: Union[str, Path], output_dir: Union[str, Path],
script_type='phone'): script_type='phone'):
# script_type can in {'text', 'syllable', 'phone'} # script_type can in {'word', 'syllable', 'phone'}
manifest_path = root_dir / "manifest.train" json_name = 'text.' + script_type
manifest_jsons = read_manifest(manifest_path) json_path = root_dir / json_name
for line_json in manifest_jsons: with open(json_path, 'r') as rf:
utt_id = line_json['utt'] for line in rf:
transcript_name = utt_id + ".lab" line = line.strip().split()
transcript_path = output_dir / transcript_name utt_id = line[0]
with open(transcript_path, 'wt') as wf: context = ' '.join(line[1:])
wf.write(line_json[script_type] + "\n") transcript_name = utt_id + '.lab'
transcript_path = output_dir / transcript_name
with open(transcript_path, 'wt') as wf:
if script_type == 'word':
# add space between chinese char
context = ''.join([f + ' ' for f in context])[:-1]
wf.write(context + "\n")
def reorganize_thchs30(root_dir: Union[str, Path], def reorganize_thchs30(root_dir: Union[str, Path],
...@@ -90,8 +62,7 @@ def reorganize_thchs30(root_dir: Union[str, Path], ...@@ -90,8 +62,7 @@ def reorganize_thchs30(root_dir: Union[str, Path],
output_dir = Path(output_dir).expanduser() output_dir = Path(output_dir).expanduser()
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
link_wav(root_dir, output_dir) link_wav(root_dir, output_dir)
dump_lab(root_dir, output_dir, script_type) write_lab(root_dir, output_dir, script_type)
link_lexicon(root_dir, output_dir, script_type)
if __name__ == "__main__": if __name__ == "__main__":
...@@ -107,6 +78,6 @@ if __name__ == "__main__": ...@@ -107,6 +78,6 @@ if __name__ == "__main__":
"--script-type", "--script-type",
type=str, type=str,
default="phone", default="phone",
help="type of lab (text'/'syllable'/'phone')") help="type of lab ('word'/'syllable'/'phone')")
args = parser.parse_args() args = parser.parse_args()
reorganize_thchs30(args.root_dir, args.output_dir, args.script_type) reorganize_thchs30(args.root_dir, args.output_dir, args.script_type)
此差异已折叠。
...@@ -9,6 +9,5 @@ export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH} ...@@ -9,6 +9,5 @@ export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/ export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
# MFA is in tools
MODEL=deepspeech2 export PATH=${MAIN_ROOT}/tools/montreal-forced-aligner/bin:$PATH
export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin \ No newline at end of file
...@@ -4,33 +4,26 @@ source path.sh ...@@ -4,33 +4,26 @@ source path.sh
stage=0 stage=0
stop_stage=100 stop_stage=100
EXP_DIR=exp EXP_DIR=exp
# LEXICON_NAME in {'phone', 'syllable', 'text'} # LEXICON_NAME in {'phone', 'syllable', 'word'}
LEXICON_NAME='phone' LEXICON_NAME='phone'
# get machine's cpu core number # set MFA num_jobs as half of machine's cpu core number
NUM_JOBS=`grep 'processor' /proc/cpuinfo | sort -u | wc -l` NUM_JOBS=$((`nproc`/2))
NUM_JOBS=$((NUM_JOBS/2))
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1; source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
# download dataset、unzip and generate manifest # download dataset、unzip and generate manifest
# gen lexicon relink gen dump
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data # prepare data
bash ./local/data.sh || exit -1 bash ./local/data.sh $LEXICON_NAME|| exit -1
fi fi
# reorganize dataset for MFA # run MFA
if [ ! -d $EXP_DIR/thchs30_corpus ]; then
echo "reorganizing thchs30 corpus..."
python local/recorganize_thchs30.py --root-dir=./data --output-dir=$EXP_DIR/thchs30_corpus --script-type=$LEXICON_NAME
echo "reorganization done."
fi
# MFA is in tools
export PATH="${MAIN_ROOT}/tools/montreal-forced-aligner/bin"
if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then
echo "Start MFA training..." echo "Start MFA training..."
mfa_train_and_align $EXP_DIR/thchs30_corpus "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS mfa_train_and_align data/thchs30_corpus "data/$LEXICON_NAME.lexicon" $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
echo "training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n" echo "training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n"
fi fi
mfa_train_and_align data/thchs30_corpus data/dict/$LEXICON_NAME.lexicon $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
......
#!/usr/bin/env python3
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""format manifest into wav.scp text.word [text.syllable text.phone]"""
import argparse
from pathlib import Path
from typing import Union
from deepspeech.frontend.utility import read_manifest
key_whitelist = set(['feat', 'text', 'syllable', 'phone'])
filename = {
'text': 'text.word',
'syllable': 'text.syllable',
'phone': 'text.phone',
'feat': 'wav.scp',
}
def dump_manifest(manifest_path, output_dir: Union[str, Path]):
output_dir = Path(output_dir).expanduser()
manifest_path = Path(manifest_path).expanduser()
manifest_jsons = read_manifest(manifest_path)
first_line = manifest_jsons[0]
file_map = {}
for k in first_line.keys():
if k not in key_whitelist:
continue
file_map[k] = open(output_dir / filename[k], 'w')
for line_json in manifest_jsons:
for k in line_json.keys():
if k not in key_whitelist:
continue
file_map[k].write(line_json['utt'] + ' ' + line_json[k] + '\n')
for _, file in file_map.items():
file.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="dump manifest to wav.scp text.word ...")
parser.add_argument("--manifest-path", type=str, help="path to manifest")
parser.add_argument(
"--output-dir",
type=str,
help="path to save outputs(audio and transcriptions)")
args = parser.parse_args()
dump_manifest(args.manifest_path, args.output_dir)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册