提交 00017301 编写于 作者: 小湉湉's avatar 小湉湉

restructure thchs30/a0

上级 c0ee57d4
this is the example of MFA for thchs30 dataset
cd a0 run run.sh to get start
MFA 对齐所使用的字典
MFA 字典的格式可以参考: https://montreal-forced-aligner.readthedocs.io/en/latest/dictionary.html
phone.lexicon 直接使用的是 THCHS-30/data_thchs30/lm_phone/lexicon.txt
word.lexicon 是一个带概率的字典, 生成规则请参考 local/gen_word2phone.py
......@@ -8,9 +8,10 @@ source ${MAIN_ROOT}/utils/parse_options.sh
mkdir -p data
TARGET_DIR=${MAIN_ROOT}/examples/dataset
mkdir -p ${TARGET_DIR}
LEXICON_NAME=$1
# download data, generate manifests
if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
# download data, generate manifests
python3 ${TARGET_DIR}/thchs30/thchs30.py \
--manifest_prefix="data/manifest" \
--target_dir="${TARGET_DIR}/thchs30"
......@@ -22,5 +23,25 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
fi
# dump manifest to data/
python3 ${MAIN_ROOT}/utils/dump_manifest.py --manifest-path=data/manifest.train --output-dir=data
# copy files to data/dict to gen word.lexicon
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1
cp ${TARGET_DIR}/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2
# copy phone.lexicon to data/dict
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon
# gen word.lexicon
python local/gen_word2phone.py --root-dir=data/dict --output-dir=data/dict
# reorganize dataset for MFA
if [ ! -d $EXP_DIR/thchs30_corpus ]; then
echo "reorganizing thchs30 corpus..."
python local/reorganize_thchs30.py --root-dir=data --output-dir=data/thchs30_corpus --script-type=$LEXICON_NAME
echo "reorganization done."
fi
echo "THCHS-30 data preparation done."
exit 0
......@@ -53,9 +53,9 @@ def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]):
root_dir = Path(root_dir).expanduser()
output_dir = Path(output_dir).expanduser()
output_dir.mkdir(parents=True, exist_ok=True)
file1 = root_dir / "data_thchs30/lm_word/lexicon.txt"
file2 = root_dir / "resource/dict/lexicon.txt"
write_file = output_dir / "thchs30_cn2phone"
file1 = root_dir / "lm_word_lexicon_1"
file2 = root_dir / "lm_word_lexicon_2"
write_file = output_dir / "word.lexicon"
with open(file1, "r") as f1:
for line in f1:
......@@ -87,10 +87,8 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Gen Chinese characters to phone lexicon for THCHS-30 dataset"
)
parser.add_argument("--root-dir", type=str, help="path to thchs30 dataset.")
parser.add_argument(
"--output-dir",
type=str,
help="path to save outputs(audio and transcriptions)")
"--root-dir", type=str, help="dir to thchs30 lm_word_lexicons")
parser.add_argument("--output-dir", type=str, help="path to save outputs")
args = parser.parse_args()
gen_lexicon(args.root_dir, args.output_dir)
......@@ -23,64 +23,36 @@ import os
from pathlib import Path
from typing import Union
from deepspeech.frontend.utility import read_manifest
def link_wav(root_dir: Union[str, Path], output_dir: Union[str, Path]):
manifest_path = root_dir / "manifest.train"
manifest_jsons = read_manifest(manifest_path)
for line_json in manifest_jsons:
wav_path = line_json['feat']
wav_name = wav_path.split("/")[-1]
new_wav_path = output_dir / wav_name
os.symlink(wav_path, new_wav_path)
def link_lexicon(root_dir: Union[str, Path],
output_dir: Union[str, Path],
script_type='phone'):
manifest_path = root_dir / "manifest.train"
manifest_jsons = read_manifest(manifest_path)
line_json = manifest_jsons[0]
wav_path = line_json['feat']
if script_type == 'phone':
# find lexicon.txt in THCHS-30
grader_father = os.path.abspath(
os.path.dirname(wav_path) + os.path.sep + "..")
grader_father = Path(grader_father).expanduser()
lexicon_name = "lexicon.txt"
lexicon_father_dir = "lm_phone"
lexicon_path = grader_father / lexicon_father_dir / lexicon_name
elif script_type == 'syllable':
# find thchs30_pinyin2phone in dir of this py file
py_dir_path = os.path.split(os.path.realpath(__file__))[0]
py_dir_path = Path(py_dir_path).expanduser()
lexicon_path = py_dir_path / "thchs30_pinyin2phone"
else:
# script_type == 'text'
# find thchs30_cn2phone in dir of this py file
py_dir_path = os.path.split(os.path.realpath(__file__))[0]
py_dir_path = Path(py_dir_path).expanduser()
lexicon_path = py_dir_path / "thchs30_cn2phone"
new_lexicon_name = script_type + ".lexicon"
new_lexicon_path = os.path.dirname(output_dir) + "/" + new_lexicon_name
os.symlink(lexicon_path, new_lexicon_path)
wav_scp_path = root_dir / 'wav.scp'
with open(wav_scp_path, 'r') as rf:
for line in rf:
utt, feat = line.strip().split()
wav_path = feat
wav_name = wav_path.split("/")[-1]
new_wav_path = output_dir / wav_name
os.symlink(wav_path, new_wav_path)
def dump_lab(root_dir: Union[str, Path],
output_dir: Union[str, Path],
script_type='phone'):
# script_type can in {'text', 'syllable', 'phone'}
manifest_path = root_dir / "manifest.train"
manifest_jsons = read_manifest(manifest_path)
for line_json in manifest_jsons:
utt_id = line_json['utt']
transcript_name = utt_id + ".lab"
transcript_path = output_dir / transcript_name
with open(transcript_path, 'wt') as wf:
wf.write(line_json[script_type] + "\n")
def write_lab(root_dir: Union[str, Path],
output_dir: Union[str, Path],
script_type='phone'):
# script_type can in {'word', 'syllable', 'phone'}
json_name = 'text.' + script_type
json_path = root_dir / json_name
with open(json_path, 'r') as rf:
for line in rf:
line = line.strip().split()
utt_id = line[0]
context = ' '.join(line[1:])
transcript_name = utt_id + '.lab'
transcript_path = output_dir / transcript_name
with open(transcript_path, 'wt') as wf:
if script_type == 'word':
# add space between chinese char
context = ''.join([f + ' ' for f in context])[:-1]
wf.write(context + "\n")
def reorganize_thchs30(root_dir: Union[str, Path],
......@@ -90,8 +62,7 @@ def reorganize_thchs30(root_dir: Union[str, Path],
output_dir = Path(output_dir).expanduser()
output_dir.mkdir(parents=True, exist_ok=True)
link_wav(root_dir, output_dir)
dump_lab(root_dir, output_dir, script_type)
link_lexicon(root_dir, output_dir, script_type)
write_lab(root_dir, output_dir, script_type)
if __name__ == "__main__":
......@@ -107,6 +78,6 @@ if __name__ == "__main__":
"--script-type",
type=str,
default="phone",
help="type of lab (text'/'syllable'/'phone')")
help="type of lab ('word'/'syllable'/'phone')")
args = parser.parse_args()
reorganize_thchs30(args.root_dir, args.output_dir, args.script_type)
此差异已折叠。
......@@ -9,6 +9,5 @@ export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
MODEL=deepspeech2
export BIN_DIR=${MAIN_ROOT}/deepspeech/exps/${MODEL}/bin
# MFA is in tools
export PATH=${MAIN_ROOT}/tools/montreal-forced-aligner/bin:$PATH
\ No newline at end of file
......@@ -4,33 +4,26 @@ source path.sh
stage=0
stop_stage=100
EXP_DIR=exp
# LEXICON_NAME in {'phone', 'syllable', 'text'}
# LEXICON_NAME in {'phone', 'syllable', 'word'}
LEXICON_NAME='phone'
# get machine's cpu core number
NUM_JOBS=`grep 'processor' /proc/cpuinfo | sort -u | wc -l`
NUM_JOBS=$((NUM_JOBS/2))
# set MFA num_jobs as half of machine's cpu core number
NUM_JOBS=$((`nproc`/2))
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
# download dataset、unzip and generate manifest
# gen lexicon relink gen dump
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
bash ./local/data.sh || exit -1
bash ./local/data.sh $LEXICON_NAME|| exit -1
fi
# reorganize dataset for MFA
if [ ! -d $EXP_DIR/thchs30_corpus ]; then
echo "reorganizing thchs30 corpus..."
python local/recorganize_thchs30.py --root-dir=./data --output-dir=$EXP_DIR/thchs30_corpus --script-type=$LEXICON_NAME
echo "reorganization done."
fi
# MFA is in tools
export PATH="${MAIN_ROOT}/tools/montreal-forced-aligner/bin"
# run MFA
if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then
echo "Start MFA training..."
mfa_train_and_align $EXP_DIR/thchs30_corpus "$EXP_DIR/$LEXICON_NAME.lexicon" $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
mfa_train_and_align data/thchs30_corpus "data/$LEXICON_NAME.lexicon" $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
echo "training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n"
fi
mfa_train_and_align data/thchs30_corpus data/dict/$LEXICON_NAME.lexicon $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
......
#!/usr/bin/env python3
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""format manifest into wav.scp text.word [text.syllable text.phone]"""
import argparse
from pathlib import Path
from typing import Union
from deepspeech.frontend.utility import read_manifest
key_whitelist = set(['feat', 'text', 'syllable', 'phone'])
filename = {
'text': 'text.word',
'syllable': 'text.syllable',
'phone': 'text.phone',
'feat': 'wav.scp',
}
def dump_manifest(manifest_path, output_dir: Union[str, Path]):
output_dir = Path(output_dir).expanduser()
manifest_path = Path(manifest_path).expanduser()
manifest_jsons = read_manifest(manifest_path)
first_line = manifest_jsons[0]
file_map = {}
for k in first_line.keys():
if k not in key_whitelist:
continue
file_map[k] = open(output_dir / filename[k], 'w')
for line_json in manifest_jsons:
for k in line_json.keys():
if k not in key_whitelist:
continue
file_map[k].write(line_json['utt'] + ' ' + line_json[k] + '\n')
for _, file in file_map.items():
file.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="dump manifest to wav.scp text.word ...")
parser.add_argument("--manifest-path", type=str, help="path to manifest")
parser.add_argument(
"--output-dir",
type=str,
help="path to save outputs(audio and transcriptions)")
args = parser.parse_args()
dump_manifest(args.manifest_path, args.output_dir)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册