未验证 提交 f46e8595 编写于 作者: H Hui Zhang 提交者: GitHub

Merge pull request #781 from yt605155624/fix_mfa

fix_mfa
......@@ -84,8 +84,9 @@ FILES = glob.glob('kenlm/util/*.cc') \
FILES += glob.glob('openfst-1.6.3/src/lib/*.cc')
FILES = [
fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc')
or fn.endswith('unittest.cc'))
fn for fn in FILES
if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith(
'unittest.cc'))
]
LIBS = ['stdc++']
......
......@@ -33,4 +33,4 @@
},
"prob": 1.0
}
]
\ No newline at end of file
]
......@@ -20,27 +20,33 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
echo "Prepare THCHS-30 failed. Terminated."
exit 1
fi
fi
# dump manifest to data/
python3 ${MAIN_ROOT}/utils/dump_manifest.py --manifest-path=data/manifest.train --output-dir=data
# copy files to data/dict to gen word.lexicon
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1
cp ${TARGET_DIR}/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# dump manifest to data/
python3 ${MAIN_ROOT}/utils/dump_manifest.py --manifest-path=data/manifest.train --output-dir=data
fi
# copy phone.lexicon to data/dict
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# copy files to data/dict to gen word.lexicon
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1
cp ${TARGET_DIR}/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2
# copy phone.lexicon to data/dict
cp ${TARGET_DIR}/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon
fi
# gen word.lexicon
python local/gen_word2phone.py --root-dir=data/dict --output-dir=data/dict
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# gen word.lexicon
python local/gen_word2phone.py --lexicon-files="data/dict/lm_word_lexicon_1 data/dict/lm_word_lexicon_2" --output-path=data/dict/word.lexicon
fi
# reorganize dataset for MFA
if [ ! -d $EXP_DIR/thchs30_corpus ]; then
echo "reorganizing thchs30 corpus..."
python local/reorganize_thchs30.py --root-dir=data --output-dir=data/thchs30_corpus --script-type=$LEXICON_NAME
echo "reorganization done."
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# reorganize dataset for MFA
if [ ! -d $EXP_DIR/thchs30_corpus ]; then
echo "reorganizing thchs30 corpus..."
python local/reorganize_thchs30.py --root-dir=data --output-dir=data/thchs30_corpus --script-type=$LEXICON_NAME
echo "reorganization done."
fi
fi
echo "THCHS-30 data preparation done."
......
......@@ -18,6 +18,7 @@ file2: THCHS-30/resource/dict/lexicon.txt
import argparse
from collections import defaultdict
from pathlib import Path
from typing import List
from typing import Union
# key: (cn, ('ee', 'er4')),value: count
......@@ -34,7 +35,7 @@ def is_Chinese(ch):
return False
def proc_line(line):
def proc_line(line: str):
line = line.strip()
if is_Chinese(line[0]):
line_list = line.split()
......@@ -49,20 +50,25 @@ def proc_line(line):
cn_phones_counter[(cn, phones)] += 1
def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]):
root_dir = Path(root_dir).expanduser()
output_dir = Path(output_dir).expanduser()
output_dir.mkdir(parents=True, exist_ok=True)
file1 = root_dir / "lm_word_lexicon_1"
file2 = root_dir / "lm_word_lexicon_2"
write_file = output_dir / "word.lexicon"
"""
example lines of output
the first column is a Chinese character
the second is the probability of this pronunciation
and the rest are the phones of this pronunciation
一 0.22 ii i1↩
一 0.45 ii i4↩
一 0.32 ii i2↩
一 0.01 ii i5
"""
def gen_lexicon(lexicon_files: List[Union[str, Path]],
output_path: Union[str, Path]):
for file_path in lexicon_files:
with open(file_path, "r") as f1:
for line in f1:
proc_line(line)
with open(file1, "r") as f1:
for line in f1:
proc_line(line)
with open(file2, "r") as f2:
for line in f2:
proc_line(line)
for key in cn_phones_counter:
cn = key[0]
cn_counter[cn].append((key[1], cn_phones_counter[key]))
......@@ -75,7 +81,8 @@ def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]):
p = round(p, 2)
if p > 0:
cn_counter_p[key].append((item[0], p))
with open(write_file, "w") as wf:
with open(output_path, "w") as wf:
for key in cn_counter_p:
phone_p_list = cn_counter_p[key]
for item in phone_p_list:
......@@ -87,8 +94,21 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Gen Chinese characters to phone lexicon for THCHS-30 dataset"
)
# A line of word_lexicon:
# 一丁点 ii i4 d ing1 d ian3
# the first is word, and the rest are the phones of the word, and the len of phones is twice of the word's len
parser.add_argument(
"--lexicon-files",
type=str,
default="data/dict/lm_word_lexicon_1 data/dict/lm_word_lexicon_2",
help="lm_word_lexicon files")
parser.add_argument(
"--root-dir", type=str, help="dir to thchs30 lm_word_lexicons")
parser.add_argument("--output-dir", type=str, help="path to save outputs")
"--output-path",
type=str,
default="data/dict/word.lexicon",
help="path to save output word2phone lexicon")
args = parser.parse_args()
gen_lexicon(args.root_dir, args.output_dir)
lexicon_files = args.lexicon_files.split(" ")
output_path = Path(args.output_path).expanduser()
gen_lexicon(lexicon_files, output_path)
......@@ -58,8 +58,6 @@ def write_lab(root_dir: Union[str, Path],
def reorganize_thchs30(root_dir: Union[str, Path],
output_dir: Union[str, Path]=None,
script_type='phone'):
root_dir = Path(root_dir).expanduser()
output_dir = Path(output_dir).expanduser()
output_dir.mkdir(parents=True, exist_ok=True)
link_wav(root_dir, output_dir)
write_lab(root_dir, output_dir, script_type)
......@@ -72,12 +70,15 @@ if __name__ == "__main__":
parser.add_argument(
"--output-dir",
type=str,
help="path to save outputs(audio and transcriptions)")
help="path to save outputs (audio and transcriptions)")
parser.add_argument(
"--script-type",
type=str,
default="phone",
help="type of lab ('word'/'syllable'/'phone')")
args = parser.parse_args()
reorganize_thchs30(args.root_dir, args.output_dir, args.script_type)
root_dir = Path(args.root_dir).expanduser()
output_dir = Path(args.output_dir).expanduser()
reorganize_thchs30(root_dir, output_dir, args.script_type)
......@@ -14,14 +14,17 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
# gen lexicon relink gen dump
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# prepare data
bash ./local/data.sh $LEXICON_NAME|| exit -1
echo "Start prepare thchs30 data for MFA ..."
bash ./local/data.sh $LEXICON_NAME || exit -1
fi
# run MFA
if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then
echo "Start MFA training..."
mfa_train_and_align data/thchs30_corpus data/dict/$LEXICON_NAME.lexicon $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
echo "training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n"
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# run MFA
if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then
echo "Start MFA training ..."
mfa_train_and_align data/thchs30_corpus data/dict/$LEXICON_NAME.lexicon $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
echo "MFA training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n"
fi
fi
......
......@@ -4,7 +4,7 @@
test -d Montreal-Forced-Aligner || git clone https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner.git
pushd Montreal-Forced-Aligner && git checkout v2.0.0a7 && python setup.py install
pushd Montreal-Forced-Aligner && python setup.py install && popd
test -d kaldi || { echo "need install kaldi first"; exit 1;}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册