Merge pull request #781 from yt605155624/fix_mfa

fix_mfa

Merge pull request #781 from yt605155624/fix_mfa
fix_mfa
f46e8595 · Hui Zhang · GitHub · 9ac6d65a · 2c75c923 · f46e8595
7 changed file
--- a/deepspeech/decoders/swig/setup.py
+++ b/deepspeech/decoders/swig/setup.py
@@ -84,8 +84,9 @@ FILES = glob.glob('kenlm/util/*.cc') \
 FILES += glob.glob('openfst-1.6.3/src/lib/*.cc')

 FILES = [
-    fn for fn in FILES if not (fn.endswith('main.cc') or fn.endswith('test.cc')
-                               or fn.endswith('unittest.cc'))
+    fn for fn in FILES
+    if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith(
+        'unittest.cc'))
 ]

 LIBS = ['stdc++']

--- a/examples/aishell/s0/conf/augmentation.json
+++ b/examples/aishell/s0/conf/augmentation.json
@@ -33,4 +33,4 @@
    },
    "prob": 1.0
  }
-]
\ No newline at end of file
+]
--- a/examples/thchs30/a0/local/data.sh
+++ b/examples/thchs30/a0/local/data.sh
@@ -20,27 +20,33 @@ if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
        echo "Prepare THCHS-30 failed. Terminated."
        exit 1
    fi
-    
 fi

-# dump manifest to data/
-python3 ${MAIN_ROOT}/utils/dump_manifest.py --manifest-path=data/manifest.train --output-dir=data
-
-# copy files to data/dict to gen word.lexicon
-cp  ${TARGET_DIR}/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1
-cp  ${TARGET_DIR}/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # dump manifest to data/
+    python3 ${MAIN_ROOT}/utils/dump_manifest.py --manifest-path=data/manifest.train --output-dir=data
+fi

-# copy phone.lexicon to data/dict
-cp  ${TARGET_DIR}/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # copy files to data/dict to gen word.lexicon
+    cp  ${TARGET_DIR}/thchs30/data_thchs30/lm_word/lexicon.txt data/dict/lm_word_lexicon_1
+    cp  ${TARGET_DIR}/thchs30/resource/dict/lexicon.txt data/dict/lm_word_lexicon_2
+    # copy phone.lexicon to data/dict
+    cp  ${TARGET_DIR}/thchs30/data_thchs30/lm_phone/lexicon.txt data/dict/phone.lexicon
+fi

-# gen word.lexicon
-python local/gen_word2phone.py  --root-dir=data/dict --output-dir=data/dict
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    # gen word.lexicon
+    python local/gen_word2phone.py  --lexicon-files="data/dict/lm_word_lexicon_1 data/dict/lm_word_lexicon_2" --output-path=data/dict/word.lexicon
+fi

-# reorganize dataset for MFA
-if [ ! -d $EXP_DIR/thchs30_corpus ]; then
-    echo "reorganizing thchs30 corpus..."
-    python local/reorganize_thchs30.py --root-dir=data --output-dir=data/thchs30_corpus --script-type=$LEXICON_NAME
-    echo "reorganization done."
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    # reorganize dataset for MFA
+    if [ ! -d $EXP_DIR/thchs30_corpus ]; then
+        echo "reorganizing thchs30 corpus..."
+        python local/reorganize_thchs30.py --root-dir=data --output-dir=data/thchs30_corpus --script-type=$LEXICON_NAME
+        echo "reorganization done."
+    fi
 fi

 echo "THCHS-30  data preparation done."

--- a/examples/thchs30/a0/local/gen_word2phone.py
+++ b/examples/thchs30/a0/local/gen_word2phone.py
@@ -18,6 +18,7 @@ file2: THCHS-30/resource/dict/lexicon.txt
 import argparse
 from collections import defaultdict
 from pathlib import Path
+from typing import List
 from typing import Union

 # key: (cn, ('ee', 'er4'))，value: count
@@ -34,7 +35,7 @@ def is_Chinese(ch):
    return False


-def proc_line(line):
+def proc_line(line: str):
    line = line.strip()
    if is_Chinese(line[0]):
        line_list = line.split()
@@ -49,20 +50,25 @@ def proc_line(line):
                cn_phones_counter[(cn, phones)] += 1


-def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]):
-    root_dir = Path(root_dir).expanduser()
-    output_dir = Path(output_dir).expanduser()
-    output_dir.mkdir(parents=True, exist_ok=True)
-    file1 = root_dir / "lm_word_lexicon_1"
-    file2 = root_dir / "lm_word_lexicon_2"
-    write_file = output_dir / "word.lexicon"
+"""
+example lines of output
+the first column is a Chinese character
+the second is the probability of this pronunciation
+and the rest are the phones of this pronunciation
+一 0.22 ii i1↩
+一 0.45 ii i4↩
+一 0.32 ii i2↩
+一 0.01 ii i5
+"""
+
+
+def gen_lexicon(lexicon_files: List[Union[str, Path]],
+                output_path: Union[str, Path]):
+    for file_path in lexicon_files:
+        with open(file_path, "r") as f1:
+            for line in f1:
+                proc_line(line)

-    with open(file1, "r") as f1:
-        for line in f1:
-            proc_line(line)
-    with open(file2, "r") as f2:
-        for line in f2:
-            proc_line(line)
    for key in cn_phones_counter:
        cn = key[0]
        cn_counter[cn].append((key[1], cn_phones_counter[key]))
@@ -75,7 +81,8 @@ def gen_lexicon(root_dir: Union[str, Path], output_dir: Union[str, Path]):
            p = round(p, 2)
            if p > 0:
                cn_counter_p[key].append((item[0], p))
-    with open(write_file, "w") as wf:
+
+    with open(output_path, "w") as wf:
        for key in cn_counter_p:
            phone_p_list = cn_counter_p[key]
            for item in phone_p_list:
@@ -87,8 +94,21 @@ if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Gen Chinese characters to phone lexicon for THCHS-30 dataset"
    )
+    # A line of word_lexicon:
+    # 一丁点 ii i4 d ing1 d ian3
+    # the first is word, and the rest are the phones of the word, and the len of phones is twice of the word's len
+    parser.add_argument(
+        "--lexicon-files",
+        type=str,
+        default="data/dict/lm_word_lexicon_1 data/dict/lm_word_lexicon_2",
+        help="lm_word_lexicon files")
    parser.add_argument(
-        "--root-dir", type=str, help="dir to thchs30 lm_word_lexicons")
-    parser.add_argument("--output-dir", type=str, help="path to save outputs")
+        "--output-path",
+        type=str,
+        default="data/dict/word.lexicon",
+        help="path to save output word2phone lexicon")
    args = parser.parse_args()
-    gen_lexicon(args.root_dir, args.output_dir)
+    lexicon_files = args.lexicon_files.split(" ")
+    output_path = Path(args.output_path).expanduser()
+
+    gen_lexicon(lexicon_files, output_path)
--- a/examples/thchs30/a0/local/reorganize_thchs30.py
+++ b/examples/thchs30/a0/local/reorganize_thchs30.py
@@ -58,8 +58,6 @@ def write_lab(root_dir: Union[str, Path],
 def reorganize_thchs30(root_dir: Union[str, Path],
                       output_dir: Union[str, Path]=None,
                       script_type='phone'):
-    root_dir = Path(root_dir).expanduser()
-    output_dir = Path(output_dir).expanduser()
    output_dir.mkdir(parents=True, exist_ok=True)
    link_wav(root_dir, output_dir)
    write_lab(root_dir, output_dir, script_type)
@@ -72,12 +70,15 @@ if __name__ == "__main__":
    parser.add_argument(
        "--output-dir",
        type=str,
-        help="path to save outputs(audio and transcriptions)")
+        help="path to save outputs (audio and transcriptions)")

    parser.add_argument(
        "--script-type",
        type=str,
        default="phone",
        help="type of lab ('word'/'syllable'/'phone')")
+
    args = parser.parse_args()
-    reorganize_thchs30(args.root_dir, args.output_dir, args.script_type)
+    root_dir = Path(args.root_dir).expanduser()
+    output_dir = Path(args.output_dir).expanduser()
+    reorganize_thchs30(root_dir, output_dir, args.script_type)
--- a/examples/thchs30/a0/run.sh
+++ b/examples/thchs30/a0/run.sh
@@ -14,14 +14,17 @@ source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
 # gen lexicon relink gen dump
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # prepare data
-    bash ./local/data.sh $LEXICON_NAME|| exit -1
+    echo "Start prepare thchs30 data for MFA ..."
+    bash ./local/data.sh $LEXICON_NAME || exit -1
 fi

-# run MFA
-if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then
-    echo "Start MFA training..."
-    mfa_train_and_align data/thchs30_corpus data/dict/$LEXICON_NAME.lexicon $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
-    echo "training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n"
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    # run MFA
+    if [ ! -d "$EXP_DIR/thchs30_alignment" ]; then
+        echo "Start MFA training ..."
+        mfa_train_and_align data/thchs30_corpus data/dict/$LEXICON_NAME.lexicon $EXP_DIR/thchs30_alignment -o $EXP_DIR/thchs30_model --clean --verbose --temp_directory exp/.mfa_train_and_align --num_jobs $NUM_JOBS
+        echo "MFA training done! \nresults: $EXP_DIR/thchs30_alignment \nmodel: $EXP_DIR/thchs30_model\n"
+    fi
 fi



--- a/tools/extras/install_mfa.sh
+++ b/tools/extras/install_mfa.sh
@@ -4,7 +4,7 @@

 test -d Montreal-Forced-Aligner || git clone https://github.com/MontrealCorpusTools/Montreal-Forced-Aligner.git

-pushd Montreal-Forced-Aligner && git checkout v2.0.0a7 &&  python setup.py install
+pushd Montreal-Forced-Aligner &&  python setup.py install && popd

 test -d kaldi || { echo "need install kaldi first"; exit 1;}