train lm

0ede6c2e · Hui Zhang · c492a42f · 0ede6c2e · 0ede6c2e · 0ede6c2e
5 changed file
--- a/examples/other/ngram_lm/s0/local/download_lm_zh.sh
+++ b/examples/other/ngram_lm/s0/local/download_lm_zh.sh
@@ -10,6 +10,11 @@ MD5="29e02312deb2e59b3c8686c7966d4fe3"
 TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm
+if [ -e $TARGET ];then
+    echo "already have lm"
+    exit 0;
+fi
 echo "Download language model ..."
 download $URL $MD5 $TARGET
 if [ $? -ne 0 ]; then

--- a/speechx/examples/ngram/zh/local/aishell_train_lms.sh
+++ b/speechx/examples/ngram/zh/local/aishell_train_lms.sh
@@ -29,12 +29,13 @@ mkdir -p $dir
 cleantext=$dir/text.no_oov
 # oov to <SPOKEN_NOISE>
-# line: utt word0 ... wordn -> line: <SPOKEN_NOISE> word0 ... wordn
+# lexicon line: word char0 ... charn
+# text line: utt word0 ... wordn -> line: <SPOKEN_NOISE> word0 ... wordn
 cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
  {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
  > $cleantext || exit 1;
-# compute word counts
+# compute word counts, sort in descending order
 # line: count word
 cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
   sort -nr > $dir/word.counts || exit 1;
@@ -50,8 +51,7 @@ cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
 cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
 # hold out to compute ppl
-heldout_sent=10000 # Don't change this if you want result to be comparable with
+heldout_sent=10000 # Don't change this if you want result to be comparable with kaldi_lm results
-    # kaldi_lm results
 mkdir -p $dir
 cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \

--- a/speechx/examples/ngram/zh/local/text_to_lexicon.py
+++ b/speechx/examples/ngram/zh/local/text_to_lexicon.py
 #!/usr/bin/env python3
 import argparse
+from collections import Counter
 def main(args):
+    counter = Counter()
    with open(args.text, 'r') as fin, open(args.lexicon, 'w') as fout:
        for line in fin:
            line = line.strip()
@@ -11,10 +13,12 @@ def main(args):
            else:
                words = line.split()
-            for word in words:
+            counter.update(words)
-                val = " ".join(list(word))
-                fout.write(f"{word}\t{val}\n")
+        for word in counter:
-                fout.flush()
+            val = " ".join(list(word))
+            fout.write(f"{word}\t{val}\n")
+            fout.flush()
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(

--- a/speechx/examples/ngram/zh/run.sh
+++ b/speechx/examples/ngram/zh/run.sh
@@ -3,11 +3,11 @@ set -eo pipefail
 . path.sh
-stage=0
+stage=-1
 stop_stage=100
 corpus=aishell
-unit=data/vocab.txt       # line: char/spm_pice, vocab file
+unit=data/vocab.txt       # vocab file, line: char/spm_pice
 lexicon=data/lexicon.txt  # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
 text=data/text            # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
@@ -42,15 +42,17 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
    # line: char/spm_pices
    cp $unit data/local/dict/units.txt
-    if [ -f $lexicon ];then
+    if [ ! -f $lexicon ];then
-        # line: word ph0 ... phn -> line: word char0 ... charn
+        local/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon
-        utils/fst/prepare_dict.py \
+        echo "Generate $lexicon from $text"
-            --unit_file $unit \
-            --in_lexicon ${lexicon} \
-            --out_lexicon data/local/dict/lexicon.txt
-    else
-        local/text_to_lexicon.py --has_key true --text $text --lexicon data/local/dict/lexicon.txt
    fi
+    # filter by vocab
+    # line: word ph0 ... phn -> line: word char0 ... charn
+    utils/fst/prepare_dict.py \
+        --unit_file $unit \
+        --in_lexicon ${lexicon} \
+        --out_lexicon data/local/dict/lexicon.txt
 fi
 lm=data/local/lm

--- a/utils/fst/prepare_dict.py
+++ b/utils/fst/prepare_dict.py
@@ -3,7 +3,8 @@ import argparse
 def main(args):
-    # load `unit` or `vocab` file
+    # load vocab file
+    # line: token
    unit_table = set()
    with open(args.unit_file, 'r') as fin:
        for line in fin:
@@ -11,27 +12,41 @@ def main(args):
            unit_table.add(unit)
    def contain_oov(units):
+        """token not in vocab
+        Args:
+            units (str): token
+        Returns:
+            bool: True token in voca, else False.
+        """
        for unit in units:
            if unit not in unit_table:
                return True
        return False
-    # load spm model
+    # load spm model, for English
    bpemode = args.bpemodel
    if bpemode:
        import sentencepiece as spm
        sp = spm.SentencePieceProcessor()
        sp.Load(sys.bpemodel)
-    # used to filter polyphone
+    # used to filter polyphone and invalid word
    lexicon_table = set()
+    in_n = 0  # in lexicon word count
+    out_n = 0 # out lexicon word cout
    with open(args.in_lexicon, 'r') as fin, \
            open(args.out_lexicon, 'w') as fout:
        for line in fin:
            word = line.split()[0]
+            in_n += 1
            if word == 'SIL' and not bpemode:  # `sil` might be a valid piece in bpemodel
+                # filter 'SIL' for mandarin, keep it in English
                continue
            elif word == '<SPOKEN_NOISE>':
+                # filter <SPOKEN_NOISE>
                continue
            else:
                # each word only has one pronunciation for e2e system
@@ -39,12 +54,14 @@ def main(args):
                    continue
                if bpemode:
+                    # for english
                    pieces = sp.EncodeAsPieces(word)
                    if contain_oov(pieces):
                        print('Ignoring words {}, which contains oov unit'.
                              format(''.join(word).strip('▁')))
                        continue
+                    # word is piece list, which not have <unk> piece, filter out by `contain_oov(pieces)`
                    chars = ' '.join(
                        [p if p in unit_table else '<unk>' for p in pieces])
                else:
@@ -58,11 +75,14 @@ def main(args):
                    # we assume the model unit of our e2e system is char now.
                    if word.encode('utf8').isalpha() and '▁' in unit_table:
                        word = '▁' + word
                    chars = ' '.join(word)  # word is a char list
                fout.write('{} {}\n'.format(word, chars))
                lexicon_table.add(word)
+                out_n += 1
+    print(f"Filter lexicon by unit table: filter out {in_n - out_n}, {out_n}/{in_n}")
 if __name__ == '__main__':
    parser = argparse.ArgumentParser(