From 0ede6c2ee747100552a29cbcd9ef8ca72427527c Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Mon, 18 Apr 2022 12:47:09 +0000
Subject: [PATCH] train lm

---
 .../other/ngram_lm/s0/local/download_lm_zh.sh |  5 ++++
 .../ngram/zh/local/aishell_train_lms.sh       |  8 +++---
 .../ngram/zh/local/text_to_lexicon.py         | 12 ++++++---
 speechx/examples/ngram/zh/run.sh              | 22 +++++++++-------
 utils/fst/prepare_dict.py                     | 26 ++++++++++++++++---
 5 files changed, 52 insertions(+), 21 deletions(-)
diff --git a/examples/other/ngram_lm/s0/local/download_lm_zh.sh b/examples/other/ngram_lm/s0/local/download_lm_zh.sh
index f9e2261f..050749ce 100755
--- a/examples/other/ngram_lm/s0/local/download_lm_zh.sh
+++ b/examples/other/ngram_lm/s0/local/download_lm_zh.sh
@@ -10,6 +10,11 @@ MD5="29e02312deb2e59b3c8686c7966d4fe3"
 TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm
 
 
+if [ -e $TARGET ];then
+    echo "already have lm"
+    exit 0;
+fi
+
 echo "Download language model ..."
 download $URL $MD5 $TARGET
 if [ $? -ne 0 ]; then
diff --git a/speechx/examples/ngram/zh/local/aishell_train_lms.sh b/speechx/examples/ngram/zh/local/aishell_train_lms.sh
index e3cee438..76266151 100755
--- a/speechx/examples/ngram/zh/local/aishell_train_lms.sh
+++ b/speechx/examples/ngram/zh/local/aishell_train_lms.sh
@@ -29,12 +29,13 @@ mkdir -p $dir
 cleantext=$dir/text.no_oov
 
 # oov to <SPOKEN_NOISE>
-# line: utt word0 ... wordn -> line: <SPOKEN_NOISE> word0 ... wordn
+# lexicon line: word char0 ... charn
+# text line: utt word0 ... wordn -> line: <SPOKEN_NOISE> word0 ... wordn
 cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
   {for(n=1; n<=NF;n++) {  if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
   > $cleantext || exit 1;
 
-# compute word counts
+# compute word counts, sort in descending order
 # line: count word
 cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
    sort -nr > $dir/word.counts || exit 1;
@@ -50,8 +51,7 @@ cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
 cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
 
 # hold out to compute ppl
-heldout_sent=10000 # Don't change this if you want result to be comparable with
-    # kaldi_lm results
+heldout_sent=10000 # Don't change this if you want result to be comparable with kaldi_lm results
 
 mkdir -p $dir
 cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
diff --git a/speechx/examples/ngram/zh/local/text_to_lexicon.py b/speechx/examples/ngram/zh/local/text_to_lexicon.py
index 4d6b016d..0ccd07c7 100755
--- a/speechx/examples/ngram/zh/local/text_to_lexicon.py
+++ b/speechx/examples/ngram/zh/local/text_to_lexicon.py
@@ -1,7 +1,9 @@
 #!/usr/bin/env python3
 import argparse
+from collections import Counter
 
 def main(args):
+    counter = Counter()
     with open(args.text, 'r') as fin, open(args.lexicon, 'w') as fout:
         for line in fin:
             line = line.strip()
@@ -11,10 +13,12 @@ def main(args):
             else:
                 words = line.split()
             
-            for word in words:
-                val = " ".join(list(word))
-                fout.write(f"{word}\t{val}\n")
-                fout.flush()
+            counter.update(words)
+
+        for word in counter:
+            val = " ".join(list(word))
+            fout.write(f"{word}\t{val}\n")
+            fout.flush()
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
diff --git a/speechx/examples/ngram/zh/run.sh b/speechx/examples/ngram/zh/run.sh
index 347dfa2d..f24ad0a7 100755
--- a/speechx/examples/ngram/zh/run.sh
+++ b/speechx/examples/ngram/zh/run.sh
@@ -3,11 +3,11 @@ set -eo pipefail
 
 . path.sh
 
-stage=0
+stage=-1
 stop_stage=100
 corpus=aishell
 
-unit=data/vocab.txt       # line: char/spm_pice, vocab file
+unit=data/vocab.txt       # vocab file, line: char/spm_pice
 lexicon=data/lexicon.txt  # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
 text=data/text            # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
 
@@ -42,15 +42,17 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     # line: char/spm_pices
     cp $unit data/local/dict/units.txt
 
-    if [ -f $lexicon ];then
-        # line: word ph0 ... phn -> line: word char0 ... charn
-        utils/fst/prepare_dict.py \
-            --unit_file $unit \
-            --in_lexicon ${lexicon} \
-            --out_lexicon data/local/dict/lexicon.txt
-    else
-        local/text_to_lexicon.py --has_key true --text $text --lexicon data/local/dict/lexicon.txt
+    if [ ! -f $lexicon ];then
+        local/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon
+        echo "Generate $lexicon from $text"
     fi
+
+    # filter by vocab
+    # line: word ph0 ... phn -> line: word char0 ... charn
+    utils/fst/prepare_dict.py \
+        --unit_file $unit \
+        --in_lexicon ${lexicon} \
+        --out_lexicon data/local/dict/lexicon.txt
 fi
 
 lm=data/local/lm
diff --git a/utils/fst/prepare_dict.py b/utils/fst/prepare_dict.py
index f59cd311..301d72fb 100755
--- a/utils/fst/prepare_dict.py
+++ b/utils/fst/prepare_dict.py
@@ -3,7 +3,8 @@ import argparse
 
 
 def main(args):
-    # load `unit` or `vocab` file
+    # load vocab file
+    # line: token
     unit_table = set()
     with open(args.unit_file, 'r') as fin:
         for line in fin:
@@ -11,27 +12,41 @@ def main(args):
             unit_table.add(unit)
 
     def contain_oov(units):
+        """token not in vocab
+
+        Args:
+            units (str): token
+
+        Returns:
+            bool: True token in voca, else False.
+        """
         for unit in units:
             if unit not in unit_table:
                 return True
         return False
 
-    # load spm model
+    # load spm model, for English
     bpemode = args.bpemodel
     if bpemode:
         import sentencepiece as spm
         sp = spm.SentencePieceProcessor()
         sp.Load(sys.bpemodel)
 
-    # used to filter polyphone
+    # used to filter polyphone and invalid word
     lexicon_table = set()
+    in_n = 0  # in lexicon word count
+    out_n = 0 # out lexicon word cout
     with open(args.in_lexicon, 'r') as fin, \
             open(args.out_lexicon, 'w') as fout:
         for line in fin:
             word = line.split()[0]
+            in_n += 1
+
             if word == 'SIL' and not bpemode:  # `sil` might be a valid piece in bpemodel
+                # filter 'SIL' for mandarin, keep it in English
                 continue
             elif word == '<SPOKEN_NOISE>':
+                # filter <SPOKEN_NOISE>
                 continue
             else:
                 # each word only has one pronunciation for e2e system
@@ -39,12 +54,14 @@ def main(args):
                     continue
 
                 if bpemode:
+                    # for english
                     pieces = sp.EncodeAsPieces(word)
                     if contain_oov(pieces):
                         print('Ignoring words {}, which contains oov unit'.
                               format(''.join(word).strip('▁')))
                         continue
 
+                    # word is piece list, which not have <unk> piece, filter out by `contain_oov(pieces)`
                     chars = ' '.join(
                         [p if p in unit_table else '<unk>' for p in pieces])
                 else:
@@ -58,11 +75,14 @@ def main(args):
                     # we assume the model unit of our e2e system is char now.
                     if word.encode('utf8').isalpha() and '▁' in unit_table:
                         word = '▁' + word
+
                     chars = ' '.join(word)  # word is a char list
 
                 fout.write('{} {}\n'.format(word, chars))
                 lexicon_table.add(word)
+                out_n += 1
 
+    print(f"Filter lexicon by unit table: filter out {in_n - out_n}, {out_n}/{in_n}")
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser(
-- 
GitLab