From c492a42f140ee919b418f3958a1a3213613c12c5 Mon Sep 17 00:00:00 2001
From: Hui Zhang <zhtclz@foxmail.com>
Date: Sat, 16 Apr 2022 14:52:34 +0000
Subject: [PATCH] add build tlg wfst

---
 speechx/examples/README.md                    |  12 +-
 .../ngram/zh/local/text_to_lexicon.py         |  37 ++++
 speechx/examples/ngram/zh/run.sh              |  14 +-
 speechx/examples/wfst/.gitignore              |   1 +
 speechx/examples/wfst/README.md               | 168 ++++++++++++++++++
 speechx/examples/wfst/path.sh                 |   2 +-
 speechx/examples/wfst/run.sh                  |  47 +----
 speechx/examples/wfst/utils                   |   1 +
 8 files changed, 234 insertions(+), 48 deletions(-)
 create mode 100755 speechx/examples/ngram/zh/local/text_to_lexicon.py
 create mode 100644 speechx/examples/wfst/.gitignore
 mode change 100644 => 100755 speechx/examples/wfst/run.sh
 create mode 120000 speechx/examples/wfst/utils

diff --git a/speechx/examples/README.md b/speechx/examples/README.md
index c3de0d3a..50f5f902 100644
--- a/speechx/examples/README.md
+++ b/speechx/examples/README.md
@@ -22,12 +22,16 @@ pip install netron
 netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel  --port 8022 --host 10.21.55.20
 ```
 
+## For Developer  
+
+> Warning: Only for developer, make sure you know what's it.
+
+* dev - for speechx developer, using for test.
+
 ## Build WFST  
 
+> Warning: Using below example when you know what's it.
+
 * text_lm - process text for build lm
 * ngram - using to build NGram ARPA lm.
 * wfst - build wfst for TLG.
-
-## For Developer  
-
-* dev - for speechx developer, using for test.
diff --git a/speechx/examples/ngram/zh/local/text_to_lexicon.py b/speechx/examples/ngram/zh/local/text_to_lexicon.py
new file mode 100755
index 00000000..4d6b016d
--- /dev/null
+++ b/speechx/examples/ngram/zh/local/text_to_lexicon.py
@@ -0,0 +1,37 @@
+#!/usr/bin/env python3
+import argparse
+
+def main(args):
+    with open(args.text, 'r') as fin, open(args.lexicon, 'w') as fout:
+        for line in fin:
+            line = line.strip()
+            if args.has_key:
+                utt, text = line.split(maxsplit=1)
+                words = text.split()
+            else:
+                words = line.split()
+            
+            for word in words:
+                val = " ".join(list(word))
+                fout.write(f"{word}\t{val}\n")
+                fout.flush()
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(
+        description='text(line:utt1 中国 人) to lexicon（line:中国 中 国).')
+    parser.add_argument(
+        '--has_key',
+        default=True,
+        help='text path, with utt or not')
+    parser.add_argument(
+        '--text',
+        required=True,
+        help='text path. line: utt1 中国 人 or 中国 人')
+    parser.add_argument(
+        '--lexicon',
+        required=True,
+        help='lexicon path. line:中国 中 国')
+    args = parser.parse_args()
+    print(args)
+
+    main(args)
diff --git a/speechx/examples/ngram/zh/run.sh b/speechx/examples/ngram/zh/run.sh
index eda422b3..347dfa2d 100755
--- a/speechx/examples/ngram/zh/run.sh
+++ b/speechx/examples/ngram/zh/run.sh
@@ -42,11 +42,15 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     # line: char/spm_pices
     cp $unit data/local/dict/units.txt
 
-    # line: word ph0 ... phn -> line: word char0 ... charn
-    utils/fst/prepare_dict.py \
-        --unit_file $unit \
-        --in_lexicon ${lexicon} \
-        --out_lexicon data/local/dict/lexicon.txt
+    if [ -f $lexicon ];then
+        # line: word ph0 ... phn -> line: word char0 ... charn
+        utils/fst/prepare_dict.py \
+            --unit_file $unit \
+            --in_lexicon ${lexicon} \
+            --out_lexicon data/local/dict/lexicon.txt
+    else
+        local/text_to_lexicon.py --has_key true --text $text --lexicon data/local/dict/lexicon.txt
+    fi
 fi
 
 lm=data/local/lm
diff --git a/speechx/examples/wfst/.gitignore b/speechx/examples/wfst/.gitignore
new file mode 100644
index 00000000..1269488f
--- /dev/null
+++ b/speechx/examples/wfst/.gitignore
@@ -0,0 +1 @@
+data
diff --git a/speechx/examples/wfst/README.md b/speechx/examples/wfst/README.md
index 4f862a25..4f4674a4 100644
--- a/speechx/examples/wfst/README.md
+++ b/speechx/examples/wfst/README.md
@@ -1,3 +1,146 @@
+# Built TLG wfst
+
+## Input
+```
+data/local/
+├── dict
+│   ├── lexicon.txt
+│   └── units.txt
+└── lm
+    ├── heldout
+    ├── lm.arpa
+    ├── text
+    ├── text.no_oov
+    ├── train
+    ├── unigram.counts
+    ├── word.counts
+    └── wordlist
+```
+
+```
+==> data/local/dict/lexicon.txt <==
+啊 啊
+啊啊啊 啊 啊 啊
+阿 阿
+阿尔 阿 尔
+阿根廷 阿 根 廷
+阿九 阿 九
+阿克 阿 克
+阿拉伯数字 阿 拉 伯 数 字
+阿拉法特 阿 拉 法 特
+阿拉木图 阿 拉 木 图
+
+==> data/local/dict/units.txt <==
+<blank>
+<unk>
+A
+B
+C
+D
+E
+F
+G
+H
+
+==> data/local/lm/heldout <==
+而 对 楼市 成交 抑制 作用 最 大 的 限 购
+也 成为 地方 政府 的 眼中 钉
+自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
+各地 政府 便 纷纷 跟进
+仅 一 个 多 月 的 时间 里
+除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
+四十六 个 限 购 城市 当中
+四十一 个 已 正式 取消 或 变相 放松 了 限 购
+财政 金融 政策 紧随 其后 而来
+显示 出 了 极 强 的 威力
+
+==> data/local/lm/lm.arpa <==
+
+\data\
+ngram 1=129356
+ngram 2=504661
+ngram 3=123455
+
+\1-grams:
+-1.531278       </s>
+-3.828829       <SPOKEN_NOISE>  -0.1600094
+-6.157292       <UNK>
+
+==> data/local/lm/text <==
+BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
+BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
+BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
+BAC009S0002W0125 各地 政府 便 纷纷 跟进
+BAC009S0002W0126 仅 一 个 多 月 的 时间 里
+BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
+BAC009S0002W0128 四十六 个 限 购 城市 当中
+BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
+BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
+BAC009S0002W0131 显示 出 了 极 强 的 威力
+
+==> data/local/lm/text.no_oov <==
+<SPOKEN_NOISE> 而 对 楼市 成交 抑制 作用 最 大 的 限 购 
+<SPOKEN_NOISE> 也 成为 地方 政府 的 眼中 钉 
+<SPOKEN_NOISE> 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后 
+<SPOKEN_NOISE> 各地 政府 便 纷纷 跟进 
+<SPOKEN_NOISE> 仅 一 个 多 月 的 时间 里 
+<SPOKEN_NOISE> 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外 
+<SPOKEN_NOISE> 四十六 个 限 购 城市 当中 
+<SPOKEN_NOISE> 四十一 个 已 正式 取消 或 变相 放松 了 限 购 
+<SPOKEN_NOISE> 财政 ���融 政策 紧随 其后 而来 
+<SPOKEN_NOISE> 显示 出 了 极 强 的 威力 
+
+==> data/local/lm/train <==
+汉莎 不 得 不 通过 这样 的 方式 寻求 新 的 发展 点
+并 计划 朝云 计算 方面 发展
+汉莎 的 基础 设施 部门 拥有 一千四百 名 员工
+媒体 就 曾 披露 这笔 交易
+虽然 双方 已经 正式 签署 了 外包 协议
+但是 这笔 交易 还 需要 得到 反 垄断 部门 的 批准
+陈 黎明 一九八九 年 获得 美国 康乃尔 大学 硕士 学位
+并 于 二零零三 年 顺利 完成 美国 哈佛 商学 院 高级 管理 课程
+曾 在 多家 国际 公司 任职
+拥有 业务 开发 商务 及 企业 治理
+
+==> data/local/lm/unigram.counts <==
+  57487 的
+  13099 在
+  11862 一
+  11397 了
+  10998 不
+   9913 是
+   7952 有
+   6250 和
+   6152 个
+   5422 将
+
+==> data/local/lm/word.counts <==
+  57486 的
+  13098 在
+  11861 一
+  11396 了
+  10997 不
+   9912 是
+   7951 有
+   6249 和
+   6151 个
+   5421 将
+
+==> data/local/lm/wordlist <==
+的
+在
+一
+了
+不
+是
+有
+和
+个
+将
+```
+
+## Output
+
 ```
 fstaddselfloops 'echo 4234 |' 'echo 123660 |' 
 Lexicon and Token FSTs compiling succeeded
@@ -16,3 +159,28 @@ fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst
 Composing decoding graph TLG.fst succeeded
 Aishell build TLG done.
 ```
+
+```
+data/
+├── lang_test
+│   ├── G.fst
+│   ├── L.fst
+│   ├── LG.fst
+│   ├── T.fst
+│   ├── TLG.fst
+│   ├── tokens.txt
+│   ├── units.txt
+│   └── words.txt
+└── local
+    ├── lang
+    │   ├── L.fst
+    │   ├── T.fst
+    │   ├── tokens.txt
+    │   ├── units.txt
+    │   └── words.txt
+    └── tmp
+        ├── disambig.list
+        ├── lexiconp_disambig.txt
+        ├── lexiconp.txt
+        └── units.list
+```
\ No newline at end of file
diff --git a/speechx/examples/wfst/path.sh b/speechx/examples/wfst/path.sh
index 877f2399..a07c1297 100644
--- a/speechx/examples/wfst/path.sh
+++ b/speechx/examples/wfst/path.sh
@@ -1,6 +1,6 @@
 # This contains the locations of binarys build required for running the examples.
 
-MAIN_ROOT=`realpath $PWD/../../../../`
+MAIN_ROOT=`realpath $PWD/../../../`
 SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
 
 export LC_AL=C
diff --git a/speechx/examples/wfst/run.sh b/speechx/examples/wfst/run.sh
old mode 100644
new mode 100755
index b53e1a5b..1354646a
--- a/speechx/examples/wfst/run.sh
+++ b/speechx/examples/wfst/run.sh
@@ -5,54 +5,25 @@ set -eo pipefail
 
 stage=-1
 stop_stage=100
-corpus=aishell
-lmtype=srilm
 
-lexicon=  # aishell/resource_aishell/lexicon.txt
-text=     # aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
+. utils/parse_options.sh
 
-source parse_options.sh
-
-if [ ! which fstprint ]; then
+if ! which fstprint ; then
     pushd $MAIN_ROOT/tools
     make kaldi.done
     popd
 fi
 
-if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
-    # 7.1 Prepare dict
-    unit_file=data/vocab.txt
-    mkdir -p data/local/dict
-    cp $unit_file data/local/dict/units.txt
-    utils/fst/prepare_dict.py \
-        --unit_file $unit_file \
-        --in_lexicon ${lexicon} \
-        --out_lexicon data/local/dict/lexicon.txt
-fi
-
-if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    # 7.2 Train lm
-    lm=data/local/lm
-    mkdir -p data/train
-    mkdir -p $lm
-    utils/manifest_key_value.py \
-        --manifest_path data/manifest.train \
-        --output_path data/train
-    utils/filter_scp.pl data/train/text \
-        $text > $lm/text
-    if [ $lmtype == 'srilm' ];then
-        local/aishell_train_lms.sh
-    else
-        utils/ngram_train.sh --order 3 $lm/text $lm/lm.arpa
-    fi
-fi
-
-if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then 
-    # 7.3 Build decoding TLG
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then 
+    # build T & L
+    # utils/fst/compile_lexicon_token_fst.sh <dict-src-dir> <tmp-dir> <lang-dir>
     utils/fst/compile_lexicon_token_fst.sh \
         data/local/dict data/local/tmp data/local/lang
+
+    # build G & LG & TLG
+    # utils/fst/make_tlg.sh <lm_dir> <src_lang> <tgt_lang>
     utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
 fi
 
-echo "Aishell build TLG done."
+echo "build TLG done."
 exit 0
diff --git a/speechx/examples/wfst/utils b/speechx/examples/wfst/utils
new file mode 120000
index 00000000..256f914a
--- /dev/null
+++ b/speechx/examples/wfst/utils
@@ -0,0 +1 @@
+../../../utils/
\ No newline at end of file
-- 
GitLab