text process for lm

a054d1c4 · Hui Zhang · eb52896c · a054d1c4 · a054d1c4 · a054d1c4
12 changed file
--- a/examples/other/ngram_lm/s0/local/build_zh_lm.sh
+++ b/examples/other/ngram_lm/s0/local/build_zh_lm.sh
@@ -27,7 +27,7 @@ arpa=$3
 if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
    # text tn & wordseg preprocess
    echo "process text."
-    python3 ${MAIN_ROOT}/utils/zh_tn.py ${type} ${text} ${text}.${type}.tn
+    python3 ${MAIN_ROOT}/utils/zh_tn.py --token_type ${type} ${text} ${text}.${type}.tn
 fi
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then

--- a/setup.py
+++ b/setup.py
@@ -64,6 +64,7 @@ base = [
    "webrtcvad",
    "yacs~=0.1.8",
    "prettytable",
+    "zhon",
 ]
 server = [
@@ -90,7 +91,6 @@ requirements = {
        "unidecode",
        "yq",
        "pre-commit",
-        "zhon",
    ]
 }

--- a/speechx/examples/ds2_ol/aishell/README.md
+++ b/speechx/examples/ds2_ol/aishell/README.md
@@ -24,4 +24,4 @@ LM: aishell train
 Overall -> 11.14 % N=103017 C=93363 S=9583 D=71 I=1819
 Mandarin -> 11.14 % N=103017 C=93363 S=9583 D=71 I=1818
 Other -> 0.00 % N=0 C=0 S=0 D=0 I=1
 ```
\ No newline at end of file
--- a/speechx/examples/ngram/zh/README.md
+++ b/speechx/examples/ngram/zh/README.md
@@ -98,4 +98,4 @@ Ignoring words 瑗, which contains oov unit
 file data/local/lm/heldout: 10000 sentences, 89496 words, 0 OOVs
 0 zeroprobs, logprob= -270337.9 ppl= 521.2819 ppl1= 1048.745
 build LM done.
 ```
\ No newline at end of file
--- a/speechx/examples/text_lm/README.md
+++ b/speechx/examples/text_lm/README.md
+# Text PreProcess for building ngram LM
+Output `text` file like this:
+```
+BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
+BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
+BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
+BAC009S0002W0125 各地 政府 便 纷纷 跟进
+BAC009S0002W0126 仅 一 个 多 月 的 时间 里
+BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
+BAC009S0002W0128 四十六 个 限 购 城市 当中
+BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
+BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
+```
--- a/speechx/examples/text_lm/path.sh
+++ b/speechx/examples/text_lm/path.sh
+MAIN_ROOT=`realpath $PWD/../../../../`
+SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
+export LC_AL=C
--- a/speechx/examples/text_lm/run.sh
+++ b/speechx/examples/text_lm/run.sh
+#!/bin/bash
+set -eo pipefail
+. path.sh
+stage=0
+stop_stage=100
+has_key=true
+token_type=word
+. utils/parse_options.sh || exit -1;
+text=data/text
+if [ ! -f $text ]; then
+    echo "$0: Not find $1";
+    exit -1;
+fi
+if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
+    echo "text tn & wordseg preprocess"
+    rm -rf ${text}.tn
+    python3 utils/zh_tn.py --has_key $has_key --token_type $token_type ${text} ${text}.tn
+fi
\ No newline at end of file
--- a/speechx/examples/text_lm/utils
+++ b/speechx/examples/text_lm/utils
+../../../utils/
\ No newline at end of file
--- a/speechx/examples/wfst/README.md
+++ b/speechx/examples/wfst/README.md
@@ -15,4 +15,4 @@ fstdeterminizestar --use-log=true
 fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst 
 Composing decoding graph TLG.fst succeeded
 Aishell build TLG done.
 ```
\ No newline at end of file
--- a/utils/compute-wer.py
+++ b/utils/compute-wer.py
--- a/utils/manifest_key_value.py
+++ b/utils/manifest_key_value.py
@@ -34,7 +34,7 @@ def main(args):
            utt2spk = line_json['utt2spk']
            # input
-            assert(len(line_json['input']) == 1), "only support one input now"
+            assert (len(line_json['input']) == 1), "only support one input now"
            input_json = line_json['input'][0]
            feat = input_json['feat']
            feat_shape = input_json['shape']
@@ -49,7 +49,8 @@ def main(args):
            fdur.write(f"{utt} {dur}\n")
            # output
-            assert(len(line_json['output']) == 1), "only support one output now"
+            assert (
+                len(line_json['output']) == 1), "only support one output now"
            output_json = line_json['output'][0]
            text = output_json['text']
            if 'token' in output_json:

--- a/utils/zh_tn.py
+++ b/utils/zh_tn.py