提交 a054d1c4 编写于 作者: H Hui Zhang

text process for lm

上级 eb52896c
...@@ -27,7 +27,7 @@ arpa=$3 ...@@ -27,7 +27,7 @@ arpa=$3
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
# text tn & wordseg preprocess # text tn & wordseg preprocess
echo "process text." echo "process text."
python3 ${MAIN_ROOT}/utils/zh_tn.py ${type} ${text} ${text}.${type}.tn python3 ${MAIN_ROOT}/utils/zh_tn.py --token_type ${type} ${text} ${text}.${type}.tn
fi fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
......
...@@ -64,6 +64,7 @@ base = [ ...@@ -64,6 +64,7 @@ base = [
"webrtcvad", "webrtcvad",
"yacs~=0.1.8", "yacs~=0.1.8",
"prettytable", "prettytable",
"zhon",
] ]
server = [ server = [
...@@ -90,7 +91,6 @@ requirements = { ...@@ -90,7 +91,6 @@ requirements = {
"unidecode", "unidecode",
"yq", "yq",
"pre-commit", "pre-commit",
"zhon",
] ]
} }
......
...@@ -24,4 +24,4 @@ LM: aishell train ...@@ -24,4 +24,4 @@ LM: aishell train
Overall -> 11.14 % N=103017 C=93363 S=9583 D=71 I=1819 Overall -> 11.14 % N=103017 C=93363 S=9583 D=71 I=1819
Mandarin -> 11.14 % N=103017 C=93363 S=9583 D=71 I=1818 Mandarin -> 11.14 % N=103017 C=93363 S=9583 D=71 I=1818
Other -> 0.00 % N=0 C=0 S=0 D=0 I=1 Other -> 0.00 % N=0 C=0 S=0 D=0 I=1
``` ```
\ No newline at end of file
...@@ -98,4 +98,4 @@ Ignoring words 瑗, which contains oov unit ...@@ -98,4 +98,4 @@ Ignoring words 瑗, which contains oov unit
file data/local/lm/heldout: 10000 sentences, 89496 words, 0 OOVs file data/local/lm/heldout: 10000 sentences, 89496 words, 0 OOVs
0 zeroprobs, logprob= -270337.9 ppl= 521.2819 ppl1= 1048.745 0 zeroprobs, logprob= -270337.9 ppl= 521.2819 ppl1= 1048.745
build LM done. build LM done.
``` ```
\ No newline at end of file
# Text PreProcess for building ngram LM
Output `text` file like this:
```
BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
BAC009S0002W0125 各地 政府 便 纷纷 跟进
BAC009S0002W0126 仅 一 个 多 月 的 时间 里
BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
BAC009S0002W0128 四十六 个 限 购 城市 当中
BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
```
MAIN_ROOT=`realpath $PWD/../../../../`
SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
export LC_AL=C
#!/bin/bash
set -eo pipefail
. path.sh
stage=0
stop_stage=100
has_key=true
token_type=word
. utils/parse_options.sh || exit -1;
text=data/text
if [ ! -f $text ]; then
echo "$0: Not find $1";
exit -1;
fi
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
echo "text tn & wordseg preprocess"
rm -rf ${text}.tn
python3 utils/zh_tn.py --has_key $has_key --token_type $token_type ${text} ${text}.tn
fi
\ No newline at end of file
../../../utils/
\ No newline at end of file
...@@ -15,4 +15,4 @@ fstdeterminizestar --use-log=true ...@@ -15,4 +15,4 @@ fstdeterminizestar --use-log=true
fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst
Composing decoding graph TLG.fst succeeded Composing decoding graph TLG.fst succeeded
Aishell build TLG done. Aishell build TLG done.
``` ```
\ No newline at end of file
此差异已折叠。
...@@ -34,7 +34,7 @@ def main(args): ...@@ -34,7 +34,7 @@ def main(args):
utt2spk = line_json['utt2spk'] utt2spk = line_json['utt2spk']
# input # input
assert(len(line_json['input']) == 1), "only support one input now" assert (len(line_json['input']) == 1), "only support one input now"
input_json = line_json['input'][0] input_json = line_json['input'][0]
feat = input_json['feat'] feat = input_json['feat']
feat_shape = input_json['shape'] feat_shape = input_json['shape']
...@@ -49,7 +49,8 @@ def main(args): ...@@ -49,7 +49,8 @@ def main(args):
fdur.write(f"{utt} {dur}\n") fdur.write(f"{utt} {dur}\n")
# output # output
assert(len(line_json['output']) == 1), "only support one output now" assert (
len(line_json['output']) == 1), "only support one output now"
output_json = line_json['output'][0] output_json = line_json['output'][0]
text = output_json['text'] text = output_json['text']
if 'token' in output_json: if 'token' in output_json:
......
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册