提交 0ede6c2e 编写于 作者: H Hui Zhang

train lm

上级 c492a42f
......@@ -10,6 +10,11 @@ MD5="29e02312deb2e59b3c8686c7966d4fe3"
TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm
if [ -e $TARGET ];then
echo "already have lm"
exit 0;
fi
echo "Download language model ..."
download $URL $MD5 $TARGET
if [ $? -ne 0 ]; then
......
......@@ -29,12 +29,13 @@ mkdir -p $dir
cleantext=$dir/text.no_oov
# oov to <SPOKEN_NOISE>
# line: utt word0 ... wordn -> line: <SPOKEN_NOISE> word0 ... wordn
# lexicon line: word char0 ... charn
# text line: utt word0 ... wordn -> line: <SPOKEN_NOISE> word0 ... wordn
cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
> $cleantext || exit 1;
# compute word counts
# compute word counts, sort in descending order
# line: count word
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
sort -nr > $dir/word.counts || exit 1;
......@@ -50,8 +51,7 @@ cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
# hold out to compute ppl
heldout_sent=10000 # Don't change this if you want result to be comparable with
# kaldi_lm results
heldout_sent=10000 # Don't change this if you want result to be comparable with kaldi_lm results
mkdir -p $dir
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
......
#!/usr/bin/env python3
import argparse
from collections import Counter
def main(args):
counter = Counter()
with open(args.text, 'r') as fin, open(args.lexicon, 'w') as fout:
for line in fin:
line = line.strip()
......@@ -11,10 +13,12 @@ def main(args):
else:
words = line.split()
for word in words:
val = " ".join(list(word))
fout.write(f"{word}\t{val}\n")
fout.flush()
counter.update(words)
for word in counter:
val = " ".join(list(word))
fout.write(f"{word}\t{val}\n")
fout.flush()
if __name__ == '__main__':
parser = argparse.ArgumentParser(
......
......@@ -3,11 +3,11 @@ set -eo pipefail
. path.sh
stage=0
stage=-1
stop_stage=100
corpus=aishell
unit=data/vocab.txt # line: char/spm_pice, vocab file
unit=data/vocab.txt # vocab file, line: char/spm_pice
lexicon=data/lexicon.txt # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
text=data/text # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
......@@ -42,15 +42,17 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# line: char/spm_pices
cp $unit data/local/dict/units.txt
if [ -f $lexicon ];then
# line: word ph0 ... phn -> line: word char0 ... charn
utils/fst/prepare_dict.py \
--unit_file $unit \
--in_lexicon ${lexicon} \
--out_lexicon data/local/dict/lexicon.txt
else
local/text_to_lexicon.py --has_key true --text $text --lexicon data/local/dict/lexicon.txt
if [ ! -f $lexicon ];then
local/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon
echo "Generate $lexicon from $text"
fi
# filter by vocab
# line: word ph0 ... phn -> line: word char0 ... charn
utils/fst/prepare_dict.py \
--unit_file $unit \
--in_lexicon ${lexicon} \
--out_lexicon data/local/dict/lexicon.txt
fi
lm=data/local/lm
......
......@@ -3,7 +3,8 @@ import argparse
def main(args):
# load `unit` or `vocab` file
# load vocab file
# line: token
unit_table = set()
with open(args.unit_file, 'r') as fin:
for line in fin:
......@@ -11,27 +12,41 @@ def main(args):
unit_table.add(unit)
def contain_oov(units):
"""token not in vocab
Args:
units (str): token
Returns:
bool: True token in voca, else False.
"""
for unit in units:
if unit not in unit_table:
return True
return False
# load spm model
# load spm model, for English
bpemode = args.bpemodel
if bpemode:
import sentencepiece as spm
sp = spm.SentencePieceProcessor()
sp.Load(sys.bpemodel)
# used to filter polyphone
# used to filter polyphone and invalid word
lexicon_table = set()
in_n = 0 # in lexicon word count
out_n = 0 # out lexicon word cout
with open(args.in_lexicon, 'r') as fin, \
open(args.out_lexicon, 'w') as fout:
for line in fin:
word = line.split()[0]
in_n += 1
if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel
# filter 'SIL' for mandarin, keep it in English
continue
elif word == '<SPOKEN_NOISE>':
# filter <SPOKEN_NOISE>
continue
else:
# each word only has one pronunciation for e2e system
......@@ -39,12 +54,14 @@ def main(args):
continue
if bpemode:
# for english
pieces = sp.EncodeAsPieces(word)
if contain_oov(pieces):
print('Ignoring words {}, which contains oov unit'.
format(''.join(word).strip('▁')))
continue
# word is piece list, which not have <unk> piece, filter out by `contain_oov(pieces)`
chars = ' '.join(
[p if p in unit_table else '<unk>' for p in pieces])
else:
......@@ -58,11 +75,14 @@ def main(args):
# we assume the model unit of our e2e system is char now.
if word.encode('utf8').isalpha() and '▁' in unit_table:
word = '▁' + word
chars = ' '.join(word) # word is a char list
fout.write('{} {}\n'.format(word, chars))
lexicon_table.add(word)
out_n += 1
print(f"Filter lexicon by unit table: filter out {in_n - out_n}, {out_n}/{in_n}")
if __name__ == '__main__':
parser = argparse.ArgumentParser(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册