提交 0ede6c2e 编写于 作者: H Hui Zhang

train lm

上级 c492a42f
...@@ -10,6 +10,11 @@ MD5="29e02312deb2e59b3c8686c7966d4fe3" ...@@ -10,6 +10,11 @@ MD5="29e02312deb2e59b3c8686c7966d4fe3"
TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm
if [ -e $TARGET ];then
echo "already have lm"
exit 0;
fi
echo "Download language model ..." echo "Download language model ..."
download $URL $MD5 $TARGET download $URL $MD5 $TARGET
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
......
...@@ -29,12 +29,13 @@ mkdir -p $dir ...@@ -29,12 +29,13 @@ mkdir -p $dir
cleantext=$dir/text.no_oov cleantext=$dir/text.no_oov
# oov to <SPOKEN_NOISE> # oov to <SPOKEN_NOISE>
# line: utt word0 ... wordn -> line: <SPOKEN_NOISE> word0 ... wordn # lexicon line: word char0 ... charn
# text line: utt word0 ... wordn -> line: <SPOKEN_NOISE> word0 ... wordn
cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } } cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \ {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
> $cleantext || exit 1; > $cleantext || exit 1;
# compute word counts # compute word counts, sort in descending order
# line: count word # line: count word
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
sort -nr > $dir/word.counts || exit 1; sort -nr > $dir/word.counts || exit 1;
...@@ -50,8 +51,7 @@ cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ ...@@ -50,8 +51,7 @@ cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
# hold out to compute ppl # hold out to compute ppl
heldout_sent=10000 # Don't change this if you want result to be comparable with heldout_sent=10000 # Don't change this if you want result to be comparable with kaldi_lm results
# kaldi_lm results
mkdir -p $dir mkdir -p $dir
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \ cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
......
#!/usr/bin/env python3 #!/usr/bin/env python3
import argparse import argparse
from collections import Counter
def main(args): def main(args):
counter = Counter()
with open(args.text, 'r') as fin, open(args.lexicon, 'w') as fout: with open(args.text, 'r') as fin, open(args.lexicon, 'w') as fout:
for line in fin: for line in fin:
line = line.strip() line = line.strip()
...@@ -11,10 +13,12 @@ def main(args): ...@@ -11,10 +13,12 @@ def main(args):
else: else:
words = line.split() words = line.split()
for word in words: counter.update(words)
val = " ".join(list(word))
fout.write(f"{word}\t{val}\n") for word in counter:
fout.flush() val = " ".join(list(word))
fout.write(f"{word}\t{val}\n")
fout.flush()
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
......
...@@ -3,11 +3,11 @@ set -eo pipefail ...@@ -3,11 +3,11 @@ set -eo pipefail
. path.sh . path.sh
stage=0 stage=-1
stop_stage=100 stop_stage=100
corpus=aishell corpus=aishell
unit=data/vocab.txt # line: char/spm_pice, vocab file unit=data/vocab.txt # vocab file, line: char/spm_pice
lexicon=data/lexicon.txt # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt lexicon=data/lexicon.txt # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
text=data/text # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt text=data/text # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
...@@ -42,15 +42,17 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then ...@@ -42,15 +42,17 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# line: char/spm_pices # line: char/spm_pices
cp $unit data/local/dict/units.txt cp $unit data/local/dict/units.txt
if [ -f $lexicon ];then if [ ! -f $lexicon ];then
# line: word ph0 ... phn -> line: word char0 ... charn local/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon
utils/fst/prepare_dict.py \ echo "Generate $lexicon from $text"
--unit_file $unit \
--in_lexicon ${lexicon} \
--out_lexicon data/local/dict/lexicon.txt
else
local/text_to_lexicon.py --has_key true --text $text --lexicon data/local/dict/lexicon.txt
fi fi
# filter by vocab
# line: word ph0 ... phn -> line: word char0 ... charn
utils/fst/prepare_dict.py \
--unit_file $unit \
--in_lexicon ${lexicon} \
--out_lexicon data/local/dict/lexicon.txt
fi fi
lm=data/local/lm lm=data/local/lm
......
...@@ -3,7 +3,8 @@ import argparse ...@@ -3,7 +3,8 @@ import argparse
def main(args): def main(args):
# load `unit` or `vocab` file # load vocab file
# line: token
unit_table = set() unit_table = set()
with open(args.unit_file, 'r') as fin: with open(args.unit_file, 'r') as fin:
for line in fin: for line in fin:
...@@ -11,27 +12,41 @@ def main(args): ...@@ -11,27 +12,41 @@ def main(args):
unit_table.add(unit) unit_table.add(unit)
def contain_oov(units): def contain_oov(units):
"""token not in vocab
Args:
units (str): token
Returns:
bool: True token in voca, else False.
"""
for unit in units: for unit in units:
if unit not in unit_table: if unit not in unit_table:
return True return True
return False return False
# load spm model # load spm model, for English
bpemode = args.bpemodel bpemode = args.bpemodel
if bpemode: if bpemode:
import sentencepiece as spm import sentencepiece as spm
sp = spm.SentencePieceProcessor() sp = spm.SentencePieceProcessor()
sp.Load(sys.bpemodel) sp.Load(sys.bpemodel)
# used to filter polyphone # used to filter polyphone and invalid word
lexicon_table = set() lexicon_table = set()
in_n = 0 # in lexicon word count
out_n = 0 # out lexicon word cout
with open(args.in_lexicon, 'r') as fin, \ with open(args.in_lexicon, 'r') as fin, \
open(args.out_lexicon, 'w') as fout: open(args.out_lexicon, 'w') as fout:
for line in fin: for line in fin:
word = line.split()[0] word = line.split()[0]
in_n += 1
if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel
# filter 'SIL' for mandarin, keep it in English
continue continue
elif word == '<SPOKEN_NOISE>': elif word == '<SPOKEN_NOISE>':
# filter <SPOKEN_NOISE>
continue continue
else: else:
# each word only has one pronunciation for e2e system # each word only has one pronunciation for e2e system
...@@ -39,12 +54,14 @@ def main(args): ...@@ -39,12 +54,14 @@ def main(args):
continue continue
if bpemode: if bpemode:
# for english
pieces = sp.EncodeAsPieces(word) pieces = sp.EncodeAsPieces(word)
if contain_oov(pieces): if contain_oov(pieces):
print('Ignoring words {}, which contains oov unit'. print('Ignoring words {}, which contains oov unit'.
format(''.join(word).strip('▁'))) format(''.join(word).strip('▁')))
continue continue
# word is piece list, which not have <unk> piece, filter out by `contain_oov(pieces)`
chars = ' '.join( chars = ' '.join(
[p if p in unit_table else '<unk>' for p in pieces]) [p if p in unit_table else '<unk>' for p in pieces])
else: else:
...@@ -58,11 +75,14 @@ def main(args): ...@@ -58,11 +75,14 @@ def main(args):
# we assume the model unit of our e2e system is char now. # we assume the model unit of our e2e system is char now.
if word.encode('utf8').isalpha() and '▁' in unit_table: if word.encode('utf8').isalpha() and '▁' in unit_table:
word = '▁' + word word = '▁' + word
chars = ' '.join(word) # word is a char list chars = ' '.join(word) # word is a char list
fout.write('{} {}\n'.format(word, chars)) fout.write('{} {}\n'.format(word, chars))
lexicon_table.add(word) lexicon_table.add(word)
out_n += 1
print(f"Filter lexicon by unit table: filter out {in_n - out_n}, {out_n}/{in_n}")
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册