diff --git a/examples/other/ngram_lm/s0/local/download_lm_zh.sh b/examples/other/ngram_lm/s0/local/download_lm_zh.sh index f9e2261fdd42fcab6e5d5643c32d48d4abe43c90..050749ce1ba674d07fde74b0c85dc26f8cc490a3 100755 --- a/examples/other/ngram_lm/s0/local/download_lm_zh.sh +++ b/examples/other/ngram_lm/s0/local/download_lm_zh.sh @@ -10,6 +10,11 @@ MD5="29e02312deb2e59b3c8686c7966d4fe3" TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm +if [ -e $TARGET ];then + echo "already have lm" + exit 0; +fi + echo "Download language model ..." download $URL $MD5 $TARGET if [ $? -ne 0 ]; then diff --git a/speechx/examples/ngram/zh/local/aishell_train_lms.sh b/speechx/examples/ngram/zh/local/aishell_train_lms.sh index e3cee43891c5255c2e027f25742f81f6d7642adc..762661513a24b84477c6d0024f11af129001e9af 100755 --- a/speechx/examples/ngram/zh/local/aishell_train_lms.sh +++ b/speechx/examples/ngram/zh/local/aishell_train_lms.sh @@ -29,12 +29,13 @@ mkdir -p $dir cleantext=$dir/text.no_oov # oov to -# line: utt word0 ... wordn -> line: word0 ... wordn +# lexicon line: word char0 ... charn +# text line: utt word0 ... wordn -> line: word0 ... wordn cat $text | awk -v lex=$lexicon 'BEGIN{while((getline0){ seen[$1]=1; } } {for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf(" ");} } printf("\n");}' \ > $cleantext || exit 1; -# compute word counts +# compute word counts, sort in descending order # line: count word cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \ sort -nr > $dir/word.counts || exit 1; @@ -50,8 +51,7 @@ cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \ cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo ""; echo "" ) > $dir/wordlist # hold out to compute ppl -heldout_sent=10000 # Don't change this if you want result to be comparable with - # kaldi_lm results +heldout_sent=10000 # Don't change this if you want result to be comparable with kaldi_lm results mkdir -p $dir cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n line: word char0 ... charn - utils/fst/prepare_dict.py \ - --unit_file $unit \ - --in_lexicon ${lexicon} \ - --out_lexicon data/local/dict/lexicon.txt - else - local/text_to_lexicon.py --has_key true --text $text --lexicon data/local/dict/lexicon.txt + if [ ! -f $lexicon ];then + local/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon + echo "Generate $lexicon from $text" fi + + # filter by vocab + # line: word ph0 ... phn -> line: word char0 ... charn + utils/fst/prepare_dict.py \ + --unit_file $unit \ + --in_lexicon ${lexicon} \ + --out_lexicon data/local/dict/lexicon.txt fi lm=data/local/lm diff --git a/utils/fst/prepare_dict.py b/utils/fst/prepare_dict.py index f59cd3113fd04de078a9616dd61cc6fa0fabe44e..301d72fb08376cab1a552c1c377738746715bfe4 100755 --- a/utils/fst/prepare_dict.py +++ b/utils/fst/prepare_dict.py @@ -3,7 +3,8 @@ import argparse def main(args): - # load `unit` or `vocab` file + # load vocab file + # line: token unit_table = set() with open(args.unit_file, 'r') as fin: for line in fin: @@ -11,27 +12,41 @@ def main(args): unit_table.add(unit) def contain_oov(units): + """token not in vocab + + Args: + units (str): token + + Returns: + bool: True token in voca, else False. + """ for unit in units: if unit not in unit_table: return True return False - # load spm model + # load spm model, for English bpemode = args.bpemodel if bpemode: import sentencepiece as spm sp = spm.SentencePieceProcessor() sp.Load(sys.bpemodel) - # used to filter polyphone + # used to filter polyphone and invalid word lexicon_table = set() + in_n = 0 # in lexicon word count + out_n = 0 # out lexicon word cout with open(args.in_lexicon, 'r') as fin, \ open(args.out_lexicon, 'w') as fout: for line in fin: word = line.split()[0] + in_n += 1 + if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel + # filter 'SIL' for mandarin, keep it in English continue elif word == '': + # filter continue else: # each word only has one pronunciation for e2e system @@ -39,12 +54,14 @@ def main(args): continue if bpemode: + # for english pieces = sp.EncodeAsPieces(word) if contain_oov(pieces): print('Ignoring words {}, which contains oov unit'. format(''.join(word).strip('▁'))) continue + # word is piece list, which not have piece, filter out by `contain_oov(pieces)` chars = ' '.join( [p if p in unit_table else '' for p in pieces]) else: @@ -58,11 +75,14 @@ def main(args): # we assume the model unit of our e2e system is char now. if word.encode('utf8').isalpha() and '▁' in unit_table: word = '▁' + word + chars = ' '.join(word) # word is a char list fout.write('{} {}\n'.format(word, chars)) lexicon_table.add(word) + out_n += 1 + print(f"Filter lexicon by unit table: filter out {in_n - out_n}, {out_n}/{in_n}") if __name__ == '__main__': parser = argparse.ArgumentParser(