From f5369abdbe86d66ef6ec6e2f5fddfe331fcd8922 Mon Sep 17 00:00:00 2001 From: Hui Zhang Date: Mon, 26 Jul 2021 10:05:50 +0000 Subject: [PATCH] add TLG utils --- examples/aishell/s1/run.sh | 44 +++++- examples/dataset/aishell/.gitignore | 3 +- utils/fst/add_lex_disambig.pl | 195 +++++++++++++++++++++++++ utils/fst/compile_lexicon_token_fst.sh | 88 +++++++++++ utils/fst/ctc_token_fst.py | 51 +++++++ utils/fst/ctc_token_fst_corrected.py | 80 ++++++++++ utils/fst/eps2disambig.pl | 29 ++++ utils/fst/make_lexicon_fst.pl | 154 +++++++++++++++++++ utils/fst/make_tlg.sh | 49 +++++++ utils/fst/prepare_dict.py | 90 ++++++++++++ utils/fst/remove_oovs.pl | 42 ++++++ utils/fst/rnnt_token_fst.py | 38 +++++ utils/fst/s2eps.pl | 27 ++++ 13 files changed, 885 insertions(+), 5 deletions(-) create mode 100644 utils/fst/add_lex_disambig.pl create mode 100644 utils/fst/compile_lexicon_token_fst.sh create mode 100644 utils/fst/ctc_token_fst.py create mode 100644 utils/fst/ctc_token_fst_corrected.py create mode 100644 utils/fst/eps2disambig.pl create mode 100644 utils/fst/make_lexicon_fst.pl create mode 100644 utils/fst/make_tlg.sh create mode 100644 utils/fst/prepare_dict.py create mode 100644 utils/fst/remove_oovs.pl create mode 100644 utils/fst/rnnt_token_fst.py create mode 100644 utils/fst/s2eps.pl diff --git a/examples/aishell/s1/run.sh b/examples/aishell/s1/run.sh index 65b48a97..cf5ed508 100644 --- a/examples/aishell/s1/run.sh +++ b/examples/aishell/s1/run.sh @@ -38,7 +38,43 @@ if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then CUDA_VISIBLE_DEVICES=0 ./local/align.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} || exit -1 fi -if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then - # export ckpt avg_n - CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit -fi +# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then +# # export ckpt avg_n +# CUDA_VISIBLE_DEVICES=0 ./local/export.sh ${conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} exp/${ckpt}/checkpoints/${avg_ckpt}.jit +# fi + + # Optionally, you can add LM and test it with runtime. + if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then + # 7.1 Prepare dict + unit_file=data/vocab.txt + mkdir -p data/local/dict + cp $unit_file data/local/dict/units.txt + utils/fst/prepare_dict.py $unit_file ${data}/resource_aishell/lexicon.txt \ + data/local/dict/lexicon.txt + + # 7.2 Train lm + lm=data/local/lm + mkdir -p $lm + utils/filter_scp.pl data/train/text \ + $data/data_aishell/transcript/aishell_transcript_v0.8.txt > $lm/text + local/aishell_train_lms.sh + + # 7.3 Build decoding TLG + utils/fst/compile_lexicon_token_fst.sh \ + data/local/dict data/local/tmp data/local/lang + utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1; + + # # 7.4 Decoding with runtime + # # reverse_weight only works for u2++ model and only left to right decoder is used when it is set to 0.0. + # dir=exp/conformer + # reverse_weight=0.0 + # chunk_size=-1 + # ./tools/decode.sh --nj 16 \ + # --beam 15.0 --lattice_beam 7.5 --max_active 7000 \ + # --blank_skip_thresh 0.98 --ctc_weight 0.5 --rescoring_weight 1.0 \ + # --reverse_weight $reverse_weight --chunk_size $chunk_size \ + # --fst_path data/lang_test/TLG.fst \ + # data/test/wav.scp data/test/text $dir/final.zip \ + # data/lang_test/words.txt $dir/lm_with_runtime + # # See $dir/lm_with_runtime for wer + fi diff --git a/examples/dataset/aishell/.gitignore b/examples/dataset/aishell/.gitignore index eea6573e..27194aab 100644 --- a/examples/dataset/aishell/.gitignore +++ b/examples/dataset/aishell/.gitignore @@ -1,4 +1,5 @@ data_aishell* *.meta manifest.* -*.tgz \ No newline at end of file +*.tgz +resource_aishell diff --git a/utils/fst/add_lex_disambig.pl b/utils/fst/add_lex_disambig.pl new file mode 100644 index 00000000..8ecbbd3a --- /dev/null +++ b/utils/fst/add_lex_disambig.pl @@ -0,0 +1,195 @@ +#!/usr/bin/env perl +# Copyright 2010-2011 Microsoft Corporation +# 2013-2016 Johns Hopkins University (author: Daniel Povey) +# 2015 Hainan Xu +# 2015 Guoguo Chen + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + + +# Adds disambiguation symbols to a lexicon. +# Outputs still in the normal lexicon format. +# Disambig syms are numbered #1, #2, #3, etc. (#0 +# reserved for symbol in grammar). +# Outputs the number of disambig syms to the standard output. +# With the --pron-probs option, expects the second field +# of each lexicon line to be a pron-prob. +# With the --sil-probs option, expects three additional +# fields after the pron-prob, representing various components +# of the silence probability model. + +$pron_probs = 0; +$sil_probs = 0; +$first_allowed_disambig = 1; + +for ($n = 1; $n <= 3 && @ARGV > 0; $n++) { + if ($ARGV[0] eq "--pron-probs") { + $pron_probs = 1; + shift @ARGV; + } + if ($ARGV[0] eq "--sil-probs") { + $sil_probs = 1; + shift @ARGV; + } + if ($ARGV[0] eq "--first-allowed-disambig") { + $first_allowed_disambig = 0 + $ARGV[1]; + if ($first_allowed_disambig < 1) { + die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n"; + } + shift @ARGV; + shift @ARGV; + } +} + +if (@ARGV != 2) { + die "Usage: add_lex_disambig.pl [opts] \n" . + "This script adds disambiguation symbols to a lexicon in order to\n" . + "make decoding graphs determinizable; it adds pseudo-phone\n" . + "disambiguation symbols #1, #2 and so on at the ends of phones\n" . + "to ensure that all pronunciations are different, and that none\n" . + "is a prefix of another.\n" . + "It prints to the standard output the number of the largest-numbered" . + "disambiguation symbol that was used.\n" . + "\n" . + "Options: --pron-probs Expect pronunciation probabilities in the 2nd field\n" . + " --sil-probs [should be with --pron-probs option]\n" . + " Expect 3 extra fields after the pron-probs, for aspects of\n" . + " the silence probability model\n" . + " --first-allowed-disambig The number of the first disambiguation symbol\n" . + " that this script is allowed to add. By default this is\n" . + " #1, but you can set this to a larger value using this option.\n" . + "e.g.:\n" . + " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" . + " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" . + " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n"; +} + + +$lexfn = shift @ARGV; +$lexoutfn = shift @ARGV; + +open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; + +# (1) Read in the lexicon. +@L = ( ); +while() { + @A = split(" ", $_); + push @L, join(" ", @A); +} + +# (2) Work out the count of each phone-sequence in the +# lexicon. + +foreach $l (@L) { + @A = split(" ", $l); + shift @A; # Remove word. + if ($pron_probs) { + $p = shift @A; + if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; } + } + if ($sil_probs) { + $silp = shift @A; + if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; } + $correction = shift @A; + if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } + $correction = shift @A; + if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; } + } + if (!(@A)) { + die "Bad lexicon line $1, no phone in phone list"; + } + $count{join(" ",@A)}++; +} + +# (3) For each left sub-sequence of each phone-sequence, note down +# that it exists (for identifying prefixes of longer strings). + +foreach $l (@L) { + @A = split(" ", $l); + shift @A; # Remove word. + if ($pron_probs) { shift @A; } # remove pron-prob. + if ($sil_probs) { + shift @A; # Remove silprob + shift @A; # Remove silprob + } + while(@A > 0) { + pop @A; # Remove last phone + $issubseq{join(" ",@A)} = 1; + } +} + +# (4) For each entry in the lexicon: +# if the phone sequence is unique and is not a +# prefix of another word, no diambig symbol. +# Else output #1, or #2, #3, ... if the same phone-seq +# has already been assigned a disambig symbol. + + +open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n"; + +# max_disambig will always be the highest-numbered disambiguation symbol that +# has been used so far. +$max_disambig = $first_allowed_disambig - 1; + +foreach $l (@L) { + @A = split(" ", $l); + $word = shift @A; + if ($pron_probs) { + $pron_prob = shift @A; + } + if ($sil_probs) { + $sil_word_prob = shift @A; + $word_sil_correction = shift @A; + $prev_nonsil_correction = shift @A + } + $phnseq = join(" ", @A); + if (!defined $issubseq{$phnseq} + && $count{$phnseq} == 1) { + ; # Do nothing. + } else { + if ($phnseq eq "") { # need disambig symbols for the empty string + # that are not use anywhere else. + $max_disambig++; + $reserved_for_the_empty_string{$max_disambig} = 1; + $phnseq = "#$max_disambig"; + } else { + $cur_disambig = $last_used_disambig_symbol_of{$phnseq}; + if (!defined $cur_disambig) { + $cur_disambig = $first_allowed_disambig; + } else { + $cur_disambig++; # Get a number that has not been used yet for + # this phone sequence. + } + while (defined $reserved_for_the_empty_string{$cur_disambig}) { + $cur_disambig++; + } + if ($cur_disambig > $max_disambig) { + $max_disambig = $cur_disambig; + } + $last_used_disambig_symbol_of{$phnseq} = $cur_disambig; + $phnseq = $phnseq . " #" . $cur_disambig; + } + } + if ($pron_probs) { + if ($sil_probs) { + print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n"; + } else { + print O "$word\t$pron_prob\t$phnseq\n"; + } + } else { + print O "$word\t$phnseq\n"; + } +} + +print $max_disambig . "\n"; \ No newline at end of file diff --git a/utils/fst/compile_lexicon_token_fst.sh b/utils/fst/compile_lexicon_token_fst.sh new file mode 100644 index 00000000..6e5716b7 --- /dev/null +++ b/utils/fst/compile_lexicon_token_fst.sh @@ -0,0 +1,88 @@ +#!/bin/bash +# Copyright 2015 Yajie Miao (Carnegie Mellon University) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the +# phoneme and character-based lexicons. +set -eo pipefail +. utils/parse_options.sh + +if [ $# -ne 3 ]; then + echo "usage: utils/fst/compile_lexicon_token_fst.sh " + echo "e.g.: utils/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang" + echo " should contain the following files:" + echo "lexicon.txt lexicon_numbers.txt units.txt" + echo "options: " + exit 1; +fi + +srcdir=$1 +tmpdir=$2 +dir=$3 +mkdir -p $dir $tmpdir + +[ -f path.sh ] && . ./path.sh + +cp $srcdir/units.txt $dir + +# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0. +# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is. +perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1; + +# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst. +# Without these symbols, determinization will fail. +# default first disambiguation is #1 +ndisambig=`utils/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt` +# add #0 (#0 reserved for symbol in grammar). +ndisambig=$[$ndisambig+1]; + +( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list + +# Get the full list of CTC tokens used in FST. These tokens include , the blank , +# the actual model unit, and the disambiguation symbols. +cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list +(echo '';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt + +# ctc_token_fst_corrected is too big and too slow for character based chinese modeling, +# so here just use simple ctc_token_fst +utils/fst/ctc_token_fst.py $dir/tokens.txt | \ + fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \ + fstarcsort --sort_type=olabel > $dir/T.fst || exit 1; + +# Encode the words with indices. Will be used in lexicon and language model FST compiling. +cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | awk ' + BEGIN { + print " 0"; + } + { + printf("%s %d\n", $1, NR); + } + END { + printf("#0 %d\n", NR+1); + printf(" %d\n", NR+2); + printf(" %d\n", NR+3); + }' > $dir/words.txt || exit 1; + +# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time. +token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'` +word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'` + +utils/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \ + fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \ + --keep_isymbols=false --keep_osymbols=false | \ + fstaddselfloops "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \ + fstarcsort --sort_type=olabel > $dir/L.fst || exit 1; + +echo "Lexicon and Token FSTs compiling succeeded" \ No newline at end of file diff --git a/utils/fst/ctc_token_fst.py b/utils/fst/ctc_token_fst.py new file mode 100644 index 00000000..d41da568 --- /dev/null +++ b/utils/fst/ctc_token_fst.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 +import argparse +import sys + + +def main(args): + """Token Transducer""" + # entry + print('0 1 ') + # skip begining and ending + print('1 1 ') + print('2 2 ') + # exit + print('2 0 ') + + # linking `token` between node 1 and node 2 + with open(sys.token_file, 'r') as fin: + node = 3 + for entry in fin: + fields = entry.strip().split(' ') + phone = fields[0] + if phone == '' or phone == '': + continue + elif '#' in phone: + # disambiguous phone + # `token` maybe ending with disambiguous symbol + print('{} {} {} {}'.format(0, 0, '', phone)) + else: + # eating `token` + print('{} {} {} {}'.format(1, node, phone, phone)) + # remove repeating `token` + print('{} {} {} {}'.format(node, node, phone, '')) + # leaving `token` + print('{} {} {} {}'.format(node, 2, '', '')) + node += 1 + # Fianl node + print('0') + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='FST: CTC Token FST transducer') + parser.add_argument( + '--token_file', + required=True, + help='e2e model token file. line: token(char/phone/spm/disambigous)') + + args = parser.parse_args() + print(args) + + main(args) diff --git a/utils/fst/ctc_token_fst_corrected.py b/utils/fst/ctc_token_fst_corrected.py new file mode 100644 index 00000000..e88436a4 --- /dev/null +++ b/utils/fst/ctc_token_fst_corrected.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +import argparse + + +def il(n): + """ilabel""" + return n + 1 + + +def ol(n): + """olabel""" + return n + 1 + + +def s(n): + """state""" + return n + + +def main(args): + with open(args.token_file) as f: + lines = f.readlines() + # token count w/0 + phone_count = 0 + disambig_count = 0 + for line in lines: + sp = line.strip().split() + phone = sp[0] + if phone == '' or phone == '': + continue + if phone.startswith('#'): + disambig_count += 1 + else: + phone_count += 1 + + # 1. add start state + # first token is :0 + print('0 0 {} 0'.format(il(0))) + + # 2. 0 -> i, i -> i, i -> 0 + # non-blank token start from 1 + for i in range(1, phone_count + 1): + # eating `token` + print('0 {} {} {}'.format(s(i), il(i), ol(i))) + # remove repeating `token` + print('{} {} {} 0'.format(s(i), s(i), il(i))) + # skip ending `token` + print('{} 0 {} 0'.format(s(i), il(0))) + + # 3. i -> other phone + # non-blank token to other non-blank token + for i in range(1, phone_count + 1): + for j in range(1, phone_count + 1): + if i != j: + print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j))) + + # 4. add disambiguous arcs on every final state + # blank and non-blank token maybe ending with disambiguous `token` + for i in range(0, phone_count + 1): + for j in range(phone_count + 2, phone_count + disambig_count + 2): + print('{} {} {} {}'.format(s(i), s(i), 0, j)) + + # 5. every i is final state + # blank and non-blank `token` are final state + for i in range(0, phone_count + 1): + print(s(i)) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='FST: CTC Token unfold FST transducer') + parser.add_argument( + '--token_file', + required=True, + help='e2e model token file. line: token(char/phone/spm/disambigous)') + + args = parser.parse_args() + print(args) + + main(args) diff --git a/utils/fst/eps2disambig.pl b/utils/fst/eps2disambig.pl new file mode 100644 index 00000000..52ec0acb --- /dev/null +++ b/utils/fst/eps2disambig.pl @@ -0,0 +1,29 @@ +#!/usr/bin/env perl +# Copyright 2010-2011 Microsoft Corporation +# 2015 Guoguo Chen + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script replaces epsilon with #0 on the input side only, of the G.fst +# acceptor. + +while(<>){ + if (/\s+#0\s+/) { + print STDERR "$0: ERROR: LM has word #0, " . + "which is reserved as disambiguation symbol\n"; + exit 1; + } + s:^(\d+\s+\d+\s+)\(\s+):$1#0$2:; + print; +} \ No newline at end of file diff --git a/utils/fst/make_lexicon_fst.pl b/utils/fst/make_lexicon_fst.pl new file mode 100644 index 00000000..95cda9df --- /dev/null +++ b/utils/fst/make_lexicon_fst.pl @@ -0,0 +1,154 @@ +#!/usr/bin/env perl +use warnings; #sed replacement for -w perl parameter +# Copyright 2010-2011 Microsoft Corporation +# 2013 Johns Hopkins University (author: Daniel Povey) + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional). + +$pron_probs = 0; + +if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) { + $pron_probs = 1; + shift @ARGV; +} + +if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) { + print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n"; + print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n"; + print STDERR "Note: ordinarily, each line of lexicon.txt is:\n"; + print STDERR " word phone1 phone2 ... phoneN;\n"; + print STDERR "if the --pron-probs option is used, each line is:\n"; + print STDERR " word pronunciation-probability phone1 phone2 ... phoneN.\n\n"; + print STDERR "The probability 'prob' will typically be between zero and one, and note that\n"; + print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n"; + print STDERR "this is your responsibility.\n\n"; + print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n"; + print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n"; + print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n"; + exit(1); +} + +$lexfn = shift @ARGV; +if (@ARGV == 0) { + $silprob = 0.0; +} elsif (@ARGV == 2) { + ($silprob,$silphone) = @ARGV; +} else { + ($silprob,$silphone,$sildisambig) = @ARGV; +} +if ($silprob != 0.0) { + $silprob < 1.0 || die "Sil prob cannot be >= 1.0"; + $silcost = -log($silprob); + $nosilcost = -log(1.0 - $silprob); +} + + +open(L, "<$lexfn") || die "Error opening lexicon $lexfn"; + + +if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero. + $loopstate = 0; + $nextstate = 1; # next unallocated state. + while () { + @A = split(" ", $_); + @A == 0 && die "Empty lexicon line."; + foreach $a (@A) { + if ($a eq "") { + die "Bad lexicon line $_ ( is forbidden)"; + } + } + $w = shift @A; + if (! $pron_probs) { + $pron_cost = 0.0; + } else { + $pron_prob = shift @A; + if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { + die "Bad pronunciation probability in line $_"; + } + $pron_cost = -log($pron_prob); + } + if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } + + $s = $loopstate; + $word_or_eps = $w; + while (@A > 0) { + $p = shift @A; + if (@A > 0) { + $ns = $nextstate++; + } else { + $ns = $loopstate; + } + print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; + $word_or_eps = ""; + $pron_cost_string = ""; # so we only print it on the first arc of the word. + $s = $ns; + } + } + print "$loopstate\t0\n"; # final-cost. +} else { # have silence probs. + $startstate = 0; + $loopstate = 1; + $silstate = 2; # state from where we go to loopstate after emitting silence. + print "$startstate\t$loopstate\t\t\t$nosilcost\n"; # no silence. + if (!defined $sildisambig) { + print "$startstate\t$loopstate\t$silphone\t\t$silcost\n"; # silence. + print "$silstate\t$loopstate\t$silphone\t\n"; # no cost. + $nextstate = 3; + } else { + $disambigstate = 3; + $nextstate = 4; + print "$startstate\t$disambigstate\t$silphone\t\t$silcost\n"; # silence. + print "$silstate\t$disambigstate\t$silphone\t\n"; # no cost. + print "$disambigstate\t$loopstate\t$sildisambig\t\n"; # silence disambiguation symbol. + } + while () { + @A = split(" ", $_); + $w = shift @A; + if (! $pron_probs) { + $pron_cost = 0.0; + } else { + $pron_prob = shift @A; + if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) { + die "Bad pronunciation probability in line $_"; + } + $pron_cost = -log($pron_prob); + } + if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; } + $s = $loopstate; + $word_or_eps = $w; + while (@A > 0) { + $p = shift @A; + if (@A > 0) { + $ns = $nextstate++; + print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n"; + $word_or_eps = ""; + $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time. + $s = $ns; + } elsif (!defined($silphone) || $p ne $silphone) { + # This is non-deterministic but relatively compact, + # and avoids epsilons. + $local_nosilcost = $nosilcost + $pron_cost; + $local_silcost = $silcost + $pron_cost; + print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n"; + print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n"; + } else { + # no point putting opt-sil after silence word. + print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n"; + } + } + } + print "$loopstate\t0\n"; # final-cost. +} \ No newline at end of file diff --git a/utils/fst/make_tlg.sh b/utils/fst/make_tlg.sh new file mode 100644 index 00000000..c68387af --- /dev/null +++ b/utils/fst/make_tlg.sh @@ -0,0 +1,49 @@ +#!/bin/bash + +if [ -f path.sh ]; then . path.sh; fi + +lm_dir=$1 +src_lang=$2 +tgt_lang=$3 + +arpa_lm=${lm_dir}/lm.arpa +[ ! -f $arpa_lm ] && { echo "No such file $arpa_lm"; exit 1;} + +rm -rf $tgt_lang +cp -r $src_lang $tgt_lang + +# Compose the language model to FST +# grep -i或--ignore-case 忽略字符大小写的差别。 +# grep -v或--revert-match 反转查找。 +# arpa2fst: remove the embedded symbols from the FST +# arpa2fst: make sure there are no out-of-vocabulary words in the language model +# arpa2fst: remove "illegal" sequences of the start and end-ofsentence symbols +# eps2disambig.pl: replace epsilons on the input side with the special disambiguation symbol #0. +# s2eps.pl: replaces and with (on both input and output sides), for the G.fst acceptor. +# G.fst, the disambiguation symbol #0 only appears on the input side +# do eps2disambig.pl and s2eps.pl maybe just for fallowing `fstrmepsilon`. +cat $arpa_lm | \ + grep -v ' ' | \ + grep -v ' ' | \ + grep -v ' ' | \ + grep -v -i '' | \ + grep -v -i '' | \ + arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \ + utils/fst/eps2disambig.pl | utils/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \ + --osymbols=$tgt_lang/words.txt --keep_isymbols=false --keep_osymbols=false | \ + fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst + + +echo "Checking how stochastic G is (the first of these numbers should be small):" +fstisstochastic $tgt_lang/G.fst + +# Compose the token, lexicon and language-model FST into the final decoding graph +# minimization: the same as minimization algorithm that applies to weighted acceptors; +# the only change relevant here is that it avoids pushing weights, +# hence preserving stochasticity +fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \ + fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1; +fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1; + +echo "Composing decoding graph TLG.fst succeeded" +#rm -r $tgt_lang/LG.fst # We don't need to keep this intermediate FST \ No newline at end of file diff --git a/utils/fst/prepare_dict.py b/utils/fst/prepare_dict.py new file mode 100644 index 00000000..471b12ec --- /dev/null +++ b/utils/fst/prepare_dict.py @@ -0,0 +1,90 @@ +#!/usr/bin/env python3 +import argparse +import sys + + +def contain_oov(units): + for unit in units: + if unit not in unit_table: + return True + return False + + +def main(args): + # load `unit` or `vocab` file + unit_table = set() + with open(args.unit_file, 'r') as fin: + for line in fin: + unit = line.strip() + unit_table.add(unit) + + # load spm model + bpemode = args.bpemodel + if bpemode: + import sentencepiece as spm + sp = spm.SentencePieceProcessor() + sp.Load(sys.bpemodel) + + # used to filter polyphone + lexicon_table = set() + with open(sys.in_lexicon, 'r') as fin, \ + open(sys.out_lexicon, 'w') as fout: + for line in fin: + word = line.split()[0] + if word == 'SIL' and not bpemode: # `sil` might be a valid piece in bpemodel + continue + elif word == '': + continue + else: + # each word only has one pronunciation for e2e system + if word in lexicon_table: + continue + + if bpemode: + pieces = sp.EncodeAsPieces(word) + if contain_oov(pieces): + print('Ignoring words {}, which contains oov unit'. + format(''.join(word).strip('▁'))) + continue + + chars = ' '.join( + [p if p in unit_table else '' for p in pieces]) + else: + # ignore words with OOV + if contain_oov(word): + print('Ignoring words {}, which contains oov unit'. + format(word)) + continue + + # Optional, append ▁ in front of english word + # we assume the model unit of our e2e system is char now. + if word.encode('utf8').isalpha() and '▁' in unit_table: + word = '▁' + word + chars = ' '.join(word) # word is a char list + + fout.write('{} {}\n'.format(word, chars)) + lexicon_table.add(word) + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description='FST: preprae e2e(char/spm) dict') + parser.add_argument( + '--unit_file', + required=True, + help='e2e model unit file(lang_char.txt/vocab.txt). line: char/spm_pices' + ) + parser.add_argument( + '--in_lexicon', + required=True, + help='raw lexicon file. line: word ph0 ... phn') + parser.add_argument( + '--out_lexicon', + required=True, + help='output lexicon file. line: word char0 ... charn') + parser.add_argument('--bpemodel', default=None, help='bpemodel') + + args = parser.parse_args() + print(args) + + main(args) diff --git a/utils/fst/remove_oovs.pl b/utils/fst/remove_oovs.pl new file mode 100644 index 00000000..bbf7e632 --- /dev/null +++ b/utils/fst/remove_oovs.pl @@ -0,0 +1,42 @@ +#!/usr/bin/env perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script removes lines that contain these OOVs on either the +# third or fourth fields of the line. It is intended to remove arcs +# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in). + +if ( @ARGV < 1 && @ARGV > 2) { + die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n"; +} + +$unklist = shift @ARGV; +open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n"; +while(){ + @A = split(" ", $_); + @A == 1 || die "Bad line in unknown-symbol list: $_"; + $unk{$A[0]} = 1; +} + +$num_removed = 0; +while(<>){ + @A = split(" ", $_); + if(defined $unk{$A[2]} || defined $unk{$A[3]}) { + $num_removed++; + } else { + print; + } +} +print STDERR "remove_oovs.pl: removed $num_removed lines.\n"; diff --git a/utils/fst/rnnt_token_fst.py b/utils/fst/rnnt_token_fst.py new file mode 100644 index 00000000..14376c3b --- /dev/null +++ b/utils/fst/rnnt_token_fst.py @@ -0,0 +1,38 @@ +#!/usr/bin/env python3 +import argparse + + +def main(args): + # skip `token` + print('0 0 ') + + with open(args.token_file, 'r') as fin: + for entry in fin: + fields = entry.strip().split(' ') + phone = fields[0] + if phone == '' or phone == '': + continue + elif '#' in phone: + # disambiguous phone + # maybe add disambiguous `token` + print('{} {} {} {}'.format(0, 0, '', phone)) + else: + # eating `token` + print('{} {} {} {}'.format(0, 0, phone, phone)) + + # final state + print('0') + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description='FST: RNN-T Token FST transducer') + parser.add_argument( + '--token_file', + required=True, + help='e2e model token file. line: token(char/phone/spm/disambigous)') + + args = parser.parse_args() + print(args) + + main(args) diff --git a/utils/fst/s2eps.pl b/utils/fst/s2eps.pl new file mode 100644 index 00000000..84d494e2 --- /dev/null +++ b/utils/fst/s2eps.pl @@ -0,0 +1,27 @@ +#!/usr/bin/env perl +# Copyright 2010-2011 Microsoft Corporation + +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED +# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE, +# MERCHANTABLITY OR NON-INFRINGEMENT. +# See the Apache 2 License for the specific language governing permissions and +# limitations under the License. + +# This script replaces and with (on both input and output sides), +# for the G.fst acceptor. + +while(<>){ + @A = split(" ", $_); + if ( @A >= 4 ) { + if ($A[2] eq "" || $A[2] eq "") { $A[2] = ""; } + if ($A[3] eq "" || $A[3] eq "") { $A[3] = ""; } + } + print join("\t", @A) . "\n"; +} \ No newline at end of file -- GitLab