rm tools/fst

cc434566 · Yang Zhou · 15f434a5 · 15f434a5 · 15f434a5 · 15f434a5
13 changed file
--- a/speechx/examples/aishell/tools/fst/add_lex_disambig.pl
+++ b/speechx/examples/aishell/tools/fst/add_lex_disambig.pl
-#!/usr/bin/env perl
-#  Copyright 2010-2011  Microsoft Corporation
-#            2013-2016  Johns Hopkins University (author: Daniel Povey)
-#                 2015  Hainan Xu
-#                 2015  Guoguo Chen
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Adds disambiguation symbols to a lexicon.
-# Outputs still in the normal lexicon format.
-# Disambig syms are numbered #1, #2, #3, etc. (#0
-# reserved for symbol in grammar).
-# Outputs the number of disambig syms to the standard output.
-# With the --pron-probs option, expects the second field
-# of each lexicon line to be a pron-prob.
-# With the --sil-probs option, expects three additional
-# fields after the pron-prob, representing various components
-# of the silence probability model.
-
-$pron_probs = 0;
-$sil_probs = 0;
-$first_allowed_disambig = 1;
-
-for ($n = 1; $n <= 3 && @ARGV > 0; $n++) {
-  if ($ARGV[0] eq "--pron-probs") {
-    $pron_probs = 1;
-    shift @ARGV;
-  }
-  if ($ARGV[0] eq "--sil-probs") {
-    $sil_probs = 1;
-    shift @ARGV;
-  }
-  if ($ARGV[0] eq "--first-allowed-disambig") {
-    $first_allowed_disambig = 0 + $ARGV[1];
-    if ($first_allowed_disambig < 1) {
-      die "add_lex_disambig.pl: invalid --first-allowed-disambig option: $first_allowed_disambig\n";
-    }
-    shift @ARGV;
-    shift @ARGV;
-  }
-}
-
-if (@ARGV != 2) {
-  die "Usage: add_lex_disambig.pl [opts] <lexicon-in> <lexicon-out>\n" .
-    "This script adds disambiguation symbols to a lexicon in order to\n" .
-    "make decoding graphs determinizable; it adds pseudo-phone\n" .
-    "disambiguation symbols #1, #2 and so on at the ends of phones\n" .
-    "to ensure that all pronunciations are different, and that none\n" .
-    "is a prefix of another.\n" .
-    "It prints to the standard output the number of the largest-numbered" .
-    "disambiguation symbol that was used.\n" .
-    "\n" .
-    "Options:   --pron-probs       Expect pronunciation probabilities in the 2nd field\n" .
-    "           --sil-probs        [should be with --pron-probs option]\n" .
-    "                              Expect 3 extra fields after the pron-probs, for aspects of\n" .
-    "                              the silence probability model\n" .
-    "           --first-allowed-disambig <n>  The number of the first disambiguation symbol\n" .
-    "                              that this script is allowed to add.  By default this is\n" .
-    "                              #1, but you can set this to a larger value using this option.\n" .
-    "e.g.:\n" .
-    " add_lex_disambig.pl lexicon.txt lexicon_disambig.txt\n" .
-    " add_lex_disambig.pl --pron-probs lexiconp.txt lexiconp_disambig.txt\n" .
-    " add_lex_disambig.pl --pron-probs --sil-probs lexiconp_silprob.txt lexiconp_silprob_disambig.txt\n";
-}
-
-
-$lexfn = shift @ARGV;
-$lexoutfn = shift @ARGV;
-
-open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
-
-# (1)  Read in the lexicon.
-@L = ( );
-while(<L>) {
-    @A = split(" ", $_);
-    push @L, join(" ", @A);
-}
-
-# (2) Work out the count of each phone-sequence in the
-# lexicon.
-
-foreach $l (@L) {
-    @A = split(" ", $l);
-    shift @A; # Remove word.
-    if ($pron_probs) {
-      $p = shift @A;
-      if (!($p > 0.0 && $p <= 1.0)) { die "Bad lexicon line $l (expecting pron-prob as second field)"; }
-    }
-    if ($sil_probs) {
-      $silp = shift @A;
-      if (!($silp > 0.0 && $silp <= 1.0)) { die "Bad lexicon line $l for silprobs"; }
-      $correction = shift @A;
-      if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
-      $correction = shift @A;
-      if ($correction <= 0.0) { die "Bad lexicon line $l for silprobs"; }
-    }
-    if (!(@A)) {
-      die "Bad lexicon line $1, no phone in phone list";
-    }
-    $count{join(" ",@A)}++;
-}
-
-# (3) For each left sub-sequence of each phone-sequence, note down
-# that it exists (for identifying prefixes of longer strings).
-
-foreach $l (@L) {
-    @A = split(" ", $l);
-    shift @A; # Remove word.
-    if ($pron_probs) { shift @A; } # remove pron-prob.
-    if ($sil_probs) {
-      shift @A; # Remove silprob
-      shift @A; # Remove silprob
-    }
-    while(@A > 0) {
-        pop @A;  # Remove last phone
-        $issubseq{join(" ",@A)} = 1;
-    }
-}
-
-# (4) For each entry in the lexicon:
-#  if the phone sequence is unique and is not a
-#  prefix of another word, no diambig symbol.
-#  Else output #1, or #2, #3, ... if the same phone-seq
-#  has already been assigned a disambig symbol.
-
-
-open(O, ">$lexoutfn") || die "Opening lexicon file $lexoutfn for writing.\n";
-
-# max_disambig will always be the highest-numbered disambiguation symbol that
-# has been used so far.
-$max_disambig = $first_allowed_disambig - 1;
-
-foreach $l (@L) {
-  @A = split(" ", $l);
-  $word = shift @A;
-  if ($pron_probs) {
-    $pron_prob = shift @A;
-  }
-  if ($sil_probs) {
-    $sil_word_prob = shift @A;
-    $word_sil_correction = shift @A;
-    $prev_nonsil_correction = shift @A
-  }
-  $phnseq = join(" ", @A);
-  if (!defined $issubseq{$phnseq}
-      && $count{$phnseq} == 1) {
-    ;                           # Do nothing.
-  } else {
-    if ($phnseq eq "") {        # need disambig symbols for the empty string
-      # that are not use anywhere else.
-      $max_disambig++;
-      $reserved_for_the_empty_string{$max_disambig} = 1;
-      $phnseq = "#$max_disambig";
-    } else {
-      $cur_disambig = $last_used_disambig_symbol_of{$phnseq};
-      if (!defined $cur_disambig) {
-        $cur_disambig = $first_allowed_disambig;
-      } else {
-        $cur_disambig++;           # Get a number that has not been used yet for
-                                   # this phone sequence.
-      }
-      while (defined $reserved_for_the_empty_string{$cur_disambig}) {
-        $cur_disambig++;
-      }
-      if ($cur_disambig > $max_disambig) {
-        $max_disambig = $cur_disambig;
-      }
-      $last_used_disambig_symbol_of{$phnseq} = $cur_disambig;
-      $phnseq = $phnseq . " #" . $cur_disambig;
-    }
-  }
-  if ($pron_probs) {
-    if ($sil_probs) {
-      print O "$word\t$pron_prob\t$sil_word_prob\t$word_sil_correction\t$prev_nonsil_correction\t$phnseq\n";
-    } else {
-      print O "$word\t$pron_prob\t$phnseq\n";
-    }
-  } else {
-    print O "$word\t$phnseq\n";
-  }
-}
-
-print $max_disambig . "\n";
--- a/speechx/examples/aishell/tools/fst/compile_lexicon_token_fst.sh
+++ b/speechx/examples/aishell/tools/fst/compile_lexicon_token_fst.sh
-#!/bin/bash
-# Copyright 2015       Yajie Miao    (Carnegie Mellon University)
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# This script compiles the lexicon and CTC tokens into FSTs. FST compiling slightly differs between the
-# phoneme and character-based lexicons.
-set -eo pipefail
-. tools/parse_options.sh
-
-if [ $# -ne 3 ]; then
-  echo "usage: tools/fst/compile_lexicon_token_fst.sh <dict-src-dir> <tmp-dir> <lang-dir>"
-  echo "e.g.: tools/fst/compile_lexicon_token_fst.sh data/local/dict data/local/lang_tmp data/lang"
-  echo "<dict-src-dir> should contain the following files:"
-  echo "lexicon.txt lexicon_numbers.txt units.txt"
-  echo "options: "
-  exit 1;
-fi
-
-srcdir=$1
-tmpdir=$2
-dir=$3
-mkdir -p $dir $tmpdir
-
-[ -f path.sh ] && . ./path.sh
-
-cp $srcdir/units.txt $dir
-
-# Add probabilities to lexicon entries. There is in fact no point of doing this here since all the entries have 1.0.
-# But utils/make_lexicon_fst.pl requires a probabilistic version, so we just leave it as it is.
-perl -ape 's/(\S+\s+)(.+)/${1}1.0\t$2/;' < $srcdir/lexicon.txt > $tmpdir/lexiconp.txt || exit 1;
-
-# Add disambiguation symbols to the lexicon. This is necessary for determinizing the composition of L.fst and G.fst.
-# Without these symbols, determinization will fail.
-ndisambig=`tools/fst/add_lex_disambig.pl $tmpdir/lexiconp.txt $tmpdir/lexiconp_disambig.txt`
-ndisambig=$[$ndisambig+1];
-
-( for n in `seq 0 $ndisambig`; do echo '#'$n; done ) > $tmpdir/disambig.list
-
-# Get the full list of CTC tokens used in FST. These tokens include <eps>, the blank <blk>,
-# the actual model unit, and the disambiguation symbols.
-cat $srcdir/units.txt | awk '{print $1}' > $tmpdir/units.list
-(echo '<eps>';) | cat - $tmpdir/units.list $tmpdir/disambig.list | awk '{print $1 " " (NR-1)}' > $dir/tokens.txt
-
-# ctc_token_fst_corrected is too big and too slow for character based chinese modeling,
-# so here just use simple ctc_token_fst
-tools/fst/ctc_token_fst.py $dir/tokens.txt | \
-  fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/tokens.txt --keep_isymbols=false --keep_osymbols=false | \
-  fstarcsort --sort_type=olabel > $dir/T.fst || exit 1;
-
-# Encode the words with indices. Will be used in lexicon and language model FST compiling.
-cat $tmpdir/lexiconp.txt | awk '{print $1}' | sort | awk '
-  BEGIN {
-    print "<eps> 0";
-  }
-  {
-    printf("%s %d\n", $1, NR);
-  }
-  END {
-    printf("#0 %d\n", NR+1);
-    printf("<s> %d\n", NR+2);
-    printf("</s> %d\n", NR+3);
-  }' > $dir/words.txt || exit 1;
-
-# Now compile the lexicon FST. Depending on the size of your lexicon, it may take some time.
-token_disambig_symbol=`grep \#0 $dir/tokens.txt | awk '{print $2}'`
-word_disambig_symbol=`grep \#0 $dir/words.txt | awk '{print $2}'`
-
-tools/fst/make_lexicon_fst.pl --pron-probs $tmpdir/lexiconp_disambig.txt 0 "sil" '#'$ndisambig | \
-  fstcompile --isymbols=$dir/tokens.txt --osymbols=$dir/words.txt \
-  --keep_isymbols=false --keep_osymbols=false |   \
-  fstaddselfloops  "echo $token_disambig_symbol |" "echo $word_disambig_symbol |" | \
-  fstarcsort --sort_type=olabel > $dir/L.fst || exit 1;
-
-echo "Lexicon and token FSTs compiling succeeded"
--- a/speechx/examples/aishell/tools/fst/ctc_token_fst.py
+++ b/speechx/examples/aishell/tools/fst/ctc_token_fst.py
-#!/usr/bin/env python
-
-import sys
-
-print('0 1 <eps> <eps>')
-print('1 1 <blank> <eps>')
-print('2 2 <blank> <eps>')
-print('2 0 <eps> <eps>')
-
-with open(sys.argv[1], 'r') as fin:
-    node = 3
-    for entry in fin:
-        fields = entry.strip().split(' ')
-        phone = fields[0]
-        if phone == '<eps>' or phone == '<blank>':
-            continue
-        elif '#' in phone:  # disambiguous phone
-            print('{} {} {} {}'.format(0, 0, '<eps>', phone))
-        else:
-            print('{} {} {} {}'.format(1, node, phone, phone))
-            print('{} {} {} {}'.format(node, node, phone, '<eps>'))
-            print('{} {} {} {}'.format(node, 2, '<eps>', '<eps>'))
-        node += 1
-print('0')
--- a/speechx/examples/aishell/tools/fst/ctc_token_fst_compact.py
+++ b/speechx/examples/aishell/tools/fst/ctc_token_fst_compact.py
-#!/usr/bin/env python
-
-import sys
-
-print('0 0 <blank> <eps>')
-
-with open(sys.argv[1], 'r', encoding='utf8') as fin:
-    node = 1
-    for entry in fin:
-        fields = entry.strip().split(' ')
-        phone = fields[0]
-        if phone == '<eps>' or phone == '<blank>':
-            continue
-        elif '#' in phone:  # disambiguous phone
-            print('{} {} {} {}'.format(0, 0, '<eps>', phone))
-        else:
-            print('{} {} {} {}'.format(0, node, phone, phone))
-            print('{} {} {} {}'.format(node, node, phone, '<eps>'))
-            print('{} {} {} {}'.format(node, 0, '<eps>', '<eps>'))
-        node += 1
-print('0')
--- a/speechx/examples/aishell/tools/fst/ctc_token_fst_corrected.py
+++ b/speechx/examples/aishell/tools/fst/ctc_token_fst_corrected.py
-#!/usr/bin/env python
-
-import sys
-
-
-def il(n):
-    return n + 1
-
-
-def ol(n):
-    return n + 1
-
-
-def s(n):
-    return n
-
-
-if __name__ == "__main__":
-    with open(sys.argv[1]) as f:
-        lines = f.readlines()
-    phone_count = 0
-    disambig_count = 0
-    for line in lines:
-        sp = line.split()
-        phone = sp[0]
-        if phone == '<eps>' or phone == '<blank>':
-            continue
-        if phone.startswith('#'):
-            disambig_count += 1
-        else:
-            phone_count += 1
-
-    # 1. add start state
-    print('0 0 {} 0'.format(il(0)))
-
-    # 2. 0 -> i, i -> i, i -> 0
-    for i in range(1, phone_count + 1):
-        print('0 {} {} {}'.format(s(i), il(i), ol(i)))
-        print('{} {} {} 0'.format(s(i), s(i), il(i)))
-        print('{} 0 {} 0'.format(s(i), il(0)))
-
-    # 3. i -> other phone
-    for i in range(1, phone_count + 1):
-        for j in range(1, phone_count + 1):
-            if i != j:
-                print('{} {} {} {}'.format(s(i), s(j), il(j), ol(j)))
-
-    # 4. add disambiguous arcs on every final state
-    for i in range(0, phone_count + 1):
-        for j in range(phone_count + 2, phone_count + disambig_count + 2):
-            print('{} {} {} {}'.format(s(i), s(i), 0, j))
-
-    # 5. every i is final state
-    for i in range(0, phone_count + 1):
-        print(s(i))
--- a/speechx/examples/aishell/tools/fst/eps2disambig.pl
+++ b/speechx/examples/aishell/tools/fst/eps2disambig.pl
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-#                2015 Guoguo Chen
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# This script replaces epsilon with #0 on the input side only, of the G.fst
-# acceptor.
-
-while(<>){
-  if (/\s+#0\s+/) {
-    print STDERR "$0: ERROR: LM has word #0, " .
-                 "which is reserved as disambiguation symbol\n";
-    exit 1;
-  }
-  s:^(\d+\s+\d+\s+)\<eps\>(\s+):$1#0$2:;
-  print;
-}
--- a/speechx/examples/aishell/tools/fst/make_lexicon_fst.pl
+++ b/speechx/examples/aishell/tools/fst/make_lexicon_fst.pl
-#!/usr/bin/env perl
-use warnings; #sed replacement for -w perl parameter
-# Copyright 2010-2011  Microsoft Corporation
-#                2013  Johns Hopkins University (author: Daniel Povey)
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# makes lexicon FST, in text form, from lexicon (pronunciation probabilities optional).
-
-$pron_probs = 0;
-
-if ((@ARGV > 0) && ($ARGV[0] eq "--pron-probs")) {
-  $pron_probs = 1;
-  shift @ARGV;
-}
-
-if (@ARGV != 1 && @ARGV != 3 && @ARGV != 4) {
-  print STDERR "Usage: make_lexicon_fst.pl [--pron-probs] lexicon.txt [silprob silphone [sil_disambig_sym]] >lexiconfst.txt\n\n";
-  print STDERR "Creates a lexicon FST that transduces phones to words, and may allow optional silence.\n\n";
-  print STDERR "Note: ordinarily, each line of lexicon.txt is:\n";
-  print STDERR "  word phone1 phone2 ... phoneN;\n";
-  print STDERR "if the --pron-probs option is used, each line is:\n";
-  print STDERR "  word pronunciation-probability phone1 phone2 ... phoneN.\n\n";
-  print STDERR "The probability 'prob' will typically be between zero and one, and note that\n";
-  print STDERR "it's generally helpful to normalize so the largest one for each word is 1.0, but\n";
-  print STDERR "this is your responsibility.\n\n";
-  print STDERR "The silence disambiguation symbol, e.g. something like #5, is used only\n";
-  print STDERR "when creating a lexicon with disambiguation symbols, e.g. L_disambig.fst,\n";
-  print STDERR "and was introduced to fix a particular case of non-determinism of decoding graphs.\n\n";
-  exit(1);
-}
-
-$lexfn = shift @ARGV;
-if (@ARGV == 0) {
-  $silprob = 0.0;
-} elsif (@ARGV == 2) {
-  ($silprob,$silphone) = @ARGV;
-} else {
-  ($silprob,$silphone,$sildisambig) = @ARGV;
-}
-if ($silprob != 0.0) {
-  $silprob < 1.0 || die "Sil prob cannot be >= 1.0";
-  $silcost = -log($silprob);
-  $nosilcost = -log(1.0 - $silprob);
-}
-
-
-open(L, "<$lexfn") || die "Error opening lexicon $lexfn";
-
-
-if ( $silprob == 0.0 ) { # No optional silences: just have one (loop+final) state which is numbered zero.
-  $loopstate = 0;
-  $nextstate = 1;               # next unallocated state.
-  while (<L>) {
-    @A = split(" ", $_);
-    @A == 0 && die "Empty lexicon line.";
-    foreach $a (@A) {
-      if ($a eq "<eps>") {
-        die "Bad lexicon line $_ (<eps> is forbidden)";
-      }
-    }
-    $w = shift @A;
-    if (! $pron_probs) {
-      $pron_cost = 0.0;
-    } else {
-      $pron_prob = shift @A;
-      if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) {
-        die "Bad pronunciation probability in line $_";
-      }
-      $pron_cost = -log($pron_prob);
-    }
-    if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; }
-
-    $s = $loopstate;
-    $word_or_eps = $w;
-    while (@A > 0) {
-      $p = shift @A;
-      if (@A > 0) {
-        $ns = $nextstate++;
-      } else {
-        $ns = $loopstate;
-      }
-      print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n";
-      $word_or_eps = "<eps>";
-      $pron_cost_string = ""; # so we only print it on the first arc of the word.
-      $s = $ns;
-    }
-  }
-  print "$loopstate\t0\n";      # final-cost.
-} else {                        # have silence probs.
-  $startstate = 0;
-  $loopstate = 1;
-  $silstate = 2;   # state from where we go to loopstate after emitting silence.
-  print "$startstate\t$loopstate\t<eps>\t<eps>\t$nosilcost\n"; # no silence.
-  if (!defined $sildisambig) {
-    print "$startstate\t$loopstate\t$silphone\t<eps>\t$silcost\n"; # silence.
-    print "$silstate\t$loopstate\t$silphone\t<eps>\n";             # no cost.
-    $nextstate = 3;
-  } else {
-    $disambigstate = 3;
-    $nextstate = 4;
-    print "$startstate\t$disambigstate\t$silphone\t<eps>\t$silcost\n"; # silence.
-    print "$silstate\t$disambigstate\t$silphone\t<eps>\n"; # no cost.
-    print "$disambigstate\t$loopstate\t$sildisambig\t<eps>\n"; # silence disambiguation symbol.
-  }
-  while (<L>) {
-    @A = split(" ", $_);
-    $w = shift @A;
-    if (! $pron_probs) {
-      $pron_cost = 0.0;
-    } else {
-      $pron_prob = shift @A;
-      if (! defined $pron_prob || !($pron_prob > 0.0 && $pron_prob <= 1.0)) {
-        die "Bad pronunciation probability in line $_";
-      }
-      $pron_cost = -log($pron_prob);
-    }
-    if ($pron_cost != 0.0) { $pron_cost_string = "\t$pron_cost"; } else { $pron_cost_string = ""; }
-    $s = $loopstate;
-    $word_or_eps = $w;
-    while (@A > 0) {
-      $p = shift @A;
-      if (@A > 0) {
-        $ns = $nextstate++;
-        print "$s\t$ns\t$p\t$word_or_eps$pron_cost_string\n";
-        $word_or_eps = "<eps>";
-        $pron_cost_string = ""; $pron_cost = 0.0; # so we only print it the 1st time.
-        $s = $ns;
-      } elsif (!defined($silphone) || $p ne $silphone) {
-        # This is non-deterministic but relatively compact,
-        # and avoids epsilons.
-        $local_nosilcost = $nosilcost + $pron_cost;
-        $local_silcost = $silcost + $pron_cost;
-        print "$s\t$loopstate\t$p\t$word_or_eps\t$local_nosilcost\n";
-        print "$s\t$silstate\t$p\t$word_or_eps\t$local_silcost\n";
-      } else {
-        # no point putting opt-sil after silence word.
-        print "$s\t$loopstate\t$p\t$word_or_eps$pron_cost_string\n";
-      }
-    }
-  }
-  print "$loopstate\t0\n";      # final-cost.
-}
--- a/speechx/examples/aishell/tools/fst/make_tlg.sh
+++ b/speechx/examples/aishell/tools/fst/make_tlg.sh
-#!/bin/bash
-#
-
-if [ -f path.sh ]; then . path.sh; fi
-
-lm_dir=$1
-src_lang=$2
-tgt_lang=$3
-
-arpa_lm=${lm_dir}/lm.arpa
-[ ! -f $arpa_lm ] && echo No such file $arpa_lm && exit 1;
-
-rm -rf $tgt_lang
-cp -r $src_lang $tgt_lang
-
-# Compose the language model to FST
-cat $arpa_lm | \
-   grep -v '<s> <s>' | \
-   grep -v '</s> <s>' | \
-   grep -v '</s> </s>' | \
-   grep -v -i '<unk>' | \
-   grep -v -i '<spoken_noise>' | \
-   arpa2fst --read-symbol-table=$tgt_lang/words.txt --keep-symbols=true - | fstprint | \
-   tools/fst/eps2disambig.pl | tools/fst/s2eps.pl | fstcompile --isymbols=$tgt_lang/words.txt \
-     --osymbols=$tgt_lang/words.txt  --keep_isymbols=false --keep_osymbols=false | \
-    fstrmepsilon | fstarcsort --sort_type=ilabel > $tgt_lang/G.fst
-
-
-echo  "Checking how stochastic G is (the first of these numbers should be small):"
-fstisstochastic $tgt_lang/G.fst
-
-# Compose the token, lexicon and language-model FST into the final decoding graph
-fsttablecompose $tgt_lang/L.fst $tgt_lang/G.fst | fstdeterminizestar --use-log=true | \
-    fstminimizeencoded | fstarcsort --sort_type=ilabel > $tgt_lang/LG.fst || exit 1;
-fsttablecompose $tgt_lang/T.fst $tgt_lang/LG.fst > $tgt_lang/TLG.fst || exit 1;
-
-echo "Composing decoding graph TLG.fst succeeded"
-#rm -r $tgt_lang/LG.fst   # We don't need to keep this intermediate FST
--- a/speechx/examples/aishell/tools/fst/prepare_dict.py
+++ b/speechx/examples/aishell/tools/fst/prepare_dict.py
-#!/usr/bin/env python3
-# encoding: utf-8
-
-import sys
-
-# sys.argv[1]: e2e model unit file(lang_char.txt)
-# sys.argv[2]: raw lexicon file
-# sys.argv[3]: output lexicon file
-# sys.argv[4]: bpemodel
-
-unit_table = set()
-with open(sys.argv[1], 'r', encoding='utf8') as fin:
-    for line in fin:
-        unit = line.split()[0]
-        unit_table.add(unit)
-
-
-def contain_oov(units):
-    for unit in units:
-        if unit not in unit_table:
-            return True
-    return False
-
-
-bpemode = len(sys.argv) > 4
-if bpemode:
-    import sentencepiece as spm
-    sp = spm.SentencePieceProcessor()
-    sp.Load(sys.argv[4])
-lexicon_table = set()
-with open(sys.argv[2], 'r', encoding='utf8') as fin, \
-        open(sys.argv[3], 'w', encoding='utf8') as fout:
-    for line in fin:
-        word = line.split()[0]
-        if word == 'SIL' and not bpemode:  # `sil` might be a valid piece in bpemodel
-            continue
-        elif word == '<SPOKEN_NOISE>':
-            continue
-        else:
-            # each word only has one pronunciation for e2e system
-            if word in lexicon_table:
-                continue
-            if bpemode:
-                pieces = sp.EncodeAsPieces(word)
-                if contain_oov(pieces):
-                    print(
-                        'Ignoring words {}, which contains oov unit'.format(
-                            ''.join(word).strip('▁'))
-                    )
-                    continue
-                chars = ' '.join(
-                    [p if p in unit_table else '<unk>' for p in pieces])
-            else:
-                # ignore words with OOV
-                if contain_oov(word):
-                    print('Ignoring words {}, which contains oov unit'.format(word))
-                    continue
-                # Optional, append ▁ in front of english word
-                # we assume the model unit of our e2e system is char now.
-                if word.encode('utf8').isalpha() and '▁' in unit_table:
-                    word = '▁' + word
-                chars = ' '.join(word)  # word is a char list
-            fout.write('{} {}\n'.format(word, chars))
-            lexicon_table.add(word)
--- a/speechx/examples/aishell/tools/fst/remove_oovs.pl
+++ b/speechx/examples/aishell/tools/fst/remove_oovs.pl
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# This script removes lines that contain these OOVs on either the
-# third or fourth fields  of the line.  It is intended to remove arcs
-# with OOVs on, from FSTs (probably compiled from ARPAs with OOVs in).
-
-if (  @ARGV < 1 && @ARGV > 2) {
-    die "Usage: remove_oovs.pl unk_list.txt [ printed-fst ]\n";
-}
-
-$unklist = shift @ARGV;
-open(S, "<$unklist") || die "Failed opening unknown-symbol list $unklist\n";
-while(<S>){
-    @A = split(" ", $_);
-    @A == 1 || die "Bad line in unknown-symbol list: $_";
-    $unk{$A[0]} = 1;
-}
-
-$num_removed = 0;
-while(<>){
-    @A = split(" ", $_);
-    if(defined $unk{$A[2]} || defined $unk{$A[3]}) {
-        $num_removed++;
-    } else {
-        print;
-    }
-}
-print STDERR "remove_oovs.pl: removed $num_removed lines.\n";
-
--- a/speechx/examples/aishell/tools/fst/rnnt_token_fst.py
+++ b/speechx/examples/aishell/tools/fst/rnnt_token_fst.py
-#!/usr/bin/env python
-
-import sys
-
-print('0 0 <blank> <eps>')
-
-with open(sys.argv[1], 'r', encoding='utf8') as fin:
-    for entry in fin:
-        fields = entry.strip().split(' ')
-        phone = fields[0]
-        if phone == '<eps>' or phone == '<blank>':
-            continue
-        elif '#' in phone:  # disambiguous phone
-            print('{} {} {} {}'.format(0, 0, '<eps>', phone))
-        else:
-            print('{} {} {} {}'.format(0, 0, phone, phone))
-print('0')
--- a/speechx/examples/aishell/tools/fst/s2eps.pl
+++ b/speechx/examples/aishell/tools/fst/s2eps.pl
-#!/usr/bin/env perl
-# Copyright 2010-2011 Microsoft Corporation
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-# This script replaces <s> and </s> with <eps> (on both input and output sides),
-# for the G.fst acceptor.
-
-while(<>){
-    @A = split(" ", $_);
-    if ( @A >= 4 ) {
-        if ($A[2] eq "<s>" || $A[2] eq "</s>") { $A[2] = "<eps>"; }
-        if ($A[3] eq "<s>" || $A[3] eq "</s>") { $A[3] = "<eps>"; }
-    }
-    print join("\t", @A) . "\n";
-}
--- a/speechx/examples/aishell/tools/parse_options.sh
+++ b/speechx/examples/aishell/tools/parse_options.sh
-#!/bin/bash
-
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
-#                 Arnab Ghoshal, Karel Vesely
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Parse command-line options.
-# To be sourced by another script (as in ". parse_options.sh").
-# Option format is: --option-name arg
-# and shell variable "option_name" gets set to value "arg."
-# The exception is --help, which takes no arguments, but prints the
-# $help_message variable (if defined).
-
-
-###
-### The --config file options have lower priority to command line
-### options, so we need to import them first...
-###
-
-# Now import all the configs specified by command-line, in left-to-right order
-for ((argpos=1; argpos<$#; argpos++)); do
-  if [ "${!argpos}" == "--config" ]; then
-    argpos_plus1=$((argpos+1))
-    config=${!argpos_plus1}
-    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
-    . $config  # source the config file.
-  fi
-done
-
-
-###
-### No we process the command line options
-###
-while true; do
-  [ -z "${1:-}" ] && break;  # break if there are no arguments
-  case "$1" in
-    # If the enclosing script is called with --help option, print the help
-    # message and exit.  Scripts should put help messages in $help_message
-    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
-      else printf "$help_message\n" 1>&2 ; fi;
-      exit 0 ;;
-    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
-      exit 1 ;;
-    # If the first command-line argument begins with "--" (e.g. --foo-bar),
-    # then work out the variable name as $name, which will equal "foo_bar".
-    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
-      # Next we test whether the variable in question is undefned-- if so it's
-      # an invalid option and we die.  Note: $0 evaluates to the name of the
-      # enclosing script.
-      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
-      # is undefined.  We then have to wrap this test inside "eval" because
-      # foo_bar is itself inside a variable ($name).
-      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
-
-      oldval="`eval echo \\$$name`";
-      # Work out whether we seem to be expecting a Boolean argument.
-      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
-        was_bool=true;
-      else
-        was_bool=false;
-      fi
-
-      # Set the variable to the right value-- the escaped quotes make it work if
-      # the option had spaces, like --cmd "queue.pl -sync y"
-      eval $name=\"$2\";
-
-      # Check that Boolean-valued arguments are really Boolean.
-      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
-        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
-        exit 1;
-      fi
-      shift 2;
-      ;;
-  *) break;
-  esac
-done
-
-
-# Check for an empty argument to the --cmd option, which can easily occur as a
-# result of scripting errors.
-[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
-
-
-true; # so this script returns exit code 0.