Merge pull request #854 from PaddlePaddle/space

space as <space>

Merge pull request #854 from PaddlePaddle/space
space as <space>
2da198b7 · Hui Zhang · GitHub · 4d1ce107 · 30563981 · 2da198b7
5 changed file
--- a/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
+++ b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
@@ -28,6 +28,7 @@
 #include "path_trie.h"
 using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
+constexpr kSPACE = "<space>"
 std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
    const std::vector<std::vector<double>> &probs_seq,
@@ -46,13 +47,8 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
                       "The shape of probs_seq does not match with "
                       "the shape of the vocabulary");
    }
-    // assign blank id
-    // size_t blank_id = vocabulary.size();
-    // size_t blank_id = 0;
    // assign space id
-    auto it = std::find(vocabulary.begin(), vocabulary.end(), " ");
+    auto it = std::find(vocabulary.begin(), vocabulary.end(), kSPACE);
    int space_id = it - vocabulary.begin();
    // if no space in vocabulary
    if ((size_t)space_id >= vocabulary.size()) {

--- a/deepspeech/frontend/featurizer/text_featurizer.py
+++ b/deepspeech/frontend/featurizer/text_featurizer.py
@@ -16,6 +16,7 @@ import sentencepiece as spm
 from ..utility import EOS
 from ..utility import load_dict
+from ..utility import SPACE
 from ..utility import UNK
 __all__ = ["TextFeaturizer"]
@@ -53,9 +54,9 @@ class TextFeaturizer():
            self.sp = spm.SentencePieceProcessor()
            self.sp.Load(spm_model)
-    def tokenize(self, text):
+    def tokenize(self, text, replace_space=True):
        if self.unit_type == 'char':
-            tokens = self.char_tokenize(text)
+            tokens = self.char_tokenize(text, replace_space)
        elif self.unit_type == 'word':
            tokens = self.word_tokenize(text)
        else:  # spm
@@ -105,16 +106,20 @@ class TextFeaturizer():
        text = self.detokenize(tokens)
        return text
-    def char_tokenize(self, text):
+    def char_tokenize(self, text, replace_space=True):
        """Character tokenizer.
        Args:
            text (str): text string.
+            replace_space (bool): False only used by build_vocab.py.
        Returns:
            List[str]: tokens.
        """
-        return list(text.strip())
+        text = text.strip()
+        if replace_space:
+            text = text.replace(" ", SPACE)
+        return list(text)
    def char_detokenize(self, tokens):
        """Character detokenizer.
@@ -125,6 +130,7 @@ class TextFeaturizer():
        Returns:
           str: text string.
        """
+        tokens = tokens.replace(SPACE, " ")
        return "".join(tokens)
    def word_tokenize(self, text):
@@ -191,17 +197,14 @@ class TextFeaturizer():
        """Load vocabulary from file."""
        vocab_list = load_dict(vocab_filepath, maskctc)
        assert vocab_list is not None
+        assert SPACE in vocab_list
        id2token = dict(
            [(idx, token) for (idx, token) in enumerate(vocab_list)])
        token2id = dict(
            [(token, idx) for (idx, token) in enumerate(vocab_list)])
-        if UNK in vocab_list:
-            unk_id = vocab_list.index(UNK)
+        unk_id = vocab_list.index(UNK) if UNK in vocab_list else -1
-        else:
+        eos_id = vocab_list.index(EOS) if EOS in vocab_list else -1
-            unk_id = -1
-        if EOS in vocab_list:
-            eos_id = vocab_list.index(EOS)
-        else:
-            eos_id = -1
        return token2id, id2token, vocab_list, unk_id, eos_id
--- a/deepspeech/frontend/utility.py
+++ b/deepspeech/frontend/utility.py
@@ -28,7 +28,7 @@ logger = Log(__name__).getlog()
 __all__ = [
    "load_dict", "load_cmvn", "read_manifest", "rms_to_db", "rms_to_dbfs",
    "max_dbfs", "mean_dbfs", "gain_db_to_ratio", "normalize_audio", "SOS",
-    "EOS", "UNK", "BLANK", "MASKCTC"
+    "EOS", "UNK", "BLANK", "MASKCTC", "SPACE"
 ]
 IGNORE_ID = -1
@@ -38,6 +38,7 @@ EOS = SOS
 UNK = "<unk>"
 BLANK = "<blank>"
 MASKCTC = "<mask>"
+SPACE = "<space>"
 def load_dict(dict_path: Optional[Text], maskctc=False) -> Optional[List[Text]]:

--- a/utils/build_vocab.py
+++ b/utils/build_vocab.py
@@ -25,6 +25,7 @@ from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
 from deepspeech.frontend.utility import BLANK
 from deepspeech.frontend.utility import read_manifest
 from deepspeech.frontend.utility import SOS
+from deepspeech.frontend.utility import SPACE
 from deepspeech.frontend.utility import UNK
 from deepspeech.utils.utility import add_arguments
 from deepspeech.utils.utility import print_arguments
@@ -60,7 +61,7 @@ args = parser.parse_args()
 def count_manifest(counter, text_feature, manifest_path):
    manifest_jsons = read_manifest(manifest_path)
    for line_json in manifest_jsons:
-        line = text_feature.tokenize(line_json['text'])
+        line = text_feature.tokenize(line_json['text'], replace_space=False)
        counter.update(line)
 def dump_text_manifest(fileobj, manifest_path, key='text'):
@@ -109,6 +110,8 @@ def main():
    for token, count in count_sorted:
        if count < args.count_threshold:
            break
+        # replace space by `<space>`
+        token = SPACE if token == ' ' else token
        tokens.append(token)
    tokens = sorted(tokens)

--- a/utils/split_scp.pl
+++ b/utils/split_scp.pl
+#!/usr/bin/env perl
+use warnings; #sed replacement for -w perl parameter
+# Copyright 2010-2011 Microsoft Corporation
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+# This program splits up any kind of .scp or archive-type file.
+# If there is no utt2spk option it will work on any text  file and
+# will split it up with an approximately equal number of lines in
+# each but.
+# With the --utt2spk option it will work on anything that has the 
+# utterance-id as the first entry on each line; the utt2spk file is
+# of the form "utterance speaker" (on each line).
+# It splits it into equal size chunks as far as it can.  If you use
+# the utt2spk option it will make sure these chunks coincide with
+# speaker boundaries.  In this case, if there are more chunks
+# than speakers (and in some other circumstances), some of the 
+# resulting  chunks will be empty and it
+# will print a warning.
+# You will normally call this like:
+# split_scp.pl scp scp.1 scp.2 scp.3 ...
+# or
+# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
+# Note that you can use this script to split the utt2spk file itself,
+# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
+# You can also call the scripts like:
+# split_scp.pl -j 3 0 scp scp.0
+# [note: with this option, it assumes zero-based indexing of the split parts,
+# i.e. the second number must be 0 <= n < num-jobs.]
+$num_jobs = 0;
+$job_id = 0;
+$utt2spk_file = "";
+for ($x = 1; $x <= 2; $x++) {
+    if ($ARGV[0] eq "-j") {
+        shift @ARGV;
+        $num_jobs = shift @ARGV;
+        $job_id = shift @ARGV;
+        if ($num_jobs <= 0 || $job_id < 0 || $job_id >= $num_jobs) {
+            die "Invalid num-jobs and job-id: $num_jobs and $job_id";
+        }
+    }
+    if ($ARGV[0] =~ "--utt2spk=(.+)") {
+        $utt2spk_file=$1;
+        shift;
+    }
+}
+if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
+    die "Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ... \n" .
+        " or: split_scp.pl -j num-jobs job-id [--utt2spk=<utt2spk_file>] in.scp [out.scp]\n" .
+        " ... where 0 <= job-id < num-jobs.";
+}
+$inscp = shift @ARGV;
+if ($num_jobs == 0) { # without -j option
+    @OUTPUTS = @ARGV;
+} else {
+    for ($j = 0; $j < $num_jobs; $j++) {
+        if ($j == $job_id) { 
+            if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
+            else { push @OUTPUTS, "-"; }
+        } else {
+            push @OUTPUTS, "/dev/null";
+        }
+    }
+} 
+if ($utt2spk_file ne "") {  # We have the --utt2spk option...
+    open(U, "<$utt2spk_file") || die "Failed to open utt2spk file $utt2spk_file";
+    while(<U>) {
+        @A = split;
+        @A == 2 || die "Bad line $_ in utt2spk file $utt2spk_file";
+        ($u,$s) = @A;
+        $utt2spk{$u} = $s;
+    }
+    open(I, "<$inscp") || die "Opening input scp file $inscp";
+    @spkrs = ();
+    while(<I>) {
+        @A = split;
+        if(@A == 0) { die "Empty or space-only line in scp file $inscp"; }
+        $u = $A[0];
+        $s = $utt2spk{$u};
+        if(!defined $s) { die "No such utterance $u in utt2spk file $utt2spk_file"; }
+        if(!defined $spk_count{$s}) { 
+            push @spkrs, $s; 
+            $spk_count{$s} = 0;
+            $spk_data{$s} = "";
+        }
+        $spk_count{$s}++;
+        $spk_data{$s} = $spk_data{$s} . $_;
+    }
+    # Now split as equally as possible ..
+    # First allocate spks to files by allocating an approximately
+    # equal number of speakers.
+    $numspks = @spkrs;  # number of speakers.
+    $numscps = @OUTPUTS; # number of output files.
+    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+        $scparray[$scpidx] = []; # [] is array reference.
+    }
+    for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
+        $scpidx = int(($spkidx*$numscps) / $numspks);
+        $spk = $spkrs[$spkidx];
+        push @{$scparray[$scpidx]}, $spk;
+        $scpcount[$scpidx] += $spk_count{$spk};
+    }
+    # Now will try to reassign beginning + ending speakers
+    # to different scp's and see if it gets more balanced.
+    # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
+    # We can show that if considering changing just 2 scp's, we minimize
+    # this by minimizing the squared difference in sizes.  This is
+    # equivalent to minimizing the absolute difference in sizes.  This
+    # shows this method is bound to converge.
+    $changed = 1;
+    while($changed) {
+        $changed = 0;
+        for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+            # First try to reassign ending spk of this scp.
+            if($scpidx < $numscps-1) {
+                $sz = @{$scparray[$scpidx]};
+                if($sz > 0) {
+                    $spk = $scparray[$scpidx]->[$sz-1];
+                    $count = $spk_count{$spk};
+                    $nutt1 = $scpcount[$scpidx];
+                    $nutt2 = $scpcount[$scpidx+1];
+                    if( abs( ($nutt2+$count) - ($nutt1-$count))
+                        < abs($nutt2 - $nutt1))  { # Would decrease
+                        # size-diff by reassigning spk...
+                        $scpcount[$scpidx+1] += $count;
+                        $scpcount[$scpidx] -= $count;
+                        pop @{$scparray[$scpidx]};
+                        unshift @{$scparray[$scpidx+1]}, $spk;
+                        $changed = 1;
+                    }
+                }
+            }
+            if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
+                $spk = $scparray[$scpidx]->[0];
+                $count = $spk_count{$spk};
+                $nutt1 = $scpcount[$scpidx-1];
+                $nutt2 = $scpcount[$scpidx];
+                if( abs( ($nutt2-$count) - ($nutt1+$count))
+                    < abs($nutt2 - $nutt1))  { # Would decrease
+                    # size-diff by reassigning spk...
+                    $scpcount[$scpidx-1] += $count;
+                    $scpcount[$scpidx] -= $count;
+                    shift @{$scparray[$scpidx]};
+                    push @{$scparray[$scpidx-1]}, $spk;
+                    $changed = 1;
+                }
+            }
+        }
+    }
+    # Now print out the files...
+    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+        $scpfn = $OUTPUTS[$scpidx];
+        open(F, ">$scpfn") || die "Could not open scp file $scpfn for writing.";
+        $count = 0;
+        if(@{$scparray[$scpidx]} == 0) {
+            print STDERR "Warning: split_scp.pl producing empty .scp file $scpfn (too many splits and too few speakers?)\n";
+        } else {
+            foreach $spk ( @{$scparray[$scpidx]} ) {
+                print F $spk_data{$spk};
+                $count += $spk_count{$spk};
+            }
+            if($count != $scpcount[$scpidx]) { die "Count mismatch [code error]"; }
+        }
+        close(F);
+    }
+} else { 
+   # This block is the "normal" case where there is no --utt2spk 
+   # option and we just break into equal size chunks.
+    open(I, "<$inscp") || die "Opening input scp file $inscp";
+    $numscps = @OUTPUTS;  # size of array.
+    @F = ();
+    while(<I>) {
+        push @F, $_;
+    }
+    $numlines = @F;
+    if($numlines == 0) {
+        print STDERR "split_scp.pl: warning: empty input scp file $inscp";
+    }
+    $linesperscp = int( ($numlines+($numscps-1)) / $numscps); # the +$(numscps-1) forces rounding up.
+# [just doing int() rounds down].
+    for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
+        $scpfile = $OUTPUTS[$scpidx];
+        open(O, ">$scpfile") || die "Opening output scp file $scpfile";
+        for($n = $linesperscp * $scpidx; $n < $numlines && $n < $linesperscp*($scpidx+1); $n++) {
+            print O $F[$n];
+        }
+        close(O) || die "Closing scp file $scpfile";
+    }
+}
\ No newline at end of file