replace space when build vocab

30563981 · Hui Zhang · 86f34784 · 30563981 · 30563981 · 30563981
3 changed file
--- a/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
+++ b/deepspeech/decoders/swig/ctc_beam_search_decoder.cpp
@@ -28,6 +28,7 @@
 #include "path_trie.h"

 using FSTMATCH = fst::SortedMatcher<fst::StdVectorFst>;
+constexpr kSPACE = "<space>"

 std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
    const std::vector<std::vector<double>> &probs_seq,
@@ -46,13 +47,8 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
                       "The shape of probs_seq does not match with "
                       "the shape of the vocabulary");
    }
-
-    // assign blank id
-    // size_t blank_id = vocabulary.size();
-    // size_t blank_id = 0;
-
    // assign space id
-    auto it = std::find(vocabulary.begin(), vocabulary.end(), " ");
+    auto it = std::find(vocabulary.begin(), vocabulary.end(), kSPACE);
    int space_id = it - vocabulary.begin();
    // if no space in vocabulary
    if ((size_t)space_id >= vocabulary.size()) {

--- a/deepspeech/frontend/featurizer/text_featurizer.py
+++ b/deepspeech/frontend/featurizer/text_featurizer.py
@@ -54,9 +54,9 @@ class TextFeaturizer():
            self.sp = spm.SentencePieceProcessor()
            self.sp.Load(spm_model)

-    def tokenize(self, text):
+    def tokenize(self, text, replace_space=True):
        if self.unit_type == 'char':
-            tokens = self.char_tokenize(text)
+            tokens = self.char_tokenize(text, replace_space)
        elif self.unit_type == 'word':
            tokens = self.word_tokenize(text)
        else:  # spm
@@ -106,17 +106,19 @@ class TextFeaturizer():
        text = self.detokenize(tokens)
        return text

-    def char_tokenize(self, text):
+    def char_tokenize(self, text, replace_space=True):
        """Character tokenizer.

        Args:
            text (str): text string.
+            replace_space (bool): False only used by build_vocab.py.

        Returns:
            List[str]: tokens.
        """
        text = text.strip()
-        text = text.replace(" ", SPACE)
+        if replace_space:
+            text = text.replace(" ", SPACE)
        return list(text)

    def char_detokenize(self, tokens):

--- a/utils/build_vocab.py
+++ b/utils/build_vocab.py
@@ -61,7 +61,7 @@ args = parser.parse_args()
 def count_manifest(counter, text_feature, manifest_path):
    manifest_jsons = read_manifest(manifest_path)
    for line_json in manifest_jsons:
-        line = text_feature.tokenize(line_json['text'])
+        line = text_feature.tokenize(line_json['text'], replace_space=False)
        counter.update(line)

 def dump_text_manifest(fileobj, manifest_path, key='text'):