From 08281eca72a81f45e8a2c35ed6b280e3f4ee2be2 Mon Sep 17 00:00:00 2001
From: huangyuxin <hyxin2014@126.com>
Date: Fri, 24 Sep 2021 08:29:09 +0000
Subject: [PATCH] fix bug: bug of space id in score.cpp, add detokenize

---
 deepspeech/decoders/swig/scorer.cpp  |  3 ++-
 deepspeech/exps/deepspeech2/model.py | 15 +++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)
diff --git a/deepspeech/decoders/swig/scorer.cpp b/deepspeech/decoders/swig/scorer.cpp
index a25382b1..ebb9e448 100644
--- a/deepspeech/decoders/swig/scorer.cpp
+++ b/deepspeech/decoders/swig/scorer.cpp
@@ -26,6 +26,7 @@
 #include "decoder_utils.h"
 
 using namespace lm::ngram;
+const std::string kSPACE = "<space>";
 
 Scorer::Scorer(double alpha,
                double beta,
@@ -165,7 +166,7 @@ void Scorer::set_char_map(const std::vector<std::string>& char_list) {
 
     // Set the char map for the FST for spelling correction
     for (size_t i = 0; i < char_list_.size(); i++) {
-        if (char_list_[i] == " ") {
+        if (char_list_[i] == kSPACE) {
             SPACE_ID_ = i;
         }
         // The initial state of FST is state 0, hence the index of chars in
diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py
index 79a67634..702a0576 100644
--- a/deepspeech/exps/deepspeech2/model.py
+++ b/deepspeech/exps/deepspeech2/model.py
@@ -27,6 +27,7 @@ from paddle import inference
 from paddle.io import DataLoader
 from yacs.config import CfgNode
 
+from deepspeech.frontend.featurizer.text_featurizer import TextFeaturizer
 from deepspeech.io.collator import SpeechCollator
 from deepspeech.io.dataset import ManifestDataset
 from deepspeech.io.sampler import SortagradBatchSampler
@@ -271,6 +272,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
 
     def __init__(self, config, args):
         super().__init__(config, args)
+        self._text_featurizer = TextFeaturizer(
+            unit_type=config.collator.unit_type, vocab_filepath=None)
 
     def ordid2token(self, texts, texts_len):
         """ ord() id to chr() chr """
@@ -299,6 +302,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
 
         result_transcripts = self.compute_result_transcripts(audio, audio_len,
                                                              vocab_list, cfg)
+
         for utt, target, result in zip(utts, target_transcripts,
                                        result_transcripts):
             errors, len_ref = errors_func(target, result)
@@ -335,6 +339,12 @@ class DeepSpeech2Tester(DeepSpeech2Trainer):
             cutoff_prob=cfg.cutoff_prob,
             cutoff_top_n=cfg.cutoff_top_n,
             num_processes=cfg.num_proc_bsearch)
+        #replace the <space> with ' '
+        result_transcripts = [
+            self._text_featurizer.detokenize(sentence)
+            for sentence in result_transcripts
+        ]
+
         self.autolog.times.stamp()
         self.autolog.times.stamp()
         self.autolog.times.end()
@@ -455,6 +465,11 @@ class DeepSpeech2ExportTester(DeepSpeech2Tester):
             output_probs, output_lens, vocab_list, cfg.decoding_method,
             cfg.lang_model_path, cfg.alpha, cfg.beta, cfg.beam_size,
             cfg.cutoff_prob, cfg.cutoff_top_n, cfg.num_proc_bsearch)
+        #replace the <space> with ' '
+        result_transcripts = [
+            self._text_featurizer.detokenize(sentence)
+            for sentence in result_transcripts
+        ]
 
         return result_transcripts
 
-- 
GitLab