Merge pull request #753 from kuke/kaldi_decoder

Add decoder for deep asr model

Merge pull request #753 from kuke/kaldi_decoder
Add decoder for deep asr model
4d9d1413 · Yibing Liu · GitHub · 745591ac · 7473ec05 · 745591ac
7 changed file
--- a/fluid/DeepASR/decoder/decoder.cc
+++ b/fluid/DeepASR/decoder/decoder.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "decoder.h"
-std::string decode(std::vector<std::vector<float>> probs_mat) {
-  // Add decoding logic here
-  return "example decoding result";
-}
--- a/fluid/DeepASR/decoder/post_decode_faster.cc
+++ b/fluid/DeepASR/decoder/post_decode_faster.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "post_decode_faster.h"
+typedef kaldi::int32 int32;
+using fst::SymbolTable;
+using fst::VectorFst;
+using fst::StdArc;
+Decoder::Decoder(std::string word_syms_filename,
+                 std::string fst_in_filename,
+                 std::string logprior_rxfilename) {
+  const char* usage =
+      "Decode, reading log-likelihoods (of transition-ids or whatever symbol "
+      "is on the graph) as matrices.";
+  kaldi::ParseOptions po(usage);
+  binary = true;
+  acoustic_scale = 1.5;
+  allow_partial = true;
+  kaldi::FasterDecoderOptions decoder_opts;
+  decoder_opts.Register(&po, true);  // true == include obscure settings.
+  po.Register("binary", &binary, "Write output in binary mode");
+  po.Register("allow-partial",
+              &allow_partial,
+              "Produce output even when final state was not reached");
+  po.Register("acoustic-scale",
+              &acoustic_scale,
+              "Scaling factor for acoustic likelihoods");
+  word_syms = NULL;
+  if (word_syms_filename != "") {
+    word_syms = fst::SymbolTable::ReadText(word_syms_filename);
+    if (!word_syms)
+      KALDI_ERR << "Could not read symbol table from file "
+                << word_syms_filename;
+  }
+  std::ifstream is_logprior(logprior_rxfilename);
+  logprior.Read(is_logprior, false);
+  // It's important that we initialize decode_fst after loglikes_reader, as it
+  // can prevent crashes on systems installed without enough virtual memory.
+  // It has to do with what happens on UNIX systems if you call fork() on a
+  // large process: the page-table entries are duplicated, which requires a
+  // lot of virtual memory.
+  decode_fst = fst::ReadFstKaldi(fst_in_filename);
+  decoder = new kaldi::FasterDecoder(*decode_fst, decoder_opts);
+}
+Decoder::~Decoder() {
+  if (!word_syms) delete word_syms;
+  delete decode_fst;
+  delete decoder;
+}
+std::string Decoder::decode(
+    std::string key,
+    const std::vector<std::vector<kaldi::BaseFloat>>& log_probs) {
+  size_t num_frames = log_probs.size();
+  size_t dim_label = log_probs[0].size();
+  kaldi::Matrix<kaldi::BaseFloat> loglikes(
+      num_frames, dim_label, kaldi::kSetZero, kaldi::kStrideEqualNumCols);
+  for (size_t i = 0; i < num_frames; ++i) {
+    memcpy(loglikes.Data() + i * dim_label,
+           log_probs[i].data(),
+           sizeof(kaldi::BaseFloat) * dim_label);
+  }
+  return decode(key, loglikes);
+}
+std::vector<std::string> Decoder::decode(std::string posterior_rspecifier) {
+  kaldi::SequentialBaseFloatMatrixReader posterior_reader(posterior_rspecifier);
+  std::vector<std::string> decoding_results;
+  for (; !posterior_reader.Done(); posterior_reader.Next()) {
+    std::string key = posterior_reader.Key();
+    kaldi::Matrix<kaldi::BaseFloat> loglikes(posterior_reader.Value());
+    decoding_results.push_back(decode(key, loglikes));
+  }
+  return decoding_results;
+}
+std::string Decoder::decode(std::string key,
+                            kaldi::Matrix<kaldi::BaseFloat>& loglikes) {
+  std::string decoding_result;
+  if (loglikes.NumRows() == 0) {
+    KALDI_WARN << "Zero-length utterance: " << key;
+  }
+  KALDI_ASSERT(loglikes.NumCols() == logprior.Dim());
+  loglikes.ApplyLog();
+  loglikes.AddVecToRows(-1.0, logprior);
+  kaldi::DecodableMatrixScaled decodable(loglikes, acoustic_scale);
+  decoder->Decode(&decodable);
+  VectorFst<kaldi::LatticeArc> decoded;  // linear FST.
+  if ((allow_partial || decoder->ReachedFinal()) &&
+      decoder->GetBestPath(&decoded)) {
+    if (!decoder->ReachedFinal())
+      KALDI_WARN << "Decoder did not reach end-state, outputting partial "
+                    "traceback.";
+    std::vector<int32> alignment;
+    std::vector<int32> words;
+    kaldi::LatticeWeight weight;
+    GetLinearSymbolSequence(decoded, &alignment, &words, &weight);
+    if (word_syms != NULL) {
+      for (size_t i = 0; i < words.size(); i++) {
+        std::string s = word_syms->Find(words[i]);
+        decoding_result += s;
+        if (s == "")
+          KALDI_ERR << "Word-id " << words[i] << " not in symbol table.";
+      }
+    }
+  }
+  return decoding_result;
+}
--- a/fluid/DeepASR/decoder/decoder.h
+++ b/fluid/DeepASR/decoder/decoder.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -14,5 +14,44 @@ limitations under the License. */
 #include <string>
 #include <vector>
+#include "base/kaldi-common.h"
+#include "base/timer.h"
+#include "decoder/decodable-matrix.h"
+#include "decoder/faster-decoder.h"
+#include "fstext/fstext-lib.h"
+#include "hmm/transition-model.h"
+#include "lat/kaldi-lattice.h"  // for {Compact}LatticeArc
+#include "tree/context-dep.h"
+#include "util/common-utils.h"
-std::string decode(std::vector<std::vector<float>> probs_mat);
+class Decoder {
+public:
+  Decoder(std::string word_syms_filename,
+          std::string fst_in_filename,
+          std::string logprior_rxfilename);
+  ~Decoder();
+  // Interface to accept the scores read from specifier and return
+  // the batch decoding results
+  std::vector<std::string> decode(std::string posterior_rspecifier);
+  // Accept the scores of one utterance and return the decoding result
+  std::string decode(
+      std::string key,
+      const std::vector<std::vector<kaldi::BaseFloat>> &log_probs);
+private:
+  // For decoding one utterance
+  std::string decode(std::string key,
+                     kaldi::Matrix<kaldi::BaseFloat> &loglikes);
+  fst::SymbolTable *word_syms;
+  fst::VectorFst<fst::StdArc> *decode_fst;
+  kaldi::FasterDecoder *decoder;
+  kaldi::Vector<kaldi::BaseFloat> logprior;
+  bool binary;
+  kaldi::BaseFloat acoustic_scale;
+  bool allow_partial;
+};
--- a/fluid/DeepASR/decoder/pybind.cc
+++ b/fluid/DeepASR/decoder/pybind.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -15,15 +15,25 @@ limitations under the License. */
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
-#include "decoder.h"
+#include "post_decode_faster.h"
 namespace py = pybind11;
-PYBIND11_MODULE(decoder, m) {
+PYBIND11_MODULE(post_decode_faster, m) {
-  m.doc() = "Decode function for Deep ASR model";
+  m.doc() = "Decoder for Deep ASR model";
-  m.def("decode",
+  py::class_<Decoder>(m, "Decoder")
-        &decode,
+      .def(py::init<std::string, std::string, std::string>())
-        "Decode one input probability matrix "
+      .def("decode",
-        "and return the transcription");
+           (std::vector<std::string> (Decoder::*)(std::string)) &
+               Decoder::decode,
+           "Decode for the probability matrices in specifier "
+           "and return the transcriptions.")
+      .def(
+          "decode",
+          (std::string (Decoder::*)(
+              std::string, const std::vector<std::vector<kaldi::BaseFloat>>&)) &
+              Decoder::decode,
+          "Decode one input probability matrix "
+          "and return the transcription.");
 }
--- a/fluid/DeepASR/decoder/setup.py
+++ b/fluid/DeepASR/decoder/setup.py
-#  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+#  Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -13,27 +13,57 @@
 # limitations under the License.
 import os
+import glob
 from distutils.core import setup, Extension
 from distutils.sysconfig import get_config_vars
-args = ['-std=c++11']
+try:
+    kaldi_root = os.environ['KALDI_ROOT']
+except:
+    raise ValueError("Enviroment variable 'KALDI_ROOT' is not defined. Please "
+                     "install kaldi and export KALDI_ROOT=<kaldi's root dir> .")
+args = [
+    '-std=c++11', '-Wno-sign-compare', '-Wno-unused-variable',
+    '-Wno-unused-local-typedefs', '-Wno-unused-but-set-variable',
+    '-Wno-deprecated-declarations', '-Wno-unused-function'
+]
 # remove warning about -Wstrict-prototypes
 (opt, ) = get_config_vars('OPT')
 os.environ['OPT'] = " ".join(flag for flag in opt.split()
                             if flag != '-Wstrict-prototypes')
+os.environ['CC'] = 'g++'
+LIBS = [
+    'fst', 'kaldi-base', 'kaldi-util', 'kaldi-matrix', 'kaldi-tree',
+    'kaldi-hmm', 'kaldi-fstext', 'kaldi-decoder', 'kaldi-lat'
+]
+LIB_DIRS = [
+    'tools/openfst/lib', 'src/base', 'src/matrix', 'src/util', 'src/tree',
+    'src/hmm', 'src/fstext', 'src/decoder', 'src/lat'
+]
+LIB_DIRS = [os.path.join(kaldi_root, path) for path in LIB_DIRS]
+LIB_DIRS = [os.path.abspath(path) for path in LIB_DIRS]
 ext_modules = [
    Extension(
-        'decoder',
+        'post_decode_faster',
-        ['pybind.cc', 'decoder.cc'],
+        ['pybind.cc', 'post_decode_faster.cc'],
-        include_dirs=['pybind11/include', '.'],
+        include_dirs=[
+            'pybind11/include', '.', os.path.join(kaldi_root, 'src'),
+            os.path.join(kaldi_root, 'tools/openfst/src/include')
+        ],
        language='c++',
+        libraries=LIBS,
+        library_dirs=LIB_DIRS,
+        runtime_library_dirs=LIB_DIRS,
        extra_compile_args=args, ),
 ]
 setup(
-    name='decoder',
+    name='post_decode_faster',
    version='0.0.1',
    author='Paddle',
    author_email='',

--- a/fluid/DeepASR/decoder/setup.sh
+++ b/fluid/DeepASR/decoder/setup.sh
+set -e
 if [ ! -d pybind11 ]; then
    git clone https://github.com/pybind/pybind11.git

--- a/fluid/DeepASR/infer_by_ckpt.py
+++ b/fluid/DeepASR/infer_by_ckpt.py
@@ -13,7 +13,7 @@ import data_utils.augmentor.trans_mean_variance_norm as trans_mean_variance_norm
 import data_utils.augmentor.trans_add_delta as trans_add_delta
 import data_utils.augmentor.trans_splice as trans_splice
 import data_utils.async_data_reader as reader
-import decoder.decoder as decoder
+from decoder.post_decode_faster import Decoder
 from data_utils.util import lodtensor_to_ndarray
 from model_utils.model import stacked_lstmp_model
 from data_utils.util import split_infer_result
@@ -91,6 +91,21 @@ def parse_args():
        type=str,
        default='./checkpoint',
        help="The checkpoint path to init model. (default: %(default)s)")
+    parser.add_argument(
+        '--vocabulary',
+        type=str,
+        default='./decoder/graph/words.txt',
+        help="The path to vocabulary. (default: %(default)s)")
+    parser.add_argument(
+        '--graphs',
+        type=str,
+        default='./decoder/graph/TLG.fst',
+        help="The path to TLG graphs for decoding. (default: %(default)s)")
+    parser.add_argument(
+        '--log_prior',
+        type=str,
+        default="./decoder/logprior",
+        help="The log prior probs for training data. (default: %(default)s)")
    args = parser.parse_args()
    return args
@@ -165,8 +180,9 @@ def infer_from_ckpt(args):
        probs, lod = lodtensor_to_ndarray(results[0])
        infer_batch = split_infer_result(probs, lod)
        for index, sample in enumerate(infer_batch):
-            print("Decoding %d: " % (batch_id * args.batch_size + index),
+            key = "utter#%d" % (batch_id * args.batch_size + index)
-                  decoder.decode(sample))
+            print(key, ": ", decoder.decode(key, sample), "\n")
    print(np.mean(infer_costs), np.mean(infer_accs))