diff --git a/.clang_format.hook b/.clang_format.hook
index 40d70f56cf97f7b7f18bb255dae73ab1d542f12a..4cbc972bbd200d0dcb6d8ba404bb1286ee81736c 100755
--- a/.clang_format.hook
+++ b/.clang_format.hook
@@ -1,7 +1,7 @@
#!/usr/bin/env bash
set -e
-readonly VERSION="3.8"
+readonly VERSION="3.9"
version=$(clang-format -version)
diff --git a/deep_speech_2/README.md b/deep_speech_2/README.md
index 4080476b4b95be154187ab46116c5736a40bbdbf..9e9113d842905f2fb700a640db676b45fc258a50 100644
--- a/deep_speech_2/README.md
+++ b/deep_speech_2/README.md
@@ -14,8 +14,8 @@
- [Hyper-parameters Tuning](#hyper-parameters-tuning)
- [Training for Mandarin Language](#training-for-mandarin-language)
- [Trying Live Demo with Your Own Voice](#trying-live-demo-with-your-own-voice)
-- [Experiments and Benchmarks](#experiments-and-benchmarks)
- [Released Models](#released-models)
+- [Experiments and Benchmarks](#experiments-and-benchmarks)
- [Questions and Help](#questions-and-help)
## Prerequisites
@@ -466,9 +466,21 @@ Test Set | Aishell Model | Internal Mandarin Model
Aishell-Test | X.X | X.X
Baidu-Mandarin-Test | X.X | X.X
-#### Multiple GPU Efficiency
+#### Acceleration with Multi-GPUs
+
+We compare the training time with 1, 2, 4, 8, 16 Tesla K40m GPUs (with a subset of LibriSpeech samples whose audio durations are between 6.0 and 7.0 seconds). And it shows that a **near-linear** acceleration with multiple GPUs has been achieved. In the following figure, the time (in seconds) used for training is plotted on the blue bars.
+
+
+
+| # of GPU | Acceleration Rate |
+| -------- | --------------: |
+| 1 | 1.00 X |
+| 2 | 1.97 X |
+| 4 | 3.74 X |
+| 8 | 6.21 X |
+|16 | 10.70 X |
-TODO: To Be Added
+`tools/profile.sh` provides such a profiling tool.
## Questions and Help
diff --git a/deep_speech_2/decoders/__init__.py b/deep_speech_2/decoders/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/deep_speech_2/model_utils/decoder.py b/deep_speech_2/decoders/decoders_deprecated.py
similarity index 95%
rename from deep_speech_2/model_utils/decoder.py
rename to deep_speech_2/decoders/decoders_deprecated.py
index ffba2731a06b49105f74ab2c47831105c4c68428..17b28b0d02a22a2e59856156ccd663324e886aed 100644
--- a/deep_speech_2/model_utils/decoder.py
+++ b/deep_speech_2/decoders/decoders_deprecated.py
@@ -42,8 +42,8 @@ def ctc_greedy_decoder(probs_seq, vocabulary):
def ctc_beam_search_decoder(probs_seq,
beam_size,
vocabulary,
- blank_id,
cutoff_prob=1.0,
+ cutoff_top_n=40,
ext_scoring_func=None,
nproc=False):
"""CTC Beam search decoder.
@@ -66,8 +66,6 @@ def ctc_beam_search_decoder(probs_seq,
:type beam_size: int
:param vocabulary: Vocabulary list.
:type vocabulary: list
- :param blank_id: ID of blank.
- :type blank_id: int
:param cutoff_prob: Cutoff probability in pruning,
default 1.0, no pruning.
:type cutoff_prob: float
@@ -87,9 +85,8 @@ def ctc_beam_search_decoder(probs_seq,
raise ValueError("The shape of prob_seq does not match with the "
"shape of the vocabulary.")
- # blank_id check
- if not blank_id < len(probs_seq[0]):
- raise ValueError("blank_id shouldn't be greater than probs dimension")
+ # blank_id assign
+ blank_id = len(vocabulary)
# If the decoder called in the multiprocesses, then use the global scorer
# instantiated in ctc_beam_search_decoder_batch().
@@ -114,7 +111,7 @@ def ctc_beam_search_decoder(probs_seq,
prob_idx = list(enumerate(probs_seq[time_step]))
cutoff_len = len(prob_idx)
#If pruning is enabled
- if cutoff_prob < 1.0:
+ if cutoff_prob < 1.0 or cutoff_top_n < cutoff_len:
prob_idx = sorted(prob_idx, key=lambda asd: asd[1], reverse=True)
cutoff_len, cum_prob = 0, 0.0
for i in xrange(len(prob_idx)):
@@ -122,6 +119,7 @@ def ctc_beam_search_decoder(probs_seq,
cutoff_len += 1
if cum_prob >= cutoff_prob:
break
+ cutoff_len = min(cutoff_len, cutoff_top_n)
prob_idx = prob_idx[0:cutoff_len]
for l in prefix_set_prev:
@@ -191,9 +189,9 @@ def ctc_beam_search_decoder(probs_seq,
def ctc_beam_search_decoder_batch(probs_split,
beam_size,
vocabulary,
- blank_id,
num_processes,
cutoff_prob=1.0,
+ cutoff_top_n=40,
ext_scoring_func=None):
"""CTC beam search decoder using multiple processes.
@@ -204,8 +202,6 @@ def ctc_beam_search_decoder_batch(probs_split,
:type beam_size: int
:param vocabulary: Vocabulary list.
:type vocabulary: list
- :param blank_id: ID of blank.
- :type blank_id: int
:param num_processes: Number of parallel processes.
:type num_processes: int
:param cutoff_prob: Cutoff probability in pruning,
@@ -232,8 +228,8 @@ def ctc_beam_search_decoder_batch(probs_split,
pool = multiprocessing.Pool(processes=num_processes)
results = []
for i, probs_list in enumerate(probs_split):
- args = (probs_list, beam_size, vocabulary, blank_id, cutoff_prob, None,
- nproc)
+ args = (probs_list, beam_size, vocabulary, cutoff_prob, cutoff_top_n,
+ None, nproc)
results.append(pool.apply_async(ctc_beam_search_decoder, args))
pool.close()
diff --git a/deep_speech_2/model_utils/lm_scorer.py b/deep_speech_2/decoders/scorer_deprecated.py
similarity index 98%
rename from deep_speech_2/model_utils/lm_scorer.py
rename to deep_speech_2/decoders/scorer_deprecated.py
index 463e96d6653b29207fb6105527a1f79c41c7fb84..c6a661030d4363727e259da9c7949e59705d55c8 100644
--- a/deep_speech_2/model_utils/lm_scorer.py
+++ b/deep_speech_2/decoders/scorer_deprecated.py
@@ -8,7 +8,7 @@ import kenlm
import numpy as np
-class LmScorer(object):
+class Scorer(object):
"""External scorer to evaluate a prefix or whole sentence in
beam search decoding, including the score from n-gram language
model and word count.
diff --git a/deep_speech_2/decoders/swig/__init__.py b/deep_speech_2/decoders/swig/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/deep_speech_2/deploy/_init_paths.py b/deep_speech_2/decoders/swig/_init_paths.py
similarity index 100%
rename from deep_speech_2/deploy/_init_paths.py
rename to deep_speech_2/decoders/swig/_init_paths.py
diff --git a/deep_speech_2/decoders/swig/ctc_beam_search_decoder.cpp b/deep_speech_2/decoders/swig/ctc_beam_search_decoder.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..624784b05e215782f2264cc6ae4db7eed5b28cae
--- /dev/null
+++ b/deep_speech_2/decoders/swig/ctc_beam_search_decoder.cpp
@@ -0,0 +1,204 @@
+#include "ctc_beam_search_decoder.h"
+
+#include
+#include
+#include
+#include
+#include