提交 a18e6a7e 编写于 作者: Y Yibing Liu

refine by following review comments

上级 e0ab51f4
......@@ -24,8 +24,6 @@
## Installation
### Basic setup
Please make sure the above [prerequisites](#prerequisites) have been satisfied before moving on.
```bash
......@@ -34,16 +32,6 @@ cd models/deep_speech_2
sh setup.sh
```
### Decoders setup
```bash
cd decoders/swig
sh setup.sh
cd ../..
```
These commands will install the decoders that translate the ouptut probability vectors of DS2 model to text data, incuding CTC greedy decoder, CTC beam search decoder and its batch version. And a detailed usuage about them will be given in the following sections.
## Getting Started
Several shell scripts provided in `./examples` will help us to quickly give it a try, for most major modules, including data preparation, model training, case inference and model evaluation, with a few public dataset (e.g. [LibriSpeech](http://www.openslr.org/12/), [Aishell](http://www.openslr.org/33)). Reading these examples will also help you to understand how to make it work with your own data.
......@@ -189,7 +177,6 @@ Data augmentation has often been a highly effective technique to boost the deep
Six optional augmentation components are provided to be selected, configured and inserted into the processing pipeline.
### Inference
- Volume Perturbation
- Speed Perturbation
- Shifting Perturbation
......
......@@ -22,6 +22,8 @@ class TextFeaturizer(object):
def __init__(self, vocab_filepath):
self._vocab_dict, self._vocab_list = self._load_vocabulary_from_file(
vocab_filepath)
# from unicode to string
self._vocab_list = [chars.encode("utf-8") for chars in self._vocab_list]
def featurize(self, text):
"""Convert text string to a list of token indices in char-level.Note
......
......@@ -17,41 +17,38 @@ std::string ctc_greedy_decoder(
const std::vector<std::vector<double>> &probs_seq,
const std::vector<std::string> &vocabulary) {
// dimension check
int num_time_steps = probs_seq.size();
for (int i = 0; i < num_time_steps; i++) {
if (probs_seq[i].size() != vocabulary.size() + 1) {
std::cout << "The shape of probs_seq does not match"
<< " with the shape of the vocabulary!" << std::endl;
exit(1);
}
size_t num_time_steps = probs_seq.size();
for (size_t i = 0; i < num_time_steps; i++) {
VALID_CHECK_EQ(probs_seq[i].size(),
vocabulary.size() + 1,
"The shape of probs_seq does not match with "
"the shape of the vocabulary");
}
int blank_id = vocabulary.size();
size_t blank_id = vocabulary.size();
std::vector<int> max_idx_vec;
double max_prob = 0.0;
int max_idx = 0;
for (int i = 0; i < num_time_steps; i++) {
for (int j = 0; j < probs_seq[i].size(); j++) {
std::vector<size_t> max_idx_vec;
for (size_t i = 0; i < num_time_steps; i++) {
double max_prob = 0.0;
size_t max_idx = 0;
for (size_t j = 0; j < probs_seq[i].size(); j++) {
if (max_prob < probs_seq[i][j]) {
max_idx = j;
max_prob = probs_seq[i][j];
}
}
max_idx_vec.push_back(max_idx);
max_prob = 0.0;
max_idx = 0;
}
std::vector<int> idx_vec;
for (int i = 0; i < max_idx_vec.size(); i++) {
std::vector<size_t> idx_vec;
for (size_t i = 0; i < max_idx_vec.size(); i++) {
if ((i == 0) || ((i > 0) && max_idx_vec[i] != max_idx_vec[i - 1])) {
idx_vec.push_back(max_idx_vec[i]);
}
}
std::string best_path_result;
for (int i = 0; i < idx_vec.size(); i++) {
for (size_t i = 0; i < idx_vec.size(); i++) {
if (idx_vec[i] != blank_id) {
best_path_result += vocabulary[idx_vec[i]];
}
......@@ -61,29 +58,24 @@ std::string ctc_greedy_decoder(
std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
const std::vector<std::vector<double>> &probs_seq,
int beam_size,
const size_t beam_size,
std::vector<std::string> vocabulary,
int blank_id,
double cutoff_prob,
int cutoff_top_n,
Scorer *extscorer) {
const double cutoff_prob,
const size_t cutoff_top_n,
Scorer *ext_scorer) {
// dimension check
size_t num_time_steps = probs_seq.size();
for (int i = 0; i < num_time_steps; i++) {
if (probs_seq[i].size() != vocabulary.size() + 1) {
std::cout << " The shape of probs_seq does not match"
<< " with the shape of the vocabulary!" << std::endl;
exit(1);
}
for (size_t i = 0; i < num_time_steps; i++) {
VALID_CHECK_EQ(probs_seq[i].size(),
vocabulary.size() + 1,
"The shape of probs_seq does not match with "
"the shape of the vocabulary");
}
// blank_id check
if (blank_id > vocabulary.size()) {
std::cout << " Invalid blank_id! " << std::endl;
exit(1);
}
// assign blank id
size_t blank_id = vocabulary.size();
// assign space ID
// assign space id
std::vector<std::string>::iterator it =
std::find(vocabulary.begin(), vocabulary.end(), " ");
int space_id = it - vocabulary.begin();
......@@ -98,16 +90,16 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
std::vector<PathTrie *> prefixes;
prefixes.push_back(&root);
if (extscorer != nullptr) {
if (extscorer->is_char_map_empty()) {
extscorer->set_char_map(vocabulary);
if (ext_scorer != nullptr) {
if (ext_scorer->is_char_map_empty()) {
ext_scorer->set_char_map(vocabulary);
}
if (!extscorer->is_character_based()) {
if (extscorer->dictionary == nullptr) {
if (!ext_scorer->is_character_based()) {
if (ext_scorer->dictionary == nullptr) {
// fill dictionary for fst with space
extscorer->fill_dictionary(true);
ext_scorer->fill_dictionary(true);
}
auto fst_dict = static_cast<fst::StdVectorFst *>(extscorer->dictionary);
auto fst_dict = static_cast<fst::StdVectorFst *>(ext_scorer->dictionary);
fst::StdVectorFst *dict_ptr = fst_dict->Copy(true);
root.set_dictionary(dict_ptr);
auto matcher = std::make_shared<FSTMATCH>(*dict_ptr, fst::MATCH_INPUT);
......@@ -116,33 +108,33 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
}
// prefix search over time
for (int time_step = 0; time_step < num_time_steps; time_step++) {
for (size_t time_step = 0; time_step < num_time_steps; time_step++) {
std::vector<double> prob = probs_seq[time_step];
std::vector<std::pair<int, double>> prob_idx;
for (int i = 0; i < prob.size(); i++) {
for (size_t i = 0; i < prob.size(); i++) {
prob_idx.push_back(std::pair<int, double>(i, prob[i]));
}
float min_cutoff = -NUM_FLT_INF;
bool full_beam = false;
if (extscorer != nullptr) {
int num_prefixes = std::min((int)prefixes.size(), beam_size);
if (ext_scorer != nullptr) {
size_t num_prefixes = std::min(prefixes.size(), beam_size);
std::sort(
prefixes.begin(), prefixes.begin() + num_prefixes, prefix_compare);
min_cutoff = prefixes[num_prefixes - 1]->score + log(prob[blank_id]) -
std::max(0.0, extscorer->beta);
std::max(0.0, ext_scorer->beta);
full_beam = (num_prefixes == beam_size);
}
// pruning of vacobulary
int cutoff_len = prob.size();
size_t cutoff_len = prob.size();
if (cutoff_prob < 1.0 || cutoff_top_n < prob.size()) {
std::sort(
prob_idx.begin(), prob_idx.end(), pair_comp_second_rev<int, double>);
if (cutoff_prob < 1.0) {
double cum_prob = 0.0;
cutoff_len = 0;
for (int i = 0; i < prob_idx.size(); i++) {
for (size_t i = 0; i < prob_idx.size(); i++) {
cum_prob += prob_idx[i].second;
cutoff_len += 1;
if (cum_prob >= cutoff_prob) break;
......@@ -152,18 +144,18 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
prob_idx = std::vector<std::pair<int, double>>(
prob_idx.begin(), prob_idx.begin() + cutoff_len);
}
std::vector<std::pair<int, float>> log_prob_idx;
for (int i = 0; i < cutoff_len; i++) {
std::vector<std::pair<size_t, float>> log_prob_idx;
for (size_t i = 0; i < cutoff_len; i++) {
log_prob_idx.push_back(std::pair<int, float>(
prob_idx[i].first, log(prob_idx[i].second + NUM_FLT_MIN)));
}
// loop over chars
for (int index = 0; index < log_prob_idx.size(); index++) {
for (size_t index = 0; index < log_prob_idx.size(); index++) {
auto c = log_prob_idx[index].first;
float log_prob_c = log_prob_idx[index].second;
for (int i = 0; i < prefixes.size() && i < beam_size; i++) {
for (size_t i = 0; i < prefixes.size() && i < beam_size; i++) {
auto prefix = prefixes[i];
if (full_beam && log_prob_c + prefix->score < min_cutoff) {
......@@ -194,12 +186,12 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
}
// language model scoring
if (extscorer != nullptr &&
(c == space_id || extscorer->is_character_based())) {
if (ext_scorer != nullptr &&
(c == space_id || ext_scorer->is_character_based())) {
PathTrie *prefix_toscore = nullptr;
// skip scoring the space
if (extscorer->is_character_based()) {
if (ext_scorer->is_character_based()) {
prefix_toscore = prefix_new;
} else {
prefix_toscore = prefix;
......@@ -207,11 +199,11 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
double score = 0.0;
std::vector<std::string> ngram;
ngram = extscorer->make_ngram(prefix_toscore);
score = extscorer->get_log_cond_prob(ngram) * extscorer->alpha;
ngram = ext_scorer->make_ngram(prefix_toscore);
score = ext_scorer->get_log_cond_prob(ngram) * ext_scorer->alpha;
log_p += score;
log_p += extscorer->beta;
log_p += ext_scorer->beta;
}
prefix_new->log_prob_nb_cur =
log_sum_exp(prefix_new->log_prob_nb_cur, log_p);
......@@ -240,15 +232,15 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
for (size_t i = 0; i < beam_size && i < prefixes.size(); i++) {
double approx_ctc = prefixes[i]->score;
if (extscorer != nullptr) {
if (ext_scorer != nullptr) {
std::vector<int> output;
prefixes[i]->get_path_vec(output);
size_t prefix_length = output.size();
auto words = extscorer->split_labels(output);
auto words = ext_scorer->split_labels(output);
// remove word insert
approx_ctc = approx_ctc - prefix_length * extscorer->beta;
approx_ctc = approx_ctc - prefix_length * ext_scorer->beta;
// remove language model weight:
approx_ctc -= (extscorer->get_sent_log_prob(words)) * extscorer->alpha;
approx_ctc -= (ext_scorer->get_sent_log_prob(words)) * ext_scorer->alpha;
}
prefixes[i]->approx_ctc = approx_ctc;
......@@ -269,7 +261,7 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
space_prefixes[i]->get_path_vec(output);
// convert index to string
std::string output_str;
for (int j = 0; j < output.size(); j++) {
for (size_t j = 0; j < output.size(); j++) {
output_str += vocabulary[output[j]];
}
std::pair<double, std::string> output_pair(-space_prefixes[i]->approx_ctc,
......@@ -283,49 +275,45 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
std::vector<std::vector<std::pair<double, std::string>>>
ctc_beam_search_decoder_batch(
const std::vector<std::vector<std::vector<double>>> &probs_split,
int beam_size,
const size_t beam_size,
const std::vector<std::string> &vocabulary,
int blank_id,
int num_processes,
double cutoff_prob,
int cutoff_top_n,
Scorer *extscorer) {
if (num_processes <= 0) {
std::cout << "num_processes must be nonnegative!" << std::endl;
exit(1);
}
const size_t num_processes,
const double cutoff_prob,
const size_t cutoff_top_n,
Scorer *ext_scorer) {
VALID_CHECK_GT(num_processes, 0, "num_processes must be nonnegative!");
// thread pool
ThreadPool pool(num_processes);
// number of samples
int batch_size = probs_split.size();
size_t batch_size = probs_split.size();
// scorer filling up
if (extscorer != nullptr) {
if (extscorer->is_char_map_empty()) {
extscorer->set_char_map(vocabulary);
if (ext_scorer != nullptr) {
if (ext_scorer->is_char_map_empty()) {
ext_scorer->set_char_map(vocabulary);
}
if (!extscorer->is_character_based() && extscorer->dictionary == nullptr) {
if (!ext_scorer->is_character_based() &&
ext_scorer->dictionary == nullptr) {
// init dictionary
extscorer->fill_dictionary(true);
ext_scorer->fill_dictionary(true);
}
}
// enqueue the tasks of decoding
std::vector<std::future<std::vector<std::pair<double, std::string>>>> res;
for (int i = 0; i < batch_size; i++) {
for (size_t i = 0; i < batch_size; i++) {
res.emplace_back(pool.enqueue(ctc_beam_search_decoder,
probs_split[i],
beam_size,
vocabulary,
blank_id,
cutoff_prob,
cutoff_top_n,
extscorer));
ext_scorer));
}
// get decoding results
std::vector<std::vector<std::pair<double, std::string>>> batch_results;
for (int i = 0; i < batch_size; i++) {
for (size_t i = 0; i < batch_size; i++) {
batch_results.emplace_back(res[i].get());
}
return batch_results;
......
......@@ -27,21 +27,21 @@ std::string ctc_greedy_decoder(
* over vocabulary of one time step.
* beam_size: The width of beam search.
* vocabulary: A vector of vocabulary.
* blank_id: ID of blank.
* cutoff_prob: Cutoff probability for pruning.
* cutoff_top_n: Cutoff number for pruning.
* ext_scorer: External scorer to evaluate a prefix.
* ext_scorer: External scorer to evaluate a prefix, which consists of
* n-gram language model scoring and word insertion term.
* Default null, decoding the input sample without scorer.
* Return:
* A vector that each element is a pair of score and decoding result,
* in desending order.
*/
std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
const std::vector<std::vector<double>> &probs_seq,
int beam_size,
const size_t beam_size,
std::vector<std::string> vocabulary,
int blank_id,
double cutoff_prob = 1.0,
int cutoff_top_n = 40,
const double cutoff_prob = 1.0,
const size_t cutoff_top_n = 40,
Scorer *ext_scorer = NULL);
/* CTC Beam Search Decoder for batch data
......@@ -52,11 +52,12 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
* .
* beam_size: The width of beam search.
* vocabulary: A vector of vocabulary.
* blank_id: ID of blank.
* num_processes: Number of threads for beam search.
* cutoff_prob: Cutoff probability for pruning.
* cutoff_top_n: Cutoff number for pruning.
* ext_scorer: External scorer to evaluate a prefix.
* ext_scorer: External scorer to evaluate a prefix, which consists of
* n-gram language model scoring and word insertion term.
* Default null, decoding the input sample without scorer.
* Return:
* A 2-D vector that each element is a vector of beam search decoding
* result for one audio sample.
......@@ -64,12 +65,11 @@ std::vector<std::pair<double, std::string>> ctc_beam_search_decoder(
std::vector<std::vector<std::pair<double, std::string>>>
ctc_beam_search_decoder_batch(
const std::vector<std::vector<std::vector<double>>> &probs_split,
int beam_size,
const size_t beam_size,
const std::vector<std::string> &vocabulary,
int blank_id,
int num_processes,
const size_t num_processes,
double cutoff_prob = 1.0,
int cutoff_top_n = 40,
const size_t cutoff_top_n = 40,
Scorer *ext_scorer = NULL);
#endif // CTC_BEAM_SEARCH_DECODER_H_
......@@ -7,6 +7,22 @@
const float NUM_FLT_INF = std::numeric_limits<float>::max();
const float NUM_FLT_MIN = std::numeric_limits<float>::min();
// check if __A == _B
#define VALID_CHECK_EQ(__A, __B, __ERR) \
if ((__A) != (__B)) { \
std::ostringstream str; \
str << (__A) << " != " << (__B) << ", "; \
throw std::runtime_error(str.str() + __ERR); \
}
// check if __A > __B
#define VALID_CHECK_GT(__A, __B, __ERR) \
if ((__A) <= (__B)) { \
std::ostringstream str; \
str << (__A) << " <= " << (__B) << ", "; \
throw std::runtime_error(str.str() + __ERR); \
}
// Function template for comparing two pairs
template <typename T1, typename T2>
bool pair_comp_first_rev(const std::pair<T1, T2> &a,
......
......@@ -41,7 +41,6 @@ def ctc_greedy_decoder(probs_seq, vocabulary):
def ctc_beam_search_decoder(probs_seq,
beam_size,
vocabulary,
blank_id,
cutoff_prob=1.0,
cutoff_top_n=40,
ext_scoring_func=None):
......@@ -55,8 +54,6 @@ def ctc_beam_search_decoder(probs_seq,
:type beam_size: int
:param vocabulary: Vocabulary list.
:type vocabulary: list
:param blank_id: ID of blank.
:type blank_id: int
:param cutoff_prob: Cutoff probability in pruning,
default 1.0, no pruning.
:type cutoff_prob: float
......@@ -72,15 +69,14 @@ def ctc_beam_search_decoder(probs_seq,
results, in descending order of the probability.
:rtype: list
"""
return swig_decoders.ctc_beam_search_decoder(
probs_seq.tolist(), beam_size, vocabulary, blank_id, cutoff_prob,
cutoff_top_n, ext_scoring_func)
return swig_decoders.ctc_beam_search_decoder(probs_seq.tolist(), beam_size,
vocabulary, cutoff_prob,
cutoff_top_n, ext_scoring_func)
def ctc_beam_search_decoder_batch(probs_split,
beam_size,
vocabulary,
blank_id,
num_processes,
cutoff_prob=1.0,
cutoff_top_n=40,
......@@ -94,8 +90,6 @@ def ctc_beam_search_decoder_batch(probs_split,
:type beam_size: int
:param vocabulary: Vocabulary list.
:type vocabulary: list
:param blank_id: ID of blank.
:type blank_id: int
:param num_processes: Number of parallel processes.
:type num_processes: int
:param cutoff_prob: Cutoff probability in vocabulary pruning,
......@@ -118,5 +112,5 @@ def ctc_beam_search_decoder_batch(probs_split,
probs_split = [probs_seq.tolist() for probs_seq in probs_split]
return swig_decoders.ctc_beam_search_decoder_batch(
probs_split, beam_size, vocabulary, blank_id, num_processes,
cutoff_prob, cutoff_top_n, ext_scoring_func)
probs_split, beam_size, vocabulary, num_processes, cutoff_prob,
cutoff_top_n, ext_scoring_func)
......@@ -31,13 +31,13 @@ python -u test.py \
--num_conv_layers=2 \
--num_rnn_layers=3 \
--rnn_layer_size=2048 \
--alpha=0.36 \
--beta=0.25 \
--cutoff_prob=0.99 \
--alpha=2.15 \
--beta=0.35 \
--cutoff_prob=1.0 \
--use_gru=False \
--use_gpu=True \
--share_rnn_weights=True \
--test_manifest='data/tiny/manifest.test-clean' \
--test_manifest='data/librispeech/manifest.test-clean' \
--mean_std_path='models/librispeech/mean_std.npz' \
--vocab_path='models/librispeech/vocab.txt' \
--model_path='models/librispeech/params.tar.gz' \
......
......@@ -21,9 +21,9 @@ add_arg('num_proc_bsearch', int, 12, "# of CPUs for beam search.")
add_arg('num_conv_layers', int, 2, "# of convolution layers.")
add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
add_arg('alpha', float, 0.36, "Coef of LM for beam search.")
add_arg('beta', float, 0.25, "Coef of WC for beam search.")
add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
add_arg('alpha', float, 2.15, "Coef of LM for beam search.")
add_arg('beta', float, 0.35, "Coef of WC for beam search.")
add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.")
add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.")
add_arg('use_gpu', bool, True, "Use GPU or not.")
add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
......@@ -85,7 +85,6 @@ def infer():
pretrained_model_path=args.model_path,
share_rnn_weights=args.share_rnn_weights)
vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list]
result_transcripts = ds2_model.infer_batch(
infer_data=infer_data,
decoding_method=args.decoding_method,
......@@ -93,7 +92,7 @@ def infer():
beam_beta=args.beta,
beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob,
vocab_list=vocab_list,
vocab_list=data_generator.vocab_list,
language_model_path=args.lang_model_path,
num_processes=args.num_proc_bsearch)
......
......@@ -214,7 +214,6 @@ class DeepSpeech2Model(object):
probs_split=probs_split,
vocabulary=vocab_list,
beam_size=beam_size,
blank_id=len(vocab_list),
num_processes=num_processes,
ext_scoring_func=self._ext_scorer,
cutoff_prob=cutoff_prob)
......
......@@ -26,4 +26,13 @@ if [ $? != 0 ]; then
rm libsndfile-1.0.28.tar.gz
fi
# install decoders
python -c "import swig_decoders"
if [ $? != 0 ]; then
pushd decoders/swig > /dev/null
sh setup.sh
popd > /dev/null
fi
echo "Install all dependencies successfully."
......@@ -22,9 +22,9 @@ add_arg('num_proc_data', int, 12, "# of CPUs for data preprocessing.")
add_arg('num_conv_layers', int, 2, "# of convolution layers.")
add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
add_arg('alpha', float, 0.36, "Coef of LM for beam search.")
add_arg('beta', float, 0.25, "Coef of WC for beam search.")
add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
add_arg('alpha', float, 2.15, "Coef of LM for beam search.")
add_arg('beta', float, 0.35, "Coef of WC for beam search.")
add_arg('cutoff_prob', float, 1.0, "Cutoff probability for pruning.")
add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.")
add_arg('use_gpu', bool, True, "Use GPU or not.")
add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
......@@ -85,7 +85,6 @@ def evaluate():
pretrained_model_path=args.model_path,
share_rnn_weights=args.share_rnn_weights)
vocab_list = [chars.encode("utf-8") for chars in data_generator.vocab_list]
error_rate_func = cer if args.error_rate_type == 'cer' else wer
error_sum, num_ins = 0.0, 0
for infer_data in batch_reader():
......@@ -96,7 +95,7 @@ def evaluate():
beam_beta=args.beta,
beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob,
vocab_list=vocab_list,
vocab_list=data_generator.vocab_list,
language_model_path=args.lang_model_path,
num_processes=args.num_proc_bsearch)
target_transcripts = [
......
......@@ -13,7 +13,7 @@ download() {
wget -c $URL -P `dirname "$TARGET"`
md5_result=`md5sum $TARGET | awk -F[' '] '{print $1}'`
if [ $MD5 -ne $md5_result ]; then
if [ ! $MD5 == $md5_result ]; then
echo "Fail to download the language model!"
return 1
fi
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册