diff --git a/deep_speech_2/README.md b/deep_speech_2/README.md index 2912ff3143516ee21f21732f25992fadcd33c270..41acf1020c25e05896f29804acc44042225f4271 100644 --- a/deep_speech_2/README.md +++ b/deep_speech_2/README.md @@ -77,3 +77,29 @@ More help for arguments: ``` python infer.py --help ``` + +### Evaluating + +``` +CUDA_VISIBLE_DEVICES=0 python evaluate.py +``` + +More help for arguments: + +``` +python evaluate.py --help +``` + +### Parameters tuning + +Parameters tuning for the CTC beam search decoder + +``` +CUDA_VISIBLE_DEVICES=0 python tune.py +``` + +More help for arguments: + +``` +python tune.py --help +``` diff --git a/deep_speech_2/decoder.py b/deep_speech_2/decoder.py index 77d950b8db072d539788fd1b2bc7ac0525ffa0f9..a1fadc2c81ac5036f5082e1a60b018106ab90277 100644 --- a/deep_speech_2/decoder.py +++ b/deep_speech_2/decoder.py @@ -1,14 +1,16 @@ -"""Contains various CTC decoder.""" +"""Contains various CTC decoders.""" from __future__ import absolute_import from __future__ import division from __future__ import print_function -import numpy as np from itertools import groupby +import numpy as np +from math import log +import multiprocessing -def ctc_best_path_decode(probs_seq, vocabulary): - """Best path decoding, also called argmax decoding or greedy decoding. +def ctc_best_path_decoder(probs_seq, vocabulary): + """Best path decoder, also called argmax decoder or greedy decoder. Path consisting of the most probable tokens are further post-processed to remove consecutive repetitions and all blanks. @@ -36,24 +38,200 @@ def ctc_best_path_decode(probs_seq, vocabulary): return ''.join([vocabulary[index] for index in index_list]) -def ctc_decode(probs_seq, vocabulary, method): - """CTC-like sequence decoding from a sequence of likelihood probablilites. +def ctc_beam_search_decoder(probs_seq, + beam_size, + vocabulary, + blank_id, + cutoff_prob=1.0, + ext_scoring_func=None, + nproc=False): + """Beam search decoder for CTC-trained network. It utilizes beam search + to approximately select top best decoding labels and returning results + in the descending order. The implementation is based on Prefix + Beam Search (https://arxiv.org/abs/1408.2873), and the unclear part is + redesigned. Two important modifications: 1) in the iterative computation + of probabilities, the assignment operation is changed to accumulation for + one prefix may comes from different paths; 2) the if condition "if l^+ not + in A_prev then" after probabilities' computation is deprecated for it is + hard to understand and seems unnecessary. - :param probs_seq: 2-D list of probabilities over the vocabulary for each - character. Each element is a list of float probabilities - for one character. - :type probs_seq: list + :param probs_seq: 2-D list of probability distributions over each time + step, with each element being a list of normalized + probabilities over vocabulary and blank. + :type probs_seq: 2-D list + :param beam_size: Width for beam search. + :type beam_size: int :param vocabulary: Vocabulary list. :type vocabulary: list - :param method: Decoding method name, with options: "best_path". - :type method: basestring - :return: Decoding result string. - :rtype: baseline + :param blank_id: ID of blank. + :type blank_id: int + :param cutoff_prob: Cutoff probability in pruning, + default 1.0, no pruning. + :type cutoff_prob: float + :param ext_scoring_func: External scoring function for + partially decoded sentence, e.g. word count + or language model. + :type external_scoring_func: callable + :param nproc: Whether the decoder used in multiprocesses. + :type nproc: bool + :return: List of tuples of log probability and sentence as decoding + results, in descending order of the probability. + :rtype: list """ + # dimension check for prob_list in probs_seq: if not len(prob_list) == len(vocabulary) + 1: - raise ValueError("probs dimension mismatchedd with vocabulary") - if method == "best_path": - return ctc_best_path_decode(probs_seq, vocabulary) - else: - raise ValueError("Decoding method [%s] is not supported.") + raise ValueError("The shape of prob_seq does not match with the " + "shape of the vocabulary.") + + # blank_id check + if not blank_id < len(probs_seq[0]): + raise ValueError("blank_id shouldn't be greater than probs dimension") + + # If the decoder called in the multiprocesses, then use the global scorer + # instantiated in ctc_beam_search_decoder_batch(). + if nproc is True: + global ext_nproc_scorer + ext_scoring_func = ext_nproc_scorer + + ## initialize + # prefix_set_prev: the set containing selected prefixes + # probs_b_prev: prefixes' probability ending with blank in previous step + # probs_nb_prev: prefixes' probability ending with non-blank in previous step + prefix_set_prev = {'\t': 1.0} + probs_b_prev, probs_nb_prev = {'\t': 1.0}, {'\t': 0.0} + + ## extend prefix in loop + for time_step in xrange(len(probs_seq)): + # prefix_set_next: the set containing candidate prefixes + # probs_b_cur: prefixes' probability ending with blank in current step + # probs_nb_cur: prefixes' probability ending with non-blank in current step + prefix_set_next, probs_b_cur, probs_nb_cur = {}, {}, {} + + prob_idx = list(enumerate(probs_seq[time_step])) + cutoff_len = len(prob_idx) + #If pruning is enabled + if cutoff_prob < 1.0: + prob_idx = sorted(prob_idx, key=lambda asd: asd[1], reverse=True) + cutoff_len, cum_prob = 0, 0.0 + for i in xrange(len(prob_idx)): + cum_prob += prob_idx[i][1] + cutoff_len += 1 + if cum_prob >= cutoff_prob: + break + prob_idx = prob_idx[0:cutoff_len] + + for l in prefix_set_prev: + if not prefix_set_next.has_key(l): + probs_b_cur[l], probs_nb_cur[l] = 0.0, 0.0 + + # extend prefix by travering prob_idx + for index in xrange(cutoff_len): + c, prob_c = prob_idx[index][0], prob_idx[index][1] + + if c == blank_id: + probs_b_cur[l] += prob_c * ( + probs_b_prev[l] + probs_nb_prev[l]) + else: + last_char = l[-1] + new_char = vocabulary[c] + l_plus = l + new_char + if not prefix_set_next.has_key(l_plus): + probs_b_cur[l_plus], probs_nb_cur[l_plus] = 0.0, 0.0 + + if new_char == last_char: + probs_nb_cur[l_plus] += prob_c * probs_b_prev[l] + probs_nb_cur[l] += prob_c * probs_nb_prev[l] + elif new_char == ' ': + if (ext_scoring_func is None) or (len(l) == 1): + score = 1.0 + else: + prefix = l[1:] + score = ext_scoring_func(prefix) + probs_nb_cur[l_plus] += score * prob_c * ( + probs_b_prev[l] + probs_nb_prev[l]) + else: + probs_nb_cur[l_plus] += prob_c * ( + probs_b_prev[l] + probs_nb_prev[l]) + # add l_plus into prefix_set_next + prefix_set_next[l_plus] = probs_nb_cur[ + l_plus] + probs_b_cur[l_plus] + # add l into prefix_set_next + prefix_set_next[l] = probs_b_cur[l] + probs_nb_cur[l] + # update probs + probs_b_prev, probs_nb_prev = probs_b_cur, probs_nb_cur + + ## store top beam_size prefixes + prefix_set_prev = sorted( + prefix_set_next.iteritems(), key=lambda asd: asd[1], reverse=True) + if beam_size < len(prefix_set_prev): + prefix_set_prev = prefix_set_prev[:beam_size] + prefix_set_prev = dict(prefix_set_prev) + + beam_result = [] + for seq, prob in prefix_set_prev.items(): + if prob > 0.0 and len(seq) > 1: + result = seq[1:] + # score last word by external scorer + if (ext_scoring_func is not None) and (result[-1] != ' '): + prob = prob * ext_scoring_func(result) + log_prob = log(prob) + beam_result.append((log_prob, result)) + + ## output top beam_size decoding results + beam_result = sorted(beam_result, key=lambda asd: asd[0], reverse=True) + return beam_result + + +def ctc_beam_search_decoder_batch(probs_split, + beam_size, + vocabulary, + blank_id, + num_processes, + cutoff_prob=1.0, + ext_scoring_func=None): + """CTC beam search decoder using multiple processes. + + :param probs_seq: 3-D list with each element as an instance of 2-D list + of probabilities used by ctc_beam_search_decoder(). + :type probs_seq: 3-D list + :param beam_size: Width for beam search. + :type beam_size: int + :param vocabulary: Vocabulary list. + :type vocabulary: list + :param blank_id: ID of blank. + :type blank_id: int + :param num_processes: Number of parallel processes. + :type num_processes: int + :param cutoff_prob: Cutoff probability in pruning, + default 1.0, no pruning. + :param num_processes: Number of parallel processes. + :type num_processes: int + :type cutoff_prob: float + :param ext_scoring_func: External scoring function for + partially decoded sentence, e.g. word count + or language model. + :type external_scoring_function: callable + :return: List of tuples of log probability and sentence as decoding + results, in descending order of the probability. + :rtype: list + """ + if not num_processes > 0: + raise ValueError("Number of processes must be positive!") + + # use global variable to pass the externnal scorer to beam search decoder + global ext_nproc_scorer + ext_nproc_scorer = ext_scoring_func + nproc = True + + pool = multiprocessing.Pool(processes=num_processes) + results = [] + for i, probs_list in enumerate(probs_split): + args = (probs_list, beam_size, vocabulary, blank_id, cutoff_prob, None, + nproc) + results.append(pool.apply_async(ctc_beam_search_decoder, args)) + + pool.close() + pool.join() + beam_search_results = [result.get() for result in results] + return beam_search_results diff --git a/deep_speech_2/evaluate.py b/deep_speech_2/evaluate.py new file mode 100644 index 0000000000000000000000000000000000000000..a4f2a690ac18a5d9f0c8ec06e5a3c7dc8a4a0548 --- /dev/null +++ b/deep_speech_2/evaluate.py @@ -0,0 +1,204 @@ +"""Evaluation for DeepSpeech2 model.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import distutils.util +import argparse +import gzip +import paddle.v2 as paddle +from data_utils.data import DataGenerator +from model import deep_speech2 +from decoder import * +from lm.lm_scorer import LmScorer +from error_rate import wer + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--batch_size", + default=100, + type=int, + help="Minibatch size for evaluation. (default: %(default)s)") +parser.add_argument( + "--num_conv_layers", + default=2, + type=int, + help="Convolution layer number. (default: %(default)s)") +parser.add_argument( + "--num_rnn_layers", + default=3, + type=int, + help="RNN layer number. (default: %(default)s)") +parser.add_argument( + "--rnn_layer_size", + default=512, + type=int, + help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--use_gpu", + default=True, + type=distutils.util.strtobool, + help="Use gpu or not. (default: %(default)s)") +parser.add_argument( + "--num_threads_data", + default=multiprocessing.cpu_count(), + type=int, + help="Number of cpu threads for preprocessing data. (default: %(default)s)") +parser.add_argument( + "--num_processes_beam_search", + default=multiprocessing.cpu_count(), + type=int, + help="Number of cpu processes for beam search. (default: %(default)s)") +parser.add_argument( + "--mean_std_filepath", + default='mean_std.npz', + type=str, + help="Manifest path for normalizer. (default: %(default)s)") +parser.add_argument( + "--decode_method", + default='beam_search', + type=str, + help="Method for ctc decoding, best_path or beam_search. (default: %(default)s)" +) +parser.add_argument( + "--language_model_path", + default="lm/data/1Billion.klm", + type=str, + help="Path for language model. (default: %(default)s)") +parser.add_argument( + "--alpha", + default=0.26, + type=float, + help="Parameter associated with language model. (default: %(default)f)") +parser.add_argument( + "--beta", + default=0.1, + type=float, + help="Parameter associated with word count. (default: %(default)f)") +parser.add_argument( + "--cutoff_prob", + default=0.99, + type=float, + help="The cutoff probability of pruning" + "in beam search. (default: %(default)f)") +parser.add_argument( + "--beam_size", + default=500, + type=int, + help="Width for beam search decoding. (default: %(default)d)") +parser.add_argument( + "--decode_manifest_path", + default='datasets/manifest.test', + type=str, + help="Manifest path for decoding. (default: %(default)s)") +parser.add_argument( + "--model_filepath", + default='checkpoints/params.latest.tar.gz', + type=str, + help="Model filepath. (default: %(default)s)") +parser.add_argument( + "--vocab_filepath", + default='datasets/vocab/eng_vocab.txt', + type=str, + help="Vocabulary filepath. (default: %(default)s)") +args = parser.parse_args() + + +def evaluate(): + """Evaluate on whole test data for DeepSpeech2.""" + # initialize data generator + data_generator = DataGenerator( + vocab_filepath=args.vocab_filepath, + mean_std_filepath=args.mean_std_filepath, + augmentation_config='{}', + num_threads=args.num_threads_data) + + # create network config + # paddle.data_type.dense_array is used for variable batch input. + # The size 161 * 161 is only an placeholder value and the real shape + # of input batch data will be induced during training. + audio_data = paddle.layer.data( + name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161)) + text_data = paddle.layer.data( + name="transcript_text", + type=paddle.data_type.integer_value_sequence(data_generator.vocab_size)) + output_probs = deep_speech2( + audio_data=audio_data, + text_data=text_data, + dict_size=data_generator.vocab_size, + num_conv_layers=args.num_conv_layers, + num_rnn_layers=args.num_rnn_layers, + rnn_size=args.rnn_layer_size, + is_inference=True) + + # load parameters + parameters = paddle.parameters.Parameters.from_tar( + gzip.open(args.model_filepath)) + + # prepare infer data + batch_reader = data_generator.batch_reader_creator( + manifest_path=args.decode_manifest_path, + batch_size=args.batch_size, + sortagrad=False, + shuffle_method=None) + + # define inferer + inferer = paddle.inference.Inference( + output_layer=output_probs, parameters=parameters) + + # initialize external scorer for beam search decoding + if args.decode_method == 'beam_search': + ext_scorer = LmScorer(args.alpha, args.beta, args.language_model_path) + + wer_counter, wer_sum = 0, 0.0 + for infer_data in batch_reader(): + # run inference + infer_results = inferer.infer(input=infer_data) + num_steps = len(infer_results) // len(infer_data) + probs_split = [ + infer_results[i * num_steps:(i + 1) * num_steps] + for i in xrange(0, len(infer_data)) + ] + # target transcription + target_transcription = [ + ''.join([ + data_generator.vocab_list[index] for index in infer_data[i][1] + ]) for i, probs in enumerate(probs_split) + ] + # decode and print + # best path decode + if args.decode_method == "best_path": + for i, probs in enumerate(probs_split): + output_transcription = ctc_best_path_decoder( + probs_seq=probs, vocabulary=data_generator.vocab_list) + wer_sum += wer(target_transcription[i], output_transcription) + wer_counter += 1 + # beam search decode + elif args.decode_method == "beam_search": + # beam search using multiple processes + beam_search_results = ctc_beam_search_decoder_batch( + probs_split=probs_split, + vocabulary=data_generator.vocab_list, + beam_size=args.beam_size, + blank_id=len(data_generator.vocab_list), + num_processes=args.num_processes_beam_search, + ext_scoring_func=ext_scorer, + cutoff_prob=args.cutoff_prob, ) + for i, beam_search_result in enumerate(beam_search_results): + wer_sum += wer(target_transcription[i], + beam_search_result[0][1]) + wer_counter += 1 + else: + raise ValueError("Decoding method [%s] is not supported." % + decode_method) + + print("Final WER = %f" % (wer_sum / wer_counter)) + + +def main(): + paddle.init(use_gpu=args.use_gpu, trainer_count=1) + evaluate() + + +if __name__ == '__main__': + main() diff --git a/deep_speech_2/infer.py b/deep_speech_2/infer.py index 9037a108e2c5cbf8f5d8544b6fa07057067c9340..dc1430804bfc17da1958baf561e701a4f1e7558e 100644 --- a/deep_speech_2/infer.py +++ b/deep_speech_2/infer.py @@ -10,7 +10,9 @@ import multiprocessing import paddle.v2 as paddle from data_utils.data import DataGenerator from model import deep_speech2 -from decoder import ctc_decode +from decoder import * +from lm.lm_scorer import LmScorer +from error_rate import wer import utils parser = argparse.ArgumentParser(description=__doc__) @@ -44,6 +46,11 @@ parser.add_argument( default=multiprocessing.cpu_count(), type=int, help="Number of cpu threads for preprocessing data. (default: %(default)s)") +parser.add_argument( + "--num_processes_beam_search", + default=multiprocessing.cpu_count(), + type=int, + help="Number of cpu processes for beam search. (default: %(default)s)") parser.add_argument( "--mean_std_filepath", default='mean_std.npz', @@ -64,11 +71,48 @@ parser.add_argument( default='datasets/vocab/eng_vocab.txt', type=str, help="Vocabulary filepath. (default: %(default)s)") +parser.add_argument( + "--decode_method", + default='beam_search', + type=str, + help="Method for ctc decoding: best_path or beam_search. (default: %(default)s)" +) +parser.add_argument( + "--beam_size", + default=500, + type=int, + help="Width for beam search decoding. (default: %(default)d)") +parser.add_argument( + "--num_results_per_sample", + default=1, + type=int, + help="Number of output per sample in beam search. (default: %(default)d)") +parser.add_argument( + "--language_model_path", + default="lm/data/1Billion.klm", + type=str, + help="Path for language model. (default: %(default)s)") +parser.add_argument( + "--alpha", + default=0.26, + type=float, + help="Parameter associated with language model. (default: %(default)f)") +parser.add_argument( + "--beta", + default=0.1, + type=float, + help="Parameter associated with word count. (default: %(default)f)") +parser.add_argument( + "--cutoff_prob", + default=0.99, + type=float, + help="The cutoff probability of pruning" + "in beam search. (default: %(default)f)") args = parser.parse_args() def infer(): - """Max-ctc-decoding for DeepSpeech2.""" + """Inference for DeepSpeech2.""" # initialize data generator data_generator = DataGenerator( vocab_filepath=args.vocab_filepath, @@ -102,6 +146,7 @@ def infer(): batch_reader = data_generator.batch_reader_creator( manifest_path=args.decode_manifest_path, batch_size=args.num_samples, + min_batch_size=1, sortagrad=False, shuffle_method=None) infer_data = batch_reader().next() @@ -115,16 +160,52 @@ def infer(): for i in xrange(len(infer_data)) ] - # decode and print - for i, probs in enumerate(probs_split): - output_transcription = ctc_decode( - probs_seq=probs, - vocabulary=data_generator.vocab_list, - method="best_path") - target_transcription = ''.join( + # targe transcription + target_transcription = [ + ''.join( [data_generator.vocab_list[index] for index in infer_data[i][1]]) - print("Target Transcription: %s \nOutput Transcription: %s \n" % - (target_transcription, output_transcription)) + for i, probs in enumerate(probs_split) + ] + + ## decode and print + # best path decode + wer_sum, wer_counter = 0, 0 + if args.decode_method == "best_path": + for i, probs in enumerate(probs_split): + best_path_transcription = ctc_best_path_decoder( + probs_seq=probs, vocabulary=data_generator.vocab_list) + print("\nTarget Transcription: %s\nOutput Transcription: %s" % + (target_transcription[i], best_path_transcription)) + wer_cur = wer(target_transcription[i], best_path_transcription) + wer_sum += wer_cur + wer_counter += 1 + print("cur wer = %f, average wer = %f" % + (wer_cur, wer_sum / wer_counter)) + # beam search decode + elif args.decode_method == "beam_search": + ext_scorer = LmScorer(args.alpha, args.beta, args.language_model_path) + beam_search_batch_results = ctc_beam_search_decoder_batch( + probs_split=probs_split, + vocabulary=data_generator.vocab_list, + beam_size=args.beam_size, + blank_id=len(data_generator.vocab_list), + num_processes=args.num_processes_beam_search, + cutoff_prob=args.cutoff_prob, + ext_scoring_func=ext_scorer, ) + for i, beam_search_result in enumerate(beam_search_batch_results): + print("\nTarget Transcription:\t%s" % target_transcription[i]) + for index in xrange(args.num_results_per_sample): + result = beam_search_result[index] + #output: index, log prob, beam result + print("Beam %d: %f \t%s" % (index, result[0], result[1])) + wer_cur = wer(target_transcription[i], beam_search_result[0][1]) + wer_sum += wer_cur + wer_counter += 1 + print("cur wer = %f , average wer = %f" % + (wer_cur, wer_sum / wer_counter)) + else: + raise ValueError("Decoding method [%s] is not supported." % + decode_method) def main(): diff --git a/deep_speech_2/lm/__init__.py b/deep_speech_2/lm/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/deep_speech_2/lm/lm_scorer.py b/deep_speech_2/lm/lm_scorer.py new file mode 100644 index 0000000000000000000000000000000000000000..463e96d6653b29207fb6105527a1f79c41c7fb84 --- /dev/null +++ b/deep_speech_2/lm/lm_scorer.py @@ -0,0 +1,68 @@ +"""External Scorer for Beam Search Decoder.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import os +import kenlm +import numpy as np + + +class LmScorer(object): + """External scorer to evaluate a prefix or whole sentence in + beam search decoding, including the score from n-gram language + model and word count. + + :param alpha: Parameter associated with language model. Don't use + language model when alpha = 0. + :type alpha: float + :param beta: Parameter associated with word count. Don't use word + count when beta = 0. + :type beta: float + :model_path: Path to load language model. + :type model_path: basestring + """ + + def __init__(self, alpha, beta, model_path): + self._alpha = alpha + self._beta = beta + if not os.path.isfile(model_path): + raise IOError("Invaid language model path: %s" % model_path) + self._language_model = kenlm.LanguageModel(model_path) + + # n-gram language model scoring + def _language_model_score(self, sentence): + #log10 prob of last word + log_cond_prob = list( + self._language_model.full_scores(sentence, eos=False))[-1][0] + return np.power(10, log_cond_prob) + + # word insertion term + def _word_count(self, sentence): + words = sentence.strip().split(' ') + return len(words) + + # reset alpha and beta + def reset_params(self, alpha, beta): + self._alpha = alpha + self._beta = beta + + # execute evaluation + def __call__(self, sentence, log=False): + """Evaluation function, gathering all the different scores + and return the final one. + + :param sentence: The input sentence for evalutation + :type sentence: basestring + :param log: Whether return the score in log representation. + :type log: bool + :return: Evaluation score, in the decimal or log. + :rtype: float + """ + lm = self._language_model_score(sentence) + word_cnt = self._word_count(sentence) + if log == False: + score = np.power(lm, self._alpha) * np.power(word_cnt, self._beta) + else: + score = self._alpha * np.log(lm) + self._beta * np.log(word_cnt) + return score diff --git a/deep_speech_2/lm/run.sh b/deep_speech_2/lm/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..bf523740ce9c78f83fa3946ab52fe5c9ff69cb45 --- /dev/null +++ b/deep_speech_2/lm/run.sh @@ -0,0 +1,3 @@ +echo "Downloading language model." + +wget -c ftp://xxx/xxx/en.00.UNKNOWN.klm -P ./data diff --git a/deep_speech_2/requirements.txt b/deep_speech_2/requirements.txt index 967b4f8c3148c62cd5b7a511567848af6c5c8f93..2ae7d0895a3594059e995e20d106f7c30ef92568 100755 --- a/deep_speech_2/requirements.txt +++ b/deep_speech_2/requirements.txt @@ -1,3 +1,4 @@ wget==3.2 scipy==0.13.1 -resampy==0.1.5 \ No newline at end of file +resampy==0.1.5 +https://github.com/kpu/kenlm/archive/master.zip diff --git a/deep_speech_2/tests/test_decoders.py b/deep_speech_2/tests/test_decoders.py new file mode 100644 index 0000000000000000000000000000000000000000..99d8a8289d93574c58ced50923716c39cfb96558 --- /dev/null +++ b/deep_speech_2/tests/test_decoders.py @@ -0,0 +1,91 @@ +"""Test decoders.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import unittest +from decoder import * + + +class TestDecoders(unittest.TestCase): + def setUp(self): + self.vocab_list = ["\'", ' ', 'a', 'b', 'c', 'd'] + self.beam_size = 20 + self.probs_seq1 = [[ + 0.06390443, 0.21124858, 0.27323887, 0.06870235, 0.0361254, + 0.18184413, 0.16493624 + ], [ + 0.03309247, 0.22866108, 0.24390638, 0.09699597, 0.31895462, + 0.0094893, 0.06890021 + ], [ + 0.218104, 0.19992557, 0.18245131, 0.08503348, 0.14903535, + 0.08424043, 0.08120984 + ], [ + 0.12094152, 0.19162472, 0.01473646, 0.28045061, 0.24246305, + 0.05206269, 0.09772094 + ], [ + 0.1333387, 0.00550838, 0.00301669, 0.21745861, 0.20803985, + 0.41317442, 0.01946335 + ], [ + 0.16468227, 0.1980699, 0.1906545, 0.18963251, 0.19860937, + 0.04377724, 0.01457421 + ]] + self.probs_seq2 = [[ + 0.08034842, 0.22671944, 0.05799633, 0.36814645, 0.11307441, + 0.04468023, 0.10903471 + ], [ + 0.09742457, 0.12959763, 0.09435383, 0.21889204, 0.15113123, + 0.10219457, 0.20640612 + ], [ + 0.45033529, 0.09091417, 0.15333208, 0.07939558, 0.08649316, + 0.12298585, 0.01654384 + ], [ + 0.02512238, 0.22079203, 0.19664364, 0.11906379, 0.07816055, + 0.22538587, 0.13483174 + ], [ + 0.17928453, 0.06065261, 0.41153005, 0.1172041, 0.11880313, + 0.07113197, 0.04139363 + ], [ + 0.15882358, 0.1235788, 0.23376776, 0.20510435, 0.00279306, + 0.05294827, 0.22298418 + ]] + self.best_path_result = ["ac'bdc", "b'da"] + self.beam_search_result = ['acdc', "b'a"] + + def test_best_path_decoder_1(self): + bst_result = ctc_best_path_decoder(self.probs_seq1, self.vocab_list) + self.assertEqual(bst_result, self.best_path_result[0]) + + def test_best_path_decoder_2(self): + bst_result = ctc_best_path_decoder(self.probs_seq2, self.vocab_list) + self.assertEqual(bst_result, self.best_path_result[1]) + + def test_beam_search_decoder_1(self): + beam_result = ctc_beam_search_decoder( + probs_seq=self.probs_seq1, + beam_size=self.beam_size, + vocabulary=self.vocab_list, + blank_id=len(self.vocab_list)) + self.assertEqual(beam_result[0][1], self.beam_search_result[0]) + + def test_beam_search_decoder_2(self): + beam_result = ctc_beam_search_decoder( + probs_seq=self.probs_seq2, + beam_size=self.beam_size, + vocabulary=self.vocab_list, + blank_id=len(self.vocab_list)) + self.assertEqual(beam_result[0][1], self.beam_search_result[1]) + + def test_beam_search_decoder_batch(self): + beam_results = ctc_beam_search_decoder_batch( + probs_split=[self.probs_seq1, self.probs_seq2], + beam_size=self.beam_size, + vocabulary=self.vocab_list, + blank_id=len(self.vocab_list), + num_processes=24) + self.assertEqual(beam_results[0][0][1], self.beam_search_result[0]) + self.assertEqual(beam_results[1][0][1], self.beam_search_result[1]) + + +if __name__ == '__main__': + unittest.main() diff --git a/deep_speech_2/tune.py b/deep_speech_2/tune.py new file mode 100644 index 0000000000000000000000000000000000000000..4e9e268fb5036aee36f8f3d6a0a0525b342232bb --- /dev/null +++ b/deep_speech_2/tune.py @@ -0,0 +1,217 @@ +"""Parameters tuning for DeepSpeech2 model.""" +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import distutils.util +import argparse +import gzip +import paddle.v2 as paddle +from data_utils.data import DataGenerator +from model import deep_speech2 +from decoder import * +from lm.lm_scorer import LmScorer +from error_rate import wer +import utils + +parser = argparse.ArgumentParser(description=__doc__) +parser.add_argument( + "--num_samples", + default=100, + type=int, + help="Number of samples for parameters tuning. (default: %(default)s)") +parser.add_argument( + "--num_conv_layers", + default=2, + type=int, + help="Convolution layer number. (default: %(default)s)") +parser.add_argument( + "--num_rnn_layers", + default=3, + type=int, + help="RNN layer number. (default: %(default)s)") +parser.add_argument( + "--rnn_layer_size", + default=512, + type=int, + help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--use_gpu", + default=True, + type=distutils.util.strtobool, + help="Use gpu or not. (default: %(default)s)") +parser.add_argument( + "--num_threads_data", + default=multiprocessing.cpu_count(), + type=int, + help="Number of cpu threads for preprocessing data. (default: %(default)s)") +parser.add_argument( + "--num_processes_beam_search", + default=multiprocessing.cpu_count(), + type=int, + help="Number of cpu processes for beam search. (default: %(default)s)") +parser.add_argument( + "--mean_std_filepath", + default='mean_std.npz', + type=str, + help="Manifest path for normalizer. (default: %(default)s)") +parser.add_argument( + "--decode_manifest_path", + default='datasets/manifest.test', + type=str, + help="Manifest path for decoding. (default: %(default)s)") +parser.add_argument( + "--model_filepath", + default='checkpoints/params.latest.tar.gz', + type=str, + help="Model filepath. (default: %(default)s)") +parser.add_argument( + "--vocab_filepath", + default='datasets/vocab/eng_vocab.txt', + type=str, + help="Vocabulary filepath. (default: %(default)s)") +parser.add_argument( + "--beam_size", + default=500, + type=int, + help="Width for beam search decoding. (default: %(default)d)") +parser.add_argument( + "--language_model_path", + default="lm/data/1Billion.klm", + type=str, + help="Path for language model. (default: %(default)s)") +parser.add_argument( + "--alpha_from", + default=0.1, + type=float, + help="Where alpha starts from. (default: %(default)f)") +parser.add_argument( + "--num_alphas", + default=14, + type=int, + help="Number of candidate alphas. (default: %(default)d)") +parser.add_argument( + "--alpha_to", + default=0.36, + type=float, + help="Where alpha ends with. (default: %(default)f)") +parser.add_argument( + "--beta_from", + default=0.05, + type=float, + help="Where beta starts from. (default: %(default)f)") +parser.add_argument( + "--num_betas", + default=20, + type=float, + help="Number of candidate betas. (default: %(default)d)") +parser.add_argument( + "--beta_to", + default=1.0, + type=float, + help="Where beta ends with. (default: %(default)f)") +parser.add_argument( + "--cutoff_prob", + default=0.99, + type=float, + help="The cutoff probability of pruning" + "in beam search. (default: %(default)f)") +args = parser.parse_args() + + +def tune(): + """Tune parameters alpha and beta on one minibatch.""" + + if not args.num_alphas >= 0: + raise ValueError("num_alphas must be non-negative!") + + if not args.num_betas >= 0: + raise ValueError("num_betas must be non-negative!") + + # initialize data generator + data_generator = DataGenerator( + vocab_filepath=args.vocab_filepath, + mean_std_filepath=args.mean_std_filepath, + augmentation_config='{}', + num_threads=args.num_threads_data) + + # create network config + # paddle.data_type.dense_array is used for variable batch input. + # The size 161 * 161 is only an placeholder value and the real shape + # of input batch data will be induced during training. + audio_data = paddle.layer.data( + name="audio_spectrogram", type=paddle.data_type.dense_array(161 * 161)) + text_data = paddle.layer.data( + name="transcript_text", + type=paddle.data_type.integer_value_sequence(data_generator.vocab_size)) + output_probs = deep_speech2( + audio_data=audio_data, + text_data=text_data, + dict_size=data_generator.vocab_size, + num_conv_layers=args.num_conv_layers, + num_rnn_layers=args.num_rnn_layers, + rnn_size=args.rnn_layer_size, + is_inference=True) + + # load parameters + parameters = paddle.parameters.Parameters.from_tar( + gzip.open(args.model_filepath)) + + # prepare infer data + batch_reader = data_generator.batch_reader_creator( + manifest_path=args.decode_manifest_path, + batch_size=args.num_samples, + sortagrad=False, + shuffle_method=None) + # get one batch data for tuning + infer_data = batch_reader().next() + + # run inference + infer_results = paddle.infer( + output_layer=output_probs, parameters=parameters, input=infer_data) + num_steps = len(infer_results) // len(infer_data) + probs_split = [ + infer_results[i * num_steps:(i + 1) * num_steps] + for i in xrange(0, len(infer_data)) + ] + + # create grid for search + cand_alphas = np.linspace(args.alpha_from, args.alpha_to, args.num_alphas) + cand_betas = np.linspace(args.beta_from, args.beta_to, args.num_betas) + params_grid = [(alpha, beta) for alpha in cand_alphas + for beta in cand_betas] + + ext_scorer = LmScorer(args.alpha_from, args.beta_from, + args.language_model_path) + ## tune parameters in loop + for alpha, beta in params_grid: + wer_sum, wer_counter = 0, 0 + # reset scorer + ext_scorer.reset_params(alpha, beta) + # beam search using multiple processes + beam_search_results = ctc_beam_search_decoder_batch( + probs_split=probs_split, + vocabulary=data_generator.vocab_list, + beam_size=args.beam_size, + cutoff_prob=args.cutoff_prob, + blank_id=len(data_generator.vocab_list), + num_processes=args.num_processes_beam_search, + ext_scoring_func=ext_scorer, ) + for i, beam_search_result in enumerate(beam_search_results): + target_transcription = ''.join([ + data_generator.vocab_list[index] for index in infer_data[i][1] + ]) + wer_sum += wer(target_transcription, beam_search_result[0][1]) + wer_counter += 1 + + print("alpha = %f\tbeta = %f\tWER = %f" % + (alpha, beta, wer_sum / wer_counter)) + + +def main(): + paddle.init(use_gpu=args.use_gpu, trainer_count=1) + tune() + + +if __name__ == '__main__': + main()