提交 b56a548e 编写于 作者: X Xinghai Sun 提交者: GitHub

Merge pull request #227 from xinghai-sun/refactor_config

Reduce the config parsing codes (-360 Lines) for DS2 and make it looks cleaner.
...@@ -9,8 +9,9 @@ from math import log ...@@ -9,8 +9,9 @@ from math import log
import multiprocessing import multiprocessing
def ctc_best_path_decoder(probs_seq, vocabulary): def ctc_greedy_decoder(probs_seq, vocabulary):
"""Best path decoder, also called argmax decoder or greedy decoder. """CTC greedy (best path) decoder.
Path consisting of the most probable tokens are further post-processed to Path consisting of the most probable tokens are further post-processed to
remove consecutive repetitions and all blanks. remove consecutive repetitions and all blanks.
...@@ -45,10 +46,12 @@ def ctc_beam_search_decoder(probs_seq, ...@@ -45,10 +46,12 @@ def ctc_beam_search_decoder(probs_seq,
cutoff_prob=1.0, cutoff_prob=1.0,
ext_scoring_func=None, ext_scoring_func=None,
nproc=False): nproc=False):
"""Beam search decoder for CTC-trained network. It utilizes beam search """CTC Beam search decoder.
to approximately select top best decoding labels and returning results
in the descending order. The implementation is based on Prefix It utilizes beam search to approximately select top best decoding
Beam Search (https://arxiv.org/abs/1408.2873), and the unclear part is labels and returning results in the descending order.
The implementation is based on Prefix Beam Search
(https://arxiv.org/abs/1408.2873), and the unclear part is
redesigned. Two important modifications: 1) in the iterative computation redesigned. Two important modifications: 1) in the iterative computation
of probabilities, the assignment operation is changed to accumulation for of probabilities, the assignment operation is changed to accumulation for
one prefix may comes from different paths; 2) the if condition "if l^+ not one prefix may comes from different paths; 2) the if condition "if l^+ not
......
...@@ -3,123 +3,63 @@ import os ...@@ -3,123 +3,63 @@ import os
import time import time
import random import random
import argparse import argparse
import distutils.util import functools
from time import gmtime, strftime from time import gmtime, strftime
import SocketServer import SocketServer
import struct import struct
import wave import wave
import paddle.v2 as paddle import paddle.v2 as paddle
from utils import print_arguments
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from model import DeepSpeech2Model from model import DeepSpeech2Model
from data_utils.utils import read_manifest from data_utils.utils import read_manifest
from utils import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( add_arg = functools.partial(add_arguments, argparser=parser)
"--host_ip", # yapf: disable
default="localhost", add_arg('host_port', int, 8086, "Server's IP port.")
type=str, add_arg('beam_size', int, 500, "Beam search width.")
help="Server IP address. (default: %(default)s)") add_arg('num_conv_layers', int, 2, "# of convolution layers.")
parser.add_argument( add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
"--host_port", add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
default=8086, add_arg('alpha', float, 0.36, "Coef of LM for beam search.")
type=int, add_arg('beta', float, 0.25, "Coef of WC for beam search.")
help="Server Port. (default: %(default)s)") add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
parser.add_argument( add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.")
"--speech_save_dir", add_arg('use_gpu', bool, True, "Use GPU or not.")
default="demo_cache", add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
type=str, "bi-directional RNNs. Not for GRU.")
help="Directory for saving demo speech. (default: %(default)s)") add_arg('host_ip', str,
parser.add_argument( 'localhost',
"--vocab_filepath", "Server's IP address.")
default='datasets/vocab/eng_vocab.txt', add_arg('speech_save_dir', str,
type=str, 'demo_cache',
help="Vocabulary filepath. (default: %(default)s)") "Directory to save demo audios.")
parser.add_argument( add_arg('warmup_manifest', str,
"--mean_std_filepath", 'datasets/manifest.test',
default='mean_std.npz', "Filepath of manifest to warm up.")
type=str, add_arg('mean_std_path', str,
help="Manifest path for normalizer. (default: %(default)s)") 'mean_std.npz',
parser.add_argument( "Filepath of normalizer's mean & std.")
"--warmup_manifest_path", add_arg('vocab_path', str,
default='datasets/manifest.test', 'datasets/vocab/eng_vocab.txt',
type=str, "Filepath of vocabulary.")
help="Manifest path for warmup test. (default: %(default)s)") add_arg('model_path', str,
parser.add_argument( './checkpoints/params.latest.tar.gz',
"--specgram_type", "If None, the training starts from scratch, "
default='linear', "otherwise, it resumes from the pre-trained model.")
type=str, add_arg('lang_model_path', str,
help="Feature type of audio data: 'linear' (power spectrum)" 'lm/data/common_crawl_00.prune01111.trie.klm',
" or 'mfcc'. (default: %(default)s)") "Filepath for language model.")
parser.add_argument( add_arg('decoding_method', str,
"--num_conv_layers", 'ctc_beam_search',
default=2, "Decoding method. Options: ctc_beam_search, ctc_greedy",
type=int, choices = ['ctc_beam_search', 'ctc_greedy'])
help="Convolution layer number. (default: %(default)s)") add_arg('specgram_type', str,
parser.add_argument( 'linear',
"--num_rnn_layers", "Audio feature type. Options: linear, mfcc.",
default=3, choices=['linear', 'mfcc'])
type=int, # yapf: disable
help="RNN layer number. (default: %(default)s)")
parser.add_argument(
"--rnn_layer_size",
default=2048,
type=int,
help="RNN layer cell number. (default: %(default)s)")
parser.add_argument(
"--share_rnn_weights",
default=True,
type=distutils.util.strtobool,
help="Whether to share input-hidden weights between forword and backward "
"directional simple RNNs. Only available when use_gru=False. "
"(default: %(default)s)")
parser.add_argument(
"--use_gru",
default=False,
type=distutils.util.strtobool,
help="Use GRU or simple RNN. (default: %(default)s)")
parser.add_argument(
"--use_gpu",
default=True,
type=distutils.util.strtobool,
help="Use gpu or not. (default: %(default)s)")
parser.add_argument(
"--model_filepath",
default='checkpoints/params.latest.tar.gz',
type=str,
help="Model filepath. (default: %(default)s)")
parser.add_argument(
"--decode_method",
default='beam_search',
type=str,
help="Method for ctc decoding: best_path or beam_search. "
"(default: %(default)s)")
parser.add_argument(
"--beam_size",
default=100,
type=int,
help="Width for beam search decoding. (default: %(default)d)")
parser.add_argument(
"--language_model_path",
default="lm/data/common_crawl_00.prune01111.trie.klm",
type=str,
help="Path for language model. (default: %(default)s)")
parser.add_argument(
"--alpha",
default=0.36,
type=float,
help="Parameter associated with language model. (default: %(default)f)")
parser.add_argument(
"--beta",
default=0.25,
type=float,
help="Parameter associated with word count. (default: %(default)f)")
parser.add_argument(
"--cutoff_prob",
default=0.99,
type=float,
help="The cutoff probability of pruning"
"in beam search. (default: %(default)f)")
args = parser.parse_args() args = parser.parse_args()
...@@ -200,8 +140,8 @@ def start_server(): ...@@ -200,8 +140,8 @@ def start_server():
"""Start the ASR server""" """Start the ASR server"""
# prepare data generator # prepare data generator
data_generator = DataGenerator( data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_path,
augmentation_config='{}', augmentation_config='{}',
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=1) num_threads=1)
...@@ -212,7 +152,7 @@ def start_server(): ...@@ -212,7 +152,7 @@ def start_server():
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru, use_gru=args.use_gru,
pretrained_model_path=args.model_filepath, pretrained_model_path=args.model_path,
share_rnn_weights=args.share_rnn_weights) share_rnn_weights=args.share_rnn_weights)
# prepare ASR inference handler # prepare ASR inference handler
...@@ -220,13 +160,13 @@ def start_server(): ...@@ -220,13 +160,13 @@ def start_server():
feature = data_generator.process_utterance(filename, "") feature = data_generator.process_utterance(filename, "")
result_transcript = ds2_model.infer_batch( result_transcript = ds2_model.infer_batch(
infer_data=[feature], infer_data=[feature],
decode_method=args.decode_method, decoding_method=args.decoding_method,
beam_alpha=args.alpha, beam_alpha=args.alpha,
beam_beta=args.beta, beam_beta=args.beta,
beam_size=args.beam_size, beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob, cutoff_prob=args.cutoff_prob,
vocab_list=data_generator.vocab_list, vocab_list=data_generator.vocab_list,
language_model_path=args.language_model_path, language_model_path=args.lang_model_path,
num_processes=1) num_processes=1)
return result_transcript[0] return result_transcript[0]
...@@ -235,7 +175,7 @@ def start_server(): ...@@ -235,7 +175,7 @@ def start_server():
print('Warming up ...') print('Warming up ...')
warm_up_test( warm_up_test(
audio_process_handler=file_to_transcript, audio_process_handler=file_to_transcript,
manifest_path=args.warmup_manifest_path, manifest_path=args.warmup_manifest,
num_test_cases=3) num_test_cases=3)
print('-----------------------------------------------------------') print('-----------------------------------------------------------')
......
...@@ -3,147 +3,74 @@ from __future__ import absolute_import ...@@ -3,147 +3,74 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import distutils.util
import argparse import argparse
import multiprocessing import functools
import paddle.v2 as paddle import paddle.v2 as paddle
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from model import DeepSpeech2Model from model import DeepSpeech2Model
from error_rate import wer, cer from error_rate import wer, cer
import utils from utils import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( add_arg = functools.partial(add_arguments, argparser=parser)
"--batch_size", # yapf: disable
default=128, add_arg('batch_size', int, 128, "Minibatch size.")
type=int, add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).")
help="Minibatch size for evaluation. (default: %(default)s)") add_arg('beam_size', int, 500, "Beam search width.")
parser.add_argument( add_arg('num_proc_bsearch', int, 12, "# of CPUs for beam search.")
"--trainer_count", add_arg('num_proc_data', int, 12, "# of CPUs for data preprocessing.")
default=8, add_arg('num_conv_layers', int, 2, "# of convolution layers.")
type=int, add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
help="Trainer number. (default: %(default)s)") add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
parser.add_argument( add_arg('alpha', float, 0.36, "Coef of LM for beam search.")
"--num_conv_layers", add_arg('beta', float, 0.25, "Coef of WC for beam search.")
default=2, add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
type=int, add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.")
help="Convolution layer number. (default: %(default)s)") add_arg('use_gpu', bool, True, "Use GPU or not.")
parser.add_argument( add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
"--num_rnn_layers", "bi-directional RNNs. Not for GRU.")
default=3, add_arg('test_manifest', str,
type=int, 'datasets/manifest.test',
help="RNN layer number. (default: %(default)s)") "Filepath of manifest to evaluate.")
parser.add_argument( add_arg('mean_std_path', str,
"--rnn_layer_size", 'mean_std.npz',
default=2048, "Filepath of normalizer's mean & std.")
type=int, add_arg('vocab_path', str,
help="RNN layer cell number. (default: %(default)s)") 'datasets/vocab/eng_vocab.txt',
parser.add_argument( "Filepath of vocabulary.")
"--share_rnn_weights", add_arg('model_path', str,
default=True, './checkpoints/params.latest.tar.gz',
type=distutils.util.strtobool, "If None, the training starts from scratch, "
help="Whether to share input-hidden weights between forword and backward " "otherwise, it resumes from the pre-trained model.")
"directional simple RNNs. Only available when use_gru=False. " add_arg('lang_model_path', str,
"(default: %(default)s)") 'lm/data/common_crawl_00.prune01111.trie.klm',
parser.add_argument( "Filepath for language model.")
"--use_gru", add_arg('decoding_method', str,
default=False, 'ctc_beam_search',
type=distutils.util.strtobool, "Decoding method. Options: ctc_beam_search, ctc_greedy",
help="Use GRU or simple RNN. (default: %(default)s)") choices = ['ctc_beam_search', 'ctc_greedy'])
parser.add_argument( add_arg('error_rate_type', str,
"--use_gpu", 'wer',
default=True, "Error rate type for evaluation.",
type=distutils.util.strtobool, choices=['wer', 'cer'])
help="Use gpu or not. (default: %(default)s)") add_arg('specgram_type', str,
parser.add_argument( 'linear',
"--num_threads_data", "Audio feature type. Options: linear, mfcc.",
default=multiprocessing.cpu_count() // 2, choices=['linear', 'mfcc'])
type=int, # yapf: disable
help="Number of cpu threads for preprocessing data. (default: %(default)s)")
parser.add_argument(
"--num_processes_beam_search",
default=multiprocessing.cpu_count() // 2,
type=int,
help="Number of cpu processes for beam search. (default: %(default)s)")
parser.add_argument(
"--mean_std_filepath",
default='mean_std.npz',
type=str,
help="Manifest path for normalizer. (default: %(default)s)")
parser.add_argument(
"--decode_method",
default='beam_search',
type=str,
help="Method for ctc decoding, best_path or beam_search. "
"(default: %(default)s)")
parser.add_argument(
"--language_model_path",
default="lm/data/common_crawl_00.prune01111.trie.klm",
type=str,
help="Path for language model. (default: %(default)s)")
parser.add_argument(
"--alpha",
default=0.36,
type=float,
help="Parameter associated with language model. (default: %(default)f)")
parser.add_argument(
"--beta",
default=0.25,
type=float,
help="Parameter associated with word count. (default: %(default)f)")
parser.add_argument(
"--cutoff_prob",
default=0.99,
type=float,
help="The cutoff probability of pruning"
"in beam search. (default: %(default)f)")
parser.add_argument(
"--beam_size",
default=500,
type=int,
help="Width for beam search decoding. (default: %(default)d)")
parser.add_argument(
"--specgram_type",
default='linear',
type=str,
help="Feature type of audio data: 'linear' (power spectrum)"
" or 'mfcc'. (default: %(default)s)")
parser.add_argument(
"--decode_manifest_path",
default='datasets/manifest.test',
type=str,
help="Manifest path for decoding. (default: %(default)s)")
parser.add_argument(
"--model_filepath",
default='checkpoints/params.latest.tar.gz',
type=str,
help="Model filepath. (default: %(default)s)")
parser.add_argument(
"--vocab_filepath",
default='datasets/vocab/eng_vocab.txt',
type=str,
help="Vocabulary filepath. (default: %(default)s)")
parser.add_argument(
"--error_rate_type",
default='wer',
choices=['wer', 'cer'],
type=str,
help="Error rate type for evaluation. 'wer' for word error rate and 'cer' "
"for character error rate. "
"(default: %(default)s)")
args = parser.parse_args() args = parser.parse_args()
def evaluate(): def evaluate():
"""Evaluate on whole test data for DeepSpeech2.""" """Evaluate on whole test data for DeepSpeech2."""
data_generator = DataGenerator( data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_path,
augmentation_config='{}', augmentation_config='{}',
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_threads_data) num_threads=args.num_proc_data)
batch_reader = data_generator.batch_reader_creator( batch_reader = data_generator.batch_reader_creator(
manifest_path=args.decode_manifest_path, manifest_path=args.test_manifest,
batch_size=args.batch_size, batch_size=args.batch_size,
min_batch_size=1, min_batch_size=1,
sortagrad=False, sortagrad=False,
...@@ -155,7 +82,7 @@ def evaluate(): ...@@ -155,7 +82,7 @@ def evaluate():
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru, use_gru=args.use_gru,
pretrained_model_path=args.model_filepath, pretrained_model_path=args.model_path,
share_rnn_weights=args.share_rnn_weights) share_rnn_weights=args.share_rnn_weights)
error_rate_func = cer if args.error_rate_type == 'cer' else wer error_rate_func = cer if args.error_rate_type == 'cer' else wer
...@@ -163,14 +90,14 @@ def evaluate(): ...@@ -163,14 +90,14 @@ def evaluate():
for infer_data in batch_reader(): for infer_data in batch_reader():
result_transcripts = ds2_model.infer_batch( result_transcripts = ds2_model.infer_batch(
infer_data=infer_data, infer_data=infer_data,
decode_method=args.decode_method, decoding_method=args.decoding_method,
beam_alpha=args.alpha, beam_alpha=args.alpha,
beam_beta=args.beta, beam_beta=args.beta,
beam_size=args.beam_size, beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob, cutoff_prob=args.cutoff_prob,
vocab_list=data_generator.vocab_list, vocab_list=data_generator.vocab_list,
language_model_path=args.language_model_path, language_model_path=args.lang_model_path,
num_processes=args.num_processes_beam_search) num_processes=args.num_proc_bsearch)
target_transcripts = [ target_transcripts = [
''.join([data_generator.vocab_list[token] for token in transcript]) ''.join([data_generator.vocab_list[token] for token in transcript])
for _, transcript in infer_data for _, transcript in infer_data
...@@ -185,7 +112,7 @@ def evaluate(): ...@@ -185,7 +112,7 @@ def evaluate():
def main(): def main():
utils.print_arguments(args) print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
evaluate() evaluate()
......
...@@ -4,146 +4,72 @@ from __future__ import division ...@@ -4,146 +4,72 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import argparse import argparse
import distutils.util import functools
import multiprocessing
import paddle.v2 as paddle import paddle.v2 as paddle
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from model import DeepSpeech2Model from model import DeepSpeech2Model
from error_rate import wer, cer from error_rate import wer, cer
import utils from utils import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( add_arg = functools.partial(add_arguments, argparser=parser)
"--num_samples", # yapf: disable
default=10, add_arg('num_samples', int, 10, "# of samples to infer.")
type=int, add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).")
help="Number of samples for inference. (default: %(default)s)") add_arg('beam_size', int, 500, "Beam search width.")
parser.add_argument( add_arg('num_proc_bsearch', int, 12, "# of CPUs for beam search.")
"--num_conv_layers", add_arg('num_conv_layers', int, 2, "# of convolution layers.")
default=2, add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
type=int, add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
help="Convolution layer number. (default: %(default)s)") add_arg('alpha', float, 0.36, "Coef of LM for beam search.")
parser.add_argument( add_arg('beta', float, 0.25, "Coef of WC for beam search.")
"--num_rnn_layers", add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
default=3, add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.")
type=int, add_arg('use_gpu', bool, True, "Use GPU or not.")
help="RNN layer number. (default: %(default)s)") add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
parser.add_argument( "bi-directional RNNs. Not for GRU.")
"--rnn_layer_size", add_arg('infer_manifest', str,
default=2048, 'datasets/manifest.dev',
type=int, "Filepath of manifest to infer.")
help="RNN layer cell number. (default: %(default)s)") add_arg('mean_std_path', str,
parser.add_argument( 'mean_std.npz',
"--share_rnn_weights", "Filepath of normalizer's mean & std.")
default=True, add_arg('vocab_path', str,
type=distutils.util.strtobool, 'datasets/vocab/eng_vocab.txt',
help="Whether to share input-hidden weights between forword and backward " "Filepath of vocabulary.")
"directional simple RNNs. Only available when use_gru=False. " add_arg('lang_model_path', str,
"(default: %(default)s)") 'lm/data/common_crawl_00.prune01111.trie.klm',
parser.add_argument( "Filepath for language model.")
"--use_gru", add_arg('model_path', str,
default=False, './checkpoints/params.latest.tar.gz',
type=distutils.util.strtobool, "If None, the training starts from scratch, "
help="Use GRU or simple RNN. (default: %(default)s)") "otherwise, it resumes from the pre-trained model.")
parser.add_argument( add_arg('decoding_method', str,
"--use_gpu", 'ctc_beam_search',
default=True, "Decoding method. Options: ctc_beam_search, ctc_greedy",
type=distutils.util.strtobool, choices = ['ctc_beam_search', 'ctc_greedy'])
help="Use gpu or not. (default: %(default)s)") add_arg('error_rate_type', str,
parser.add_argument( 'wer',
"--num_threads_data", "Error rate type for evaluation.",
default=1, choices=['wer', 'cer'])
type=int, add_arg('specgram_type', str,
help="Number of cpu threads for preprocessing data. (default: %(default)s)") 'linear',
parser.add_argument( "Audio feature type. Options: linear, mfcc.",
"--num_processes_beam_search", choices=['linear', 'mfcc'])
default=multiprocessing.cpu_count() // 2, # yapf: disable
type=int,
help="Number of cpu processes for beam search. (default: %(default)s)")
parser.add_argument(
"--specgram_type",
default='linear',
type=str,
help="Feature type of audio data: 'linear' (power spectrum)"
" or 'mfcc'. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=8,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument(
"--mean_std_filepath",
default='mean_std.npz',
type=str,
help="Manifest path for normalizer. (default: %(default)s)")
parser.add_argument(
"--decode_manifest_path",
default='datasets/manifest.test',
type=str,
help="Manifest path for decoding. (default: %(default)s)")
parser.add_argument(
"--model_filepath",
default='checkpoints/params.latest.tar.gz',
type=str,
help="Model filepath. (default: %(default)s)")
parser.add_argument(
"--vocab_filepath",
default='datasets/vocab/eng_vocab.txt',
type=str,
help="Vocabulary filepath. (default: %(default)s)")
parser.add_argument(
"--decode_method",
default='beam_search',
type=str,
help="Method for ctc decoding: best_path or beam_search. "
"(default: %(default)s)")
parser.add_argument(
"--beam_size",
default=500,
type=int,
help="Width for beam search decoding. (default: %(default)d)")
parser.add_argument(
"--language_model_path",
default="lm/data/common_crawl_00.prune01111.trie.klm",
type=str,
help="Path for language model. (default: %(default)s)")
parser.add_argument(
"--alpha",
default=0.36,
type=float,
help="Parameter associated with language model. (default: %(default)f)")
parser.add_argument(
"--beta",
default=0.25,
type=float,
help="Parameter associated with word count. (default: %(default)f)")
parser.add_argument(
"--cutoff_prob",
default=0.99,
type=float,
help="The cutoff probability of pruning"
"in beam search. (default: %(default)f)")
parser.add_argument(
"--error_rate_type",
default='wer',
choices=['wer', 'cer'],
type=str,
help="Error rate type for evaluation. 'wer' for word error rate and 'cer' "
"for character error rate. "
"(default: %(default)s)")
args = parser.parse_args() args = parser.parse_args()
def infer(): def infer():
"""Inference for DeepSpeech2.""" """Inference for DeepSpeech2."""
data_generator = DataGenerator( data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_path,
augmentation_config='{}', augmentation_config='{}',
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_threads_data) num_threads=1)
batch_reader = data_generator.batch_reader_creator( batch_reader = data_generator.batch_reader_creator(
manifest_path=args.decode_manifest_path, manifest_path=args.infer_manifest,
batch_size=args.num_samples, batch_size=args.num_samples,
min_batch_size=1, min_batch_size=1,
sortagrad=False, sortagrad=False,
...@@ -156,18 +82,18 @@ def infer(): ...@@ -156,18 +82,18 @@ def infer():
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru, use_gru=args.use_gru,
pretrained_model_path=args.model_filepath, pretrained_model_path=args.model_path,
share_rnn_weights=args.share_rnn_weights) share_rnn_weights=args.share_rnn_weights)
result_transcripts = ds2_model.infer_batch( result_transcripts = ds2_model.infer_batch(
infer_data=infer_data, infer_data=infer_data,
decode_method=args.decode_method, decoding_method=args.decoding_method,
beam_alpha=args.alpha, beam_alpha=args.alpha,
beam_beta=args.beta, beam_beta=args.beta,
beam_size=args.beam_size, beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob, cutoff_prob=args.cutoff_prob,
vocab_list=data_generator.vocab_list, vocab_list=data_generator.vocab_list,
language_model_path=args.language_model_path, language_model_path=args.lang_model_path,
num_processes=args.num_processes_beam_search) num_processes=args.num_proc_bsearch)
error_rate_func = cer if args.error_rate_type == 'cer' else wer error_rate_func = cer if args.error_rate_type == 'cer' else wer
target_transcripts = [ target_transcripts = [
...@@ -182,7 +108,7 @@ def infer(): ...@@ -182,7 +108,7 @@ def infer():
def main(): def main():
utils.print_arguments(args) print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
infer() infer()
......
...@@ -146,7 +146,7 @@ class DeepSpeech2Model(object): ...@@ -146,7 +146,7 @@ class DeepSpeech2Model(object):
# run inference # run inference
return self._loss_inferer.infer(input=infer_data) return self._loss_inferer.infer(input=infer_data)
def infer_batch(self, infer_data, decode_method, beam_alpha, beam_beta, def infer_batch(self, infer_data, decoding_method, beam_alpha, beam_beta,
beam_size, cutoff_prob, vocab_list, language_model_path, beam_size, cutoff_prob, vocab_list, language_model_path,
num_processes): num_processes):
"""Model inference. Infer the transcription for a batch of speech """Model inference. Infer the transcription for a batch of speech
...@@ -156,9 +156,9 @@ class DeepSpeech2Model(object): ...@@ -156,9 +156,9 @@ class DeepSpeech2Model(object):
consisting of a tuple of audio features and consisting of a tuple of audio features and
transcription text (empty string). transcription text (empty string).
:type infer_data: list :type infer_data: list
:param decode_method: Decoding method name, 'best_path' or :param decoding_method: Decoding method name, 'ctc_greedy' or
'beam search'. 'ctc_beam_search'.
:param decode_method: string :param decoding_method: string
:param beam_alpha: Parameter associated with language model. :param beam_alpha: Parameter associated with language model.
:type beam_alpha: float :type beam_alpha: float
:param beam_beta: Parameter associated with word count. :param beam_beta: Parameter associated with word count.
...@@ -190,13 +190,13 @@ class DeepSpeech2Model(object): ...@@ -190,13 +190,13 @@ class DeepSpeech2Model(object):
] ]
# run decoder # run decoder
results = [] results = []
if decode_method == "best_path": if decoding_method == "ctc_greedy":
# best path decode # best path decode
for i, probs in enumerate(probs_split): for i, probs in enumerate(probs_split):
output_transcription = ctc_best_path_decoder( output_transcription = ctc_greedy_decoder(
probs_seq=probs, vocabulary=vocab_list) probs_seq=probs, vocabulary=vocab_list)
results.append(output_transcription) results.append(output_transcription)
elif decode_method == "beam_search": elif decoding_method == "ctc_beam_search":
# initialize external scorer # initialize external scorer
if self._ext_scorer == None: if self._ext_scorer == None:
self._ext_scorer = LmScorer(beam_alpha, beam_beta, self._ext_scorer = LmScorer(beam_alpha, beam_beta,
...@@ -205,7 +205,6 @@ class DeepSpeech2Model(object): ...@@ -205,7 +205,6 @@ class DeepSpeech2Model(object):
else: else:
self._ext_scorer.reset_params(beam_alpha, beam_beta) self._ext_scorer.reset_params(beam_alpha, beam_beta)
assert self._loaded_lm_path == language_model_path assert self._loaded_lm_path == language_model_path
# beam search decode # beam search decode
beam_search_results = ctc_beam_search_decoder_batch( beam_search_results = ctc_beam_search_decoder_batch(
probs_split=probs_split, probs_split=probs_split,
...@@ -219,7 +218,7 @@ class DeepSpeech2Model(object): ...@@ -219,7 +218,7 @@ class DeepSpeech2Model(object):
results = [result[0][1] for result in beam_search_results] results = [result[0][1] for result in beam_search_results]
else: else:
raise ValueError("Decoding method [%s] is not supported." % raise ValueError("Decoding method [%s] is not supported." %
decode_method) decoding_method)
return results return results
def _create_parameters(self, model_path=None): def _create_parameters(self, model_path=None):
......
...@@ -49,16 +49,16 @@ class TestDecoders(unittest.TestCase): ...@@ -49,16 +49,16 @@ class TestDecoders(unittest.TestCase):
0.15882358, 0.1235788, 0.23376776, 0.20510435, 0.00279306, 0.15882358, 0.1235788, 0.23376776, 0.20510435, 0.00279306,
0.05294827, 0.22298418 0.05294827, 0.22298418
]] ]]
self.best_path_result = ["ac'bdc", "b'da"] self.greedy_result = ["ac'bdc", "b'da"]
self.beam_search_result = ['acdc', "b'a"] self.beam_search_result = ['acdc', "b'a"]
def test_best_path_decoder_1(self): def test_greedy_decoder_1(self):
bst_result = ctc_best_path_decoder(self.probs_seq1, self.vocab_list) bst_result = ctc_greedy_decoder(self.probs_seq1, self.vocab_list)
self.assertEqual(bst_result, self.best_path_result[0]) self.assertEqual(bst_result, self.greedy_result[0])
def test_best_path_decoder_2(self): def test_greedy_decoder_2(self):
bst_result = ctc_best_path_decoder(self.probs_seq2, self.vocab_list) bst_result = ctc_greedy_decoder(self.probs_seq2, self.vocab_list)
self.assertEqual(bst_result, self.best_path_result[1]) self.assertEqual(bst_result, self.greedy_result[1])
def test_beam_search_decoder_1(self): def test_beam_search_decoder_1(self):
beam_result = ctc_beam_search_decoder( beam_result = ctc_beam_search_decoder(
......
...@@ -7,32 +7,29 @@ from __future__ import division ...@@ -7,32 +7,29 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import argparse import argparse
import functools
import codecs import codecs
import json import json
from collections import Counter from collections import Counter
import os.path import os.path
import _init_paths import _init_paths
from data_utils import utils from data_utils import utils
from utils import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( add_arg = functools.partial(add_arguments, argparser=parser)
"--manifest_paths", # yapf: disable
type=str, add_arg('count_threshold', int, 0, "Truncation threshold for char counts.")
help="Manifest paths for building vocabulary." add_arg('vocab_path', str,
'datasets/vocab/zh_vocab.txt',
"Filepath to write the vocabulary.")
add_arg('manifest_paths', str,
None,
"Filepaths of manifests for building vocabulary. "
"You can provide multiple manifest files.", "You can provide multiple manifest files.",
nargs='+', nargs='+',
required=True) required=True)
parser.add_argument( # yapf: disable
"--count_threshold",
default=0,
type=int,
help="Characters whose counts are below the threshold will be truncated. "
"(default: %(default)i)")
parser.add_argument(
"--vocab_path",
default='datasets/vocab/zh_vocab.txt',
type=str,
help="File path to write the vocabulary. (default: %(default)s)")
args = parser.parse_args() args = parser.parse_args()
...@@ -44,6 +41,8 @@ def count_manifest(counter, manifest_path): ...@@ -44,6 +41,8 @@ def count_manifest(counter, manifest_path):
def main(): def main():
print_arguments(args)
counter = Counter() counter = Counter()
for manifest_path in args.manifest_paths: for manifest_path in args.manifest_paths:
count_manifest(counter, manifest_path) count_manifest(counter, manifest_path)
......
...@@ -4,48 +4,35 @@ from __future__ import division ...@@ -4,48 +4,35 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import argparse import argparse
import functools
import _init_paths import _init_paths
from data_utils.normalizer import FeatureNormalizer from data_utils.normalizer import FeatureNormalizer
from data_utils.augmentor.augmentation import AugmentationPipeline from data_utils.augmentor.augmentation import AugmentationPipeline
from data_utils.featurizer.audio_featurizer import AudioFeaturizer from data_utils.featurizer.audio_featurizer import AudioFeaturizer
from utils import add_arguments, print_arguments
parser = argparse.ArgumentParser(
description='Computing mean and stddev for feature normalizer.') parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( add_arg = functools.partial(add_arguments, argparser=parser)
"--specgram_type", # yapf: disable
default='linear', add_arg('num_samples', int, 2000, "# of samples to for statistics.")
type=str, add_arg('specgram_type', str,
help="Feature type of audio data: 'linear' (power spectrum)" 'linear',
" or 'mfcc'. (default: %(default)s)") "Audio feature type. Options: linear, mfcc.",
parser.add_argument( choices=['linear', 'mfcc'])
"--manifest_path", add_arg('manifest_path', str,
default='datasets/manifest.train', 'datasets/manifest.train',
type=str, "Filepath of manifest to compute normalizer's mean and stddev.")
help="Manifest path for computing normalizer's mean and stddev." add_arg('output_path', str,
"(default: %(default)s)") 'mean_std.npz',
parser.add_argument( "Filepath of write mean and stddev to (.npz).")
"--num_samples", # yapf: disable
default=2000,
type=int,
help="Number of samples for computing mean and stddev. "
"(default: %(default)s)")
parser.add_argument(
"--augmentation_config",
default='{}',
type=str,
help="Augmentation configuration in json-format. "
"(default: %(default)s)")
parser.add_argument(
"--output_file",
default='mean_std.npz',
type=str,
help="Filepath to write mean and std to (.npz)."
"(default: %(default)s)")
args = parser.parse_args() args = parser.parse_args()
def main(): def main():
augmentation_pipeline = AugmentationPipeline(args.augmentation_config) print_arguments(args)
augmentation_pipeline = AugmentationPipeline('{}')
audio_featurizer = AudioFeaturizer(specgram_type=args.specgram_type) audio_featurizer = AudioFeaturizer(specgram_type=args.specgram_type)
def augment_and_featurize(audio_segment): def augment_and_featurize(audio_segment):
...@@ -57,7 +44,7 @@ def main(): ...@@ -57,7 +44,7 @@ def main():
manifest_path=args.manifest_path, manifest_path=args.manifest_path,
featurize_func=augment_and_featurize, featurize_func=augment_and_featurize,
num_samples=args.num_samples) num_samples=args.num_samples)
normalizer.write_to_file(args.output_file) normalizer.write_to_file(args.output_path)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -4,174 +4,91 @@ from __future__ import division ...@@ -4,174 +4,91 @@ from __future__ import division
from __future__ import print_function from __future__ import print_function
import argparse import argparse
import distutils.util import functools
import multiprocessing
import paddle.v2 as paddle import paddle.v2 as paddle
from model import DeepSpeech2Model from model import DeepSpeech2Model
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
import utils from utils import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( add_arg = functools.partial(add_arguments, argparser=parser)
"--batch_size", default=256, type=int, help="Minibatch size.") # yapf: disable
parser.add_argument( add_arg('batch_size', int, 256, "Minibatch size.")
"--num_passes", add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).")
default=200, add_arg('num_passes', int, 200, "# of training epochs.")
type=int, add_arg('num_proc_data', int, 12, "# of CPUs for data preprocessing.")
help="Training pass number. (default: %(default)s)") add_arg('num_conv_layers', int, 2, "# of convolution layers.")
parser.add_argument( add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
"--num_iterations_print", add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
default=100, add_arg('num_iter_print', int, 100, "Every # iterations for printing "
type=int, "train cost.")
help="Number of iterations for every train cost printing. " add_arg('learning_rate', float, 5e-4, "Learning rate.")
"(default: %(default)s)") add_arg('max_duration', float, 27.0, "Longest audio duration allowed.")
parser.add_argument( add_arg('min_duration', float, 0.0, "Shortest audio duration allowed.")
"--num_conv_layers", add_arg('use_sortagrad', bool, True, "Use SortaGrad or not.")
default=2, add_arg('use_gpu', bool, True, "Use GPU or not.")
type=int, add_arg('is_local', bool, True, "Use pserver or not.")
help="Convolution layer number. (default: %(default)s)") add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.")
parser.add_argument( add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
"--num_rnn_layers", "bi-directional RNNs. Not for GRU.")
default=3, add_arg('train_manifest', str,
type=int, 'datasets/manifest.train',
help="RNN layer number. (default: %(default)s)") "Filepath of train manifest.")
parser.add_argument( add_arg('dev_manifest', str,
"--rnn_layer_size", 'datasets/manifest.dev',
default=2048, "Filepath of validation manifest.")
type=int, add_arg('mean_std_path', str,
help="RNN layer cell number. (default: %(default)s)") 'mean_std.npz',
parser.add_argument( "Filepath of normalizer's mean & std.")
"--share_rnn_weights", add_arg('vocab_path', str,
default=True, 'datasets/vocab/eng_vocab.txt',
type=distutils.util.strtobool, "Filepath of vocabulary.")
help="Whether to share input-hidden weights between forword and backward " add_arg('init_model_path', str,
"directional simple RNNs. Only available when use_gru=False. " None,
"(default: %(default)s)") "If None, the training starts from scratch, "
parser.add_argument( "otherwise, it resumes from the pre-trained model.")
"--use_gru", add_arg('output_model_dir', str,
default=False, "./checkpoints",
type=distutils.util.strtobool, "Directory for saving checkpoints.")
help="Use GRU or simple RNN. (default: %(default)s)") add_arg('augment_conf_path',str,
parser.add_argument( 'conf/augmentation.config',
"--adam_learning_rate", "Filepath of augmentation configuration file (json-format).")
default=5e-4, add_arg('specgram_type', str,
type=float, 'linear',
help="Learning rate for ADAM Optimizer. (default: %(default)s)") "Audio feature type. Options: linear, mfcc.",
parser.add_argument( choices=['linear', 'mfcc'])
"--use_gpu", add_arg('shuffle_method', str,
default=True, 'batch_shuffle_clipped',
type=distutils.util.strtobool, "Shuffle method.",
help="Use gpu or not. (default: %(default)s)") choices=['instance_shuffle', 'batch_shuffle', 'batch_shuffle_clipped'])
parser.add_argument( # yapf: disable
"--use_sortagrad",
default=True,
type=distutils.util.strtobool,
help="Use sortagrad or not. (default: %(default)s)")
parser.add_argument(
"--specgram_type",
default='linear',
type=str,
help="Feature type of audio data: 'linear' (power spectrum)"
" or 'mfcc'. (default: %(default)s)")
parser.add_argument(
"--max_duration",
default=27.0,
type=float,
help="Audios with duration larger than this will be discarded. "
"(default: %(default)s)")
parser.add_argument(
"--min_duration",
default=0.0,
type=float,
help="Audios with duration smaller than this will be discarded. "
"(default: %(default)s)")
parser.add_argument(
"--shuffle_method",
default='batch_shuffle_clipped',
type=str,
help="Shuffle method: 'instance_shuffle', 'batch_shuffle', "
"'batch_shuffle_batch'. (default: %(default)s)")
parser.add_argument(
"--trainer_count",
default=8,
type=int,
help="Trainer number. (default: %(default)s)")
parser.add_argument(
"--num_threads_data",
default=multiprocessing.cpu_count() // 2,
type=int,
help="Number of cpu threads for preprocessing data. (default: %(default)s)")
parser.add_argument(
"--mean_std_filepath",
default='mean_std.npz',
type=str,
help="Manifest path for normalizer. (default: %(default)s)")
parser.add_argument(
"--train_manifest_path",
default='datasets/manifest.train',
type=str,
help="Manifest path for training. (default: %(default)s)")
parser.add_argument(
"--dev_manifest_path",
default='datasets/manifest.dev',
type=str,
help="Manifest path for validation. (default: %(default)s)")
parser.add_argument(
"--vocab_filepath",
default='datasets/vocab/eng_vocab.txt',
type=str,
help="Vocabulary filepath. (default: %(default)s)")
parser.add_argument(
"--init_model_path",
default=None,
type=str,
help="If set None, the training will start from scratch. "
"Otherwise, the training will resume from "
"the existing model of this path. (default: %(default)s)")
parser.add_argument(
"--output_model_dir",
default="./checkpoints",
type=str,
help="Directory for saving models. (default: %(default)s)")
parser.add_argument(
"--augmentation_config",
default=open('conf/augmentation.config', 'r').read(),
type=str,
help="Augmentation configuration in json-format. "
"(default: %(default)s)")
parser.add_argument(
"--is_local",
default=True,
type=distutils.util.strtobool,
help="Set to false if running with pserver in paddlecloud. "
"(default: %(default)s)")
args = parser.parse_args() args = parser.parse_args()
def train(): def train():
"""DeepSpeech2 training.""" """DeepSpeech2 training."""
train_generator = DataGenerator( train_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_path,
augmentation_config=args.augmentation_config, augmentation_config=open(args.augment_conf_path, 'r').read(),
max_duration=args.max_duration, max_duration=args.max_duration,
min_duration=args.min_duration, min_duration=args.min_duration,
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_threads_data) num_threads=args.num_proc_data)
dev_generator = DataGenerator( dev_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_path,
augmentation_config="{}", augmentation_config="{}",
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_threads_data) num_threads=args.num_proc_data)
train_batch_reader = train_generator.batch_reader_creator( train_batch_reader = train_generator.batch_reader_creator(
manifest_path=args.train_manifest_path, manifest_path=args.train_manifest,
batch_size=args.batch_size, batch_size=args.batch_size,
min_batch_size=args.trainer_count, min_batch_size=args.trainer_count,
sortagrad=args.use_sortagrad if args.init_model_path is None else False, sortagrad=args.use_sortagrad if args.init_model_path is None else False,
shuffle_method=args.shuffle_method) shuffle_method=args.shuffle_method)
dev_batch_reader = dev_generator.batch_reader_creator( dev_batch_reader = dev_generator.batch_reader_creator(
manifest_path=args.dev_manifest_path, manifest_path=args.dev_manifest,
batch_size=args.batch_size, batch_size=args.batch_size,
min_batch_size=1, # must be 1, but will have errors. min_batch_size=1, # must be 1, but will have errors.
sortagrad=False, sortagrad=False,
...@@ -184,21 +101,21 @@ def train(): ...@@ -184,21 +101,21 @@ def train():
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru, use_gru=args.use_gru,
pretrained_model_path=args.init_model_path, pretrained_model_path=args.init_model_path,
share_rnn_weights=args.share_rnn_weights) share_rnn_weights=args.share_weights)
ds2_model.train( ds2_model.train(
train_batch_reader=train_batch_reader, train_batch_reader=train_batch_reader,
dev_batch_reader=dev_batch_reader, dev_batch_reader=dev_batch_reader,
feeding_dict=train_generator.feeding, feeding_dict=train_generator.feeding,
learning_rate=args.adam_learning_rate, learning_rate=args.learning_rate,
gradient_clipping=400, gradient_clipping=400,
num_passes=args.num_passes, num_passes=args.num_passes,
num_iterations_print=args.num_iterations_print, num_iterations_print=args.num_iter_print,
output_model_dir=args.output_model_dir, output_model_dir=args.output_model_dir,
is_local=args.is_local) is_local=args.is_local)
def main(): def main():
utils.print_arguments(args) print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
train() train()
......
"""Parameters tuning for DeepSpeech2 model.""" """Beam search parameters tuning for DeepSpeech2 model."""
from __future__ import absolute_import from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import numpy as np import numpy as np
import distutils.util
import argparse import argparse
import multiprocessing import functools
import paddle.v2 as paddle import paddle.v2 as paddle
from data_utils.data import DataGenerator from data_utils.data import DataGenerator
from model import DeepSpeech2Model from model import DeepSpeech2Model
from error_rate import wer from error_rate import wer
import utils from utils import add_arguments, print_arguments
parser = argparse.ArgumentParser(description=__doc__) parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument( add_arg = functools.partial(add_arguments, argparser=parser)
"--num_samples", # yapf: disable
default=100, add_arg('num_samples', int, 100, "# of samples to infer.")
type=int, add_arg('trainer_count', int, 8, "# of Trainers (CPUs or GPUs).")
help="Number of samples for parameters tuning. (default: %(default)s)") add_arg('beam_size', int, 500, "Beam search width.")
parser.add_argument( add_arg('num_proc_bsearch', int, 12, "# of CPUs for beam search.")
"--num_conv_layers", add_arg('num_conv_layers', int, 2, "# of convolution layers.")
default=2, add_arg('num_rnn_layers', int, 3, "# of recurrent layers.")
type=int, add_arg('rnn_layer_size', int, 2048, "# of recurrent cells per layer.")
help="Convolution layer number. (default: %(default)s)") add_arg('num_alphas', int, 14, "# of alpha candidates for tuning.")
parser.add_argument( add_arg('num_betas', int, 20, "# of beta candidates for tuning.")
"--num_rnn_layers", add_arg('alpha_from', float, 0.1, "Where alpha starts tuning from.")
default=3, add_arg('alpha_to', float, 0.36, "Where alpha ends tuning with.")
type=int, add_arg('beta_from', float, 0.05, "Where beta starts tuning from.")
help="RNN layer number. (default: %(default)s)") add_arg('beta_to', float, 0.36, "Where beta ends tuning with.")
parser.add_argument( add_arg('cutoff_prob', float, 0.99, "Cutoff probability for pruning.")
"--rnn_layer_size", add_arg('use_gru', bool, False, "Use GRUs instead of simple RNNs.")
default=2048, add_arg('use_gpu', bool, True, "Use GPU or not.")
type=int, add_arg('share_rnn_weights',bool, True, "Share input-hidden weights across "
help="RNN layer cell number. (default: %(default)s)") "bi-directional RNNs. Not for GRU.")
parser.add_argument( add_arg('tune_manifest', str,
"--share_rnn_weights", 'datasets/manifest.test',
default=True, "Filepath of manifest to tune.")
type=distutils.util.strtobool, add_arg('mean_std_path', str,
help="Whether to share input-hidden weights between forword and backward " 'mean_std.npz',
"directional simple RNNs. Only available when use_gru=False. " "Filepath of normalizer's mean & std.")
"(default: %(default)s)") add_arg('vocab_path', str,
parser.add_argument( 'datasets/vocab/eng_vocab.txt',
"--use_gru", "Filepath of vocabulary.")
default=False, add_arg('lang_model_path', str,
type=distutils.util.strtobool, 'lm/data/common_crawl_00.prune01111.trie.klm',
help="Use GRU or simple RNN. (default: %(default)s)") "Filepath for language model.")
parser.add_argument( add_arg('model_path', str,
"--use_gpu", './checkpoints/params.latest.tar.gz',
default=True, "If None, the training starts from scratch, "
type=distutils.util.strtobool, "otherwise, it resumes from the pre-trained model.")
help="Use gpu or not. (default: %(default)s)") add_arg('error_rate_type', str,
parser.add_argument( 'wer',
"--trainer_count", "Error rate type for evaluation.",
default=8, choices=['wer', 'cer'])
type=int, add_arg('specgram_type', str,
help="Trainer number. (default: %(default)s)") 'linear',
parser.add_argument( "Audio feature type. Options: linear, mfcc.",
"--num_threads_data", choices=['linear', 'mfcc'])
default=1, # yapf: disable
type=int,
help="Number of cpu threads for preprocessing data. (default: %(default)s)")
parser.add_argument(
"--num_processes_beam_search",
default=multiprocessing.cpu_count() // 2,
type=int,
help="Number of cpu processes for beam search. (default: %(default)s)")
parser.add_argument(
"--specgram_type",
default='linear',
type=str,
help="Feature type of audio data: 'linear' (power spectrum)"
" or 'mfcc'. (default: %(default)s)")
parser.add_argument(
"--mean_std_filepath",
default='mean_std.npz',
type=str,
help="Manifest path for normalizer. (default: %(default)s)")
parser.add_argument(
"--tune_manifest_path",
default='datasets/manifest.dev',
type=str,
help="Manifest path for tuning. (default: %(default)s)")
parser.add_argument(
"--model_filepath",
default='checkpoints/params.latest.tar.gz',
type=str,
help="Model filepath. (default: %(default)s)")
parser.add_argument(
"--vocab_filepath",
default='datasets/vocab/eng_vocab.txt',
type=str,
help="Vocabulary filepath. (default: %(default)s)")
parser.add_argument(
"--beam_size",
default=500,
type=int,
help="Width for beam search decoding. (default: %(default)d)")
parser.add_argument(
"--language_model_path",
default="lm/data/common_crawl_00.prune01111.trie.klm",
type=str,
help="Path for language model. (default: %(default)s)")
parser.add_argument(
"--alpha_from",
default=0.1,
type=float,
help="Where alpha starts from. (default: %(default)f)")
parser.add_argument(
"--num_alphas",
default=14,
type=int,
help="Number of candidate alphas. (default: %(default)d)")
parser.add_argument(
"--alpha_to",
default=0.36,
type=float,
help="Where alpha ends with. (default: %(default)f)")
parser.add_argument(
"--beta_from",
default=0.05,
type=float,
help="Where beta starts from. (default: %(default)f)")
parser.add_argument(
"--num_betas",
default=20,
type=float,
help="Number of candidate betas. (default: %(default)d)")
parser.add_argument(
"--beta_to",
default=1.0,
type=float,
help="Where beta ends with. (default: %(default)f)")
parser.add_argument(
"--cutoff_prob",
default=0.99,
type=float,
help="The cutoff probability of pruning"
"in beam search. (default: %(default)f)")
args = parser.parse_args() args = parser.parse_args()
...@@ -149,13 +69,13 @@ def tune(): ...@@ -149,13 +69,13 @@ def tune():
raise ValueError("num_betas must be non-negative!") raise ValueError("num_betas must be non-negative!")
data_generator = DataGenerator( data_generator = DataGenerator(
vocab_filepath=args.vocab_filepath, vocab_filepath=args.vocab_path,
mean_std_filepath=args.mean_std_filepath, mean_std_filepath=args.mean_std_path,
augmentation_config='{}', augmentation_config='{}',
specgram_type=args.specgram_type, specgram_type=args.specgram_type,
num_threads=args.num_threads_data) num_threads=1)
batch_reader = data_generator.batch_reader_creator( batch_reader = data_generator.batch_reader_creator(
manifest_path=args.tune_manifest_path, manifest_path=args.tune_manifest,
batch_size=args.num_samples, batch_size=args.num_samples,
sortagrad=False, sortagrad=False,
shuffle_method=None) shuffle_method=None)
...@@ -171,7 +91,7 @@ def tune(): ...@@ -171,7 +91,7 @@ def tune():
num_rnn_layers=args.num_rnn_layers, num_rnn_layers=args.num_rnn_layers,
rnn_layer_size=args.rnn_layer_size, rnn_layer_size=args.rnn_layer_size,
use_gru=args.use_gru, use_gru=args.use_gru,
pretrained_model_path=args.model_filepath, pretrained_model_path=args.model_path,
share_rnn_weights=args.share_rnn_weights) share_rnn_weights=args.share_rnn_weights)
# create grid for search # create grid for search
...@@ -184,14 +104,14 @@ def tune(): ...@@ -184,14 +104,14 @@ def tune():
for alpha, beta in params_grid: for alpha, beta in params_grid:
result_transcripts = ds2_model.infer_batch( result_transcripts = ds2_model.infer_batch(
infer_data=tune_data, infer_data=tune_data,
decode_method='beam_search', decoding_method='ctc_beam_search',
beam_alpha=alpha, beam_alpha=alpha,
beam_beta=beta, beam_beta=beta,
beam_size=args.beam_size, beam_size=args.beam_size,
cutoff_prob=args.cutoff_prob, cutoff_prob=args.cutoff_prob,
vocab_list=data_generator.vocab_list, vocab_list=data_generator.vocab_list,
language_model_path=args.language_model_path, language_model_path=args.lang_model_path,
num_processes=args.num_processes_beam_search) num_processes=args.num_proc_bsearch)
wer_sum, num_ins = 0.0, 0 wer_sum, num_ins = 0.0, 0
for target, result in zip(target_transcripts, result_transcripts): for target, result in zip(target_transcripts, result_transcripts):
wer_sum += wer(target, result) wer_sum += wer(target, result)
...@@ -201,7 +121,7 @@ def tune(): ...@@ -201,7 +121,7 @@ def tune():
def main(): def main():
utils.print_arguments(args) print_arguments(args)
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count) paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
tune() tune()
......
...@@ -3,6 +3,8 @@ from __future__ import absolute_import ...@@ -3,6 +3,8 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
import distutils.util
def print_arguments(args): def print_arguments(args):
"""Print argparse's arguments. """Print argparse's arguments.
...@@ -19,7 +21,27 @@ def print_arguments(args): ...@@ -19,7 +21,27 @@ def print_arguments(args):
:param args: Input argparse.Namespace for printing. :param args: Input argparse.Namespace for printing.
:type args: argparse.Namespace :type args: argparse.Namespace
""" """
print("----- Configuration Arguments -----") print("----------- Configuration Arguments -----------")
for arg, value in vars(args).iteritems(): for arg, value in sorted(vars(args).iteritems()):
print("%s: %s" % (arg, value)) print("%s: %s" % (arg, value))
print("------------------------------------") print("------------------------------------------------")
def add_arguments(argname, type, default, help, argparser, **kwargs):
"""Add argparse's argument.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
add_argument("name", str, "Jonh", "User name.", parser)
args = parser.parse_args()
"""
type = distutils.util.strtobool if type == bool else type
argparser.add_argument(
"--" + argname,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册