diff --git a/deep_speech_2/demo_server.py b/deep_speech_2/demo_server.py index c7e7e94a450121ea3c5c12fbbf7df4dfa3a48262..60d972393bff185d98dcf8b67a714b51523442af 100644 --- a/deep_speech_2/demo_server.py +++ b/deep_speech_2/demo_server.py @@ -66,6 +66,11 @@ parser.add_argument( default=512, type=int, help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--use_gru", + default=True, + type=bool, + help="Use GRU or simple RNN. (default: %(default)s)") parser.add_argument( "--use_gpu", default=True, @@ -199,6 +204,7 @@ def start_server(): num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, + use_gru=args.use_gru, pretrained_model_path=args.model_filepath) # prepare ASR inference handler diff --git a/deep_speech_2/evaluate.py b/deep_speech_2/evaluate.py index 82dcec3c24480d439f8a622964f0a1d90e948cd4..2f87abbde0685d07058e1f6d7796d615509e5bda 100644 --- a/deep_speech_2/evaluate.py +++ b/deep_speech_2/evaluate.py @@ -38,6 +38,11 @@ parser.add_argument( default=512, type=int, help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--use_gru", + default=True, + type=bool, + help="Use GRU or simple RNN. (default: %(default)s)") parser.add_argument( "--use_gpu", default=True, @@ -142,6 +147,7 @@ def evaluate(): num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, + use_gru=args.use_gru, pretrained_model_path=args.model_filepath) error_rate_func = cer if args.error_rate_type == 'cer' else wer diff --git a/deep_speech_2/infer.py b/deep_speech_2/infer.py index 43643cde70f3421a9faf92e6177c103e4099c97d..91b08932c2438377bb54fb052d9d17e7a8170d39 100644 --- a/deep_speech_2/infer.py +++ b/deep_speech_2/infer.py @@ -33,6 +33,11 @@ parser.add_argument( default=512, type=int, help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--use_gru", + default=True, + type=bool, + help="Use GRU or simple RNN. (default: %(default)s)") parser.add_argument( "--use_gpu", default=True, @@ -143,6 +148,7 @@ def infer(): num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, + use_gru=args.use_gru, pretrained_model_path=args.model_filepath) result_transcripts = ds2_model.infer_batch( infer_data=infer_data, diff --git a/deep_speech_2/layer.py b/deep_speech_2/layer.py index ef25c0a1b42a7268f99f805cafd36f97762d32dd..a91f694b8e92ff3e3b48e569ecc0a7751d26bee2 100644 --- a/deep_speech_2/layer.py +++ b/deep_speech_2/layer.py @@ -72,6 +72,45 @@ def bidirectional_simple_rnn_bn_layer(name, input, size, act): return paddle.layer.concat(input=[forward_simple_rnn, backward_simple_rnn]) +def bidirectional_gru_bn_layer(name, input, size, act): + """Bidirectonal gru layer with sequence-wise batch normalization. + The batch normalization is only performed on input-state weights. + + :param name: Name of the layer. + :type name: string + :param input: Input layer. + :type input: LayerOutput + :param size: Number of RNN cells. + :type size: int + :param act: Activation type. + :type act: BaseActivation + :return: Bidirectional simple rnn layer. + :rtype: LayerOutput + """ + # input-hidden weights shared across bi-direcitonal rnn. + input_proj_forward = paddle.layer.fc( + input=input, + size=size * 3, + act=paddle.activation.Linear(), + bias_attr=False) + input_proj_backward = paddle.layer.fc( + input=input, + size=size * 3, + act=paddle.activation.Linear(), + bias_attr=False) + # batch norm is only performed on input-state projection + input_proj_bn_forward = paddle.layer.batch_norm( + input=input_proj_forward, act=paddle.activation.Linear()) + input_proj_bn_backward = paddle.layer.batch_norm( + input=input_proj_backward, act=paddle.activation.Linear()) + # forward and backward in time + forward_gru = paddle.layer.grumemory( + input=input_proj_bn_forward, act=act, reverse=False) + backward_gru = paddle.layer.grumemory( + input=input_proj_bn_backward, act=act, reverse=True) + return paddle.layer.concat(input=[forward_gru, backward_gru]) + + def conv_group(input, num_stacks): """Convolution group with stacked convolution layers. @@ -87,9 +126,9 @@ def conv_group(input, num_stacks): filter_size=(11, 41), num_channels_in=1, num_channels_out=32, - stride=(3, 2), + stride=(2, 2), padding=(5, 20), - act=paddle.activation.BRelu()) + act=paddle.activation.Relu()) for i in xrange(num_stacks - 1): conv = conv_bn_layer( input=conv, @@ -98,13 +137,13 @@ def conv_group(input, num_stacks): num_channels_out=32, stride=(1, 2), padding=(5, 10), - act=paddle.activation.BRelu()) + act=paddle.activation.Relu()) output_num_channels = 32 output_height = 160 // pow(2, num_stacks) + 1 return conv, output_num_channels, output_height -def rnn_group(input, size, num_stacks): +def rnn_group(input, size, num_stacks, use_gru): """RNN group with stacked bidirectional simple RNN layers. :param input: Input layer. @@ -113,13 +152,25 @@ def rnn_group(input, size, num_stacks): :type size: int :param num_stacks: Number of stacked rnn layers. :type num_stacks: int + :param use_gru: Use gru if set True. Use simple rnn if set False. + :type use_gru: bool :return: Output layer of the RNN group. :rtype: LayerOutput """ output = input for i in xrange(num_stacks): - output = bidirectional_simple_rnn_bn_layer( - name=str(i), input=output, size=size, act=paddle.activation.BRelu()) + if use_gru: + output = bidirectional_gru_bn_layer( + name=str(i), + input=output, + size=size, + act=paddle.activation.Relu()) + else: + output = bidirectional_simple_rnn_bn_layer( + name=str(i), + input=output, + size=size, + act=paddle.activation.Relu()) return output @@ -128,7 +179,8 @@ def deep_speech2(audio_data, dict_size, num_conv_layers=2, num_rnn_layers=3, - rnn_size=256): + rnn_size=256, + use_gru=True): """ The whole DeepSpeech2 model structure (a simplified version). @@ -144,6 +196,8 @@ def deep_speech2(audio_data, :type num_rnn_layers: int :param rnn_size: RNN layer size (number of RNN cells). :type rnn_size: int + :param use_gru: Use gru if set True. Use simple rnn if set False. + :type use_gru: bool :return: A tuple of an output unnormalized log probability layer ( before softmax) and a ctc cost layer. :rtype: tuple of LayerOutput @@ -161,7 +215,10 @@ def deep_speech2(audio_data, block_y=conv_group_height) # rnn group rnn_group_output = rnn_group( - input=conv2seq, size=rnn_size, num_stacks=num_rnn_layers) + input=conv2seq, + size=rnn_size, + num_stacks=num_rnn_layers, + use_gru=use_gru) fc = paddle.layer.fc( input=rnn_group_output, size=dict_size + 1, diff --git a/deep_speech_2/model.py b/deep_speech_2/model.py index 99412e595af43fa3af16cb7d09274bf19c473eca..eec971c005f2ec7376bd23ed08002da7355af1c5 100644 --- a/deep_speech_2/model.py +++ b/deep_speech_2/model.py @@ -30,9 +30,9 @@ class DeepSpeech2Model(object): """ def __init__(self, vocab_size, num_conv_layers, num_rnn_layers, - rnn_layer_size, pretrained_model_path): + rnn_layer_size, use_gru, pretrained_model_path): self._create_network(vocab_size, num_conv_layers, num_rnn_layers, - rnn_layer_size) + rnn_layer_size, use_gru) self._create_parameters(pretrained_model_path) self._inferer = None self._loss_inferer = None @@ -226,7 +226,7 @@ class DeepSpeech2Model(object): gzip.open(model_path)) def _create_network(self, vocab_size, num_conv_layers, num_rnn_layers, - rnn_layer_size): + rnn_layer_size, use_gru): """Create data layers and model network.""" # paddle.data_type.dense_array is used for variable batch input. # The size 161 * 161 is only an placeholder value and the real shape @@ -243,4 +243,5 @@ class DeepSpeech2Model(object): dict_size=vocab_size, num_conv_layers=num_conv_layers, num_rnn_layers=num_rnn_layers, - rnn_size=rnn_layer_size) + rnn_size=rnn_layer_size, + use_gru=use_gru) diff --git a/deep_speech_2/train.py b/deep_speech_2/train.py index 262d8bf0125bec3b225337c3f5de299be3f44ba1..8e95d7bc888316d0433fd331f4fb5c916a6195d3 100644 --- a/deep_speech_2/train.py +++ b/deep_speech_2/train.py @@ -37,9 +37,14 @@ parser.add_argument( help="RNN layer number. (default: %(default)s)") parser.add_argument( "--rnn_layer_size", - default=512, + default=1280, type=int, help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--use_gru", + default=True, + type=bool, + help="Use GRU or simple RNN. (default: %(default)s)") parser.add_argument( "--adam_learning_rate", default=5e-4, @@ -170,6 +175,7 @@ def train(): num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, + use_gru=args.use_gru, pretrained_model_path=args.init_model_path) ds2_model.train( train_batch_reader=train_batch_reader, diff --git a/deep_speech_2/tune.py b/deep_speech_2/tune.py index 328d67a1197634e5f02ad0689056196a8904fc06..8a9b5b6109168da63bd04cad47b17898a4cea354 100644 --- a/deep_speech_2/tune.py +++ b/deep_speech_2/tune.py @@ -34,6 +34,11 @@ parser.add_argument( default=512, type=int, help="RNN layer cell number. (default: %(default)s)") +parser.add_argument( + "--use_gru", + default=True, + type=bool, + help="Use GRU or simple RNN. (default: %(default)s)") parser.add_argument( "--use_gpu", default=True, @@ -158,6 +163,7 @@ def tune(): num_conv_layers=args.num_conv_layers, num_rnn_layers=args.num_rnn_layers, rnn_layer_size=args.rnn_layer_size, + use_gru=args.use_gru, pretrained_model_path=args.model_filepath) # create grid for search