diff --git a/deepspeech/exps/deepspeech2/model.py b/deepspeech/exps/deepspeech2/model.py index 09280cf9f880cb119d1f2c2c0050cbba363d4c25..46ef915c6fbed72c3db0adeb8bc35377cb0a9a4f 100644 --- a/deepspeech/exps/deepspeech2/model.py +++ b/deepspeech/exps/deepspeech2/model.py @@ -305,11 +305,12 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): exit(-1) def export(self): - self.infer_model.eval() + infer_model = DeepSpeech2InferModel.from_pretrained( + self.test_loader.dataset, self.config, self.args.checkpoint_path) + infer_model.eval() feat_dim = self.test_loader.dataset.feature_size - paddle.jit.save( - self.infer_model, - self.args.export_path, + static_model = paddle.jit.to_static( + infer_model, input_spec=[ paddle.static.InputSpec( shape=[None, feat_dim, None], @@ -317,6 +318,8 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): paddle.static.InputSpec(shape=[None], dtype='int64'), # audio_length, [B] ]) + logger.info(f"Export code: {static_model.forward.code}") + paddle.jit.save(static_model, self.args.export_path) def run_export(self): try: @@ -349,12 +352,7 @@ class DeepSpeech2Tester(DeepSpeech2Trainer): rnn_size=config.model.rnn_layer_size, use_gru=config.model.use_gru, share_rnn_weights=config.model.share_rnn_weights) - - infer_model = DeepSpeech2InferModel.from_pretrained( - self.test_loader.dataset, config, self.args.checkpoint_path) - self.model = model - self.infer_model = infer_model self.logger.info("Setup model!") def setup_dataloader(self): diff --git a/deepspeech/models/deepspeech2.py b/deepspeech/models/deepspeech2.py index b58260749e872e74f90242e6569d1dbd888d554e..ffe678a692333e97b7a64c1277aee67428fa3777 100644 --- a/deepspeech/models/deepspeech2.py +++ b/deepspeech/models/deepspeech2.py @@ -24,17 +24,14 @@ from paddle import nn from paddle.nn import functional as F from paddle.nn import initializer as I -from deepspeech.modules.conv import ConvStack -from deepspeech.modules.rnn import RNNStack from deepspeech.modules.mask import sequence_mask from deepspeech.modules.activation import brelu +from deepspeech.modules.conv import ConvStack +from deepspeech.modules.rnn import RNNStack +from deepspeech.modules.ctc import CTCDecoder + from deepspeech.utils import checkpoint from deepspeech.utils import layer_tools -from deepspeech.decoders.swig_wrapper import Scorer -from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder -from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch - -from deepspeech.modules.loss import CTCLoss logger = logging.getLogger(__name__) @@ -105,178 +102,6 @@ class CRNNEncoder(nn.Layer): return x, x_lens -class CTCDecoder(nn.Layer): - def __init__(self, enc_n_units, vocab_size): - super().__init__() - self.blank_id = vocab_size - self.output = nn.Linear(enc_n_units, - vocab_size + 1) # blank id is last id - self.criterion = CTCLoss(self.blank_id) - - self._ext_scorer = None - - def forward(self, eout, eout_lens, texts, texts_len): - """Compute CTC Loss - - Args: - eout (Tensor): - eout_lens (Tensor): - texts (Tenosr): - texts_len (Tensor): - Returns: - loss (Tenosr): [1] - """ - logits = self.output(eout) - loss = self.criterion(logits, texts, eout_lens, texts_len) - return loss - - def probs(self, eouts, temperature=1.): - """Get CTC probabilities. - Args: - eouts (FloatTensor): `[B, T, enc_units]` - Returns: - probs (FloatTensor): `[B, T, vocab]` - """ - return F.softmax(self.output(eouts) / temperature, axis=-1) - - def scores(self, eouts, temperature=1.): - """Get log-scale CTC probabilities. - Args: - eouts (FloatTensor): `[B, T, enc_units]` - Returns: - log_probs (FloatTensor): `[B, T, vocab]` - """ - return F.log_softmax(self.output(eouts) / temperature, axis=-1) - - def _decode_batch_greedy(self, probs_split, vocab_list): - """Decode by best path for a batch of probs matrix input. - :param probs_split: List of 2-D probability matrix, and each consists - of prob vectors for one speech utterancce. - :param probs_split: List of matrix - :param vocab_list: List of tokens in the vocabulary, for decoding. - :type vocab_list: list - :return: List of transcription texts. - :rtype: List of str - """ - results = [] - for i, probs in enumerate(probs_split): - output_transcription = ctc_greedy_decoder( - probs_seq=probs, vocabulary=vocab_list) - results.append(output_transcription) - return results - - def _init_ext_scorer(self, beam_alpha, beam_beta, language_model_path, - vocab_list): - """Initialize the external scorer. - :param beam_alpha: Parameter associated with language model. - :type beam_alpha: float - :param beam_beta: Parameter associated with word count. - :type beam_beta: float - :param language_model_path: Filepath for language model. If it is - empty, the external scorer will be set to - None, and the decoding method will be pure - beam search without scorer. - :type language_model_path: str|None - :param vocab_list: List of tokens in the vocabulary, for decoding. - :type vocab_list: list - """ - # init once - if self._ext_scorer != None: - return - - if language_model_path != '': - logger.info("begin to initialize the external scorer " - "for decoding") - self._ext_scorer = Scorer(beam_alpha, beam_beta, - language_model_path, vocab_list) - lm_char_based = self._ext_scorer.is_character_based() - lm_max_order = self._ext_scorer.get_max_order() - lm_dict_size = self._ext_scorer.get_dict_size() - logger.info("language model: " - "is_character_based = %d," % lm_char_based + - " max_order = %d," % lm_max_order + " dict_size = %d" % - lm_dict_size) - logger.info("end initializing scorer") - else: - self._ext_scorer = None - logger.info("no language model provided, " - "decoding by pure beam search without scorer.") - - def _decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta, - beam_size, cutoff_prob, cutoff_top_n, - vocab_list, num_processes): - """Decode by beam search for a batch of probs matrix input. - :param probs_split: List of 2-D probability matrix, and each consists - of prob vectors for one speech utterancce. - :param probs_split: List of matrix - :param beam_alpha: Parameter associated with language model. - :type beam_alpha: float - :param beam_beta: Parameter associated with word count. - :type beam_beta: float - :param beam_size: Width for Beam search. - :type beam_size: int - :param cutoff_prob: Cutoff probability in pruning, - default 1.0, no pruning. - :type cutoff_prob: float - :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n - characters with highest probs in vocabulary will be - used in beam search, default 40. - :type cutoff_top_n: int - :param vocab_list: List of tokens in the vocabulary, for decoding. - :type vocab_list: list - :param num_processes: Number of processes (CPU) for decoder. - :type num_processes: int - :return: List of transcription texts. - :rtype: List of str - """ - if self._ext_scorer != None: - self._ext_scorer.reset_params(beam_alpha, beam_beta) - - # beam search decode - num_processes = min(num_processes, len(probs_split)) - beam_search_results = ctc_beam_search_decoder_batch( - probs_split=probs_split, - vocabulary=vocab_list, - beam_size=beam_size, - num_processes=num_processes, - ext_scoring_func=self._ext_scorer, - cutoff_prob=cutoff_prob, - cutoff_top_n=cutoff_top_n) - - results = [result[0][1] for result in beam_search_results] - return results - - def init_decode(self, beam_alpha, beam_beta, lang_model_path, vocab_list, - decoding_method): - if decoding_method == "ctc_beam_search": - self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path, - vocab_list) - - def decode_probs(self, probs, logits_lens, vocab_list, decoding_method, - lang_model_path, beam_alpha, beam_beta, beam_size, - cutoff_prob, cutoff_top_n, num_processes): - """ probs: activation after softmax - logits_len: audio output lens - """ - probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)] - if decoding_method == "ctc_greedy": - result_transcripts = self._decode_batch_greedy( - probs_split=probs_split, vocab_list=vocab_list) - elif decoding_method == "ctc_beam_search": - result_transcripts = self._decode_batch_beam_search( - probs_split=probs_split, - beam_alpha=beam_alpha, - beam_beta=beam_beta, - beam_size=beam_size, - cutoff_prob=cutoff_prob, - cutoff_top_n=cutoff_top_n, - vocab_list=vocab_list, - num_processes=num_processes) - else: - raise ValueError(f"Not support: {decoding_method}") - return result_transcripts - - class DeepSpeech2Model(nn.Layer): """The DeepSpeech2 network structure. @@ -339,8 +164,13 @@ class DeepSpeech2Model(nn.Layer): use_gru=use_gru, share_rnn_weights=share_rnn_weights) assert (self.encoder.output_size == rnn_size * 2) + self.decoder = CTCDecoder( - enc_n_units=self.encoder.output_size, vocab_size=dict_size) + enc_n_units=self.encoder.output_size, + odim=dict_size + 1, # is append after vocab + blank_id=dict_size, # last token is + dropout_rate=0.0, + reduction=True) def forward(self, audio, text, audio_len, text_len): """Compute Model loss diff --git a/deepspeech/modules/activation.py b/deepspeech/modules/activation.py index 14861fcf730bf993f13e0b9019d8842fb67143c0..a42bd1e740a30689a73e69d11f9747efb2a8ee0c 100644 --- a/deepspeech/modules/activation.py +++ b/deepspeech/modules/activation.py @@ -14,6 +14,7 @@ import logging import numpy as np +import math import paddle from paddle import nn @@ -22,7 +23,7 @@ from paddle.nn import initializer as I logger = logging.getLogger(__name__) -__all__ = ['brelu'] +__all__ = ['brelu', "softplus", "gelu_accurate", "gelu", 'Swish'] def brelu(x, t_min=0.0, t_max=24.0, name=None): @@ -30,3 +31,38 @@ def brelu(x, t_min=0.0, t_max=24.0, name=None): t_min = paddle.full(shape=[1], fill_value=t_min, dtype='float32') t_max = paddle.full(shape=[1], fill_value=t_max, dtype='float32') return x.maximum(t_min).minimum(t_max) + + +def softplus(x): + """Softplus function.""" + if hasattr(paddle.nn.functional, 'softplus'): + #return paddle.nn.functional.softplus(x.float()).type_as(x) + return paddle.nn.functional.softplus(x) + else: + raise NotImplementedError + + +def gelu_accurate(x): + """Gaussian Error Linear Units (GELU) activation.""" + # [reference] https://github.com/pytorch/fairseq/blob/e75cff5f2c1d62f12dc911e0bf420025eb1a4e33/fairseq/modules/gelu.py + if not hasattr(gelu_accurate, "_a"): + gelu_accurate._a = math.sqrt(2 / math.pi) + return 0.5 * x * (1 + paddle.tanh(gelu_accurate._a * + (x + 0.044715 * paddle.pow(x, 3)))) + + +def gelu(x): + """Gaussian Error Linear Units (GELU) activation.""" + if hasattr(torch.nn.functional, 'gelu'): + #return torch.nn.functional.gelu(x.float()).type_as(x) + return torch.nn.functional.gelu(x) + else: + return x * 0.5 * (1.0 + paddle.erf(x / math.sqrt(2.0))) + + +class Swish(nn.Layer): + """Construct an Swish object.""" + + def forward(self, x: paddle.Tensor) -> paddle.Tensor: + """Return Swish activation function.""" + return x * F.sigmoid(x) diff --git a/deepspeech/modules/ctc.py b/deepspeech/modules/ctc.py new file mode 100644 index 0000000000000000000000000000000000000000..66737f599b6ea6923164b9da7bf9c3055f21fe28 --- /dev/null +++ b/deepspeech/modules/ctc.py @@ -0,0 +1,238 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import logging +from typeguard import check_argument_types + +import paddle +from paddle import nn +from paddle.nn import functional as F +from paddle.nn import initializer as I + +from deepspeech.decoders.swig_wrapper import Scorer +from deepspeech.decoders.swig_wrapper import ctc_greedy_decoder +from deepspeech.decoders.swig_wrapper import ctc_beam_search_decoder_batch +from deepspeech.modules.loss import CTCLoss + +logger = logging.getLogger(__name__) + +__all__ = ['CTCDecoder'] + + +class CTCDecoder(nn.Layer): + def __init__(self, + enc_n_units, + odim, + blank_id=0, + dropout_rate: float=0.0, + reduction: bool=True): + """CTC decoder + + Args: + enc_n_units ([int]): encoder output dimention + vocab_size ([int]): text vocabulary size + dropout_rate (float): dropout rate (0.0 ~ 1.0) + reduction (bool): reduce the CTC loss into a scalar + """ + assert check_argument_types() + super().__init__() + + self.blank_id = blank_id + self.odim = odim + self.dropout_rate = dropout_rate + self.ctc_lo = nn.Linear(enc_n_units, self.odim) + reduction_type = "sum" if reduction else "none" + self.criterion = CTCLoss(blank=self.blank_id, reduction=reduction_type) + + # CTCDecoder LM Score handle + self._ext_scorer = None + + def forward(self, hs_pad, hlens, ys_pad, ys_lens): + """Calculate CTC loss. + + Args: + hs_pad (Tensor): batch of padded hidden state sequences (B, Tmax, D) + hlens (Tensor): batch of lengths of hidden state sequences (B) + ys_pad (Tenosr): batch of padded character id sequence tensor (B, Lmax) + ys_lens (Tensor): batch of lengths of character sequence (B) + Returns: + loss (Tenosr): scalar. + """ + logits = self.ctc_lo(F.dropout(hs_pad, p=self.dropout_rate)) + loss = self.criterion(logits, ys_pad, hlens, ys_lens) + return loss + + def probs(self, eouts: paddle.Tensor, temperature: float=1.0): + """Get CTC probabilities. + Args: + eouts (FloatTensor): `[B, T, enc_units]` + Returns: + probs (FloatTensor): `[B, T, odim]` + """ + return F.softmax(self.ctc_lo(eouts) / temperature, axis=-1) + + def scores(self, eouts: paddle.Tensor, temperature: float=1.0): + """Get log-scale CTC probabilities. + Args: + eouts (FloatTensor): `[B, T, enc_units]` + Returns: + log_probs (FloatTensor): `[B, T, odim]` + """ + return F.log_softmax(self.ctc_lo(eouts) / temperature, axis=-1) + + def log_softmax(self, hs_pad: paddle.Tensor) -> paddle.Tensor: + """log_softmax of frame activations + Args: + Tensor hs_pad: 3d tensor (B, Tmax, eprojs) + Returns: + paddle.Tensor: log softmax applied 3d tensor (B, Tmax, odim) + """ + return self.scores(hs_pad) + + def argmax(self, hs_pad: paddle.Tensor) -> paddle.Tensor: + """argmax of frame activations + Args: + paddle.Tensor hs_pad: 3d tensor (B, Tmax, eprojs) + Returns: + paddle.Tensor: argmax applied 2d tensor (B, Tmax) + """ + return paddle.argmax(self.ctc_lo(hs_pad), dim=2) + + def _decode_batch_greedy(self, probs_split, vocab_list): + """Decode by best path for a batch of probs matrix input. + :param probs_split: List of 2-D probability matrix, and each consists + of prob vectors for one speech utterancce. + :param probs_split: List of matrix + :param vocab_list: List of tokens in the vocabulary, for decoding. + :type vocab_list: list + :return: List of transcription texts. + :rtype: List of str + """ + results = [] + for i, probs in enumerate(probs_split): + output_transcription = ctc_greedy_decoder( + probs_seq=probs, vocabulary=vocab_list) + results.append(output_transcription) + return results + + def _init_ext_scorer(self, beam_alpha, beam_beta, language_model_path, + vocab_list): + """Initialize the external scorer. + :param beam_alpha: Parameter associated with language model. + :type beam_alpha: float + :param beam_beta: Parameter associated with word count. + :type beam_beta: float + :param language_model_path: Filepath for language model. If it is + empty, the external scorer will be set to + None, and the decoding method will be pure + beam search without scorer. + :type language_model_path: str|None + :param vocab_list: List of tokens in the vocabulary, for decoding. + :type vocab_list: list + """ + # init once + if self._ext_scorer != None: + return + + if language_model_path != '': + logger.info("begin to initialize the external scorer " + "for decoding") + self._ext_scorer = Scorer(beam_alpha, beam_beta, + language_model_path, vocab_list) + lm_char_based = self._ext_scorer.is_character_based() + lm_max_order = self._ext_scorer.get_max_order() + lm_dict_size = self._ext_scorer.get_dict_size() + logger.info("language model: " + "is_character_based = %d," % lm_char_based + + " max_order = %d," % lm_max_order + " dict_size = %d" % + lm_dict_size) + logger.info("end initializing scorer") + else: + self._ext_scorer = None + logger.info("no language model provided, " + "decoding by pure beam search without scorer.") + + def _decode_batch_beam_search(self, probs_split, beam_alpha, beam_beta, + beam_size, cutoff_prob, cutoff_top_n, + vocab_list, num_processes): + """Decode by beam search for a batch of probs matrix input. + :param probs_split: List of 2-D probability matrix, and each consists + of prob vectors for one speech utterancce. + :param probs_split: List of matrix + :param beam_alpha: Parameter associated with language model. + :type beam_alpha: float + :param beam_beta: Parameter associated with word count. + :type beam_beta: float + :param beam_size: Width for Beam search. + :type beam_size: int + :param cutoff_prob: Cutoff probability in pruning, + default 1.0, no pruning. + :type cutoff_prob: float + :param cutoff_top_n: Cutoff number in pruning, only top cutoff_top_n + characters with highest probs in vocabulary will be + used in beam search, default 40. + :type cutoff_top_n: int + :param vocab_list: List of tokens in the vocabulary, for decoding. + :type vocab_list: list + :param num_processes: Number of processes (CPU) for decoder. + :type num_processes: int + :return: List of transcription texts. + :rtype: List of str + """ + if self._ext_scorer != None: + self._ext_scorer.reset_params(beam_alpha, beam_beta) + + # beam search decode + num_processes = min(num_processes, len(probs_split)) + beam_search_results = ctc_beam_search_decoder_batch( + probs_split=probs_split, + vocabulary=vocab_list, + beam_size=beam_size, + num_processes=num_processes, + ext_scoring_func=self._ext_scorer, + cutoff_prob=cutoff_prob, + cutoff_top_n=cutoff_top_n) + + results = [result[0][1] for result in beam_search_results] + return results + + def init_decode(self, beam_alpha, beam_beta, lang_model_path, vocab_list, + decoding_method): + if decoding_method == "ctc_beam_search": + self._init_ext_scorer(beam_alpha, beam_beta, lang_model_path, + vocab_list) + + def decode_probs(self, probs, logits_lens, vocab_list, decoding_method, + lang_model_path, beam_alpha, beam_beta, beam_size, + cutoff_prob, cutoff_top_n, num_processes): + """ probs: activation after softmax + logits_len: audio output lens + """ + probs_split = [probs[i, :l, :] for i, l in enumerate(logits_lens)] + if decoding_method == "ctc_greedy": + result_transcripts = self._decode_batch_greedy( + probs_split=probs_split, vocab_list=vocab_list) + elif decoding_method == "ctc_beam_search": + result_transcripts = self._decode_batch_beam_search( + probs_split=probs_split, + beam_alpha=beam_alpha, + beam_beta=beam_beta, + beam_size=beam_size, + cutoff_prob=cutoff_prob, + cutoff_top_n=cutoff_top_n, + vocab_list=vocab_list, + num_processes=num_processes) + else: + raise ValueError(f"Not support: {decoding_method}") + return result_transcripts diff --git a/deepspeech/modules/embedding.py b/deepspeech/modules/embedding.py new file mode 100644 index 0000000000000000000000000000000000000000..114bcd25f420ec614e4914652a82a8f85e6c9805 --- /dev/null +++ b/deepspeech/modules/embedding.py @@ -0,0 +1,132 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Positonal Encoding Module.""" + +import math +import logging +import numpy as np +from typing import Tuple + +import paddle +from paddle import nn +from paddle.nn import functional as F +from paddle.nn import initializer as I + +logger = logging.getLogger(__name__) + +__all__ = ["PositionalEncoding", "RelPositionalEncoding"] + +# TODO(Hui Zhang): remove this hack +paddle.float32 = 'float32' + + +class PositionalEncoding(nn.Layer): + def __init__(self, + d_model: int, + dropout_rate: float, + max_len: int=5000, + reverse: bool=False): + """Positional encoding. + PE(pos, 2i) = sin(pos/(10000^(2i/dmodel))) + PE(pos, 2i+1) = cos(pos/(10000^(2i/dmodel))) + Args: + d_model (int): embedding dim. + dropout_rate (float): dropout rate. + max_len (int, optional): maximum input length. Defaults to 5000. + reverse (bool, optional): Not used. Defaults to False. + """ + super().__init__() + self.d_model = d_model + self.max_len = max_len + self.xscale = paddle.to_tensor(math.sqrt(self.d_model)) + self.dropout = nn.Dropout(p=dropout_rate) + self.pe = paddle.zeros(self.max_len, self.d_model) #[T,D] + + position = paddle.arange( + 0, self.max_len, dtype=paddle.float32).unsqueeze(1) + div_term = paddle.exp( + paddle.arange(0, self.d_model, 2, dtype=paddle.float32) * + -(math.log(10000.0) / self.d_model)) + + self.pe[:, 0::2] = paddle.sin(position * div_term) + self.pe[:, 1::2] = paddle.cos(position * div_term) + self.pe = self.pe.unsqueeze(0) #[1, T, D] + + def forward(self, x: paddle.Tensor, + offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]: + """Add positional encoding. + Args: + x (paddle.Tensor): Input. Its shape is (batch, time, ...) + offset (int): position offset + Returns: + paddle.Tensor: Encoded tensor. Its shape is (batch, time, ...) + paddle.Tensor: for compatibility to RelPositionalEncoding + """ + T = paddle.shape(x)[1] + assert offset + T < self.max_len + #assert offset + x.size(1) < self.max_len + #self.pe = self.pe.to(x.device) + #pos_emb = self.pe[:, offset:offset + x.size(1)] + pos_emb = self.pe[:, offset:offset + T] + x = x * self.xscale + pos_emb + return self.dropout(x), self.dropout(pos_emb) + + def position_encoding(self, offset: int, size: int) -> paddle.Tensor: + """ For getting encoding in a streaming fashion + Attention!!!!! + we apply dropout only once at the whole utterance level in a none + streaming way, but will call this function several times with + increasing input size in a streaming scenario, so the dropout will + be applied several times. + Args: + offset (int): start offset + size (int): requried size of position encoding + Returns: + paddle.Tensor: Corresponding encoding + """ + assert offset + size < self.max_len + return self.dropout(self.pe[:, offset:offset + size]) + + +class RelPositionalEncoding(PositionalEncoding): + """Relative positional encoding module. + See : Appendix B in https://arxiv.org/abs/1901.02860 + """ + + def __init__(self, d_model: int, dropout_rate: float, max_len: int=5000): + """ + Args: + d_model (int): Embedding dimension. + dropout_rate (float): Dropout rate. + max_len (int, optional): [Maximum input length.]. Defaults to 5000. + """ + super().__init__(d_model, dropout_rate, max_len, reverse=True) + + def forward(self, x: paddle.Tensor, + offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]: + """Compute positional encoding. + Args: + x (paddle.Tensor): Input tensor (batch, time, `*`). + Returns: + paddle.Tensor: Encoded tensor (batch, time, `*`). + paddle.Tensor: Positional embedding tensor (1, time, `*`). + """ + T = paddle.shape()[1] + assert offset + T < self.max_len + #assert offset + x.size(1) < self.max_len + #self.pe = self.pe.to(x.device) + x = x * self.xscale + #pos_emb = self.pe[:, offset:offset + x.size(1)] + pos_emb = self.pe[:, offset:offset + T] + return self.dropout(x), self.dropout(pos_emb) diff --git a/deepspeech/modules/loss.py b/deepspeech/modules/loss.py index b0e021a59761a4658ea633210a79df7d0d062f6b..ce59ec86f09133fe830c06d7b79f8d2145142ede 100644 --- a/deepspeech/modules/loss.py +++ b/deepspeech/modules/loss.py @@ -24,6 +24,7 @@ logger = logging.getLogger(__name__) __all__ = ['CTCLoss'] +# TODO(Hui Zhang): remove this hack, when `norm_by_times=True` is added def ctc_loss(logits, labels, input_lengths, @@ -47,19 +48,35 @@ def ctc_loss(logits, return loss_out +# TODO(Hui Zhang): remove this hack F.ctc_loss = ctc_loss class CTCLoss(nn.Layer): - def __init__(self, blank_id): + def __init__(self, blank=0, reduction='sum'): super().__init__() # last token id as blank id - self.loss = nn.CTCLoss(blank=blank_id, reduction='sum') + self.loss = nn.CTCLoss(blank=blank, reduction=reduction) - def forward(self, logits, text, logits_len, text_len): - # warp-ctc do softmax on activations + def forward(self, logits, ys_pad, hlens, ys_lens): + """Compute CTC loss. + + Args: + logits ([paddle.Tensor]): [description] + ys_pad ([paddle.Tensor]): [description] + hlens ([paddle.Tensor]): [description] + ys_lens ([paddle.Tensor]): [description] + + Returns: + [paddle.Tensor]: scalar. If reduction is 'none', then (N), where N = \text{batch size}. + """ + # warp-ctc need logits, and do softmax on logits by itself # warp-ctc need activation with shape [T, B, V + 1] + # logits: (B, L, D) -> (L, B, D) logits = logits.transpose([1, 0, 2]) + loss = self.loss(logits, ys_pad, hlens, ys_lens) - ctc_loss = self.loss(logits, text, logits_len, text_len) - return ctc_loss + # wenet do batch-size average, deepspeech2 not do this + # Batch-size average + # loss = loss / paddle.shape(logits)[1] + return loss diff --git a/deepspeech/modules/mask.py b/deepspeech/modules/mask.py index cb036c141fbc4a7141ce9c2cfb0717d7c807d487..0f136403ff18f3302de2b3d83e76306d3bb21187 100644 --- a/deepspeech/modules/mask.py +++ b/deepspeech/modules/mask.py @@ -28,6 +28,7 @@ def sequence_mask(x_len, max_len=None, dtype='float32'): max_len = max_len or x_len.max() x_len = paddle.unsqueeze(x_len, -1) row_vector = paddle.arange(max_len) + # TODO(Hui Zhang): fix this bug #mask = row_vector < x_len mask = row_vector > x_len # a bug, broadcast 的时候出错了 mask = paddle.cast(mask, dtype) diff --git a/deepspeech/training/trainer.py b/deepspeech/training/trainer.py index ebaed256bae842b57aec30ae13c1cbfaed45f216..f472200b8a45587d5a6cd57d9a60a53f49abf917 100644 --- a/deepspeech/training/trainer.py +++ b/deepspeech/training/trainer.py @@ -167,9 +167,17 @@ class Trainer(): self.new_epoch() while self.epoch <= self.config.training.n_epoch: try: + data_start_time = time.time() for batch in self.train_loader: + dataload_time = time.time() - data_start_time + msg = "Train: Rank: {}, ".format(dist.get_rank()) + msg += "epoch: {}, ".format(self.epoch) + msg += "step: {}, ".format(self.iteration) + msg += "dataloader time: {:>.3f}s, ".format(dataload_time) + self.logger.info(msg) self.iteration += 1 self.train_batch(batch) + data_start_time = time.time() except Exception as e: self.logger.error(e) pass diff --git a/examples/aishell/local/infer.sh b/examples/aishell/local/infer.sh index 4b4c9381b9685be155039b4cbe1e03b9913e1d65..41ccabf803f55975866f41a47dc410abb5edca9a 100644 --- a/examples/aishell/local/infer.sh +++ b/examples/aishell/local/infer.sh @@ -1,5 +1,9 @@ #! /usr/bin/env bash +if [[ $# != 1 ]]; + echo "usage: $0 ckpt-path" + exit -1 +fi # download language model bash local/download_lm_ch.sh diff --git a/examples/aishell/local/infer_golden.sh b/examples/aishell/local/infer_golden.sh deleted file mode 100644 index 3fdcd1b5e5f93962d2222a730a2081b4008ad803..0000000000000000000000000000000000000000 --- a/examples/aishell/local/infer_golden.sh +++ /dev/null @@ -1,31 +0,0 @@ -#! /usr/bin/env bash - -# download language model -bash local/download_lm_ch.sh -if [ $? -ne 0 ]; then - exit 1 -fi - -# download well-trained model -bash local/download_model.sh -if [ $? -ne 0 ]; then - exit 1 -fi - -# infer -CUDA_VISIBLE_DEVICES=0 \ -python3 -u ${BIN_DIR}/infer.py \ ---device 'gpu' \ ---nproc 1 \ ---config conf/deepspeech2.yaml \ ---checkpoint_path data/pretrain/params.pdparams \ ---opts data.mean_std_filepath data/pretrain/mean_std.npz \ ---opts data.vocab_filepath data/pretrain/vocab.txt - -if [ $? -ne 0 ]; then - echo "Failed in inference!" - exit 1 -fi - - -exit 0 diff --git a/examples/aishell/local/server.sh b/examples/aishell/local/server.sh index 37968407502c725690b928742f0d971fc666d767..1cf069ebd531c8ea62827e89c741964500108745 100644 --- a/examples/aishell/local/server.sh +++ b/examples/aishell/local/server.sh @@ -2,7 +2,7 @@ # TODO: replace the model with a mandarin model if [[ $# != 1 ]];then - echo "usage: server.sh checkpoint_path" + echo "usage: $1 checkpoint_path" exit -1 fi diff --git a/examples/aishell/local/test.sh b/examples/aishell/local/test.sh index 74015f5d5b012c3c52bcc59879a8920470e13e5f..0872ff21ec24be41eb55a28ebb94bfdf79f32ce2 100644 --- a/examples/aishell/local/test.sh +++ b/examples/aishell/local/test.sh @@ -10,7 +10,7 @@ python3 -u ${BIN_DIR}/test.py \ --device 'gpu' \ --nproc 1 \ --config conf/deepspeech2.yaml \ ---checkpoint_path ${1} +--output ckpt if [ $? -ne 0 ]; then echo "Failed in evaluation!" diff --git a/examples/aishell/local/test_golden.sh b/examples/aishell/local/test_golden.sh deleted file mode 100644 index 86abd38cb183ba953a4d804ac7fb987af4a160a2..0000000000000000000000000000000000000000 --- a/examples/aishell/local/test_golden.sh +++ /dev/null @@ -1,31 +0,0 @@ -#! /usr/bin/env bash - -# download language model -bash local/download_lm_ch.sh -if [ $? -ne 0 ]; then - exit 1 -fi - -# download well-trained model -bash local/download_model.sh -if [ $? -ne 0 ]; then - exit 1 -fi - -# evaluate model -CUDA_VISIBLE_DEVICES=0 \ -python3 -u ${BIN_DIR}/test.py \ ---device 'gpu' \ ---nproc 1 \ ---config conf/deepspeech2.yaml \ ---checkpoint_path data/pretrain/params.pdparams \ ---opts data.mean_std_filepath data/pretrain/mean_std.npz \ ---opts data.vocab_filepath data/pretrain/vocab.txt - -if [ $? -ne 0 ]; then - echo "Failed in evaluation!" - exit 1 -fi - - -exit 0 diff --git a/examples/aishell/local/train.sh b/examples/aishell/local/train.sh index 3e13a79e3075d6acb42ec0fa97c3814f39415fa5..c286566a8d1e5a68279916d8d182ef72531afddc 100644 --- a/examples/aishell/local/train.sh +++ b/examples/aishell/local/train.sh @@ -11,7 +11,7 @@ python3 -u ${BIN_DIR}/train.py \ --device 'gpu' \ --nproc ${ngpu} \ --config conf/deepspeech2.yaml \ ---output ckpt +--output ckpt-${1} if [ $? -ne 0 ]; then diff --git a/examples/aishell/run.sh b/examples/aishell/run.sh index dc762df99f488a763ceeeadde13e870282dd73f8..8beb6bf0f5459d1f8a0a0eef1bde180e1564a06c 100644 --- a/examples/aishell/run.sh +++ b/examples/aishell/run.sh @@ -10,7 +10,10 @@ bash ./local/data.sh CUDA_VISIBLE_DEVICES=0,1,2,3 bash ./local/train.sh # test model -CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh ckpt/checkpoints/step-3284 +CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh # infer model CUDA_VISIBLE_DEVICES=0 bash ./local/infer.sh ckpt/checkpoints/step-3284 + +# export model +bash ./local/export.sh ckpt/checkpoints/step-3284 jit.model \ No newline at end of file diff --git a/examples/librispeech/README.md b/examples/librispeech/README.md index 536a0cd962a0c21894669208afe5b0634ddba2bb..1e694df1c709843da03b98989eb39d376a21373c 100644 --- a/examples/librispeech/README.md +++ b/examples/librispeech/README.md @@ -1,7 +1,7 @@ # LibriSpeech ## CTC -| Model | Config | Test set | CER | +| Model | Config | Test set | WER | | --- | --- | --- | --- | | DeepSpeech2 | conf/deepspeech2.yaml | test-clean | 0.073973 | | DeepSpeech2 | release 1.8.5 | test-clean | 0.074939 | diff --git a/examples/librispeech/local/export.sh b/examples/librispeech/local/export.sh new file mode 100644 index 0000000000000000000000000000000000000000..1b553391672bad796062d5e6134fe88aba942e23 --- /dev/null +++ b/examples/librispeech/local/export.sh @@ -0,0 +1,20 @@ +#! /usr/bin/env bash + +if [ $# != 2 ];then + echo "usage: export ckpt_path jit_model_path" + exit -1 +fi + +python3 -u ${BIN_DIR}/export.py \ +--config conf/deepspeech2.yaml \ +--checkpoint_path ${1} \ +--export_path ${2} + + +if [ $? -ne 0 ]; then + echo "Failed in evaluation!" + exit 1 +fi + + +exit 0 diff --git a/examples/librispeech/local/infer.sh b/examples/librispeech/local/infer.sh index 9ea39901f45ff4a1b86908aae2d1ec350ad190a4..6fc8d39fc82c63b0fdcf4a5ae71a64d67ef13139 100644 --- a/examples/librispeech/local/infer.sh +++ b/examples/librispeech/local/infer.sh @@ -1,5 +1,10 @@ #! /usr/bin/env bash +if [[ $# != 1 ]]; + echo "usage: $0 ckpt-path" + exit -1 +fi + # download language model bash local/download_lm_en.sh if [ $? -ne 0 ]; then diff --git a/examples/librispeech/run.sh b/examples/librispeech/run.sh index ff87d38bf3c21faa3d97b0ca572a295a05866952..cf0f41edb2e5137620d11b438164a867d073f42e 100644 --- a/examples/librispeech/run.sh +++ b/examples/librispeech/run.sh @@ -7,10 +7,13 @@ source path.sh bash ./local/data.sh # train model -CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 bash ./local/train.sh +CUDA_VISIBLE_DEVICES=0,1,2,3 bash ./local/train.sh # test model CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh # infer model -CUDA_VISIBLE_DEVICES=0 bash ./local/infer.sh +CUDA_VISIBLE_DEVICES=0 bash ./local/infer.sh ckpt/checkpoints/step-3284 + +# export model +bash ./local/export.sh ckpt/checkpoints/step-3284 jit.model \ No newline at end of file diff --git a/examples/tiny/README.md b/examples/tiny/README.md index c3bfdc9c4d09667d747e35920eaa91722cd8803e..0f96864c073c1f44206026504e2aca9e7752b80b 100644 --- a/examples/tiny/README.md +++ b/examples/tiny/README.md @@ -1,7 +1,8 @@ # Tiny Example 1. `source path.sh` -2. `bash run.sh` +3. set `CUDA_VISIBLE_DEVICES` as you need. +2. demo scrpt is `bash run.sh`. You can run commond separately as needed. ## Steps - Prepare the data @@ -26,11 +27,7 @@ bash local/infer.sh ``` - `infer.sh` will show us some speech-to-text decoding results for several (default: 10) samples with the trained model. The performance might not be good now as the current model is only trained with a toy subset of LibriSpeech. To see the results with a better model, you can download a well-trained (trained for several days, with the complete LibriSpeech) model and do the inference: - - ```bash - bash local/infer_golden.sh - ``` + `infer.sh` will show us some speech-to-text decoding results for several (default: 10) samples with the trained model. The performance might not be good now as the current model is only trained with a toy subset of LibriSpeech. To see the results with a better model, you can download a well-trained (trained for several days, with the complete LibriSpeech) model and do the inference. - Evaluate an existing model @@ -40,6 +37,15 @@ `test.sh` will evaluate the model with Word Error Rate (or Character Error Rate) measurement. Similarly, you can also download a well-trained model and test its performance: + +- Export jit model + + ```bash + bash local/export.sh ckpt_path saved_jit_model_path + ``` + +- Tune hyper paerameter + ```bash - bash local/test_golden.sh + bash local/tune.sh ``` diff --git a/examples/tiny/local/infer.sh b/examples/tiny/local/infer.sh index 3aff6b78bb51c159291f5a9b112583fedf5054c4..1243c0d082d23362c2a6146cf6185a70dcc8ac6c 100644 --- a/examples/tiny/local/infer.sh +++ b/examples/tiny/local/infer.sh @@ -1,17 +1,21 @@ #! /usr/bin/env bash +if [[ $# != 1 ]]; + echo "usage: $0 ckpt-path" + exit -1 +fi + # download language model bash local/download_lm_en.sh if [ $? -ne 0 ]; then exit 1 fi -CUDA_VISIBLE_DEVICES=0 \ python3 -u ${BIN_DIR}/infer.py \ --device 'gpu' \ --nproc 1 \ --config conf/deepspeech2.yaml \ ---output ckpt +--checkpoint_path ${1} if [ $? -ne 0 ]; then diff --git a/examples/tiny/local/test.sh b/examples/tiny/local/test.sh index fedebf96d2f492a2a14b2f932b3a7eb8fca75eda..a0f200799adc0803b5a59b4422500cb25b2f43b6 100644 --- a/examples/tiny/local/test.sh +++ b/examples/tiny/local/test.sh @@ -13,7 +13,6 @@ python3 -u ${BIN_DIR}/test.py \ --config conf/deepspeech2.yaml \ --output ckpt - if [ $? -ne 0 ]; then echo "Failed in evaluation!" exit 1 diff --git a/requirements.txt b/requirements.txt index 8ab09f626c99073d74ef9243073f65bf96f76a2e..14d7c032575c258db3f5f63ba307c5763ac68b91 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,3 +4,4 @@ SoundFile==0.9.0.post1 python_speech_features tensorboardX yacs +typeguard