diff --git a/deepspeech/exps/u2/model.py b/deepspeech/exps/u2/model.py index dd62f537e85597949c6397975c3b22efbd0d7fbd..add225912ea0de0b27de7f13e42f04a5fe55c07b 100644 --- a/deepspeech/exps/u2/model.py +++ b/deepspeech/exps/u2/model.py @@ -532,7 +532,7 @@ class U2Tester(U2Trainer): # 1. Encoder encoder_out, encoder_mask = self.model._forward_encoder( feat, feats_length) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) + maxlen = encoder_out.shape[1] ctc_probs = self.model.ctc.log_softmax( encoder_out) # (1, maxlen, vocab_size) @@ -598,10 +598,20 @@ class U2Tester(U2Trainer): def export(self): infer_model, input_spec = self.load_inferspec() - assert isinstance(input_spec, list), type(input_spec) + # assert isinstance(input_spec, list), type(input_spec) infer_model.eval() - static_model = paddle.jit.to_static(infer_model, input_spec=input_spec) - logger.info(f"Export code: {static_model.forward.code}") + #static_model = paddle.jit.to_static(infer_model., input_spec=input_spec) + + static_model = paddle.jit.to_static( + infer_model.forward_attention_decoder, + input_spec=[ + paddle.static.InputSpec(shape=[1, None],dtype='int32'), + paddle.static.InputSpec(shape=[1],dtype='int32'), + paddle.static.InputSpec(shape=[1, None, 256],dtype='int32'), + ] + ) + logger.info(f"Export code: {static_model}") + paddle.jit.save(static_model, self.args.export_path) def run_export(self): diff --git a/deepspeech/models/u2.py b/deepspeech/models/u2.py index 6b266bdb430cd2403dc447f22016a844a13afb76..a9f37833beca699ff65f9bed4fe493b27e62dcca 100644 --- a/deepspeech/models/u2.py +++ b/deepspeech/models/u2.py @@ -299,8 +299,8 @@ class U2BaseModel(nn.Module): speech, speech_lengths, decoding_chunk_size, num_decoding_left_chunks, simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) - encoder_dim = encoder_out.size(2) + maxlen = encoder_out.shape[1] + encoder_dim = encoder_out.shape[2] running_size = batch_size * beam_size encoder_out = encoder_out.unsqueeze(1).repeat(1, beam_size, 1, 1).view( running_size, maxlen, encoder_dim) # (B*N, maxlen, encoder_dim) @@ -405,7 +405,7 @@ class U2BaseModel(nn.Module): encoder_out, encoder_mask = self._forward_encoder( speech, speech_lengths, decoding_chunk_size, num_decoding_left_chunks, simulate_streaming) - maxlen = encoder_out.size(1) + maxlen = encoder_out.shape[1] # (TODO Hui Zhang): bool no support reduce_sum # encoder_out_lens = encoder_mask.squeeze(1).sum(1) encoder_out_lens = encoder_mask.squeeze(1).astype(paddle.int).sum(1) @@ -455,7 +455,7 @@ class U2BaseModel(nn.Module): speech, speech_lengths, decoding_chunk_size, num_decoding_left_chunks, simulate_streaming) # (B, maxlen, encoder_dim) - maxlen = encoder_out.size(1) + maxlen = encoder_out.shape[1] ctc_probs = self.ctc.log_softmax(encoder_out) # (1, maxlen, vocab_size) ctc_probs = ctc_probs.squeeze(0) # cur_hyps: (prefix, (blank_ending_score, none_blank_ending_score)) @@ -578,7 +578,7 @@ class U2BaseModel(nn.Module): hyps_lens = hyps_lens + 1 # Add at begining encoder_out = encoder_out.repeat(beam_size, 1, 1) encoder_mask = paddle.ones( - (beam_size, 1, encoder_out.size(1)), dtype=paddle.bool) + (beam_size, 1, encoder_out.shape[1]), dtype=paddle.bool) decoder_out, _ = self.decoder( encoder_out, encoder_mask, hyps_pad, hyps_lens) # (beam_size, max_hyps_len, vocab_size) @@ -624,7 +624,7 @@ class U2BaseModel(nn.Module): """ return self.eos - @jit.export + # @jit.export def forward_encoder_chunk( self, xs: paddle.Tensor, @@ -654,9 +654,7 @@ class U2BaseModel(nn.Module): xs, offset, required_cache_size, subsampling_cache, elayers_output_cache, conformer_cnn_cache) - # @jit.export([ - # paddle.static.InputSpec(shape=[1, None, feat_dim],dtype='float32'), # audio feat, [B,T,D] - # ]) + # @jit.export def ctc_activation(self, xs: paddle.Tensor) -> paddle.Tensor: """ Export interface for c++ call, apply linear transform and log softmax before ctc @@ -667,7 +665,7 @@ class U2BaseModel(nn.Module): """ return self.ctc.log_softmax(xs) - @jit.export + # @jit.export def forward_attention_decoder( self, hyps: paddle.Tensor, @@ -683,13 +681,14 @@ class U2BaseModel(nn.Module): Returns: paddle.Tensor: decoder output, (B, L) """ - assert encoder_out.size(0) == 1 - num_hyps = hyps.size(0) - assert hyps_lens.size(0) == num_hyps - encoder_out = encoder_out.repeat(num_hyps, 1, 1) + assert encoder_out.shape[0] == 1 + num_hyps = hyps.shape[0] + assert hyps_lens.shape[0] == num_hyps + # encoder_out = encoder_out.repeat(num_hyps, 1, 1) + encoder_out = encoder_out.tile([num_hyps, 1, 1]) # (B, 1, T) encoder_mask = paddle.ones( - [num_hyps, 1, encoder_out.size(1)], dtype=paddle.bool) + [num_hyps, 1, encoder_out.shape[1]], dtype=paddle.bool) # (num_hyps, max_hyps_len, vocab_size) decoder_out, _ = self.decoder(encoder_out, encoder_mask, hyps, hyps_lens) @@ -744,7 +743,7 @@ class U2BaseModel(nn.Module): Returns: List[List[int]]: transcripts. """ - batch_size = feats.size(0) + batch_size = feats.shape[0] if decoding_method in ['ctc_prefix_beam_search', 'attention_rescoring'] and batch_size > 1: logger.fatal( @@ -772,7 +771,7 @@ class U2BaseModel(nn.Module): # result in List[int], change it to List[List[int]] for compatible # with other batch decoding mode elif decoding_method == 'ctc_prefix_beam_search': - assert feats.size(0) == 1 + assert feats.shape[0] == 1 hyp = self.ctc_prefix_beam_search( feats, feats_lengths, @@ -782,7 +781,7 @@ class U2BaseModel(nn.Module): simulate_streaming=simulate_streaming) hyps = [hyp] elif decoding_method == 'attention_rescoring': - assert feats.size(0) == 1 + assert feats.shape[0] == 1 hyp = self.attention_rescoring( feats, feats_lengths, @@ -922,7 +921,7 @@ class U2InferModel(U2Model): Returns: List[List[int]]: best path result """ - return self.ctc_greedy_search( + return self.attention_rescoring( feats, feats_lengths, decoding_chunk_size=decoding_chunk_size, diff --git a/deepspeech/modules/attention.py b/deepspeech/modules/attention.py index 4401a4a5552fa26e8141ef07c4fef1da095ec483..ac1e9b75b38118f566bdce0e4c4a70c222ac09f4 100644 --- a/deepspeech/modules/attention.py +++ b/deepspeech/modules/attention.py @@ -70,7 +70,7 @@ class MultiHeadedAttention(nn.Layer): paddle.Tensor: Transformed value tensor, size (#batch, n_head, time2, d_k). """ - n_batch = query.size(0) + n_batch = query.shape[0] q = self.linear_q(query).view(n_batch, -1, self.h, self.d_k) k = self.linear_k(key).view(n_batch, -1, self.h, self.d_k) v = self.linear_v(value).view(n_batch, -1, self.h, self.d_k) @@ -96,7 +96,7 @@ class MultiHeadedAttention(nn.Layer): paddle.Tensor: Transformed value weighted by the attention score, (#batch, time1, d_model). """ - n_batch = value.size(0) + n_batch = value.shape[0] if mask is not None: mask = mask.unsqueeze(1).eq(0) # (batch, 1, *, time2) scores = scores.masked_fill(mask, -float('inf')) @@ -172,15 +172,15 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): paddle.Tensor: Output tensor. (batch, head, time1, time1) """ zero_pad = paddle.zeros( - (x.size(0), x.size(1), x.size(2), 1), dtype=x.dtype) + (x.shape[0], x.shape[1], x.shape[2], 1), dtype=x.dtype) x_padded = paddle.cat([zero_pad, x], dim=-1) - x_padded = x_padded.view(x.size(0), x.size(1), x.size(3) + 1, x.size(2)) + x_padded = x_padded.view(x.shape[0], x.shape[1], x.shape[3] + 1, x.shape[2]) x = x_padded[:, :, 1:].view_as(x) # [B, H, T1, T1] if zero_triu: - ones = paddle.ones((x.size(2), x.size(3))) - x = x * paddle.tril(ones, x.size(3) - x.size(2))[None, None, :, :] + ones = paddle.ones((x.shape[2], x.shape[3])) + x = x * paddle.tril(ones, x.shape[3] - x.shape[2])[None, None, :, :] return x @@ -205,7 +205,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention): q, k, v = self.forward_qkv(query, key, value) q = q.transpose([0, 2, 1, 3]) # (batch, time1, head, d_k) - n_batch_pos = pos_emb.size(0) + n_batch_pos = pos_emb.shape[0] p = self.linear_pos(pos_emb).view(n_batch_pos, -1, self.h, self.d_k) p = p.transpose([0, 2, 1, 3]) # (batch, head, time1, d_k) diff --git a/deepspeech/modules/decoder.py b/deepspeech/modules/decoder.py index 696a6315b98fc3220c6717eb2f9b9e8aebd080a9..b4eb46f20c5ba2a4b6843f6c753ebade634cc518 100644 --- a/deepspeech/modules/decoder.py +++ b/deepspeech/modules/decoder.py @@ -122,7 +122,7 @@ class TransformerDecoder(nn.Module): # tgt_mask: (B, 1, L) tgt_mask = (make_non_pad_mask(ys_in_lens).unsqueeze(1)) # m: (1, L, L) - m = subsequent_mask(tgt_mask.size(-1)).unsqueeze(0) + m = subsequent_mask(tgt_mask.shape[-1]).unsqueeze(0) # tgt_mask: (B, L, L) # TODO(Hui Zhang): not support & for tensor # tgt_mask = tgt_mask & m diff --git a/deepspeech/modules/embedding.py b/deepspeech/modules/embedding.py index 98b4e1291415176dd36d54de185ef8acc68d9701..fbbda023c613a5328ac6a1e87938ed2428ff98aa 100644 --- a/deepspeech/modules/embedding.py +++ b/deepspeech/modules/embedding.py @@ -68,7 +68,7 @@ class PositionalEncoding(nn.Layer): paddle.Tensor: for compatibility to RelPositionalEncoding, (batch=1, time, ...) """ T = x.shape[1] - assert offset + x.size(1) < self.max_len + assert offset + x.shape[1] < self.max_len #TODO(Hui Zhang): using T = x.size(1), __getitem__ not support Tensor pos_emb = self.pe[:, offset:offset + T] x = x * self.xscale + pos_emb @@ -114,7 +114,7 @@ class RelPositionalEncoding(PositionalEncoding): paddle.Tensor: Encoded tensor (batch, time, `*`). paddle.Tensor: Positional embedding tensor (1, time, `*`). """ - assert offset + x.size(1) < self.max_len + assert offset + x.shape[1] < self.max_len x = x * self.xscale #TODO(Hui Zhang): using x.size(1), __getitem__ not support Tensor pos_emb = self.pe[:, offset:offset + x.shape[1]] diff --git a/deepspeech/utils/tensor_utils.py b/deepspeech/utils/tensor_utils.py index 58f958c4bf30340e51d087d8a9f138c8128cf03a..2e60599911a4e984cc03f3aa0fbcabf1745b11ea 100644 --- a/deepspeech/utils/tensor_utils.py +++ b/deepspeech/utils/tensor_utils.py @@ -65,11 +65,11 @@ def pad_sequence(sequences: List[paddle.Tensor], # assuming trailing dimensions and type of all the Tensors # in sequences are same and fetching those from sequences[0] - max_size = sequences[0].size() + max_size = sequences[0].shape # (TODO Hui Zhang): slice not supprot `end==start` # trailing_dims = max_size[1:] trailing_dims = max_size[1:] if max_size.ndim >= 2 else () - max_len = max([s.size(0) for s in sequences]) + max_len = max([s.shape[0] for s in sequences]) if batch_first: out_dims = (len(sequences), max_len) + trailing_dims else: @@ -77,7 +77,7 @@ def pad_sequence(sequences: List[paddle.Tensor], out_tensor = sequences[0].new_full(out_dims, padding_value) for i, tensor in enumerate(sequences): - length = tensor.size(0) + length = tensor.shape[0] # use index notation to prevent duplicate references to the tensor if batch_first: out_tensor[i, :length, ...] = tensor @@ -125,7 +125,7 @@ def add_sos_eos(ys_pad: paddle.Tensor, sos: int, eos: int, #ys_in = [paddle.cat([_sos, y], dim=0) for y in ys] #ys_out = [paddle.cat([y, _eos], dim=0) for y in ys] #return pad_sequence(ys_in, padding_value=eos), pad_sequence(ys_out, padding_value=ignore_id) - B = ys_pad.size(0) + B = ys_pad.shape[0] _sos = paddle.ones([B, 1], dtype=ys_pad.dtype) * sos _eos = paddle.ones([B, 1], dtype=ys_pad.dtype) * eos ys_in = paddle.cat([_sos, ys_pad], dim=1) @@ -152,7 +152,7 @@ def th_accuracy(pad_outputs: paddle.Tensor, float: Accuracy value (0.0 - 1.0). """ pad_pred = pad_outputs.view( - pad_targets.size(0), pad_targets.size(1), pad_outputs.size(1)).argmax(2) + pad_targets.shape[0], pad_targets.size(1), pad_outputs.size(1)).argmax(2) mask = pad_targets != ignore_label #TODO(Hui Zhang): sum not support bool type # numerator = paddle.sum(