提交 418d85ef 编写于 作者: H huangyuxin

fix some bug and complete the recog.py

上级 e4a9328c
...@@ -28,8 +28,8 @@ from .utils import add_results_to_json ...@@ -28,8 +28,8 @@ from .utils import add_results_to_json
from deepspeech.exps import dynamic_import_tester from deepspeech.exps import dynamic_import_tester
from deepspeech.io.reader import LoadInputsAndTargets from deepspeech.io.reader import LoadInputsAndTargets
from deepspeech.models.asr_interface import ASRInterface from deepspeech.models.asr_interface import ASRInterface
from deepspeech.utils.log import Log
from deepspeech.models.lm.transformer import TransformerLM from deepspeech.models.lm.transformer import TransformerLM
from deepspeech.utils.log import Log
# from espnet.asr.asr_utils import get_model_conf # from espnet.asr.asr_utils import get_model_conf
# from espnet.asr.asr_utils import torch_load # from espnet.asr.asr_utils import torch_load
# from espnet.nets.lm_interface import dynamic_import_lm # from espnet.nets.lm_interface import dynamic_import_lm
...@@ -80,8 +80,7 @@ def recog_v2(args): ...@@ -80,8 +80,7 @@ def recog_v2(args):
sort_in_input_length=False, sort_in_input_length=False,
preprocess_conf=confs.collator.augmentation_config preprocess_conf=confs.collator.augmentation_config
if args.preprocess_conf is None else args.preprocess_conf, if args.preprocess_conf is None else args.preprocess_conf,
preprocess_args={"train": False}, preprocess_args={"train": False}, )
)
if args.rnnlm: if args.rnnlm:
lm_path = args.rnnlm lm_path = args.rnnlm
...@@ -120,8 +119,7 @@ def recog_v2(args): ...@@ -120,8 +119,7 @@ def recog_v2(args):
ctc=args.ctc_weight, ctc=args.ctc_weight,
lm=args.lm_weight, lm=args.lm_weight,
ngram=args.ngram_weight, ngram=args.ngram_weight,
length_bonus=args.penalty, length_bonus=args.penalty, )
)
beam_search = BeamSearch( beam_search = BeamSearch(
beam_size=args.beam_size, beam_size=args.beam_size,
vocab_size=len(char_list), vocab_size=len(char_list),
...@@ -130,8 +128,7 @@ def recog_v2(args): ...@@ -130,8 +128,7 @@ def recog_v2(args):
sos=model.sos, sos=model.sos,
eos=model.eos, eos=model.eos,
token_list=char_list, token_list=char_list,
pre_beam_score_key=None if args.ctc_weight == 1.0 else "full", pre_beam_score_key=None if args.ctc_weight == 1.0 else "full", )
)
# TODO(karita): make all scorers batchfied # TODO(karita): make all scorers batchfied
if args.batchsize == 1: if args.batchsize == 1:
...@@ -178,7 +175,8 @@ def recog_v2(args): ...@@ -178,7 +175,8 @@ def recog_v2(args):
logger.info(f'feat: {feat.shape}') logger.info(f'feat: {feat.shape}')
enc = model.encode(paddle.to_tensor(feat).to(dtype)) enc = model.encode(paddle.to_tensor(feat).to(dtype))
logger.info(f'eout: {enc.shape}') logger.info(f'eout: {enc.shape}')
nbest_hyps = beam_search(x=enc, nbest_hyps = beam_search(
x=enc,
maxlenratio=args.maxlenratio, maxlenratio=args.maxlenratio,
minlenratio=args.minlenratio) minlenratio=args.minlenratio)
nbest_hyps = [ nbest_hyps = [
...@@ -190,9 +188,8 @@ def recog_v2(args): ...@@ -190,9 +188,8 @@ def recog_v2(args):
item = new_js[name]['output'][0] # 1-best item = new_js[name]['output'][0] # 1-best
ref = item['text'] ref = item['text']
rec_text = item['rec_text'].replace('▁', rec_text = item['rec_text'].replace('▁', ' ').replace(
' ').replace('<eos>', '<eos>', '').strip()
'').strip()
rec_tokenid = list(map(int, item['rec_tokenid'].split())) rec_tokenid = list(map(int, item['rec_tokenid'].split()))
f.write({ f.write({
"utt": name, "utt": name,
......
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import logging
from typing import Any from typing import Any
from typing import List from typing import List
from typing import Tuple from typing import Tuple
...@@ -20,12 +21,12 @@ import paddle ...@@ -20,12 +21,12 @@ import paddle
import paddle.nn as nn import paddle.nn as nn
import paddle.nn.functional as F import paddle.nn.functional as F
from deepspeech.modules.mask import subsequent_mask
from deepspeech.modules.encoder import TransformerEncoder
from deepspeech.decoders.scorers.scorer_interface import BatchScorerInterface from deepspeech.decoders.scorers.scorer_interface import BatchScorerInterface
from deepspeech.models.lm_interface import LMInterface from deepspeech.models.lm_interface import LMInterface
from deepspeech.modules.encoder import TransformerEncoder
from deepspeech.modules.mask import subsequent_mask
import logging
class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
def __init__( def __init__(
self, self,
...@@ -37,9 +38,9 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): ...@@ -37,9 +38,9 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
unit: int=1024, unit: int=1024,
layer: int=4, layer: int=4,
dropout_rate: float=0.5, dropout_rate: float=0.5,
emb_dropout_rate: float = 0.0, emb_dropout_rate: float=0.0,
att_dropout_rate: float = 0.0, att_dropout_rate: float=0.0,
tie_weights: bool = False,): tie_weights: bool=False, ):
nn.Layer.__init__(self) nn.Layer.__init__(self)
if pos_enc == "sinusoidal": if pos_enc == "sinusoidal":
...@@ -84,15 +85,12 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): ...@@ -84,15 +85,12 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
), "Tie Weights: True need embedding and final dimensions to match" ), "Tie Weights: True need embedding and final dimensions to match"
self.decoder.weight = self.embed.weight self.decoder.weight = self.embed.weight
def _target_mask(self, ys_in_pad): def _target_mask(self, ys_in_pad):
ys_mask = ys_in_pad != 0 ys_mask = ys_in_pad != 0
m = subsequent_mask(ys_mask.size(-1)).unsqueeze(0) m = subsequent_mask(ys_mask.size(-1)).unsqueeze(0)
return ys_mask.unsqueeze(-2) & m return ys_mask.unsqueeze(-2) & m
def forward( def forward(self, x: paddle.Tensor, t: paddle.Tensor
self, x: paddle.Tensor, t: paddle.Tensor
) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]: ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
"""Compute LM loss value from buffer sequences. """Compute LM loss value from buffer sequences.
...@@ -119,7 +117,8 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): ...@@ -119,7 +117,8 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
emb = self.embed(x) emb = self.embed(x)
h, _ = self.encoder(emb, xlen) h, _ = self.encoder(emb, xlen)
y = self.decoder(h) y = self.decoder(h)
loss = F.cross_entropy(y.view(-1, y.shape[-1]), t.view(-1), reduction="none") loss = F.cross_entropy(
y.view(-1, y.shape[-1]), t.view(-1), reduction="none")
mask = xm.to(dtype=loss.dtype) mask = xm.to(dtype=loss.dtype)
logp = loss * mask.view(-1) logp = loss * mask.view(-1)
logp = logp.sum() logp = logp.sum()
...@@ -150,16 +149,16 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): ...@@ -150,16 +149,16 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
emb = self.embed(y) emb = self.embed(y)
h, _, cache = self.encoder.forward_one_step( h, _, cache = self.encoder.forward_one_step(
emb, self._target_mask(y), cache=state emb, self._target_mask(y), cache=state)
)
h = self.decoder(h[:, -1]) h = self.decoder(h[:, -1])
logp = F.log_softmax(h).squeeze(0) logp = F.log_softmax(h).squeeze(0)
return logp, cache return logp, cache
# batch beam search API (see BatchScorerInterface) # batch beam search API (see BatchScorerInterface)
def batch_score( def batch_score(self,
self, ys: paddle.Tensor, states: List[Any], xs: paddle.Tensor ys: paddle.Tensor,
) -> Tuple[paddle.Tensor, List[Any]]: states: List[Any],
xs: paddle.Tensor) -> Tuple[paddle.Tensor, List[Any]]:
"""Score new token batch (required). """Score new token batch (required).
Args: Args:
...@@ -193,13 +192,13 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface): ...@@ -193,13 +192,13 @@ class TransformerLM(nn.Layer, LMInterface, BatchScorerInterface):
# batch decoding # batch decoding
h, _, states = self.encoder.forward_one_step( h, _, states = self.encoder.forward_one_step(
emb, self._target_mask(ys), cache=batch_state emb, self._target_mask(ys), cache=batch_state)
)
h = self.decoder(h[:, -1]) h = self.decoder(h[:, -1])
logp = F.log_softmax(h) logp = F.log_softmax(h)
# transpose state of [layer, batch] into [batch, layer] # transpose state of [layer, batch] into [batch, layer]
state_list = [[states[i][b] for i in range(n_layers)] for b in range(n_batch)] state_list = [[states[i][b] for i in range(n_layers)]
for b in range(n_batch)]
return logp, state_list return logp, state_list
...@@ -236,11 +235,11 @@ if __name__ == "__main__": ...@@ -236,11 +235,11 @@ if __name__ == "__main__":
state = None state = None
output, state = tlm.score(input2, state, None) output, state = tlm.score(input2, state, None)
input3 = np.array([5,10]) input3 = np.array([5, 10])
input3 = paddle.to_tensor(input3) input3 = paddle.to_tensor(input3)
output, state = tlm.score(input3, state, None) output, state = tlm.score(input3, state, None)
input4 = np.array([5,10,0]) input4 = np.array([5, 10, 0])
input4 = paddle.to_tensor(input4) input4 = paddle.to_tensor(input4)
output, state = tlm.score(input4, state, None) output, state = tlm.score(input4, state, None)
print("output", output) print("output", output)
......
...@@ -24,9 +24,9 @@ from deepspeech.modules.activation import get_activation ...@@ -24,9 +24,9 @@ from deepspeech.modules.activation import get_activation
from deepspeech.modules.attention import MultiHeadedAttention from deepspeech.modules.attention import MultiHeadedAttention
from deepspeech.modules.attention import RelPositionMultiHeadedAttention from deepspeech.modules.attention import RelPositionMultiHeadedAttention
from deepspeech.modules.conformer_convolution import ConvolutionModule from deepspeech.modules.conformer_convolution import ConvolutionModule
from deepspeech.modules.embedding import NoPositionalEncoding
from deepspeech.modules.embedding import PositionalEncoding from deepspeech.modules.embedding import PositionalEncoding
from deepspeech.modules.embedding import RelPositionalEncoding from deepspeech.modules.embedding import RelPositionalEncoding
from deepspeech.modules.embedding import NoPositionalEncoding
from deepspeech.modules.encoder_layer import ConformerEncoderLayer from deepspeech.modules.encoder_layer import ConformerEncoderLayer
from deepspeech.modules.encoder_layer import TransformerEncoderLayer from deepspeech.modules.encoder_layer import TransformerEncoderLayer
from deepspeech.modules.mask import add_optional_chunk_mask from deepspeech.modules.mask import add_optional_chunk_mask
...@@ -378,8 +378,7 @@ class TransformerEncoder(BaseEncoder): ...@@ -378,8 +378,7 @@ class TransformerEncoder(BaseEncoder):
self, self,
xs: paddle.Tensor, xs: paddle.Tensor,
masks: paddle.Tensor, masks: paddle.Tensor,
cache=None, cache=None, ) -> Tuple[paddle.Tensor, paddle.Tensor]:
) -> Tuple[paddle.Tensor, paddle.Tensor]:
"""Encode input frame. """Encode input frame.
Args: Args:
...@@ -397,9 +396,11 @@ class TransformerEncoder(BaseEncoder): ...@@ -397,9 +396,11 @@ class TransformerEncoder(BaseEncoder):
if isinstance(self.embed, Conv2dSubsampling): if isinstance(self.embed, Conv2dSubsampling):
#TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor #TODO(Hui Zhang): self.embed(xs, masks, offset=0), stride_slice not support bool tensor
xs, pos_emb, masks = self.embed(xs, masks.astype(xs.dtype), offset=0) xs, pos_emb, masks = self.embed(
xs, masks.astype(xs.dtype), offset=0)
else: else:
xs , pos_emb, masks= self.embed(xs, masks.astype(xs.dtype), offset=0) xs, pos_emb, masks = self.embed(
xs, masks.astype(xs.dtype), offset=0)
#TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor #TODO(Hui Zhang): remove mask.astype, stride_slice not support bool tensor
masks = masks.astype(paddle.bool) masks = masks.astype(paddle.bool)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册