diff --git a/paddlespeech/cli/asr/infer.py b/paddlespeech/cli/asr/infer.py index 23029cfb4d853c57d3f1e4fb48bd2230b9d49228..863a933f2a7446b920228fe2f5fa6e0294b50d5d 100644 --- a/paddlespeech/cli/asr/infer.py +++ b/paddlespeech/cli/asr/infer.py @@ -187,6 +187,7 @@ class ASRExecutor(BaseExecutor): vocab=self.config.vocab_filepath, spm_model_prefix=self.config.spm_model_prefix) self.config.decode.decoding_method = decode_method + else: raise Exception("wrong type") model_name = model_type[:model_type.rindex( @@ -201,6 +202,21 @@ class ASRExecutor(BaseExecutor): model_dict = paddle.load(self.ckpt_path) self.model.set_state_dict(model_dict) + # compute the max len limit + if "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type: + # in transformer like model, we may use the subsample rate cnn network + subsample_rate = self.model.subsampling_rate() + frame_shift_ms = self.config.preprocess_config.process[0][ + 'n_shift'] / self.config.preprocess_config.process[0]['fs'] + max_len = self.model.encoder.embed.pos_enc.max_len + + if self.config.encoder_conf.get("max_len", None): + max_len = self.config.encoder_conf.max_len + + self.max_len = frame_shift_ms * max_len * subsample_rate + logger.info( + f"The asr server limit max duration len: {self.max_len}") + def preprocess(self, model_type: str, input: Union[str, os.PathLike]): """ Input preprocess and return paddle.Tensor stored in self.input. @@ -352,9 +368,10 @@ class ASRExecutor(BaseExecutor): audio, audio_sample_rate = soundfile.read( audio_file, dtype="int16", always_2d=True) audio_duration = audio.shape[0] / audio_sample_rate - max_duration = 50.0 - if audio_duration >= max_duration: - logger.error("Please input audio file less then 50 seconds.\n") + if audio_duration > self.max_len: + logger.error( + f"Please input audio file less then {self.max_len} seconds.\n" + ) return False except Exception as e: logger.exception(e) diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py index 3a851ec62c35f633ce07fd0b4380d92b31d67b3b..42ac119b44540a1931408b1b86aa75e8b1413597 100644 --- a/paddlespeech/s2t/modules/decoder.py +++ b/paddlespeech/s2t/modules/decoder.py @@ -62,21 +62,21 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): False: x -> x + att(x) """ - def __init__( - self, - vocab_size: int, - encoder_output_size: int, - attention_heads: int=4, - linear_units: int=2048, - num_blocks: int=6, - dropout_rate: float=0.1, - positional_dropout_rate: float=0.1, - self_attention_dropout_rate: float=0.0, - src_attention_dropout_rate: float=0.0, - input_layer: str="embed", - use_output_layer: bool=True, - normalize_before: bool=True, - concat_after: bool=False, ): + def __init__(self, + vocab_size: int, + encoder_output_size: int, + attention_heads: int=4, + linear_units: int=2048, + num_blocks: int=6, + dropout_rate: float=0.1, + positional_dropout_rate: float=0.1, + self_attention_dropout_rate: float=0.0, + src_attention_dropout_rate: float=0.0, + input_layer: str="embed", + use_output_layer: bool=True, + normalize_before: bool=True, + concat_after: bool=False, + max_len: int=5000): assert check_argument_types() @@ -87,7 +87,8 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer): if input_layer == "embed": self.embed = nn.Sequential( Embedding(vocab_size, attention_dim), - PositionalEncoding(attention_dim, positional_dropout_rate), ) + PositionalEncoding( + attention_dim, positional_dropout_rate, max_len=max_len), ) else: raise ValueError(f"only 'embed' is supported: {input_layer}") diff --git a/paddlespeech/s2t/modules/embedding.py b/paddlespeech/s2t/modules/embedding.py index 5d4e91753b38129a9c2c71d706787af9d14a903d..596f61b78a4e449b2998b3544dd4204371aa8a2b 100644 --- a/paddlespeech/s2t/modules/embedding.py +++ b/paddlespeech/s2t/modules/embedding.py @@ -112,7 +112,9 @@ class PositionalEncoding(nn.Layer, PositionalEncodingInterface): paddle.Tensor: for compatibility to RelPositionalEncoding, (batch=1, time, ...) """ T = x.shape[1] - assert offset + x.shape[1] < self.max_len + assert offset + x.shape[ + 1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format( + offset, x.shape[1], self.max_len) #TODO(Hui Zhang): using T = x.size(1), __getitem__ not support Tensor pos_emb = self.pe[:, offset:offset + T] x = x * self.xscale + pos_emb @@ -148,6 +150,7 @@ class RelPositionalEncoding(PositionalEncoding): max_len (int, optional): [Maximum input length.]. Defaults to 5000. """ super().__init__(d_model, dropout_rate, max_len, reverse=True) + logger.info(f"max len: {max_len}") def forward(self, x: paddle.Tensor, offset: int=0) -> Tuple[paddle.Tensor, paddle.Tensor]: @@ -158,7 +161,9 @@ class RelPositionalEncoding(PositionalEncoding): paddle.Tensor: Encoded tensor (batch, time, `*`). paddle.Tensor: Positional embedding tensor (1, time, `*`). """ - assert offset + x.shape[1] < self.max_len + assert offset + x.shape[ + 1] < self.max_len, "offset: {} + x.shape[1]: {} is larger than the max_len: {}".format( + offset, x.shape[1], self.max_len) x = x * self.xscale #TODO(Hui Zhang): using x.size(1), __getitem__ not support Tensor pos_emb = self.pe[:, offset:offset + x.shape[1]] diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py index c843c0e207054b20a5d3850334198ef6bcb6888c..669a12d656947f0446eba3d228832964e8c1d7b0 100644 --- a/paddlespeech/s2t/modules/encoder.py +++ b/paddlespeech/s2t/modules/encoder.py @@ -47,24 +47,24 @@ __all__ = ["BaseEncoder", 'TransformerEncoder', "ConformerEncoder"] class BaseEncoder(nn.Layer): - def __init__( - self, - input_size: int, - output_size: int=256, - attention_heads: int=4, - linear_units: int=2048, - num_blocks: int=6, - dropout_rate: float=0.1, - positional_dropout_rate: float=0.1, - attention_dropout_rate: float=0.0, - input_layer: str="conv2d", - pos_enc_layer_type: str="abs_pos", - normalize_before: bool=True, - concat_after: bool=False, - static_chunk_size: int=0, - use_dynamic_chunk: bool=False, - global_cmvn: paddle.nn.Layer=None, - use_dynamic_left_chunk: bool=False, ): + def __init__(self, + input_size: int, + output_size: int=256, + attention_heads: int=4, + linear_units: int=2048, + num_blocks: int=6, + dropout_rate: float=0.1, + positional_dropout_rate: float=0.1, + attention_dropout_rate: float=0.0, + input_layer: str="conv2d", + pos_enc_layer_type: str="abs_pos", + normalize_before: bool=True, + concat_after: bool=False, + static_chunk_size: int=0, + use_dynamic_chunk: bool=False, + global_cmvn: paddle.nn.Layer=None, + use_dynamic_left_chunk: bool=False, + max_len: int=5000): """ Args: input_size (int): input dim, d_feature @@ -127,7 +127,9 @@ class BaseEncoder(nn.Layer): odim=output_size, dropout_rate=dropout_rate, pos_enc_class=pos_enc_class( - d_model=output_size, dropout_rate=positional_dropout_rate), ) + d_model=output_size, + dropout_rate=positional_dropout_rate, + max_len=max_len), ) self.normalize_before = normalize_before self.after_norm = LayerNorm(output_size, epsilon=1e-12) @@ -415,32 +417,32 @@ class TransformerEncoder(BaseEncoder): class ConformerEncoder(BaseEncoder): """Conformer encoder module.""" - def __init__( - self, - input_size: int, - output_size: int=256, - attention_heads: int=4, - linear_units: int=2048, - num_blocks: int=6, - dropout_rate: float=0.1, - positional_dropout_rate: float=0.1, - attention_dropout_rate: float=0.0, - input_layer: str="conv2d", - pos_enc_layer_type: str="rel_pos", - normalize_before: bool=True, - concat_after: bool=False, - static_chunk_size: int=0, - use_dynamic_chunk: bool=False, - global_cmvn: nn.Layer=None, - use_dynamic_left_chunk: bool=False, - positionwise_conv_kernel_size: int=1, - macaron_style: bool=True, - selfattention_layer_type: str="rel_selfattn", - activation_type: str="swish", - use_cnn_module: bool=True, - cnn_module_kernel: int=15, - causal: bool=False, - cnn_module_norm: str="batch_norm", ): + def __init__(self, + input_size: int, + output_size: int=256, + attention_heads: int=4, + linear_units: int=2048, + num_blocks: int=6, + dropout_rate: float=0.1, + positional_dropout_rate: float=0.1, + attention_dropout_rate: float=0.0, + input_layer: str="conv2d", + pos_enc_layer_type: str="rel_pos", + normalize_before: bool=True, + concat_after: bool=False, + static_chunk_size: int=0, + use_dynamic_chunk: bool=False, + global_cmvn: nn.Layer=None, + use_dynamic_left_chunk: bool=False, + positionwise_conv_kernel_size: int=1, + macaron_style: bool=True, + selfattention_layer_type: str="rel_selfattn", + activation_type: str="swish", + use_cnn_module: bool=True, + cnn_module_kernel: int=15, + causal: bool=False, + cnn_module_norm: str="batch_norm", + max_len: int=5000): """Construct ConformerEncoder Args: input_size to use_dynamic_chunk, see in BaseEncoder @@ -464,7 +466,7 @@ class ConformerEncoder(BaseEncoder): attention_dropout_rate, input_layer, pos_enc_layer_type, normalize_before, concat_after, static_chunk_size, use_dynamic_chunk, global_cmvn, - use_dynamic_left_chunk) + use_dynamic_left_chunk, max_len) activation = get_activation(activation_type) # self-attention module definition diff --git a/paddlespeech/server/engine/asr/python/asr_engine.py b/paddlespeech/server/engine/asr/python/asr_engine.py index e76c49a79a66be505f239f9f04b5fdd050701fda..d60a5feaeca6caa5e385f872872104df2a8aa124 100644 --- a/paddlespeech/server/engine/asr/python/asr_engine.py +++ b/paddlespeech/server/engine/asr/python/asr_engine.py @@ -78,21 +78,26 @@ class ASREngine(BaseEngine): Args: audio_data (bytes): base64.b64decode """ - if self.executor._check( - io.BytesIO(audio_data), self.config.sample_rate, - self.config.force_yes): - logger.info("start run asr engine") - self.executor.preprocess(self.config.model, io.BytesIO(audio_data)) - st = time.time() - self.executor.infer(self.config.model) - infer_time = time.time() - st - self.output = self.executor.postprocess() # Retrieve result of asr. - else: - logger.info("file check failed!") - self.output = None - - logger.info("inference time: {}".format(infer_time)) - logger.info("asr engine type: python") + try: + if self.executor._check( + io.BytesIO(audio_data), self.config.sample_rate, + self.config.force_yes): + logger.info("start run asr engine") + self.executor.preprocess(self.config.model, + io.BytesIO(audio_data)) + st = time.time() + self.executor.infer(self.config.model) + infer_time = time.time() - st + self.output = self.executor.postprocess( + ) # Retrieve result of asr. + else: + logger.info("file check failed!") + self.output = None + + logger.info("inference time: {}".format(infer_time)) + logger.info("asr engine type: python") + except Exception as e: + logger.info(e) def postprocess(self): """postprocess