Merge branch 'PaddlePaddle:develop' into develop

3ef28dee · jiamingkong · GitHub · 927c60a5 · a3c3317a · 3ef28dee
11 changed file
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,11 +3,7 @@ repos:
    rev: v0.16.0
    hooks:
    -   id: yapf
-        name: yapf
-        language: python
-        entry: yapf
-        args: [-i, -vv]
-        types: [python]
+        files: \.py$
        exclude: (?=runtime/engine/kaldi|audio/paddleaudio/src|third_party).*(\.cpp|\.cc|\.h\.hpp|\.py)$

 -   repo: https://github.com/pre-commit/pre-commit-hooks

--- a/demos/speech_server/README.md
+++ b/demos/speech_server/README.md
@@ -34,6 +34,8 @@ Currently the engine type supports two forms: python and inference (Paddle Infer
  paddlespeech_server start --config_file ./conf/application.yaml
  ```

+  > **Note:** For mixed Chinese and English speech recognition, please use the `./conf/conformer_talcs_application.yaml` configuration file 
+
  Usage:
  
  ```bash
@@ -85,6 +87,7 @@ Here are sample files for this ASR client demo that can be downloaded:
 ```bash
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 ```

 **Note:** The response time will be slightly longer when using the client for the first time
@@ -92,8 +95,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav

   If `127.0.0.1` is not accessible, you need to use the actual service IP address.

-   ```
+   ```bash
   paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
+  
+   # Chinese and English mixed speech recognition,  using `./conf/conformer_talcs_application.yaml` config file
+   paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./ch_zh_mix.wav
   ```

  Usage:

--- a/demos/speech_server/README_cn.md
+++ b/demos/speech_server/README_cn.md
@@ -37,6 +37,8 @@
  paddlespeech_server start --config_file ./conf/application.yaml
  ```

+  > **注意：** 中英文混合语音识别请使用  `./conf/conformer_talcs_application.yaml` 配置文件
+
  使用方法：
  
  ```bash
@@ -79,6 +81,8 @@
  [2022-02-23 14:57:56] [INFO] [server.py:204] Uvicorn running on http://0.0.0.0:8090 (Press CTRL+C to quit)
  ```

+
+
 ### 4. ASR 客户端使用方法

 ASR 客户端的输入是一个 WAV 文件（`.wav`），并且采样率必须与模型的采样率相同。
@@ -87,6 +91,7 @@ ASR 客户端的输入是一个 WAV 文件（`.wav`），并且采样率必须
 ```bash
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
+wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/ch_zh_mix.wav
 ```

 **注意：** 初次使用客户端时响应时间会略长
@@ -94,8 +99,11 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav

  若 `127.0.0.1` 不能访问，则需要使用实际服务 IP 地址

-  ```
+  ```bash
  paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
+  
+  # 中英文混合语音识别 , 请使用  `./conf/conformer_talcs_application.yaml` 配置文件
+  paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./ch_zh_mix.wav
  ```

  使用帮助:

--- a/demos/speech_server/conf/conformer_talcs_application.yaml
+++ b/demos/speech_server/conf/conformer_talcs_application.yaml
+# This is the parameter configuration file for PaddleSpeech Offline Serving.
+
+#################################################################################
+#                             SERVER SETTING                                    #
+#################################################################################
+host: 0.0.0.0
+port: 8090
+
+# The task format in the engin_list is: <speech task>_<engine type>
+# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference', 'cls_python', 'cls_inference', 'text_python', 'vector_python']
+protocol: 'http'
+engine_list: ['asr_python', 'tts_python', 'cls_python', 'text_python', 'vector_python']
+
+
+#################################################################################
+#                                ENGINE CONFIG                                  #
+#################################################################################
+
+################################### ASR #########################################
+################### speech task: asr; engine_type: python #######################
+asr_python:
+    model: 'conformer_talcs'
+    lang: 'zh_en'
+    sample_rate: 16000
+    cfg_path: # [optional]
+    ckpt_path: # [optional]
+    decode_method: 'attention_rescoring'
+    force_yes: True
+    codeswitch: True
+    device:  # set 'gpu:id' or 'cpu'
+
+################### speech task: asr; engine_type: inference #######################
+asr_inference:
+    # model_type choices=['deepspeech2offline_aishell']
+    model_type: 'deepspeech2offline_aishell'
+    am_model: # the pdmodel file of am static model [optional]
+    am_params:  # the pdiparams file of am static model [optional]
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: 
+    decode_method: 
+    force_yes: True
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+
+################################### TTS #########################################
+################### speech task: tts; engine_type: python #######################
+tts_python:
+    # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc',
+    #                             'fastspeech2_ljspeech', 'fastspeech2_aishell3',
+    #                             'fastspeech2_vctk', 'fastspeech2_mix',
+    #                             'tacotron2_csmsc', 'tacotron2_ljspeech']
+    am: 'fastspeech2_csmsc'   
+    am_config: 
+    am_ckpt: 
+    am_stat: 
+    phones_dict: 
+    tones_dict: 
+    speaker_dict: 
+
+
+    # voc (vocoder) choices=['pwgan_csmsc', 'pwgan_ljspeech', 'pwgan_aishell3',
+    #                        'pwgan_vctk', 'mb_melgan_csmsc', 'style_melgan_csmsc',
+    #                        'hifigan_csmsc', 'hifigan_ljspeech', 'hifigan_aishell3',
+    #                        'hifigan_vctk', 'wavernn_csmsc']
+    voc: 'mb_melgan_csmsc'
+    voc_config: 
+    voc_ckpt: 
+    voc_stat: 
+
+    # others
+    lang: 'zh'
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################### speech task: tts; engine_type: inference #######################
+tts_inference:
+    # am (acoustic model) choices=['speedyspeech_csmsc', 'fastspeech2_csmsc']
+    am: 'fastspeech2_csmsc'   
+    am_model: # the pdmodel file of your am static model (XX.pdmodel)
+    am_params: # the pdiparams file of your am static model (XX.pdipparams)
+    am_sample_rate: 24000
+    phones_dict: 
+    tones_dict: 
+    speaker_dict: 
+
+
+    am_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    # voc (vocoder) choices=['pwgan_csmsc', 'mb_melgan_csmsc','hifigan_csmsc']
+    voc: 'mb_melgan_csmsc'
+    voc_model: # the pdmodel file of your vocoder static model (XX.pdmodel)
+    voc_params: # the pdiparams file of your vocoder static model (XX.pdipparams)
+    voc_sample_rate: 24000
+
+    voc_predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'  
+        switch_ir_optim: True  
+        glog_info: False # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+    # others
+    lang: 'zh'
+
+
+################################### CLS #########################################
+################### speech task: cls; engine_type: python #######################
+cls_python:
+    # model choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6']
+    model: 'panns_cnn14'
+    cfg_path: # [optional] Config of cls task.
+    ckpt_path: # [optional] Checkpoint file of model.
+    label_file: # [optional] Label file of cls task.
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################### speech task: cls; engine_type: inference #######################
+cls_inference:
+    # model_type choices=['panns_cnn14', 'panns_cnn10', 'panns_cnn6']
+    model_type: 'panns_cnn14' 
+    cfg_path: 
+    model_path:  # the pdmodel file of am static model [optional]
+    params_path:  # the pdiparams file of am static model [optional]
+    label_file:  # [optional] Label file of cls task.
+
+    predictor_conf:
+        device:  # set 'gpu:id' or 'cpu'
+        switch_ir_optim: True
+        glog_info: False  # True -> print glog
+        summary: True  # False -> do not show predictor config
+
+
+################################### Text #########################################
+################### text task: punc; engine_type: python #######################
+text_python:
+    task: punc
+    model_type: 'ernie_linear_p3_wudao'
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: # [optional]
+    ckpt_path: # [optional]
+    vocab_file: # [optional]
+    device:  # set 'gpu:id' or 'cpu'
+
+
+################################### Vector ######################################
+################### Vector task: spk; engine_type: python #######################
+vector_python:
+    task: spk
+    model_type: 'ecapatdnn_voxceleb12'
+    sample_rate: 16000
+    cfg_path: # [optional]
+    ckpt_path: # [optional]
+    device:  # set 'gpu:id' or 'cpu'
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
 # Released Models

+> !!! Since PaddlePaddle support 0-D tensor from 2.5.0, PaddleSpeech Static model will not work for it, please re-export static model.
+
 ## Speech-to-Text Models

 ### Speech Recognition Model

--- a/paddlespeech/cli/st/infer.py
+++ b/paddlespeech/cli/st/infer.py
@@ -252,7 +252,7 @@ class STExecutor(BaseExecutor):
            norm_feat = dict(kaldiio.load_ark(process.stdout))[utt_name]
            self._inputs["audio"] = paddle.to_tensor(norm_feat).unsqueeze(0)
            self._inputs["audio_len"] = paddle.to_tensor(
-                self._inputs["audio"].shape[1], dtype="int64")
+                self._inputs["audio"].shape[1:2], dtype="int64")
        else:
            raise ValueError("Wrong model type.")


--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -491,7 +491,7 @@ class TTSExecutor(BaseExecutor):
                # multi speaker
                if am_dataset in {'aishell3', 'vctk', 'mix', 'canton'}:
                    mel = self.am_inference(
-                        part_phone_ids, spk_id=paddle.to_tensor(spk_id))
+                        part_phone_ids, spk_id=paddle.to_tensor([spk_id]))
                else:
                    mel = self.am_inference(part_phone_ids)
            self.am_time += (time.time() - am_st)

--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
@@ -274,7 +274,7 @@ asr_dynamic_pretrained_models = {
            'url':
            'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_1.5.0.model.tar.gz',
            'md5':
-            'a0adb2b204902982718bc1d8917f7038',
+            '38924b8adc28ef458847c3571e87e3cb',
            'cfg_path':
            'model.yaml',
            'ckpt_path':

--- a/paddlespeech/server/engine/asr/python/asr_engine.py
+++ b/paddlespeech/server/engine/asr/python/asr_engine.py
@@ -67,13 +67,19 @@ class ASREngine(BaseEngine):
            logger.error(e)
            return False

+        cs = False
+
+        if self.config.lang == "zh_en" :
+            cs=True
+
        self.executor._init_from_path(
            model_type=self.config.model,
            lang=self.config.lang,
            sample_rate=self.config.sample_rate,
            cfg_path=self.config.cfg_path,
            decode_method=self.config.decode_method,
-            ckpt_path=self.config.ckpt_path)
+            ckpt_path=self.config.ckpt_path,
+            codeswitch=cs )

        logger.info("Initialize ASR server engine successfully on device: %s." %
                    (self.device))

--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -783,7 +783,7 @@ class FastSpeech2(nn.Layer):
        x = paddle.cast(text, 'int64')
        d, p, e = durations, pitch, energy
        # setup batch axis
-        ilens = paddle.shape(x)[0]
+        ilens = paddle.shape(x)[0:1]

        xs = x.unsqueeze(0)


--- a/paddlespeech/t2s/modules/nets_utils.py
+++ b/paddlespeech/t2s/modules/nets_utils.py
@@ -181,7 +181,7 @@ def make_pad_mask(lengths, xs=None, length_dim=-1):
    if length_dim == 0:
        raise ValueError("length_dim cannot be 0: {}".format(length_dim))

-    bs = paddle.shape(lengths)[0]
+    bs = paddle.shape(lengths)
    if xs is None:
        maxlen = paddle.cast(lengths.max(), dtype=bs.dtype)
    else: