Merge branch 'develop' of github.com:SmileGoat/PaddleSpeech into audio_dev

d46cee6a · Yang Zhou · c072f355 · 6dfe7273 · d46cee6a · d46cee6a
164 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -39,6 +39,9 @@ tools/env.sh
 tools/openfst-1.8.1/
 tools/libsndfile/
 tools/python-soundfile/
+tools/onnx
+tools/onnxruntime
+tools/Paddle2ONNX
 speechx/fc_patch/

--- a/.mergify.yml
+++ b/.mergify.yml
@@ -52,7 +52,7 @@ pull_request_rules:
        add: ["T2S"]
  - name: "auto add label=Audio"
    conditions:
-      - files~=^paddleaudio/
+      - files~=^paddlespeech/audio/
    actions:
      label:
        add: ["Audio"]
@@ -100,7 +100,7 @@ pull_request_rules:
        add: ["README"]
  - name: "auto add label=Documentation"
    conditions:
-      - files~=^(docs/|CHANGELOG.md|paddleaudio/CHANGELOG.md)
+      - files~=^(docs/|CHANGELOG.md)
    actions:
      label:
        add: ["Documentation"]

--- a/audio/.gitignore
+++ b/audio/.gitignore
-.eggs
-*.wav
--- a/audio/CHANGELOG.md
+++ b/audio/CHANGELOG.md
-# Changelog
-Date: 2022-3-15, Author: Xiaojie Chen.
-  - kaldi and librosa mfcc, fbank, spectrogram.
-  - unit test and benchmark.
-Date: 2022-2-25, Author: Hui Zhang.
-  - Refactor architecture.
-  - dtw distance and mcd style dtw.
--- a/audio/README.md
+++ b/audio/README.md
-# PaddleAudio
-PaddleAudio is an audio library for PaddlePaddle.
-## Install
-`pip install .`
--- a/audio/docs/Makefile
+++ b/audio/docs/Makefile
-# Minimal makefile for Sphinx documentation
-#
-# You can set these variables from the command line.
-SPHINXOPTS    =
-SPHINXBUILD   = sphinx-build
-SOURCEDIR     = source
-BUILDDIR      = build
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-.PHONY: help Makefile
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
--- a/audio/docs/README.md
+++ b/audio/docs/README.md
-# Build docs for PaddleAudio
-Execute the following steps in **current directory**.
-## 1. Install
-`pip install Sphinx sphinx_rtd_theme`
-## 2. Generate API docs
-Generate API docs from doc string.
-`sphinx-apidoc -fMeT -o source ../paddleaudio ../paddleaudio/utils --templatedir source/_templates`
-## 3. Build
-`sphinx-build source _html`
-## 4. Preview
-Open `_html/index.html` for page preview.
--- a/audio/docs/images/paddle.png
+++ b/audio/docs/images/paddle.png
--- a/audio/docs/make.bat
+++ b/audio/docs/make.bat
-@ECHO OFF
-pushd %~dp0
-REM Command file for Sphinx documentation
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=source
-set BUILDDIR=build
-if "%1" == "" goto help
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.http://sphinx-doc.org/
-	exit /b 1
-)
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
-goto end
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
-:end
-popd
--- a/audio/paddleaudio/utils/env.py
+++ b/audio/paddleaudio/utils/env.py
-# Copyright (c) 2021  PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License"
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-'''
-This module is used to store environmental variables in PaddleAudio.
-PPAUDIO_HOME     -->  the root directory for storing PaddleAudio related data. Default to ~/.paddleaudio. Users can change the
-├                            default value through the PPAUDIO_HOME environment variable.
-├─ MODEL_HOME    -->  Store model files.
-└─ DATA_HOME     -->  Store automatically downloaded datasets.
-'''
-import os
-__all__ = [
-    'USER_HOME',
-    'PPAUDIO_HOME',
-    'MODEL_HOME',
-    'DATA_HOME',
-]
-def _get_user_home():
-    return os.path.expanduser('~')
-def _get_ppaudio_home():
-    if 'PPAUDIO_HOME' in os.environ:
-        home_path = os.environ['PPAUDIO_HOME']
-        if os.path.exists(home_path):
-            if os.path.isdir(home_path):
-                return home_path
-            else:
-                raise RuntimeError(
-                    'The environment variable PPAUDIO_HOME {} is not a directory.'.
-                    format(home_path))
-        else:
-            return home_path
-    return os.path.join(_get_user_home(), '.paddleaudio')
-def _get_sub_home(directory):
-    home = os.path.join(_get_ppaudio_home(), directory)
-    if not os.path.exists(home):
-        os.makedirs(home)
-    return home
-USER_HOME = _get_user_home()
-PPAUDIO_HOME = _get_ppaudio_home()
-MODEL_HOME = _get_sub_home('models')
-DATA_HOME = _get_sub_home('datasets')
--- a/audio/setup.py
+++ b/audio/setup.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import glob
-import os
-import setuptools
-from setuptools.command.install import install
-from setuptools.command.test import test
-# set the version here
-VERSION = '0.0.0'
-# Inspired by the example at https://pytest.org/latest/goodpractises.html
-class TestCommand(test):
-    def finalize_options(self):
-        test.finalize_options(self)
-        self.test_args = []
-        self.test_suite = True
-    def run(self):
-        self.run_benchmark()
-        super(TestCommand, self).run()
-    def run_tests(self):
-        # Run nose ensuring that argv simulates running nosetests directly
-        import nose
-        nose.run_exit(argv=['nosetests', '-w', 'tests'])
-    def run_benchmark(self):
-        for benchmark_item in glob.glob('tests/benchmark/*py'):
-            os.system(f'pytest {benchmark_item}')
-class InstallCommand(install):
-    def run(self):
-        install.run(self)
-def write_version_py(filename='paddleaudio/__init__.py'):
-    with open(filename, "a") as f:
-        f.write(f"__version__ = '{VERSION}'")
-def remove_version_py(filename='paddleaudio/__init__.py'):
-    with open(filename, "r") as f:
-        lines = f.readlines()
-    with open(filename, "w") as f:
-        for line in lines:
-            if "__version__" not in line:
-                f.write(line)
-remove_version_py()
-write_version_py()
-setuptools.setup(
-    name="paddleaudio",
-    version=VERSION,
-    author="",
-    author_email="",
-    description="PaddleAudio, in development",
-    long_description="",
-    long_description_content_type="text/markdown",
-    url="",
-    packages=setuptools.find_packages(include=['paddleaudio*']),
-    classifiers=[
-        "Programming Language :: Python :: 3",
-        "License :: OSI Approved :: MIT License",
-        "Operating System :: OS Independent",
-    ],
-    python_requires='>=3.6',
-    install_requires=[
-        'numpy >= 1.15.0', 'scipy >= 1.0.0', 'resampy >= 0.2.2',
-        'soundfile >= 0.9.0', 'colorlog', 'pathos == 0.2.8'
-    ],
-    extras_require={
-        'test': [
-            'nose', 'librosa==0.8.1', 'soundfile==0.10.3.post1',
-            'torchaudio==0.10.2', 'pytest-benchmark'
-        ],
-    },
-    cmdclass={
-        'install': InstallCommand,
-        'test': TestCommand,
-    }, )
-remove_version_py()
--- a/audio/tests/.gitkeep
+++ b/audio/tests/.gitkeep
--- a/demos/audio_searching/README.md
+++ b/demos/audio_searching/README.md
@@ -89,7 +89,7 @@ Then to start the system server, and it provides HTTP backend services.
  Then start the server with Fastapi.
  ```bash
-  export PYTHONPATH=$PYTHONPATH:./src:../../paddleaudio
+  export PYTHONPATH=$PYTHONPATH:./src
  python src/audio_search.py
  ```

--- a/demos/audio_searching/README_cn.md
+++ b/demos/audio_searching/README_cn.md
@@ -91,7 +91,7 @@ ffce340b3790  minio/minio:RELEASE.2020-12-03T00-03-10Z  "/usr/bin/docker-ent…"
  启动用 Fastapi 构建的服务
  ```bash
-  export PYTHONPATH=$PYTHONPATH:./src:../../paddleaudio
+  export PYTHONPATH=$PYTHONPATH:./src
  python src/audio_search.py
  ```

--- a/demos/streaming_asr_server/.gitignore
+++ b/demos/streaming_asr_server/.gitignore
+exp
--- a/demos/streaming_asr_server/README.md
+++ b/demos/streaming_asr_server/README.md
@@ -33,6 +33,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
  ```bash
  # in PaddleSpeech/demos/streaming_asr_server start the service
   paddlespeech_server start --config_file ./conf/ws_conformer_wenetspeech_application.yaml
+  # if you want to increase decoding speed, you can use the config file below, it will increase decoding speed and reduce accuracy  
+   paddlespeech_server start --config_file ./conf/ws_conformer_wenetspeech_application_faster.yaml
  ```
  Usage:

--- a/demos/streaming_asr_server/README_cn.md
+++ b/demos/streaming_asr_server/README_cn.md
@@ -21,7 +21,7 @@
 下载好 `PaddleSpeech` 之后，进入到 `PaddleSpeech/demos/streaming_asr_server` 目录。
 配置文件可参见该目录下 `conf/ws_application.yaml` 和 `conf/ws_conformer_wenetspeech_application.yaml` 。
-目前服务集成的模型有： DeepSpeech2和 conformer模型，对应的配置文件如下：
+目前服务集成的模型有： DeepSpeech2 和 conformer模型，对应的配置文件如下：
 * DeepSpeech: `conf/ws_application.yaml`
 * conformer: `conf/ws_conformer_wenetspeech_application.yaml`
@@ -40,6 +40,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
  ```bash
  # 在 PaddleSpeech/demos/streaming_asr_server 目录启动服务
  paddlespeech_server start --config_file ./conf/ws_conformer_wenetspeech_application.yaml
+  # 你如果愿意为了增加解码的速度而牺牲一定的模型精度，你可以使用如下的脚本 
+   paddlespeech_server start --config_file ./conf/ws_conformer_wenetspeech_application_faster.yaml
  ```
  使用方法：

--- a/demos/streaming_asr_server/conf/ws_conformer_application.yaml
+++ b/demos/streaming_asr_server/conf/ws_conformer_application.yaml
@@ -28,6 +28,7 @@ asr_online:
    sample_rate: 16000
    cfg_path: 
    decode_method: 
+    num_decoding_left_chunks: -1
    force_yes: True
    device: 'cpu' # cpu or gpu:id
    decode_method: "attention_rescoring"

--- a/demos/streaming_asr_server/conf/ws_conformer_wenetspeech_application.yaml
+++ b/demos/streaming_asr_server/conf/ws_conformer_wenetspeech_application.yaml
@@ -32,7 +32,7 @@ asr_online:
    device: 'cpu' # cpu or gpu:id
    decode_method: "attention_rescoring"
    continuous_decoding: True # enable continue decoding when endpoint detected
+    num_decoding_left_chunks: -1
    am_predictor_conf:
        device:  # set 'gpu:id' or 'cpu'
        switch_ir_optim: True

--- a/paddlespeech/server/conf/ws_application.yaml
+++ b/paddlespeech/server/conf/ws_application.yaml
@@ -7,8 +7,8 @@ host: 0.0.0.0
 port: 8090
 # The task format in the engin_list is: <speech task>_<engine type>
-# task choices = ['asr_online', 'tts_online']
+# task choices = ['asr_online']
-# protocol = ['websocket', 'http'] (only one can be selected).
+# protocol = ['websocket'] (only one can be selected).
 # websocket only support online engine type.
 protocol: 'websocket'
 engine_list: ['asr_online']
@@ -21,7 +21,7 @@ engine_list: ['asr_online']
 ################################### ASR #########################################
 ################### speech task: asr; engine_type: online #######################
 asr_online:
-    model_type: 'deepspeech2online_aishell'
+    model_type: 'conformer_online_wenetspeech'
    am_model: # the pdmodel file of am static model [optional]
    am_params:  # the pdiparams file of am static model [optional]
    lang: 'zh'
@@ -29,8 +29,10 @@ asr_online:
    cfg_path: 
    decode_method: 
    force_yes: True
-    device:  # cpu or gpu:id
+    device: 'cpu' # cpu or gpu:id
+    decode_method: "attention_rescoring"
+    continuous_decoding: True # enable continue decoding when endpoint detected
+    num_decoding_left_chunks: 16
    am_predictor_conf:
        device:  # set 'gpu:id' or 'cpu'
        switch_ir_optim: True
@@ -38,11 +40,9 @@ asr_online:
        summary: True  # False -> do not show predictor config
    chunk_buffer_conf:
-        frame_duration_ms: 80
-        shift_ms: 40
-        sample_rate: 16000
-        sample_width: 2
        window_n: 7     # frame
        shift_n: 4      # frame
-        window_ms: 20   # ms
+        window_ms: 25   # ms
        shift_ms: 10    # ms
+        sample_rate: 16000
+        sample_width: 2
--- a/demos/streaming_asr_server/conf/ws_ds2_application.yaml
+++ b/demos/streaming_asr_server/conf/ws_ds2_application.yaml
@@ -7,11 +7,11 @@ host: 0.0.0.0
 port: 8090
 # The task format in the engin_list is: <speech task>_<engine type>
-# task choices = ['asr_online']
+# task choices = ['asr_online-inference', 'asr_online-onnx']
 # protocol = ['websocket'] (only one can be selected).
 # websocket only support online engine type.
 protocol: 'websocket'
-engine_list: ['asr_online']
+engine_list: ['asr_online-onnx']
 #################################################################################
@@ -19,15 +19,16 @@ engine_list: ['asr_online']
 #################################################################################
 ################################### ASR #########################################
-################### speech task: asr; engine_type: online #######################
+################### speech task: asr; engine_type: online-inference #######################
-asr_online:
+asr_online-inference:
-    model_type: 'deepspeech2online_aishell'
+    model_type: 'deepspeech2online_wenetspeech'
-    am_model: # the pdmodel file of am static model [optional]
+    am_model:    # the pdmodel file of am static model [optional]
-    am_params:  # the pdiparams file of am static model [optional]
+    am_params:   # the pdiparams file of am static model [optional]
    lang: 'zh'
    sample_rate: 16000
    cfg_path: 
    decode_method: 
+    num_decoding_left_chunks: 
    force_yes: True
    device: 'cpu' # cpu or gpu:id
@@ -37,6 +38,41 @@ asr_online:
        glog_info: False  # True -> print glog
        summary: True  # False -> do not show predictor config
+    chunk_buffer_conf:
+        frame_duration_ms: 85
+        shift_ms: 40
+        sample_rate: 16000
+        sample_width: 2
+        window_n: 7     # frame
+        shift_n: 4      # frame
+        window_ms: 25   # ms
+        shift_ms: 10    # ms
+################################### ASR #########################################
+################### speech task: asr; engine_type: online-onnx #######################
+asr_online-onnx:
+    model_type: 'deepspeech2online_wenetspeech'
+    am_model:  # the pdmodel file of onnx am static model [optional]
+    am_params:  # the pdiparams file of am static model [optional]
+    lang: 'zh'
+    sample_rate: 16000
+    cfg_path: 
+    decode_method: 
+    num_decoding_left_chunks: 
+    force_yes: True
+    device: 'cpu' # cpu or gpu:id
+    # https://onnxruntime.ai/docs/api/python/api_summary.html#inferencesession
+    am_predictor_conf:
+        device: 'cpu' # set 'gpu:id' or 'cpu'
+        graph_optimization_level: 0 
+        intra_op_num_threads: 0 # Sets the number of threads used to parallelize the execution within nodes.
+        inter_op_num_threads: 0 # Sets the number of threads used to parallelize the execution of the graph (across nodes).
+        log_severity_level: 2   # Log severity level. Applies to session load, initialization, etc. 0:Verbose, 1:Info, 2:Warning. 3:Error, 4:Fatal. Default is 2.
+        log_verbosity_level: 0  # VLOG level if DEBUG build and session_log_severity_level is 0. Applies to session load, initialization, etc. Default is 0.
    chunk_buffer_conf:
        frame_duration_ms: 80
        shift_ms: 40
@@ -44,5 +80,5 @@ asr_online:
        sample_width: 2
        window_n: 7     # frame
        shift_n: 4      # frame
-        window_ms: 20   # ms
+        window_ms: 25   # ms
        shift_ms: 10    # ms
--- a/demos/streaming_asr_server/local/rtf_from_log.py
+++ b/demos/streaming_asr_server/local/rtf_from_log.py
+#!/usr/bin/env python3
+import argparse
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(prog=__doc__)
+    parser.add_argument(
+        '--logfile', type=str, required=True, help='ws client log file')
+    args = parser.parse_args()
+    rtfs = []
+    with open(args.logfile, 'r') as f:
+        for line in f:
+            if 'RTF=' in line:
+                # udio duration: 6.126, elapsed time: 3.471978187561035, RTF=0.5667610492264177
+                line = line.strip()
+                beg = line.index("audio")
+                line = line[beg:]
+                items = line.split(',')
+                vals = []
+                for elem in items:
+                    if "RTF=" in elem:
+                        continue
+                    _, val = elem.split(":")
+                    vals.append(eval(val))
+                keys = ['T', 'P']
+                meta = dict(zip(keys, vals))
+                rtfs.append(meta)
+    T = 0.0
+    P = 0.0
+    n = 0
+    for m in rtfs:
+        n += 1
+        T += m['T']
+        P += m['P']
+    print(f"RTF: {P/T}, utts: {n}")
--- a/demos/streaming_asr_server/local/test.sh
+++ b/demos/streaming_asr_server/local/test.sh
+#!/bin/bash 
+if [ $# != 1 ];then
+    echo "usage: $0 wav_scp"
+    exit -1
+fi
+scp=$1
+# calc RTF
+# wav_scp can generate from `speechx/examples/ds2_ol/aishell`
+exp=exp
+mkdir -p $exp
+python3 local/websocket_client.py --server_ip 127.0.0.1 --port 8090 --wavscp $scp &> $exp/log.rsl
+python3 local/rtf_from_log.py --logfile $exp/log.rsl
\ No newline at end of file
--- a/demos/streaming_asr_server/websocket_client.py
+++ b/demos/streaming_asr_server/websocket_client.py
+#!/usr/bin/python
 # Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
@@ -11,9 +12,9 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-#!/usr/bin/python
+# calc avg RTF(NOT Accurate): grep -rn RTF log.txt | awk '{print $NF}' | awk -F "=" '{sum += $NF} END {print "all time",sum, "audio num", NR,  "RTF", sum/NR}'
-# -*- coding: UTF-8 -*-
+# python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
-# script for calc RTF: grep -rn RTF log.txt | awk '{print $NF}' | awk -F "=" '{sum += $NF} END {print "all time",sum, "audio num", NR,  "RTF", sum/NR}'
+# python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --wavfile ./zh.wav
 import argparse
 import asyncio
 import codecs

--- a/demos/streaming_asr_server/server.sh
+++ b/demos/streaming_asr_server/server.sh
@@ -4,6 +4,6 @@ export CUDA_VISIBLE_DEVICE=0,1,2,3
 # nohup python3 punc_server.py --config_file conf/punc_application.yaml > punc.log 2>&1 &
 paddlespeech_server start --config_file conf/punc_application.yaml &> punc.log &
-# nohup python3 streaming_asr_server.py --config_file conf/ws_conformer_application.yaml > streaming_asr.log 2>&1 &
+# nohup python3 streaming_asr_server.py --config_file conf/ws_conformer_wenetspeech_application.yaml > streaming_asr.log 2>&1 &
-paddlespeech_server start --config_file conf/ws_conformer_application.yaml &> streaming_asr.log  &
+paddlespeech_server start --config_file conf/ws_conformer_wenetspeech_application.yaml &> streaming_asr.log  &
--- a/demos/streaming_asr_server/test.sh
+++ b/demos/streaming_asr_server/test.sh
@@ -3,11 +3,9 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
 # read the wav and pass it to only streaming asr service
 # If `127.0.0.1` is not accessible, you need to use the actual service IP address.
-# python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --wavfile ./zh.wav
+paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
-paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --input ./zh.wav
 # read the wav and call streaming and punc service
 # If `127.0.0.1` is not accessible, you need to use the actual service IP address.
-# python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
 paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav
--- a/audio/docs/source/_static/custom.css
+++ b/audio/docs/source/_static/custom.css
--- a/audio/docs/source/_templates/module.rst_t
+++ b/audio/docs/source/_templates/module.rst_t
--- a/audio/docs/source/_templates/package.rst_t
+++ b/audio/docs/source/_templates/package.rst_t
--- a/audio/docs/source/_templates/toc.rst_t
+++ b/audio/docs/source/_templates/toc.rst_t
--- a/audio/docs/source/conf.py
+++ b/audio/docs/source/conf.py
--- a/audio/docs/source/index.rst
+++ b/audio/docs/source/index.rst
--- a/docs/source/cls/custom_dataset.md
+++ b/docs/source/cls/custom_dataset.md
 # Customize Dataset for Audio Classification
-Following this tutorial you can customize your dataset for audio classification task by using `paddlespeech` and `paddleaudio`.
+Following this tutorial you can customize your dataset for audio classification task by using `paddlespeech`.
-A base class of classification dataset is `paddleaudio.dataset.AudioClassificationDataset`. To customize your dataset you should write a dataset class derived from `AudioClassificationDataset`. 
+A base class of classification dataset is `paddlespeech.audio.dataset.AudioClassificationDataset`. To customize your dataset you should write a dataset class derived from `AudioClassificationDataset`. 
 Assuming you have some wave files that stored in your own directory. You should prepare a meta file with the information of filepaths and labels. For example the absolute path of it is `/PATH/TO/META_FILE.txt`:
 ```
@@ -14,7 +14,7 @@ Assuming you have some wave files that stored in your own directory. You should
 Here is an example to build your custom dataset in `custom_dataset.py`:
 ```python
-from paddleaudio.datasets.dataset import AudioClassificationDataset
+from paddlespeech.audio.datasets.dataset import AudioClassificationDataset
 class CustomDataset(AudioClassificationDataset):
    meta_file = '/PATH/TO/META_FILE.txt'
@@ -48,7 +48,7 @@ class CustomDataset(AudioClassificationDataset):
 Then you can build dataset and data loader from `CustomDataset`:
 ```python
 import paddle
-from paddleaudio.features import LogMelSpectrogram
+from paddlespeech.audio.features import LogMelSpectrogram
 from custom_dataset import CustomDataset

--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -6,15 +6,15 @@
 ### Speech Recognition Model
 Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link 
 :-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----:  | :-----:  | :-----: 
-[Ds2 Online Wenetspeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz) | Wenetspeech Dataset | Char-based | 1.2 GB  | 2 Conv + 5 LSTM layers | 0.152 (test\_net, w/o LM) <br> 0.2417 (test\_meeting, w/o LM) <br> 0.053 (aishell, w/ LM) |-| 10000 h |- 
+[Ds2 Online Wenetspeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.2.model.tar.gz) | Wenetspeech Dataset | Char-based | 1.2 GB  | 2 Conv + 5 LSTM layers | 0.152 (test\_net, w/o LM) <br> 0.2417 (test\_meeting, w/o LM) <br> 0.053 (aishell, w/ LM) |-| 10000 h |- 
 [Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz) | Aishell Dataset | Char-based | 491 MB  | 2 Conv + 5 LSTM layers | 0.0666 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0) 
-[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) 
+[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz)| Aishell Dataset | Char-based | 1.4 GB | 2 Conv + 5 bidirectional LSTM layers| 0.0554 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0) 
 [Conformer Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz) | WenetSpeech Dataset | Char-based | 457 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.11 (test\_net) 0.1879 (test\_meeting) |-| 10000 h |- 
 [Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1) 
 [Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz) | Aishell Dataset | Char-based | 189 MB  | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0464 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1) 
 [Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer  Aishell ASR1](../../examples/aishell/asr1) 
-[Ds2 Offline Librispeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz)| Librispeech Dataset | Char-based | 518 MB | 2 Conv + 3 bidirectional LSTM layers| - |0.0725| 960 h | [Ds2 Offline Librispeech ASR0](../../examples/librispeech/asr0) 
+[Ds2 Offline Librispeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_offline_librispeech_ckpt_1.0.1.model.tar.gz)| Librispeech Dataset | Char-based | 1.3 GB | 2 Conv + 5 bidirectional LSTM layers| - |0.0467| 960 h | [Ds2 Offline Librispeech ASR0](../../examples/librispeech/asr0) 
-[Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_conformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0337 | 960 h | [Conformer Librispeech ASR1](../../examples/librispeech/asr1) 
+[Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_conformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0338 | 960 h | [Conformer Librispeech ASR1](../../examples/librispeech/asr1) 
 [Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0381 | 960 h | [Transformer Librispeech ASR1](../../examples/librispeech/asr1) 
 [Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/asr2_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB  | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.0240 | 960 h | [Transformer Librispeech ASR2](../../examples/librispeech/asr2) 

--- a/examples/aishell/asr0/RESULTS.md
+++ b/examples/aishell/asr0/RESULTS.md
@@ -12,7 +12,8 @@
 ## Deepspeech2 Non-Streaming
 | Model | Number of Params | Release | Config | Test set | Valid Loss | CER |  
-| --- | --- | --- | --- | --- | --- | --- |  
+| --- | --- | --- | --- | --- | --- | --- |
+| DeepSpeech2 | 122.3M | r1.0.1 | conf/deepspeech2.yaml + U2 Data pipline and spec aug + fbank161 | test | 5.780756044387817 | 0.055400 | 
 | DeepSpeech2 | 58.4M | v2.2.0 | conf/deepspeech2.yaml + spec aug | test | 5.738585948944092 | 0.064000 |  
 | DeepSpeech2 | 58.4M | v2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 |  
 | DeepSpeech2 | 58.4M | v2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 |

--- a/examples/aishell/asr0/local/train.sh
+++ b/examples/aishell/asr0/local/train.sh
 #!/bin/bash
-if [ $# != 2 ];then
+if [ $# -lt 2 ] && [ $# -gt 3 ];then
-    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
    exit -1
 fi
@@ -10,6 +10,13 @@ echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
+ips=$3
+if [ ! $ips ];then
+  ips_config=
+else
+  ips_config="--ips="${ips}
+fi
 mkdir -p exp
@@ -26,7 +33,7 @@ python3 -u ${BIN_DIR}/train.py \
 --output exp/${ckpt_name} \
 --seed ${seed}
 else
-python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
+python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \

--- a/examples/aishell/asr0/run.sh
+++ b/examples/aishell/asr0/run.sh
@@ -6,6 +6,7 @@ gpus=0,1,2,3
 stage=0
 stop_stage=100
 conf_path=conf/deepspeech2.yaml    #conf/deepspeech2.yaml or conf/deepspeech2_online.yaml
+ips=            #xx.xx.xx.xx,xx.xx.xx.xx
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=10
 audio_file=data/demo_01_03.wav
@@ -24,7 +25,7 @@ fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path}  ${ckpt}
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips}
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

--- a/examples/aishell/asr1/local/train.sh
+++ b/examples/aishell/asr1/local/train.sh
@@ -17,13 +17,21 @@ if [ ${seed} != 0  ]; then
    echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
 fi
-if [ $# != 2 ];then
+if [ $# -lt 2 ] && [ $# -gt 3 ];then
-    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
    exit -1
 fi
 config_path=$1
 ckpt_name=$2
+ips=$3
+if [ ! $ips ];then
+  ips_config=
+else
+  ips_config="--ips="${ips}
+fi
+echo ${ips_config}
 mkdir -p exp
@@ -37,7 +45,7 @@ python3 -u ${BIN_DIR}/train.py \
 --benchmark-batch-size ${benchmark_batch_size} \
 --benchmark-max-step ${benchmark_max_step}
 else
-python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
+python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
 --seed ${seed} \
 --config ${config_path} \

--- a/examples/aishell/asr1/run.sh
+++ b/examples/aishell/asr1/run.sh
@@ -6,6 +6,7 @@ gpus=0,1,2,3
 stage=0
 stop_stage=50
 conf_path=conf/conformer.yaml
+ips=            #xx.xx.xx.xx,xx.xx.xx.xx
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=30
 audio_file=data/demo_01_03.wav
@@ -23,7 +24,7 @@ fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path}  ${ckpt}
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips}
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

--- a/examples/callcenter/asr1/local/train.sh
+++ b/examples/callcenter/asr1/local/train.sh
 #! /usr/bin/env bash
-if [ $# != 2 ];then
+if [ $# -lt 2 ] && [ $# -gt 3 ];then
-    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
    exit -1
 fi
@@ -10,6 +10,13 @@ echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
+ips=$3
+if [ ! $ips ];then
+  ips_config=
+else
+  ips_config="--ips="${ips}
+fi
 echo "using ${device}..."
@@ -28,7 +35,7 @@ python3 -u ${BIN_DIR}/train.py \
 --output exp/${ckpt_name} \
 --seed ${seed}
 else
-python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
+python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \

--- a/examples/callcenter/asr1/run.sh
+++ b/examples/callcenter/asr1/run.sh
@@ -6,6 +6,7 @@ gpus=0,1,2,3
 stage=0
 stop_stage=50
 conf_path=conf/conformer.yaml
+ips=            #xx.xx.xx.xx,xx.xx.xx.xx
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=20
@@ -22,7 +23,7 @@ fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path}  ${ckpt}
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips}
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

--- a/examples/esc50/cls0/conf/panns.yaml
+++ b/examples/esc50/cls0/conf/panns.yaml
 data:
-  dataset: 'paddleaudio.datasets:ESC50'
+  dataset: 'paddlespeech.audio.datasets:ESC50'
  num_classes: 50
  train:
    mode: 'train'

--- a/examples/hey_snips/kws0/conf/mdtc.yaml
+++ b/examples/hey_snips/kws0/conf/mdtc.yaml
@@ -2,7 +2,7 @@
 ###########################################
 #                   Data                  #
 ###########################################
-dataset: 'paddleaudio.datasets:HeySnips'
+dataset: 'paddlespeech.audio.datasets:HeySnips'
 data_dir: '/PATH/TO/DATA/hey_snips_research_6k_en_train_eval_clean_ter'
 ############################################

--- a/examples/librispeech/asr0/RESULTS.md
+++ b/examples/librispeech/asr0/RESULTS.md
@@ -3,6 +3,7 @@
 ## Deepspeech2 Non-Streaming
 | Model | Params | release |  Config | Test set | Loss | WER |  
 | --- | --- | --- | --- | --- | --- | --- |  
+| DeepSpeech2 | 113.96M | r1.0.1 | conf/deepspeech2.yaml + U2 Data pipline and spec aug + fbank161 | test-clean | 10.76069622039795 | 0.046700 | 
 | DeepSpeech2 | 42.96M | 2.2.0 | conf/deepspeech2.yaml + spec_aug | test-clean | 14.49190807 | 0.067283 |  
 | DeepSpeech2 | 42.96M | 2.1.0 | conf/deepspeech2.yaml | test-clean | 15.184467315673828 | 0.072154 |  
 | DeepSpeech2 | 42.96M | 2.0.0 | conf/deepspeech2.yaml | test-clean | - | 0.073973 |  

--- a/examples/librispeech/asr0/local/train.sh
+++ b/examples/librispeech/asr0/local/train.sh
 #!/bin/bash
-if [ $# != 2 ];then
+if [ $# -lt 2 ] && [ $# -gt 3 ];then
-    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
    exit -1
 fi
@@ -10,6 +10,13 @@ echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
+ips=$3
+if [ ! $ips ];then
+  ips_config=
+else
+  ips_config="--ips="${ips}
+fi
 mkdir -p exp
@@ -26,7 +33,7 @@ python3 -u ${BIN_DIR}/train.py \
 --output exp/${ckpt_name} \
 --seed ${seed}
 else
-python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
+python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \

--- a/examples/librispeech/asr0/run.sh
+++ b/examples/librispeech/asr0/run.sh
@@ -6,6 +6,7 @@ gpus=0,1,2,3
 stage=0
 stop_stage=100
 conf_path=conf/deepspeech2.yaml
+ips=            #xx.xx.xx.xx,xx.xx.xx.xx
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=5
 audio_file=data/demo_002_en.wav
@@ -23,7 +24,7 @@ fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path}  ${ckpt}
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips}
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

--- a/examples/librispeech/asr1/local/test.sh
+++ b/examples/librispeech/asr1/local/test.sh
@@ -42,6 +42,11 @@ echo "chunk mode ${chunk_mode}"
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    # format the reference test file
+    python3 utils/format_rsl.py \
+        --origin_ref data/manifest.test-clean.raw \
+        --trans_ref data/manifest.test-clean.text
    for type in attention; do
        echo "decoding ${type}"
        if [ ${chunk_mode} == true ];then
@@ -63,54 +68,90 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
            echo "Failed in evaluation!"
            exit 1
        fi
+        python3 utils/format_rsl.py \
+            --origin_hyp ${ckpt_prefix}.${type}.rsl \
+            --trans_hyp ${ckpt_prefix}.${type}.rsl.text
+        python3 utils/compute-wer.py --char=1 --v=1 \
+            data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
+        echo "decoding ${type} done."
+    done
+    for type in ctc_greedy_search; do
+        echo "decoding ${type}"
+        if [ ${chunk_mode} == true ];then
+            # stream decoding only support batchsize=1
+            batch_size=1
+        else
+            batch_size=64
+        fi
+        python3 -u ${BIN_DIR}/test.py \
+            --ngpu ${ngpu} \
+            --config ${config_path} \
+            --decode_cfg ${decode_config_path} \
+            --result_file ${ckpt_prefix}.${type}.rsl \
+            --checkpoint_path ${ckpt_prefix} \
+            --opts decode.decoding_method ${type} \
+            --opts decode.decode_batch_size ${batch_size}
+        if [ $? -ne 0 ]; then
+            echo "Failed in evaluation!"
+            exit 1
+        fi
+        python3 utils/format_rsl.py \
+            --origin_hyp ${ckpt_prefix}.${type}.rsl \
+            --trans_hyp ${ckpt_prefix}.${type}.rsl.text
+        python3 utils/compute-wer.py --char=1 --v=1 \
+            data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
        echo "decoding ${type} done."
    done
-fi
-for type in ctc_greedy_search; do
-    echo "decoding ${type}"
-    if [ ${chunk_mode} == true ];then
+    for type in ctc_prefix_beam_search attention_rescoring; do
-        # stream decoding only support batchsize=1
+        echo "decoding ${type}"
        batch_size=1
-    else
+        python3 -u ${BIN_DIR}/test.py \
-        batch_size=64
+            --ngpu ${ngpu} \
-    fi
+            --config ${config_path} \
-    python3 -u ${BIN_DIR}/test.py \
+            --decode_cfg ${decode_config_path} \
-        --ngpu ${ngpu} \
+            --result_file ${ckpt_prefix}.${type}.rsl \
-        --config ${config_path} \
+            --checkpoint_path ${ckpt_prefix} \
-        --decode_cfg ${decode_config_path} \
+            --opts decode.decoding_method ${type} \
-        --result_file ${ckpt_prefix}.${type}.rsl \
+            --opts decode.decode_batch_size ${batch_size}
-        --checkpoint_path ${ckpt_prefix} \
-        --opts decode.decoding_method ${type} \
+        if [ $? -ne 0 ]; then
-        --opts decode.decode_batch_size ${batch_size}
+            echo "Failed in evaluation!"
+            exit 1
-    if [ $? -ne 0 ]; then
+        fi
-        echo "Failed in evaluation!"
+        python3 utils/format_rsl.py \
-        exit 1
+            --origin_hyp ${ckpt_prefix}.${type}.rsl \
-    fi
+            --trans_hyp ${ckpt_prefix}.${type}.rsl.text
-    echo "decoding ${type} done."
-done
+        python3 utils/compute-wer.py --char=1 --v=1 \
+            data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
+        echo "decoding ${type} done."
+    done
-for type in ctc_prefix_beam_search attention_rescoring; do
+fi
-    echo "decoding ${type}"
-    batch_size=1
+if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
-    python3 -u ${BIN_DIR}/test.py \
+    python3 utils/format_rsl.py \
-        --ngpu ${ngpu} \
+        --origin_ref data/manifest.test-clean.raw \
-        --config ${config_path} \
+        --trans_ref_sclite data/manifest.test.text-clean.sclite
-        --decode_cfg ${decode_config_path} \
-        --result_file ${ckpt_prefix}.${type}.rsl \
-        --checkpoint_path ${ckpt_prefix} \
+    output_dir=${ckpt_prefix}
-        --opts decode.decoding_method ${type} \
+    for type in attention ctc_greedy_search ctc_prefix_beam_search attention_rescoring; do
-        --opts decode.decode_batch_size ${batch_size}
+        python utils/format_rsl.py \
+            --origin_hyp ${output_dir}/${type}.rsl \
-    if [ $? -ne 0 ]; then
+            --trans_hyp_sclite ${output_dir}/${type}.rsl.text.sclite
-        echo "Failed in evaluation!"
-        exit 1
+        mkdir -p ${output_dir}/${type}_sclite
-    fi
+        sclite -i wsj -r data/manifest.test-clean.text.sclite -h  ${output_dir}/${type}.rsl.text.sclite  -e utf-8 -o all -O ${output_dir}/${type}_sclite -c NOASCII
-    echo "decoding ${type} done."
+    done
-done
+fi
 echo "Finished"

--- a/examples/librispeech/asr1/local/train.sh
+++ b/examples/librispeech/asr1/local/train.sh
 #!/bin/bash
-if [ $# != 2 ];then
+if [ $# -lt 2 ] && [ $# -gt 3 ];then
-    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
    exit -1
 fi
@@ -10,6 +10,13 @@ echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
+ips=$3
+if [ ! $ips ];then
+  ips_config=
+else
+  ips_config="--ips="${ips}
+fi
 mkdir -p exp
@@ -29,7 +36,7 @@ python3 -u ${BIN_DIR}/train.py \
 --output exp/${ckpt_name} \
 --seed ${seed}
 else
-python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
+python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \

--- a/examples/librispeech/asr1/run.sh
+++ b/examples/librispeech/asr1/run.sh
@@ -8,6 +8,7 @@ gpus=0,1,2,3
 stage=0
 stop_stage=50
 conf_path=conf/transformer.yaml
+ips=            #xx.xx.xx.xx,xx.xx.xx.xx
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=30
 audio_file=data/demo_002_en.wav
@@ -25,7 +26,7 @@ fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path}  ${ckpt}
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips}
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

--- a/examples/librispeech/asr2/local/train.sh
+++ b/examples/librispeech/asr2/local/train.sh
 #!/bin/bash
-if [ $# != 2 ];then
+if [ $# -lt 2 ] && [ $# -gt 3 ];then
-    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
    exit -1
 fi
@@ -10,6 +10,13 @@ echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
+ips=$3
+if [ ! $ips ];then
+  ips_config=
+else
+  ips_config="--ips="${ips}
+fi
 mkdir -p exp
@@ -27,7 +34,7 @@ python3 -u ${BIN_DIR}/train.py \
 --output exp/${ckpt_name} \
 --seed ${seed}
 else
-python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
+python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
 --model-name u2_kaldi \
 --config ${config_path} \

--- a/examples/librispeech/asr2/run.sh
+++ b/examples/librispeech/asr2/run.sh
@@ -9,6 +9,7 @@ gpus=0,1,2,3,4,5,6,7
 stage=0
 stop_stage=50
 conf_path=conf/transformer.yaml
+ips=            #xx.xx.xx.xx,xx.xx.xx.xx
 decode_conf_path=conf/decode/decode_base.yaml
 dict_path=data/lang_char/train_960_unigram5000_units.txt
 avg_num=10
@@ -26,7 +27,7 @@ fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path}  ${ckpt}
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips}
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

--- a/examples/mustc/st1/local/train.sh
+++ b/examples/mustc/st1/local/train.sh
 #!/bin/bash
-if [ $# != 3 ];then
+if [ $# -lt 3 ] && [ $# -gt 4 ];then
-    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path"
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path ips(optional)"
    exit -1
 fi
@@ -11,6 +11,13 @@ echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
 ckpt_path=$3
+ips=$3
+if [ ! $ips ];then
+  ips_config=
+else
+  ips_config="--ips="${ips}
+fi
 mkdir -p exp
@@ -21,12 +28,21 @@ if [ ${seed} != 0 ]; then
    export FLAGS_cudnn_deterministic=True
 fi
+if [ ${ngpu} == 0 ]; then
 python3 -u ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \
 --checkpoint_path "${ckpt_path}" \
 --seed ${seed}
+else
+python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
+--ngpu ${ngpu} \
+--config ${config_path} \
+--output exp/${ckpt_name} \
+--checkpoint_path "${ckpt_path}" \
+--seed ${seed}
+fi
 if [ ${seed} != 0 ]; then
    unset FLAGS_cudnn_deterministic

--- a/examples/mustc/st1/run.sh
+++ b/examples/mustc/st1/run.sh
@@ -7,6 +7,7 @@ gpus=0,1,2,3
 stage=0
 stop_stage=3
 conf_path=conf/transformer_es.yaml
+ips=            #xx.xx.xx.xx,xx.xx.xx.xx
 decode_conf_path=conf/tuning/decode.yaml
 must_c_path=
 lang=es
@@ -25,7 +26,7 @@ fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path}  ${ckpt} "${ckpt_path}" 
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path}  ${ckpt} "${ckpt_path}" ${ips} 
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
@@ -36,4 +37,4 @@ fi
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
    # test ckpt avg_n
    CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${lang} || exit -1
 fi
\ No newline at end of file
--- a/examples/ted_en_zh/st0/local/train.sh
+++ b/examples/ted_en_zh/st0/local/train.sh
 #!/bin/bash
-if [ $# != 2 ];then
+if [ $# -lt 2 ] && [ $# -gt 3 ];then
-    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
    exit -1
 fi
@@ -10,6 +10,13 @@ echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
+ips=$3
+if [ ! $ips ];then
+  ips_config=
+else
+  ips_config="--ips="${ips}
+fi
 mkdir -p exp
@@ -26,7 +33,7 @@ python3 -u ${BIN_DIR}/train.py \
 --output exp/${ckpt_name} \
 --seed ${seed}
 else
-python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
+python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \

--- a/examples/ted_en_zh/st0/run.sh
+++ b/examples/ted_en_zh/st0/run.sh
@@ -6,6 +6,7 @@ gpus=0,1,2,3
 stage=0
 stop_stage=50
 conf_path=conf/transformer_mtl_noam.yaml
+ips=            #xx.xx.xx.xx,xx.xx.xx.xx
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=5
 data_path=./TED_EnZh # path to unzipped data
@@ -23,7 +24,7 @@ fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips}
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

--- a/examples/ted_en_zh/st1/local/train.sh
+++ b/examples/ted_en_zh/st1/local/train.sh
 #!/bin/bash
-if [ $# != 3 ];then
+if [ $# -lt 3 ] && [ $# -gt 4 ];then
-    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path"
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
    exit -1
 fi
@@ -11,6 +11,15 @@ echo "using $ngpu gpus..."
 config_path=$1
 ckpt_name=$2
 ckpt_path=$3
+ips=$3
+if [ ! $ips ];then
+  ips_config=
+else
+  ips_config="--ips="${ips}
+fi
+mkdir -p exp
 mkdir -p exp
@@ -28,7 +37,7 @@ python3 -u ${BIN_DIR}/train.py \
 --checkpoint_path "${ckpt_path}" \
 --seed ${seed}
 else
-python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
+python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \

--- a/examples/ted_en_zh/st1/run.sh
+++ b/examples/ted_en_zh/st1/run.sh
@@ -7,6 +7,7 @@ gpus=0,1,2,3
 stage=1
 stop_stage=4
 conf_path=conf/transformer_mtl_noam.yaml
+ips=            #xx.xx.xx.xx,xx.xx.xx.xx
 decode_conf_path=conf/tuning/decode.yaml
 ckpt_path= # paddle.98 # (finetune from FAT-ST pretrained model)
 avg_num=5
@@ -29,7 +30,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
        echo "Finetune from Pretrained Model" ${ckpt_path}
        ./local/download_pretrain.sh || exit -1
    fi
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}"
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}" ${ips}
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

--- a/examples/tiny/asr0/local/train.sh
+++ b/examples/tiny/asr0/local/train.sh
@@ -15,13 +15,20 @@ if [ ${seed} != 0  ]; then
    echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
 fi
-if [ $# != 2 ];then
+if [ $# -lt 2 ] && [ $# -gt 3 ];then
-    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
    exit -1
 fi
 config_path=$1
 ckpt_name=$2
+ips=$3
+if [ ! $ips ];then
+  ips_config=
+else
+  ips_config="--ips="${ips}
+fi
 mkdir -p exp
@@ -33,7 +40,7 @@ python3 -u ${BIN_DIR}/train.py \
 --profiler-options "${profiler_options}" \
 --seed ${seed}
 else
-python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
+python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
 --config ${config_path} \
 --output exp/${ckpt_name} \

--- a/examples/tiny/asr0/run.sh
+++ b/examples/tiny/asr0/run.sh
@@ -2,10 +2,11 @@
 set -e
 source path.sh
-gpus=0
+gpus=4
 stage=0
 stop_stage=100
 conf_path=conf/deepspeech2.yaml
+ips=            #xx.xx.xx.xx,xx.xx.xx.xx
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=1
 source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
@@ -21,7 +22,7 @@ fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
+    CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips}
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

--- a/examples/tiny/asr1/local/train.sh
+++ b/examples/tiny/asr1/local/train.sh
@@ -17,13 +17,20 @@ if [ ${seed} != 0  ]; then
    echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
 fi
-if [ $# != 2 ];then
+if [ $# -lt 2 ] && [ $# -gt 3 ];then
-    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
+    echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
    exit -1
 fi
 config_path=$1
 ckpt_name=$2
+ips=$3
+if [ ! $ips ];then
+  ips_config=
+else
+  ips_config="--ips="${ips}
+fi
 mkdir -p exp
@@ -37,7 +44,7 @@ python3 -u ${BIN_DIR}/train.py \
 --benchmark-batch-size ${benchmark_batch_size} \
 --benchmark-max-step ${benchmark_max_step}
 else
-python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
+python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
 --ngpu ${ngpu} \
 --seed ${seed} \
 --config ${config_path} \

--- a/examples/tiny/asr1/run.sh
+++ b/examples/tiny/asr1/run.sh
@@ -2,10 +2,11 @@
 set -e
 source path.sh
-gpus=0
+gpus=4
 stage=0
 stop_stage=50
 conf_path=conf/transformer.yaml
+ips=            #xx.xx.xx.xx,xx.xx.xx.xx
 decode_conf_path=conf/tuning/decode.yaml
 avg_num=1
@@ -22,7 +23,7 @@ fi
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
    # train model, all `ckpt` under `exp` dir
-    CUDA_VISIBLE_DEVICES=${gpus}  ./local/train.sh ${conf_path}  ${ckpt}
+    CUDA_VISIBLE_DEVICES=${gpus}  ./local/train.sh ${conf_path} ${ckpt} ${ips}
 fi
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then

--- a/examples/voxceleb/sv0/local/data_prepare.py
+++ b/examples/voxceleb/sv0/local/data_prepare.py
@@ -14,9 +14,9 @@
 import argparse
 import paddle
-from paddleaudio.datasets.voxceleb import VoxCeleb
 from yacs.config import CfgNode
+from paddlespeech.audio.datasets.voxceleb import VoxCeleb
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.io.augment import build_augment_pipeline
 from paddlespeech.vector.training.seeding import seed_everything

--- a/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py
+++ b/examples/voxceleb/sv0/local/make_rirs_noise_csv_dataset_from_json.py
@@ -21,9 +21,9 @@ import os
 from typing import List
 import tqdm
-from paddleaudio import load as load_audio
 from yacs.config import CfgNode
+from paddlespeech.audio import load as load_audio
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.utils.vector_utils import get_chunks

--- a/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py
+++ b/examples/voxceleb/sv0/local/make_vox_csv_dataset_from_json.py
@@ -22,9 +22,9 @@ import os
 import random
 import tqdm
-from paddleaudio import load as load_audio
 from yacs.config import CfgNode
+from paddlespeech.audio import load as load_audio
 from paddlespeech.s2t.utils.log import Log
 from paddlespeech.vector.utils.vector_utils import get_chunks

--- a/audio/paddleaudio/__init__.py
+++ b/audio/paddleaudio/__init__.py
--- a/audio/paddleaudio/backends/__init__.py
+++ b/audio/paddleaudio/backends/__init__.py
--- a/audio/paddleaudio/backends/soundfile_backend.py
+++ b/audio/paddleaudio/backends/soundfile_backend.py
--- a/audio/paddleaudio/backends/sox_backend.py
+++ b/audio/paddleaudio/backends/sox_backend.py
--- a/audio/paddleaudio/compliance/__init__.py
+++ b/audio/paddleaudio/compliance/__init__.py
--- a/audio/paddleaudio/compliance/kaldi.py
+++ b/audio/paddleaudio/compliance/kaldi.py
--- a/audio/paddleaudio/compliance/librosa.py
+++ b/audio/paddleaudio/compliance/librosa.py
--- a/audio/paddleaudio/datasets/__init__.py
+++ b/audio/paddleaudio/datasets/__init__.py
--- a/audio/paddleaudio/datasets/dataset.py
+++ b/audio/paddleaudio/datasets/dataset.py
--- a/audio/paddleaudio/datasets/esc50.py
+++ b/audio/paddleaudio/datasets/esc50.py
@@ -16,8 +16,8 @@ import os
 from typing import List
 from typing import Tuple
+from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset
 __all__ = ['ESC50']

--- a/audio/paddleaudio/datasets/gtzan.py
+++ b/audio/paddleaudio/datasets/gtzan.py
@@ -17,8 +17,8 @@ import random
 from typing import List
 from typing import Tuple
+from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset
 __all__ = ['GTZAN']

--- a/audio/paddleaudio/datasets/hey_snips.py
+++ b/audio/paddleaudio/datasets/hey_snips.py
--- a/audio/paddleaudio/datasets/rirs_noises.py
+++ b/audio/paddleaudio/datasets/rirs_noises.py
--- a/audio/paddleaudio/datasets/tess.py
+++ b/audio/paddleaudio/datasets/tess.py
@@ -17,8 +17,8 @@ import random
 from typing import List
 from typing import Tuple
+from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset
 __all__ = ['TESS']

--- a/audio/paddleaudio/datasets/urban_sound.py
+++ b/audio/paddleaudio/datasets/urban_sound.py
@@ -16,8 +16,8 @@ import os
 from typing import List
 from typing import Tuple
+from ..utils import DATA_HOME
 from ..utils.download import download_and_decompress
-from ..utils.env import DATA_HOME
 from .dataset import AudioClassificationDataset
 __all__ = ['UrbanSound8K']

--- a/audio/paddleaudio/datasets/voxceleb.py
+++ b/audio/paddleaudio/datasets/voxceleb.py
--- a/audio/paddleaudio/features/__init__.py
+++ b/audio/paddleaudio/features/__init__.py
--- a/audio/paddleaudio/features/layers.py
+++ b/audio/paddleaudio/features/layers.py
--- a/audio/paddleaudio/functional/__init__.py
+++ b/audio/paddleaudio/functional/__init__.py
--- a/audio/paddleaudio/functional/functional.py
+++ b/audio/paddleaudio/functional/functional.py
--- a/audio/paddleaudio/functional/window.py
+++ b/audio/paddleaudio/functional/window.py
--- a/audio/paddleaudio/io/__init__.py
+++ b/audio/paddleaudio/io/__init__.py
--- a/audio/paddleaudio/metric/__init__.py
+++ b/audio/paddleaudio/metric/__init__.py
--- a/audio/paddleaudio/metric/eer.py
+++ b/audio/paddleaudio/metric/eer.py
--- a/audio/paddleaudio/sox_effects/__init__.py
+++ b/audio/paddleaudio/sox_effects/__init__.py
--- a/audio/paddleaudio/utils/__init__.py
+++ b/audio/paddleaudio/utils/__init__.py
@@ -11,13 +11,11 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from ...cli.utils import DATA_HOME
+from ...cli.utils import MODEL_HOME
 from .download import decompress
 from .download import download_and_decompress
 from .download import load_state_dict_from_url
-from .env import DATA_HOME
-from .env import MODEL_HOME
-from .env import PPAUDIO_HOME
-from .env import USER_HOME
 from .error import ParameterError
 from .log import Logger
 from .log import logger

--- a/audio/paddleaudio/utils/download.py
+++ b/audio/paddleaudio/utils/download.py
--- a/audio/paddleaudio/utils/error.py
+++ b/audio/paddleaudio/utils/error.py
--- a/audio/paddleaudio/utils/log.py
+++ b/audio/paddleaudio/utils/log.py
--- a/audio/paddleaudio/utils/numeric.py
+++ b/audio/paddleaudio/utils/numeric.py
--- a/audio/paddleaudio/utils/time.py
+++ b/audio/paddleaudio/utils/time.py
--- a/paddlespeech/cli/asr/infer.py
+++ b/paddlespeech/cli/asr/infer.py
@@ -83,6 +83,12 @@ class ASRExecutor(BaseExecutor):
                'attention_rescoring'
            ],
            help='only support transformer and conformer model')
+        self.parser.add_argument(
+            '--num_decoding_left_chunks',
+            '-num_left',
+            type=str,
+            default=-1,
+            help='only support transformer and conformer online model')
        self.parser.add_argument(
            '--ckpt_path',
            type=str,
@@ -122,6 +128,7 @@ class ASRExecutor(BaseExecutor):
                        sample_rate: int=16000,
                        cfg_path: Optional[os.PathLike]=None,
                        decode_method: str='attention_rescoring',
+                        num_decoding_left_chunks: int=-1,
                        ckpt_path: Optional[os.PathLike]=None):
        """
        Init model and other resources from a specific path.
@@ -179,6 +186,9 @@ class ASRExecutor(BaseExecutor):
            elif "conformer" in model_type or "transformer" in model_type:
                self.config.decode.decoding_method = decode_method
+                if num_decoding_left_chunks:
+                    assert num_decoding_left_chunks == -1 or num_decoding_left_chunks >= 0, "num_decoding_left_chunks should be -1 or >=0"
+                    self.config.num_decoding_left_chunks = num_decoding_left_chunks
            else:
                raise Exception("wrong type")
@@ -451,6 +461,7 @@ class ASRExecutor(BaseExecutor):
                 config: os.PathLike=None,
                 ckpt_path: os.PathLike=None,
                 decode_method: str='attention_rescoring',
+                 num_decoding_left_chunks: int=-1,
                 force_yes: bool=False,
                 rtf: bool=False,
                 device=paddle.get_device()):
@@ -460,7 +471,7 @@ class ASRExecutor(BaseExecutor):
        audio_file = os.path.abspath(audio_file)
        paddle.set_device(device)
        self._init_from_path(model, lang, sample_rate, config, decode_method,
-                             ckpt_path)
+                             num_decoding_left_chunks, ckpt_path)
        if not self._check(audio_file, sample_rate, force_yes):
            sys.exit(-1)
        if rtf:

--- a/paddlespeech/cli/cls/infer.py
+++ b/paddlespeech/cli/cls/infer.py
@@ -21,12 +21,12 @@ from typing import Union
 import numpy as np
 import paddle
 import yaml
-from paddleaudio import load
-from paddleaudio.features import LogMelSpectrogram
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import stats_wrapper
+from paddlespeech.audio import load
+from paddlespeech.audio.features import LogMelSpectrogram
 __all__ = ['CLSExecutor']

--- a/paddlespeech/cli/utils.py
+++ b/paddlespeech/cli/utils.py
@@ -24,8 +24,8 @@ from typing import Any
 from typing import Dict
 import paddle
-import paddleaudio
 import requests
+import soundfile as sf
 import yaml
 from paddle.framework import load
@@ -190,6 +190,7 @@ def _get_sub_home(directory):
 PPSPEECH_HOME = _get_paddlespcceh_home()
 MODEL_HOME = _get_sub_home('models')
 CONF_HOME = _get_sub_home('conf')
+DATA_HOME = _get_sub_home('datasets')
 def _md5(text: str):
@@ -281,7 +282,8 @@ def _note_one_stat(cls_name, params={}):
    if 'audio_file' in params:
        try:
-            _, sr = paddleaudio.load(params['audio_file'])
+            # recursive import cased by: utils.DATA_HOME
+            _, sr = sf.read(params['audio_file'])
        except Exception:
            sr = -1

--- a/paddlespeech/cli/vector/infer.py
+++ b/paddlespeech/cli/vector/infer.py
@@ -22,13 +22,13 @@ from typing import Union
 import paddle
 import soundfile
-from paddleaudio.backends import load as load_audio
-from paddleaudio.compliance.librosa import melspectrogram
 from yacs.config import CfgNode
 from ..executor import BaseExecutor
 from ..log import logger
 from ..utils import stats_wrapper
+from paddlespeech.audio.backends import load as load_audio
+from paddlespeech.audio.compliance.librosa import melspectrogram
 from paddlespeech.vector.io.batch import feature_normalize
 from paddlespeech.vector.modules.sid_model import SpeakerIdetification

--- a/paddlespeech/cls/exps/panns/deploy/predict.py
+++ b/paddlespeech/cls/exps/panns/deploy/predict.py
@@ -16,11 +16,12 @@ import os
 import numpy as np
 from paddle import inference
-from paddleaudio.backends import load as load_audio
-from paddleaudio.datasets import ESC50
-from paddleaudio.features import melspectrogram
 from scipy.special import softmax
+from paddlespeech.audio.backends import load as load_audio
+from paddlespeech.audio.datasets import ESC50
+from paddlespeech.audio.features import melspectrogram
 # yapf: disable
 parser = argparse.ArgumentParser()
 parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.")

--- a/paddlespeech/cls/exps/panns/export_model.py
+++ b/paddlespeech/cls/exps/panns/export_model.py
--- a/paddlespeech/cls/exps/panns/predict.py
+++ b/paddlespeech/cls/exps/panns/predict.py
--- a/paddlespeech/cls/exps/panns/train.py
+++ b/paddlespeech/cls/exps/panns/train.py
--- a/paddlespeech/cls/models/panns/panns.py
+++ b/paddlespeech/cls/models/panns/panns.py
--- a/paddlespeech/kws/exps/mdtc/train.py
+++ b/paddlespeech/kws/exps/mdtc/train.py
--- a/paddlespeech/resource/pretrained_models.py
+++ b/paddlespeech/resource/pretrained_models.py
--- a/paddlespeech/resource/resource.py
+++ b/paddlespeech/resource/resource.py
--- a/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py
+++ b/paddlespeech/s2t/frontend/featurizer/audio_featurizer.py
--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
--- a/paddlespeech/server/conf/ws_conformer_application.yaml
+++ b/paddlespeech/server/conf/ws_conformer_application.yaml
--- a/paddlespeech/server/conf/ws_conformer_wenetspeech_application_faster.yaml
+++ b/paddlespeech/server/conf/ws_conformer_wenetspeech_application_faster.yaml
--- a/paddlespeech/server/conf/ws_ds2_application.yaml
+++ b/paddlespeech/server/conf/ws_ds2_application.yaml
--- a/audio/tests/backends/__init__.py
+++ b/audio/tests/backends/__init__.py
--- a/paddlespeech/server/engine/asr/online/onnx/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/onnx/asr_engine.py
--- a/audio/tests/backends/soundfile/__init__.py
+++ b/audio/tests/backends/soundfile/__init__.py
--- a/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/paddleinference/asr_engine.py
--- a/audio/tests/features/__init__.py
+++ b/audio/tests/features/__init__.py
--- a/paddlespeech/server/engine/asr/online/asr_engine.py
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
--- a/paddlespeech/server/engine/engine_factory.py
+++ b/paddlespeech/server/engine/engine_factory.py
--- a/paddlespeech/server/engine/vector/python/vector_engine.py
+++ b/paddlespeech/server/engine/vector/python/vector_engine.py
--- a/paddlespeech/server/util.py
+++ b/paddlespeech/server/util.py
--- a/paddlespeech/server/utils/onnx_infer.py
+++ b/paddlespeech/server/utils/onnx_infer.py
--- a/paddlespeech/server/ws/asr_api.py
+++ b/paddlespeech/server/ws/asr_api.py
--- a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
--- a/paddlespeech/vector/exps/ecapa_tdnn/test.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/test.py
--- a/paddlespeech/vector/exps/ecapa_tdnn/train.py
+++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py
--- a/paddlespeech/vector/io/dataset.py
+++ b/paddlespeech/vector/io/dataset.py
--- a/paddlespeech/vector/io/dataset_from_json.py
+++ b/paddlespeech/vector/io/dataset_from_json.py
--- a/setup.py
+++ b/setup.py
--- a/speechx/examples/ds2_ol/onnx/.gitignore
+++ b/speechx/examples/ds2_ol/onnx/.gitignore
--- a/speechx/examples/ds2_ol/onnx/README.md
+++ b/speechx/examples/ds2_ol/onnx/README.md
--- a/speechx/examples/ds2_ol/onnx/local/infer_check.py
+++ b/speechx/examples/ds2_ol/onnx/local/infer_check.py
--- a/speechx/examples/ds2_ol/onnx/local/netron.sh
+++ b/speechx/examples/ds2_ol/onnx/local/netron.sh
--- a/speechx/examples/ds2_ol/onnx/local/onnx_clone.sh
+++ b/speechx/examples/ds2_ol/onnx/local/onnx_clone.sh
--- a/speechx/examples/ds2_ol/onnx/local/onnx_infer_shape.py
+++ b/speechx/examples/ds2_ol/onnx/local/onnx_infer_shape.py
--- a/speechx/examples/ds2_ol/onnx/local/onnx_opt.sh
+++ b/speechx/examples/ds2_ol/onnx/local/onnx_opt.sh
--- a/speechx/examples/ds2_ol/onnx/local/onnx_prune_model.py
+++ b/speechx/examples/ds2_ol/onnx/local/onnx_prune_model.py
--- a/speechx/examples/ds2_ol/onnx/local/onnx_rename_model.py
+++ b/speechx/examples/ds2_ol/onnx/local/onnx_rename_model.py
--- a/speechx/examples/ds2_ol/onnx/local/ort_opt.py
+++ b/speechx/examples/ds2_ol/onnx/local/ort_opt.py
--- a/speechx/examples/ds2_ol/onnx/local/pd_infer_shape.py
+++ b/speechx/examples/ds2_ol/onnx/local/pd_infer_shape.py
--- a/speechx/examples/ds2_ol/onnx/local/pd_prune_model.py
+++ b/speechx/examples/ds2_ol/onnx/local/pd_prune_model.py
--- a/speechx/examples/ds2_ol/onnx/local/prune.sh
+++ b/speechx/examples/ds2_ol/onnx/local/prune.sh
--- a/speechx/examples/ds2_ol/onnx/local/tonnx.sh
+++ b/speechx/examples/ds2_ol/onnx/local/tonnx.sh
--- a/speechx/examples/ds2_ol/onnx/path.sh
+++ b/speechx/examples/ds2_ol/onnx/path.sh
--- a/speechx/examples/ds2_ol/onnx/run.sh
+++ b/speechx/examples/ds2_ol/onnx/run.sh
--- a/speechx/examples/ds2_ol/onnx/utils
+++ b/speechx/examples/ds2_ol/onnx/utils
--- a/audio/tests/benchmark/README.md
+++ b/audio/tests/benchmark/README.md
--- a/audio/tests/benchmark/log_melspectrogram.py
+++ b/audio/tests/benchmark/log_melspectrogram.py
--- a/audio/tests/benchmark/melspectrogram.py
+++ b/audio/tests/benchmark/melspectrogram.py
--- a/audio/tests/benchmark/mfcc.py
+++ b/audio/tests/benchmark/mfcc.py
--- a/tests/unit/audio/backends/__init__.py
+++ b/tests/unit/audio/backends/__init__.py
--- a/audio/tests/backends/base.py
+++ b/audio/tests/backends/base.py
--- a/tests/unit/audio/backends/soundfile/__init__.py
+++ b/tests/unit/audio/backends/soundfile/__init__.py
--- a/audio/tests/backends/soundfile/test_io.py
+++ b/audio/tests/backends/soundfile/test_io.py
--- a/tests/unit/audio/features/__init__.py
+++ b/tests/unit/audio/features/__init__.py
--- a/audio/tests/features/base.py
+++ b/audio/tests/features/base.py
--- a/audio/tests/features/test_istft.py
+++ b/audio/tests/features/test_istft.py
--- a/audio/tests/features/test_kaldi.py
+++ b/audio/tests/features/test_kaldi.py
--- a/audio/tests/features/test_librosa.py
+++ b/audio/tests/features/test_librosa.py
--- a/audio/tests/features/test_log_melspectrogram.py
+++ b/audio/tests/features/test_log_melspectrogram.py
--- a/audio/tests/features/test_spectrogram.py
+++ b/audio/tests/features/test_spectrogram.py
--- a/audio/tests/features/test_stft.py
+++ b/audio/tests/features/test_stft.py
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
--- a/utils/zh_tn.py
+++ b/utils/zh_tn.py