提交 d46cee6a 编写于 作者: Y Yang Zhou

Merge branch 'develop' of github.com:SmileGoat/PaddleSpeech into audio_dev

......@@ -39,6 +39,9 @@ tools/env.sh
tools/openfst-1.8.1/
tools/libsndfile/
tools/python-soundfile/
tools/onnx
tools/onnxruntime
tools/Paddle2ONNX
speechx/fc_patch/
......
......@@ -52,7 +52,7 @@ pull_request_rules:
add: ["T2S"]
- name: "auto add label=Audio"
conditions:
- files~=^paddleaudio/
- files~=^paddlespeech/audio/
actions:
label:
add: ["Audio"]
......@@ -100,7 +100,7 @@ pull_request_rules:
add: ["README"]
- name: "auto add label=Documentation"
conditions:
- files~=^(docs/|CHANGELOG.md|paddleaudio/CHANGELOG.md)
- files~=^(docs/|CHANGELOG.md)
actions:
label:
add: ["Documentation"]
......
# Changelog
Date: 2022-3-15, Author: Xiaojie Chen.
- kaldi and librosa mfcc, fbank, spectrogram.
- unit test and benchmark.
Date: 2022-2-25, Author: Hui Zhang.
- Refactor architecture.
- dtw distance and mcd style dtw.
# PaddleAudio
PaddleAudio is an audio library for PaddlePaddle.
## Install
`pip install .`
# Minimal makefile for Sphinx documentation
#
# You can set these variables from the command line.
SPHINXOPTS =
SPHINXBUILD = sphinx-build
SOURCEDIR = source
BUILDDIR = build
# Put it first so that "make" without argument is like "make help".
help:
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
.PHONY: help Makefile
# Catch-all target: route all unknown targets to Sphinx using the new
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
%: Makefile
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
# Build docs for PaddleAudio
Execute the following steps in **current directory**.
## 1. Install
`pip install Sphinx sphinx_rtd_theme`
## 2. Generate API docs
Generate API docs from doc string.
`sphinx-apidoc -fMeT -o source ../paddleaudio ../paddleaudio/utils --templatedir source/_templates`
## 3. Build
`sphinx-build source _html`
## 4. Preview
Open `_html/index.html` for page preview.
@ECHO OFF
pushd %~dp0
REM Command file for Sphinx documentation
if "%SPHINXBUILD%" == "" (
set SPHINXBUILD=sphinx-build
)
set SOURCEDIR=source
set BUILDDIR=build
if "%1" == "" goto help
%SPHINXBUILD% >NUL 2>NUL
if errorlevel 9009 (
echo.
echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
echo.installed, then set the SPHINXBUILD environment variable to point
echo.to the full path of the 'sphinx-build' executable. Alternatively you
echo.may add the Sphinx directory to PATH.
echo.
echo.If you don't have Sphinx installed, grab it from
echo.http://sphinx-doc.org/
exit /b 1
)
%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
goto end
:help
%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
:end
popd
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
'''
This module is used to store environmental variables in PaddleAudio.
PPAUDIO_HOME --> the root directory for storing PaddleAudio related data. Default to ~/.paddleaudio. Users can change the
├ default value through the PPAUDIO_HOME environment variable.
├─ MODEL_HOME --> Store model files.
└─ DATA_HOME --> Store automatically downloaded datasets.
'''
import os
__all__ = [
'USER_HOME',
'PPAUDIO_HOME',
'MODEL_HOME',
'DATA_HOME',
]
def _get_user_home():
return os.path.expanduser('~')
def _get_ppaudio_home():
if 'PPAUDIO_HOME' in os.environ:
home_path = os.environ['PPAUDIO_HOME']
if os.path.exists(home_path):
if os.path.isdir(home_path):
return home_path
else:
raise RuntimeError(
'The environment variable PPAUDIO_HOME {} is not a directory.'.
format(home_path))
else:
return home_path
return os.path.join(_get_user_home(), '.paddleaudio')
def _get_sub_home(directory):
home = os.path.join(_get_ppaudio_home(), directory)
if not os.path.exists(home):
os.makedirs(home)
return home
USER_HOME = _get_user_home()
PPAUDIO_HOME = _get_ppaudio_home()
MODEL_HOME = _get_sub_home('models')
DATA_HOME = _get_sub_home('datasets')
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import glob
import os
import setuptools
from setuptools.command.install import install
from setuptools.command.test import test
# set the version here
VERSION = '0.0.0'
# Inspired by the example at https://pytest.org/latest/goodpractises.html
class TestCommand(test):
def finalize_options(self):
test.finalize_options(self)
self.test_args = []
self.test_suite = True
def run(self):
self.run_benchmark()
super(TestCommand, self).run()
def run_tests(self):
# Run nose ensuring that argv simulates running nosetests directly
import nose
nose.run_exit(argv=['nosetests', '-w', 'tests'])
def run_benchmark(self):
for benchmark_item in glob.glob('tests/benchmark/*py'):
os.system(f'pytest {benchmark_item}')
class InstallCommand(install):
def run(self):
install.run(self)
def write_version_py(filename='paddleaudio/__init__.py'):
with open(filename, "a") as f:
f.write(f"__version__ = '{VERSION}'")
def remove_version_py(filename='paddleaudio/__init__.py'):
with open(filename, "r") as f:
lines = f.readlines()
with open(filename, "w") as f:
for line in lines:
if "__version__" not in line:
f.write(line)
remove_version_py()
write_version_py()
setuptools.setup(
name="paddleaudio",
version=VERSION,
author="",
author_email="",
description="PaddleAudio, in development",
long_description="",
long_description_content_type="text/markdown",
url="",
packages=setuptools.find_packages(include=['paddleaudio*']),
classifiers=[
"Programming Language :: Python :: 3",
"License :: OSI Approved :: MIT License",
"Operating System :: OS Independent",
],
python_requires='>=3.6',
install_requires=[
'numpy >= 1.15.0', 'scipy >= 1.0.0', 'resampy >= 0.2.2',
'soundfile >= 0.9.0', 'colorlog', 'pathos == 0.2.8'
],
extras_require={
'test': [
'nose', 'librosa==0.8.1', 'soundfile==0.10.3.post1',
'torchaudio==0.10.2', 'pytest-benchmark'
],
},
cmdclass={
'install': InstallCommand,
'test': TestCommand,
}, )
remove_version_py()
......@@ -89,7 +89,7 @@ Then to start the system server, and it provides HTTP backend services.
Then start the server with Fastapi.
```bash
export PYTHONPATH=$PYTHONPATH:./src:../../paddleaudio
export PYTHONPATH=$PYTHONPATH:./src
python src/audio_search.py
```
......
......@@ -91,7 +91,7 @@ ffce340b3790 minio/minio:RELEASE.2020-12-03T00-03-10Z "/usr/bin/docker-ent…"
启动用 Fastapi 构建的服务
```bash
export PYTHONPATH=$PYTHONPATH:./src:../../paddleaudio
export PYTHONPATH=$PYTHONPATH:./src
python src/audio_search.py
```
......
......@@ -33,6 +33,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
```bash
# in PaddleSpeech/demos/streaming_asr_server start the service
paddlespeech_server start --config_file ./conf/ws_conformer_wenetspeech_application.yaml
# if you want to increase decoding speed, you can use the config file below, it will increase decoding speed and reduce accuracy
paddlespeech_server start --config_file ./conf/ws_conformer_wenetspeech_application_faster.yaml
```
Usage:
......
......@@ -21,7 +21,7 @@
下载好 `PaddleSpeech` 之后,进入到 `PaddleSpeech/demos/streaming_asr_server` 目录。
配置文件可参见该目录下 `conf/ws_application.yaml``conf/ws_conformer_wenetspeech_application.yaml`
目前服务集成的模型有: DeepSpeech2和 conformer模型,对应的配置文件如下:
目前服务集成的模型有: DeepSpeech2 和 conformer模型,对应的配置文件如下:
* DeepSpeech: `conf/ws_application.yaml`
* conformer: `conf/ws_conformer_wenetspeech_application.yaml`
......@@ -40,6 +40,8 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
```bash
# 在 PaddleSpeech/demos/streaming_asr_server 目录启动服务
paddlespeech_server start --config_file ./conf/ws_conformer_wenetspeech_application.yaml
# 你如果愿意为了增加解码的速度而牺牲一定的模型精度,你可以使用如下的脚本
paddlespeech_server start --config_file ./conf/ws_conformer_wenetspeech_application_faster.yaml
```
使用方法:
......
......@@ -28,6 +28,7 @@ asr_online:
sample_rate: 16000
cfg_path:
decode_method:
num_decoding_left_chunks: -1
force_yes: True
device: 'cpu' # cpu or gpu:id
decode_method: "attention_rescoring"
......
......@@ -32,7 +32,7 @@ asr_online:
device: 'cpu' # cpu or gpu:id
decode_method: "attention_rescoring"
continuous_decoding: True # enable continue decoding when endpoint detected
num_decoding_left_chunks: -1
am_predictor_conf:
device: # set 'gpu:id' or 'cpu'
switch_ir_optim: True
......
......@@ -7,8 +7,8 @@ host: 0.0.0.0
port: 8090
# The task format in the engin_list is: <speech task>_<engine type>
# task choices = ['asr_online', 'tts_online']
# protocol = ['websocket', 'http'] (only one can be selected).
# task choices = ['asr_online']
# protocol = ['websocket'] (only one can be selected).
# websocket only support online engine type.
protocol: 'websocket'
engine_list: ['asr_online']
......@@ -21,7 +21,7 @@ engine_list: ['asr_online']
################################### ASR #########################################
################### speech task: asr; engine_type: online #######################
asr_online:
model_type: 'deepspeech2online_aishell'
model_type: 'conformer_online_wenetspeech'
am_model: # the pdmodel file of am static model [optional]
am_params: # the pdiparams file of am static model [optional]
lang: 'zh'
......@@ -29,8 +29,10 @@ asr_online:
cfg_path:
decode_method:
force_yes: True
device: # cpu or gpu:id
device: 'cpu' # cpu or gpu:id
decode_method: "attention_rescoring"
continuous_decoding: True # enable continue decoding when endpoint detected
num_decoding_left_chunks: 16
am_predictor_conf:
device: # set 'gpu:id' or 'cpu'
switch_ir_optim: True
......@@ -38,11 +40,9 @@ asr_online:
summary: True # False -> do not show predictor config
chunk_buffer_conf:
frame_duration_ms: 80
shift_ms: 40
sample_rate: 16000
sample_width: 2
window_n: 7 # frame
shift_n: 4 # frame
window_ms: 20 # ms
window_ms: 25 # ms
shift_ms: 10 # ms
sample_rate: 16000
sample_width: 2
......@@ -7,11 +7,11 @@ host: 0.0.0.0
port: 8090
# The task format in the engin_list is: <speech task>_<engine type>
# task choices = ['asr_online']
# task choices = ['asr_online-inference', 'asr_online-onnx']
# protocol = ['websocket'] (only one can be selected).
# websocket only support online engine type.
protocol: 'websocket'
engine_list: ['asr_online']
engine_list: ['asr_online-onnx']
#################################################################################
......@@ -19,15 +19,16 @@ engine_list: ['asr_online']
#################################################################################
################################### ASR #########################################
################### speech task: asr; engine_type: online #######################
asr_online:
model_type: 'deepspeech2online_aishell'
am_model: # the pdmodel file of am static model [optional]
am_params: # the pdiparams file of am static model [optional]
################### speech task: asr; engine_type: online-inference #######################
asr_online-inference:
model_type: 'deepspeech2online_wenetspeech'
am_model: # the pdmodel file of am static model [optional]
am_params: # the pdiparams file of am static model [optional]
lang: 'zh'
sample_rate: 16000
cfg_path:
decode_method:
num_decoding_left_chunks:
force_yes: True
device: 'cpu' # cpu or gpu:id
......@@ -37,6 +38,41 @@ asr_online:
glog_info: False # True -> print glog
summary: True # False -> do not show predictor config
chunk_buffer_conf:
frame_duration_ms: 85
shift_ms: 40
sample_rate: 16000
sample_width: 2
window_n: 7 # frame
shift_n: 4 # frame
window_ms: 25 # ms
shift_ms: 10 # ms
################################### ASR #########################################
################### speech task: asr; engine_type: online-onnx #######################
asr_online-onnx:
model_type: 'deepspeech2online_wenetspeech'
am_model: # the pdmodel file of onnx am static model [optional]
am_params: # the pdiparams file of am static model [optional]
lang: 'zh'
sample_rate: 16000
cfg_path:
decode_method:
num_decoding_left_chunks:
force_yes: True
device: 'cpu' # cpu or gpu:id
# https://onnxruntime.ai/docs/api/python/api_summary.html#inferencesession
am_predictor_conf:
device: 'cpu' # set 'gpu:id' or 'cpu'
graph_optimization_level: 0
intra_op_num_threads: 0 # Sets the number of threads used to parallelize the execution within nodes.
inter_op_num_threads: 0 # Sets the number of threads used to parallelize the execution of the graph (across nodes).
log_severity_level: 2 # Log severity level. Applies to session load, initialization, etc. 0:Verbose, 1:Info, 2:Warning. 3:Error, 4:Fatal. Default is 2.
log_verbosity_level: 0 # VLOG level if DEBUG build and session_log_severity_level is 0. Applies to session load, initialization, etc. Default is 0.
chunk_buffer_conf:
frame_duration_ms: 80
shift_ms: 40
......@@ -44,5 +80,5 @@ asr_online:
sample_width: 2
window_n: 7 # frame
shift_n: 4 # frame
window_ms: 20 # ms
window_ms: 25 # ms
shift_ms: 10 # ms
#!/usr/bin/env python3
import argparse
if __name__ == '__main__':
parser = argparse.ArgumentParser(prog=__doc__)
parser.add_argument(
'--logfile', type=str, required=True, help='ws client log file')
args = parser.parse_args()
rtfs = []
with open(args.logfile, 'r') as f:
for line in f:
if 'RTF=' in line:
# udio duration: 6.126, elapsed time: 3.471978187561035, RTF=0.5667610492264177
line = line.strip()
beg = line.index("audio")
line = line[beg:]
items = line.split(',')
vals = []
for elem in items:
if "RTF=" in elem:
continue
_, val = elem.split(":")
vals.append(eval(val))
keys = ['T', 'P']
meta = dict(zip(keys, vals))
rtfs.append(meta)
T = 0.0
P = 0.0
n = 0
for m in rtfs:
n += 1
T += m['T']
P += m['P']
print(f"RTF: {P/T}, utts: {n}")
#!/bin/bash
if [ $# != 1 ];then
echo "usage: $0 wav_scp"
exit -1
fi
scp=$1
# calc RTF
# wav_scp can generate from `speechx/examples/ds2_ol/aishell`
exp=exp
mkdir -p $exp
python3 local/websocket_client.py --server_ip 127.0.0.1 --port 8090 --wavscp $scp &> $exp/log.rsl
python3 local/rtf_from_log.py --logfile $exp/log.rsl
\ No newline at end of file
#!/usr/bin/python
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
......@@ -11,9 +12,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#!/usr/bin/python
# -*- coding: UTF-8 -*-
# script for calc RTF: grep -rn RTF log.txt | awk '{print $NF}' | awk -F "=" '{sum += $NF} END {print "all time",sum, "audio num", NR, "RTF", sum/NR}'
# calc avg RTF(NOT Accurate): grep -rn RTF log.txt | awk '{print $NF}' | awk -F "=" '{sum += $NF} END {print "all time",sum, "audio num", NR, "RTF", sum/NR}'
# python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
# python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --wavfile ./zh.wav
import argparse
import asyncio
import codecs
......
......@@ -4,6 +4,6 @@ export CUDA_VISIBLE_DEVICE=0,1,2,3
# nohup python3 punc_server.py --config_file conf/punc_application.yaml > punc.log 2>&1 &
paddlespeech_server start --config_file conf/punc_application.yaml &> punc.log &
# nohup python3 streaming_asr_server.py --config_file conf/ws_conformer_application.yaml > streaming_asr.log 2>&1 &
paddlespeech_server start --config_file conf/ws_conformer_application.yaml &> streaming_asr.log &
# nohup python3 streaming_asr_server.py --config_file conf/ws_conformer_wenetspeech_application.yaml > streaming_asr.log 2>&1 &
paddlespeech_server start --config_file conf/ws_conformer_wenetspeech_application.yaml &> streaming_asr.log &
......@@ -3,11 +3,9 @@ wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav
# read the wav and pass it to only streaming asr service
# If `127.0.0.1` is not accessible, you need to use the actual service IP address.
# python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --wavfile ./zh.wav
paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --input ./zh.wav
paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8090 --input ./zh.wav
# read the wav and call streaming and punc service
# If `127.0.0.1` is not accessible, you need to use the actual service IP address.
# python3 websocket_client.py --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --wavfile ./zh.wav
paddlespeech_client asr_online --server_ip 127.0.0.1 --port 8290 --punc.server_ip 127.0.0.1 --punc.port 8190 --input ./zh.wav
# Customize Dataset for Audio Classification
Following this tutorial you can customize your dataset for audio classification task by using `paddlespeech` and `paddleaudio`.
Following this tutorial you can customize your dataset for audio classification task by using `paddlespeech`.
A base class of classification dataset is `paddleaudio.dataset.AudioClassificationDataset`. To customize your dataset you should write a dataset class derived from `AudioClassificationDataset`.
A base class of classification dataset is `paddlespeech.audio.dataset.AudioClassificationDataset`. To customize your dataset you should write a dataset class derived from `AudioClassificationDataset`.
Assuming you have some wave files that stored in your own directory. You should prepare a meta file with the information of filepaths and labels. For example the absolute path of it is `/PATH/TO/META_FILE.txt`:
```
......@@ -14,7 +14,7 @@ Assuming you have some wave files that stored in your own directory. You should
Here is an example to build your custom dataset in `custom_dataset.py`:
```python
from paddleaudio.datasets.dataset import AudioClassificationDataset
from paddlespeech.audio.datasets.dataset import AudioClassificationDataset
class CustomDataset(AudioClassificationDataset):
meta_file = '/PATH/TO/META_FILE.txt'
......@@ -48,7 +48,7 @@ class CustomDataset(AudioClassificationDataset):
Then you can build dataset and data loader from `CustomDataset`:
```python
import paddle
from paddleaudio.features import LogMelSpectrogram
from paddlespeech.audio.features import LogMelSpectrogram
from custom_dataset import CustomDataset
......
......@@ -6,15 +6,15 @@
### Speech Recognition Model
Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER | Hours of speech | Example Link
:-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----: | :-----: | :-----:
[Ds2 Online Wenetspeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.0a.model.tar.gz) | Wenetspeech Dataset | Char-based | 1.2 GB | 2 Conv + 5 LSTM layers | 0.152 (test\_net, w/o LM) <br> 0.2417 (test\_meeting, w/o LM) <br> 0.053 (aishell, w/ LM) |-| 10000 h |-
[Ds2 Online Wenetspeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr0/asr0_deepspeech2_online_wenetspeech_ckpt_1.0.2.model.tar.gz) | Wenetspeech Dataset | Char-based | 1.2 GB | 2 Conv + 5 LSTM layers | 0.152 (test\_net, w/o LM) <br> 0.2417 (test\_meeting, w/o LM) <br> 0.053 (aishell, w/ LM) |-| 10000 h |-
[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_fbank161_ckpt_0.2.1.model.tar.gz) | Aishell Dataset | Char-based | 491 MB | 2 Conv + 5 LSTM layers | 0.0666 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0)
[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0)
[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_offline_aishell_ckpt_1.0.1.model.tar.gz)| Aishell Dataset | Char-based | 1.4 GB | 2 Conv + 5 bidirectional LSTM layers| 0.0554 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0)
[Conformer Online Wenetspeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1/asr1_chunk_conformer_wenetspeech_ckpt_1.0.0a.model.tar.gz) | WenetSpeech Dataset | Char-based | 457 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.11 (test\_net) 0.1879 (test\_meeting) |-| 10000 h |-
[Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.2.0.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring| 0.0544 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1)
[Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0464 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1)
[Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1)
[Ds2 Offline Librispeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz)| Librispeech Dataset | Char-based | 518 MB | 2 Conv + 3 bidirectional LSTM layers| - |0.0725| 960 h | [Ds2 Offline Librispeech ASR0](../../examples/librispeech/asr0)
[Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_conformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0337 | 960 h | [Conformer Librispeech ASR1](../../examples/librispeech/asr1)
[Ds2 Offline Librispeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_offline_librispeech_ckpt_1.0.1.model.tar.gz)| Librispeech Dataset | Char-based | 1.3 GB | 2 Conv + 5 bidirectional LSTM layers| - |0.0467| 960 h | [Ds2 Offline Librispeech ASR0](../../examples/librispeech/asr0)
[Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_conformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0338 | 960 h | [Conformer Librispeech ASR1](../../examples/librispeech/asr1)
[Transformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0381 | 960 h | [Transformer Librispeech ASR1](../../examples/librispeech/asr1)
[Transformer Librispeech ASR2 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr2/asr2_transformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 131 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: JoinCTC w/ LM |-| 0.0240 | 960 h | [Transformer Librispeech ASR2](../../examples/librispeech/asr2)
......
......@@ -12,7 +12,8 @@
## Deepspeech2 Non-Streaming
| Model | Number of Params | Release | Config | Test set | Valid Loss | CER |
| --- | --- | --- | --- | --- | --- | --- |
| --- | --- | --- | --- | --- | --- | --- |
| DeepSpeech2 | 122.3M | r1.0.1 | conf/deepspeech2.yaml + U2 Data pipline and spec aug + fbank161 | test | 5.780756044387817 | 0.055400 |
| DeepSpeech2 | 58.4M | v2.2.0 | conf/deepspeech2.yaml + spec aug | test | 5.738585948944092 | 0.064000 |
| DeepSpeech2 | 58.4M | v2.1.0 | conf/deepspeech2.yaml + spec aug | test | 7.483316898345947 | 0.077860 |
| DeepSpeech2 | 58.4M | v2.1.0 | conf/deepspeech2.yaml | test | 7.299022197723389 | 0.078671 |
......
#!/bin/bash
if [ $# != 2 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
if [ $# -lt 2 ] && [ $# -gt 3 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
exit -1
fi
......@@ -10,6 +10,13 @@ echo "using $ngpu gpus..."
config_path=$1
ckpt_name=$2
ips=$3
if [ ! $ips ];then
ips_config=
else
ips_config="--ips="${ips}
fi
mkdir -p exp
......@@ -26,7 +33,7 @@ python3 -u ${BIN_DIR}/train.py \
--output exp/${ckpt_name} \
--seed ${seed}
else
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
--config ${config_path} \
--output exp/${ckpt_name} \
......
......@@ -6,6 +6,7 @@ gpus=0,1,2,3
stage=0
stop_stage=100
conf_path=conf/deepspeech2.yaml #conf/deepspeech2.yaml or conf/deepspeech2_online.yaml
ips= #xx.xx.xx.xx,xx.xx.xx.xx
decode_conf_path=conf/tuning/decode.yaml
avg_num=10
audio_file=data/demo_01_03.wav
......@@ -24,7 +25,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
......
......@@ -17,13 +17,21 @@ if [ ${seed} != 0 ]; then
echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
fi
if [ $# != 2 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
if [ $# -lt 2 ] && [ $# -gt 3 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
exit -1
fi
config_path=$1
ckpt_name=$2
ips=$3
if [ ! $ips ];then
ips_config=
else
ips_config="--ips="${ips}
fi
echo ${ips_config}
mkdir -p exp
......@@ -37,7 +45,7 @@ python3 -u ${BIN_DIR}/train.py \
--benchmark-batch-size ${benchmark_batch_size} \
--benchmark-max-step ${benchmark_max_step}
else
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
--seed ${seed} \
--config ${config_path} \
......
......@@ -6,6 +6,7 @@ gpus=0,1,2,3
stage=0
stop_stage=50
conf_path=conf/conformer.yaml
ips= #xx.xx.xx.xx,xx.xx.xx.xx
decode_conf_path=conf/tuning/decode.yaml
avg_num=30
audio_file=data/demo_01_03.wav
......@@ -23,7 +24,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
......
#! /usr/bin/env bash
if [ $# != 2 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
if [ $# -lt 2 ] && [ $# -gt 3 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
exit -1
fi
......@@ -10,6 +10,13 @@ echo "using $ngpu gpus..."
config_path=$1
ckpt_name=$2
ips=$3
if [ ! $ips ];then
ips_config=
else
ips_config="--ips="${ips}
fi
echo "using ${device}..."
......@@ -28,7 +35,7 @@ python3 -u ${BIN_DIR}/train.py \
--output exp/${ckpt_name} \
--seed ${seed}
else
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
--config ${config_path} \
--output exp/${ckpt_name} \
......
......@@ -6,6 +6,7 @@ gpus=0,1,2,3
stage=0
stop_stage=50
conf_path=conf/conformer.yaml
ips= #xx.xx.xx.xx,xx.xx.xx.xx
decode_conf_path=conf/tuning/decode.yaml
avg_num=20
......@@ -22,7 +23,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
......
data:
dataset: 'paddleaudio.datasets:ESC50'
dataset: 'paddlespeech.audio.datasets:ESC50'
num_classes: 50
train:
mode: 'train'
......
......@@ -2,7 +2,7 @@
###########################################
# Data #
###########################################
dataset: 'paddleaudio.datasets:HeySnips'
dataset: 'paddlespeech.audio.datasets:HeySnips'
data_dir: '/PATH/TO/DATA/hey_snips_research_6k_en_train_eval_clean_ter'
############################################
......
......@@ -3,6 +3,7 @@
## Deepspeech2 Non-Streaming
| Model | Params | release | Config | Test set | Loss | WER |
| --- | --- | --- | --- | --- | --- | --- |
| DeepSpeech2 | 113.96M | r1.0.1 | conf/deepspeech2.yaml + U2 Data pipline and spec aug + fbank161 | test-clean | 10.76069622039795 | 0.046700 |
| DeepSpeech2 | 42.96M | 2.2.0 | conf/deepspeech2.yaml + spec_aug | test-clean | 14.49190807 | 0.067283 |
| DeepSpeech2 | 42.96M | 2.1.0 | conf/deepspeech2.yaml | test-clean | 15.184467315673828 | 0.072154 |
| DeepSpeech2 | 42.96M | 2.0.0 | conf/deepspeech2.yaml | test-clean | - | 0.073973 |
......
#!/bin/bash
if [ $# != 2 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
if [ $# -lt 2 ] && [ $# -gt 3 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
exit -1
fi
......@@ -10,6 +10,13 @@ echo "using $ngpu gpus..."
config_path=$1
ckpt_name=$2
ips=$3
if [ ! $ips ];then
ips_config=
else
ips_config="--ips="${ips}
fi
mkdir -p exp
......@@ -26,7 +33,7 @@ python3 -u ${BIN_DIR}/train.py \
--output exp/${ckpt_name} \
--seed ${seed}
else
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
--config ${config_path} \
--output exp/${ckpt_name} \
......
......@@ -6,6 +6,7 @@ gpus=0,1,2,3
stage=0
stop_stage=100
conf_path=conf/deepspeech2.yaml
ips= #xx.xx.xx.xx,xx.xx.xx.xx
decode_conf_path=conf/tuning/decode.yaml
avg_num=5
audio_file=data/demo_002_en.wav
......@@ -23,7 +24,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
......
......@@ -42,6 +42,11 @@ echo "chunk mode ${chunk_mode}"
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# format the reference test file
python3 utils/format_rsl.py \
--origin_ref data/manifest.test-clean.raw \
--trans_ref data/manifest.test-clean.text
for type in attention; do
echo "decoding ${type}"
if [ ${chunk_mode} == true ];then
......@@ -63,54 +68,90 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
python3 utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.${type}.rsl \
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
python3 utils/compute-wer.py --char=1 --v=1 \
data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
echo "decoding ${type} done."
done
for type in ctc_greedy_search; do
echo "decoding ${type}"
if [ ${chunk_mode} == true ];then
# stream decoding only support batchsize=1
batch_size=1
else
batch_size=64
fi
python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \
--config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decode.decoding_method ${type} \
--opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
python3 utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.${type}.rsl \
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
python3 utils/compute-wer.py --char=1 --v=1 \
data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
echo "decoding ${type} done."
done
fi
for type in ctc_greedy_search; do
echo "decoding ${type}"
if [ ${chunk_mode} == true ];then
# stream decoding only support batchsize=1
for type in ctc_prefix_beam_search attention_rescoring; do
echo "decoding ${type}"
batch_size=1
else
batch_size=64
fi
python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \
--config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decode.decoding_method ${type} \
--opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
echo "decoding ${type} done."
done
for type in ctc_prefix_beam_search attention_rescoring; do
echo "decoding ${type}"
batch_size=1
python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \
--config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decode.decoding_method ${type} \
--opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
echo "decoding ${type} done."
done
python3 -u ${BIN_DIR}/test.py \
--ngpu ${ngpu} \
--config ${config_path} \
--decode_cfg ${decode_config_path} \
--result_file ${ckpt_prefix}.${type}.rsl \
--checkpoint_path ${ckpt_prefix} \
--opts decode.decoding_method ${type} \
--opts decode.decode_batch_size ${batch_size}
if [ $? -ne 0 ]; then
echo "Failed in evaluation!"
exit 1
fi
python3 utils/format_rsl.py \
--origin_hyp ${ckpt_prefix}.${type}.rsl \
--trans_hyp ${ckpt_prefix}.${type}.rsl.text
python3 utils/compute-wer.py --char=1 --v=1 \
data/manifest.test-clean.text ${ckpt_prefix}.${type}.rsl.text > ${ckpt_prefix}.${type}.error
echo "decoding ${type} done."
done
fi
if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
python3 utils/format_rsl.py \
--origin_ref data/manifest.test-clean.raw \
--trans_ref_sclite data/manifest.test.text-clean.sclite
output_dir=${ckpt_prefix}
for type in attention ctc_greedy_search ctc_prefix_beam_search attention_rescoring; do
python utils/format_rsl.py \
--origin_hyp ${output_dir}/${type}.rsl \
--trans_hyp_sclite ${output_dir}/${type}.rsl.text.sclite
mkdir -p ${output_dir}/${type}_sclite
sclite -i wsj -r data/manifest.test-clean.text.sclite -h ${output_dir}/${type}.rsl.text.sclite -e utf-8 -o all -O ${output_dir}/${type}_sclite -c NOASCII
done
fi
echo "Finished"
......
#!/bin/bash
if [ $# != 2 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
if [ $# -lt 2 ] && [ $# -gt 3 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
exit -1
fi
......@@ -10,6 +10,13 @@ echo "using $ngpu gpus..."
config_path=$1
ckpt_name=$2
ips=$3
if [ ! $ips ];then
ips_config=
else
ips_config="--ips="${ips}
fi
mkdir -p exp
......@@ -29,7 +36,7 @@ python3 -u ${BIN_DIR}/train.py \
--output exp/${ckpt_name} \
--seed ${seed}
else
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
--config ${config_path} \
--output exp/${ckpt_name} \
......
......@@ -8,6 +8,7 @@ gpus=0,1,2,3
stage=0
stop_stage=50
conf_path=conf/transformer.yaml
ips= #xx.xx.xx.xx,xx.xx.xx.xx
decode_conf_path=conf/tuning/decode.yaml
avg_num=30
audio_file=data/demo_002_en.wav
......@@ -25,7 +26,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
......
#!/bin/bash
if [ $# != 2 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
if [ $# -lt 2 ] && [ $# -gt 3 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
exit -1
fi
......@@ -10,6 +10,13 @@ echo "using $ngpu gpus..."
config_path=$1
ckpt_name=$2
ips=$3
if [ ! $ips ];then
ips_config=
else
ips_config="--ips="${ips}
fi
mkdir -p exp
......@@ -27,7 +34,7 @@ python3 -u ${BIN_DIR}/train.py \
--output exp/${ckpt_name} \
--seed ${seed}
else
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
--model-name u2_kaldi \
--config ${config_path} \
......
......@@ -9,6 +9,7 @@ gpus=0,1,2,3,4,5,6,7
stage=0
stop_stage=50
conf_path=conf/transformer.yaml
ips= #xx.xx.xx.xx,xx.xx.xx.xx
decode_conf_path=conf/decode/decode_base.yaml
dict_path=data/lang_char/train_960_unigram5000_units.txt
avg_num=10
......@@ -26,7 +27,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
......
#!/bin/bash
if [ $# != 3 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path"
if [ $# -lt 3 ] && [ $# -gt 4 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path ips(optional)"
exit -1
fi
......@@ -11,6 +11,13 @@ echo "using $ngpu gpus..."
config_path=$1
ckpt_name=$2
ckpt_path=$3
ips=$3
if [ ! $ips ];then
ips_config=
else
ips_config="--ips="${ips}
fi
mkdir -p exp
......@@ -21,12 +28,21 @@ if [ ${seed} != 0 ]; then
export FLAGS_cudnn_deterministic=True
fi
if [ ${ngpu} == 0 ]; then
python3 -u ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
--config ${config_path} \
--output exp/${ckpt_name} \
--checkpoint_path "${ckpt_path}" \
--seed ${seed}
else
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
--config ${config_path} \
--output exp/${ckpt_name} \
--checkpoint_path "${ckpt_path}" \
--seed ${seed}
fi
if [ ${seed} != 0 ]; then
unset FLAGS_cudnn_deterministic
......
......@@ -7,6 +7,7 @@ gpus=0,1,2,3
stage=0
stop_stage=3
conf_path=conf/transformer_es.yaml
ips= #xx.xx.xx.xx,xx.xx.xx.xx
decode_conf_path=conf/tuning/decode.yaml
must_c_path=
lang=es
......@@ -25,7 +26,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}"
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}" ${ips}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
......@@ -36,4 +37,4 @@ fi
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# test ckpt avg_n
CUDA_VISIBLE_DEVICES=0 ./local/test.sh ${conf_path} ${decode_conf_path} exp/${ckpt}/checkpoints/${avg_ckpt} ${lang} || exit -1
fi
\ No newline at end of file
fi
#!/bin/bash
if [ $# != 2 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
if [ $# -lt 2 ] && [ $# -gt 3 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
exit -1
fi
......@@ -10,6 +10,13 @@ echo "using $ngpu gpus..."
config_path=$1
ckpt_name=$2
ips=$3
if [ ! $ips ];then
ips_config=
else
ips_config="--ips="${ips}
fi
mkdir -p exp
......@@ -26,7 +33,7 @@ python3 -u ${BIN_DIR}/train.py \
--output exp/${ckpt_name} \
--seed ${seed}
else
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
--config ${config_path} \
--output exp/${ckpt_name} \
......
......@@ -6,6 +6,7 @@ gpus=0,1,2,3
stage=0
stop_stage=50
conf_path=conf/transformer_mtl_noam.yaml
ips= #xx.xx.xx.xx,xx.xx.xx.xx
decode_conf_path=conf/tuning/decode.yaml
avg_num=5
data_path=./TED_EnZh # path to unzipped data
......@@ -23,7 +24,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
......
#!/bin/bash
if [ $# != 3 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ckpt_path"
if [ $# -lt 3 ] && [ $# -gt 4 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
exit -1
fi
......@@ -11,6 +11,15 @@ echo "using $ngpu gpus..."
config_path=$1
ckpt_name=$2
ckpt_path=$3
ips=$3
if [ ! $ips ];then
ips_config=
else
ips_config="--ips="${ips}
fi
mkdir -p exp
mkdir -p exp
......@@ -28,7 +37,7 @@ python3 -u ${BIN_DIR}/train.py \
--checkpoint_path "${ckpt_path}" \
--seed ${seed}
else
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
--config ${config_path} \
--output exp/${ckpt_name} \
......
......@@ -7,6 +7,7 @@ gpus=0,1,2,3
stage=1
stop_stage=4
conf_path=conf/transformer_mtl_noam.yaml
ips= #xx.xx.xx.xx,xx.xx.xx.xx
decode_conf_path=conf/tuning/decode.yaml
ckpt_path= # paddle.98 # (finetune from FAT-ST pretrained model)
avg_num=5
......@@ -29,7 +30,7 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "Finetune from Pretrained Model" ${ckpt_path}
./local/download_pretrain.sh || exit -1
fi
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}"
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} "${ckpt_path}" ${ips}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
......
......@@ -15,13 +15,20 @@ if [ ${seed} != 0 ]; then
echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
fi
if [ $# != 2 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
if [ $# -lt 2 ] && [ $# -gt 3 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
exit -1
fi
config_path=$1
ckpt_name=$2
ips=$3
if [ ! $ips ];then
ips_config=
else
ips_config="--ips="${ips}
fi
mkdir -p exp
......@@ -33,7 +40,7 @@ python3 -u ${BIN_DIR}/train.py \
--profiler-options "${profiler_options}" \
--seed ${seed}
else
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
--config ${config_path} \
--output exp/${ckpt_name} \
......
......@@ -2,10 +2,11 @@
set -e
source path.sh
gpus=0
gpus=4
stage=0
stop_stage=100
conf_path=conf/deepspeech2.yaml
ips= #xx.xx.xx.xx,xx.xx.xx.xx
decode_conf_path=conf/tuning/decode.yaml
avg_num=1
source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
......@@ -21,7 +22,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
......
......@@ -17,13 +17,20 @@ if [ ${seed} != 0 ]; then
echo "using seed $seed & FLAGS_cudnn_deterministic=True ..."
fi
if [ $# != 2 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name"
if [ $# -lt 2 ] && [ $# -gt 3 ];then
echo "usage: CUDA_VISIBLE_DEVICES=0 ${0} config_path ckpt_name ips(optional)"
exit -1
fi
config_path=$1
ckpt_name=$2
ips=$3
if [ ! $ips ];then
ips_config=
else
ips_config="--ips="${ips}
fi
mkdir -p exp
......@@ -37,7 +44,7 @@ python3 -u ${BIN_DIR}/train.py \
--benchmark-batch-size ${benchmark_batch_size} \
--benchmark-max-step ${benchmark_max_step}
else
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${BIN_DIR}/train.py \
python3 -m paddle.distributed.launch --gpus=${CUDA_VISIBLE_DEVICES} ${ips_config} ${BIN_DIR}/train.py \
--ngpu ${ngpu} \
--seed ${seed} \
--config ${config_path} \
......
......@@ -2,10 +2,11 @@
set -e
source path.sh
gpus=0
gpus=4
stage=0
stop_stage=50
conf_path=conf/transformer.yaml
ips= #xx.xx.xx.xx,xx.xx.xx.xx
decode_conf_path=conf/tuning/decode.yaml
avg_num=1
......@@ -22,7 +23,7 @@ fi
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# train model, all `ckpt` under `exp` dir
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt}
CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${ckpt} ${ips}
fi
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
......
......@@ -14,9 +14,9 @@
import argparse
import paddle
from paddleaudio.datasets.voxceleb import VoxCeleb
from yacs.config import CfgNode
from paddlespeech.audio.datasets.voxceleb import VoxCeleb
from paddlespeech.s2t.utils.log import Log
from paddlespeech.vector.io.augment import build_augment_pipeline
from paddlespeech.vector.training.seeding import seed_everything
......
......@@ -21,9 +21,9 @@ import os
from typing import List
import tqdm
from paddleaudio import load as load_audio
from yacs.config import CfgNode
from paddlespeech.audio import load as load_audio
from paddlespeech.s2t.utils.log import Log
from paddlespeech.vector.utils.vector_utils import get_chunks
......
......@@ -22,9 +22,9 @@ import os
import random
import tqdm
from paddleaudio import load as load_audio
from yacs.config import CfgNode
from paddlespeech.audio import load as load_audio
from paddlespeech.s2t.utils.log import Log
from paddlespeech.vector.utils.vector_utils import get_chunks
......
......@@ -16,8 +16,8 @@ import os
from typing import List
from typing import Tuple
from ..utils import DATA_HOME
from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from .dataset import AudioClassificationDataset
__all__ = ['ESC50']
......
......@@ -17,8 +17,8 @@ import random
from typing import List
from typing import Tuple
from ..utils import DATA_HOME
from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from .dataset import AudioClassificationDataset
__all__ = ['GTZAN']
......
......@@ -17,8 +17,8 @@ import random
from typing import List
from typing import Tuple
from ..utils import DATA_HOME
from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from .dataset import AudioClassificationDataset
__all__ = ['TESS']
......
......@@ -16,8 +16,8 @@ import os
from typing import List
from typing import Tuple
from ..utils import DATA_HOME
from ..utils.download import download_and_decompress
from ..utils.env import DATA_HOME
from .dataset import AudioClassificationDataset
__all__ = ['UrbanSound8K']
......
......@@ -11,13 +11,11 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ...cli.utils import DATA_HOME
from ...cli.utils import MODEL_HOME
from .download import decompress
from .download import download_and_decompress
from .download import load_state_dict_from_url
from .env import DATA_HOME
from .env import MODEL_HOME
from .env import PPAUDIO_HOME
from .env import USER_HOME
from .error import ParameterError
from .log import Logger
from .log import logger
......
......@@ -83,6 +83,12 @@ class ASRExecutor(BaseExecutor):
'attention_rescoring'
],
help='only support transformer and conformer model')
self.parser.add_argument(
'--num_decoding_left_chunks',
'-num_left',
type=str,
default=-1,
help='only support transformer and conformer online model')
self.parser.add_argument(
'--ckpt_path',
type=str,
......@@ -122,6 +128,7 @@ class ASRExecutor(BaseExecutor):
sample_rate: int=16000,
cfg_path: Optional[os.PathLike]=None,
decode_method: str='attention_rescoring',
num_decoding_left_chunks: int=-1,
ckpt_path: Optional[os.PathLike]=None):
"""
Init model and other resources from a specific path.
......@@ -179,6 +186,9 @@ class ASRExecutor(BaseExecutor):
elif "conformer" in model_type or "transformer" in model_type:
self.config.decode.decoding_method = decode_method
if num_decoding_left_chunks:
assert num_decoding_left_chunks == -1 or num_decoding_left_chunks >= 0, "num_decoding_left_chunks should be -1 or >=0"
self.config.num_decoding_left_chunks = num_decoding_left_chunks
else:
raise Exception("wrong type")
......@@ -451,6 +461,7 @@ class ASRExecutor(BaseExecutor):
config: os.PathLike=None,
ckpt_path: os.PathLike=None,
decode_method: str='attention_rescoring',
num_decoding_left_chunks: int=-1,
force_yes: bool=False,
rtf: bool=False,
device=paddle.get_device()):
......@@ -460,7 +471,7 @@ class ASRExecutor(BaseExecutor):
audio_file = os.path.abspath(audio_file)
paddle.set_device(device)
self._init_from_path(model, lang, sample_rate, config, decode_method,
ckpt_path)
num_decoding_left_chunks, ckpt_path)
if not self._check(audio_file, sample_rate, force_yes):
sys.exit(-1)
if rtf:
......
......@@ -21,12 +21,12 @@ from typing import Union
import numpy as np
import paddle
import yaml
from paddleaudio import load
from paddleaudio.features import LogMelSpectrogram
from ..executor import BaseExecutor
from ..log import logger
from ..utils import stats_wrapper
from paddlespeech.audio import load
from paddlespeech.audio.features import LogMelSpectrogram
__all__ = ['CLSExecutor']
......
......@@ -24,8 +24,8 @@ from typing import Any
from typing import Dict
import paddle
import paddleaudio
import requests
import soundfile as sf
import yaml
from paddle.framework import load
......@@ -190,6 +190,7 @@ def _get_sub_home(directory):
PPSPEECH_HOME = _get_paddlespcceh_home()
MODEL_HOME = _get_sub_home('models')
CONF_HOME = _get_sub_home('conf')
DATA_HOME = _get_sub_home('datasets')
def _md5(text: str):
......@@ -281,7 +282,8 @@ def _note_one_stat(cls_name, params={}):
if 'audio_file' in params:
try:
_, sr = paddleaudio.load(params['audio_file'])
# recursive import cased by: utils.DATA_HOME
_, sr = sf.read(params['audio_file'])
except Exception:
sr = -1
......
......@@ -22,13 +22,13 @@ from typing import Union
import paddle
import soundfile
from paddleaudio.backends import load as load_audio
from paddleaudio.compliance.librosa import melspectrogram
from yacs.config import CfgNode
from ..executor import BaseExecutor
from ..log import logger
from ..utils import stats_wrapper
from paddlespeech.audio.backends import load as load_audio
from paddlespeech.audio.compliance.librosa import melspectrogram
from paddlespeech.vector.io.batch import feature_normalize
from paddlespeech.vector.modules.sid_model import SpeakerIdetification
......
......@@ -16,11 +16,12 @@ import os
import numpy as np
from paddle import inference
from paddleaudio.backends import load as load_audio
from paddleaudio.datasets import ESC50
from paddleaudio.features import melspectrogram
from scipy.special import softmax
from paddlespeech.audio.backends import load as load_audio
from paddlespeech.audio.datasets import ESC50
from paddlespeech.audio.features import melspectrogram
# yapf: disable
parser = argparse.ArgumentParser()
parser.add_argument("--model_dir", type=str, required=True, default="./export", help="The directory to static model.")
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册