提交 dcab04a7 编写于 作者: X xiongxinlei

merge develop to server

...@@ -33,6 +33,12 @@ tools/Miniconda3-latest-Linux-x86_64.sh ...@@ -33,6 +33,12 @@ tools/Miniconda3-latest-Linux-x86_64.sh
tools/activate_python.sh tools/activate_python.sh
tools/miniconda.sh tools/miniconda.sh
tools/CRF++-0.58/ tools/CRF++-0.58/
tools/liblbfgs-1.10/
tools/srilm/
tools/env.sh
tools/openfst-1.8.1/
tools/libsndfile/
tools/python-soundfile/
speechx/fc_patch/ speechx/fc_patch/
......
...@@ -30,21 +30,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -30,21 +30,8 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--tones_dict=dump/tone_id_map.txt --tones_dict=dump/tone_id_map.txt
fi fi
# style melgan
# style melgan's Dygraph to Static Graph is not ready now
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \
--voc=style_melgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt \
--tones_dict=dump/tone_id_map.txt
fi
# hifigan # hifigan
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${BIN_DIR}/../inference.py \ python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \ --inference_dir=${train_output_path}/inference \
--am=speedyspeech_csmsc \ --am=speedyspeech_csmsc \
......
...@@ -231,14 +231,19 @@ Pretrained FastSpeech2 model with no silence in the edge of audios: ...@@ -231,14 +231,19 @@ Pretrained FastSpeech2 model with no silence in the edge of audios:
The static model can be downloaded here: The static model can be downloaded here:
- [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip) - [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)
- [fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip) - [fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip)
- [fastspeech2_cnndecoder_csmsc_static_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_static_1.0.0.zip)
- [fastspeech2_cnndecoder_csmsc_streaming_static_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_static_1.0.0.zip)
The ONNX model can be downloaded here: The ONNX model can be downloaded here:
- [fastspeech2_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip) - [fastspeech2_csmsc_onnx_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_onnx_0.2.0.zip)
- [fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_onnx_1.0.0.zip)
- [fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_streaming_onnx_1.0.0.zip)
Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss
:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------: :-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
default| 2(gpu) x 76000|1.0991|0.59132|0.035815|0.31915|0.15287| default| 2(gpu) x 76000|1.0991|0.59132|0.035815|0.31915|0.15287|
conformer| 2(gpu) x 76000|1.0675|0.56103|0.035869|0.31553|0.15509| conformer| 2(gpu) x 76000|1.0675|0.56103|0.035869|0.31553|0.15509|
cnndecoder| 1(gpu) x 153000|1.1153|0.61475|0.03380|0.30414|0.14707|
FastSpeech2 checkpoint contains files listed below. FastSpeech2 checkpoint contains files listed below.
```text ```text
......
...@@ -5,6 +5,7 @@ train_output_path=$1 ...@@ -5,6 +5,7 @@ train_output_path=$1
stage=0 stage=0
stop_stage=0 stop_stage=0
# pwgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../inference.py \ python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \ --inference_dir=${train_output_path}/inference \
...@@ -27,20 +28,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then ...@@ -27,20 +28,9 @@ if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
--phones_dict=dump/phone_id_map.txt --phones_dict=dump/phone_id_map.txt
fi fi
# style melgan
# style melgan's Dygraph to Static Graph is not ready now
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \
--am=fastspeech2_csmsc \
--voc=style_melgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/pd_infer_out \
--phones_dict=dump/phone_id_map.txt
fi
# hifigan # hifigan
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${BIN_DIR}/../inference.py \ python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \ --inference_dir=${train_output_path}/inference \
--am=fastspeech2_csmsc \ --am=fastspeech2_csmsc \
...@@ -51,7 +41,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then ...@@ -51,7 +41,7 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
fi fi
# wavernn # wavernn
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
python3 ${BIN_DIR}/../inference.py \ python3 ${BIN_DIR}/../inference.py \
--inference_dir=${train_output_path}/inference \ --inference_dir=${train_output_path}/inference \
--am=fastspeech2_csmsc \ --am=fastspeech2_csmsc \
......
#!/bin/bash
train_output_path=$1
stage=0
stop_stage=0
# pwgan
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../inference_streaming.py \
--inference_dir=${train_output_path}/inference_streaming \
--am=fastspeech2_csmsc \
--am_stat=dump/train/speech_stats.npy \
--voc=pwgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/pd_infer_out_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True
fi
# for more GAN Vocoders
# multi band melgan
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
python3 ${BIN_DIR}/../inference_streaming.py \
--inference_dir=${train_output_path}/inference_streaming \
--am=fastspeech2_csmsc \
--am_stat=dump/train/speech_stats.npy \
--voc=mb_melgan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/pd_infer_out_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True
fi
# hifigan
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
python3 ${BIN_DIR}/../inference_streaming.py \
--inference_dir=${train_output_path}/inference_streaming \
--am=fastspeech2_csmsc \
--am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \
--text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/pd_infer_out_streaming \
--phones_dict=dump/phone_id_map.txt \
--am_streaming=True
fi
train_output_path=$1
stage=0
stop_stage=0
# e2e, synthesize from text
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
python3 ${BIN_DIR}/../ort_predict_streaming.py \
--inference_dir=${train_output_path}/inference_onnx_streaming \
--am=fastspeech2_csmsc \
--am_stat=dump/train/speech_stats.npy \
--voc=hifigan_csmsc \
--output_dir=${train_output_path}/onnx_infer_out_streaming \
--text=${BIN_DIR}/../csmsc_test.txt \
--phones_dict=dump/phone_id_map.txt \
--device=cpu \
--cpu_threads=2 \
--am_streaming=True
fi
...@@ -88,5 +88,6 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then ...@@ -88,5 +88,6 @@ if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
--text=${BIN_DIR}/../sentences.txt \ --text=${BIN_DIR}/../sentences.txt \
--output_dir=${train_output_path}/test_e2e_streaming \ --output_dir=${train_output_path}/test_e2e_streaming \
--phones_dict=dump/phone_id_map.txt \ --phones_dict=dump/phone_id_map.txt \
--am_streaming=True --am_streaming=True \
--inference_dir=${train_output_path}/inference_streaming
fi fi
...@@ -31,18 +31,75 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then ...@@ -31,18 +31,75 @@ if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi fi
# synthesize_e2e non-streaming
if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
# synthesize_e2e, vocoder is pwgan # synthesize_e2e, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi fi
# inference non-streaming
if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
# inference with static model # inference with static model
CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
fi fi
# synthesize_e2e streaming
if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
# synthesize_e2e, vocoder is pwgan # synthesize_e2e, vocoder is pwgan
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_streaming.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1 CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_streaming.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
fi fi
# inference streaming
if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
# inference with static model
CUDA_VISIBLE_DEVICES=${gpus} ./local/inference_streaming.sh ${train_output_path} || exit -1
fi
# paddle2onnx non streaming
if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then
# install paddle2onnx
version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
if [[ -z "$version" || ${version} != '0.9.4' ]]; then
pip install paddle2onnx==0.9.4
fi
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx fastspeech2_csmsc
./local/paddle2onnx.sh ${train_output_path} inference inference_onnx hifigan_csmsc
fi
# onnxruntime non streaming
# inference with onnxruntime, use fastspeech2 + hifigan by default
if [ ${stage} -le 8 ] && [ ${stop_stage} -ge 8 ]; then
# install onnxruntime
version=$(echo `pip list |grep "onnxruntime"` |awk -F" " '{print $2}')
if [[ -z "$version" || ${version} != '1.10.0' ]]; then
pip install onnxruntime==1.10.0
fi
./local/ort_predict.sh ${train_output_path}
fi
# paddle2onnx streaming
if [ ${stage} -le 9 ] && [ ${stop_stage} -ge 9 ]; then
# install paddle2onnx
version=$(echo `pip list |grep "paddle2onnx"` |awk -F" " '{print $2}')
if [[ -z "$version" || ${version} != '0.9.4' ]]; then
pip install paddle2onnx==0.9.4
fi
# streaming acoustic model
./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_encoder_infer
./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_decoder
./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming fastspeech2_csmsc_am_postnet
# vocoder
./local/paddle2onnx.sh ${train_output_path} inference_streaming inference_onnx_streaming hifigan_csmsc
fi
# onnxruntime streaming
if [ ${stage} -le 10 ] && [ ${stop_stage} -ge 10 ]; then
# install onnxruntime
version=$(echo `pip list |grep "onnxruntime"` |awk -F" " '{print $2}')
if [[ -z "$version" || ${version} != '1.10.0' ]]; then
pip install onnxruntime==1.10.0
fi
./local/ort_predict_streaming.sh ${train_output_path}
fi
...@@ -27,7 +27,7 @@ arpa=$3 ...@@ -27,7 +27,7 @@ arpa=$3
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
# text tn & wordseg preprocess # text tn & wordseg preprocess
echo "process text." echo "process text."
python3 ${MAIN_ROOT}/utils/zh_tn.py ${type} ${text} ${text}.${type}.tn python3 ${MAIN_ROOT}/utils/zh_tn.py --token_type ${type} ${text} ${text}.${type}.tn
fi fi
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
......
...@@ -10,6 +10,11 @@ MD5="29e02312deb2e59b3c8686c7966d4fe3" ...@@ -10,6 +10,11 @@ MD5="29e02312deb2e59b3c8686c7966d4fe3"
TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm TARGET=${DIR}/zh_giga.no_cna_cmn.prune01244.klm
if [ -e $TARGET ];then
echo "already have lm"
exit 0;
fi
echo "Download language model ..." echo "Download language model ..."
download $URL $MD5 $TARGET download $URL $MD5 $TARGET
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
......
...@@ -29,9 +29,10 @@ from ..download import get_path_from_url ...@@ -29,9 +29,10 @@ from ..download import get_path_from_url
from ..executor import BaseExecutor from ..executor import BaseExecutor
from ..log import logger from ..log import logger
from ..utils import cli_register from ..utils import cli_register
from ..utils import download_and_decompress
from ..utils import MODEL_HOME from ..utils import MODEL_HOME
from ..utils import stats_wrapper from ..utils import stats_wrapper
from .pretrained_models import model_alias
from .pretrained_models import pretrained_models
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.transform.transformation import Transformation from paddlespeech.s2t.transform.transformation import Transformation
from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.dynamic_import import dynamic_import
...@@ -39,110 +40,13 @@ from paddlespeech.s2t.utils.utility import UpdateConfig ...@@ -39,110 +40,13 @@ from paddlespeech.s2t.utils.utility import UpdateConfig
__all__ = ['ASRExecutor'] __all__ = ['ASRExecutor']
pretrained_models = {
# The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
# e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k".
# Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
# "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
"conformer_wenetspeech-zh-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz',
'md5':
'76cb19ed857e6623856b7cd7ebbfeda4',
'cfg_path':
'model.yaml',
'ckpt_path':
'exp/conformer/checkpoints/wenetspeech',
},
"transformer_librispeech-en-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz',
'md5':
'2c667da24922aad391eacafe37bc1660',
'cfg_path':
'model.yaml',
'ckpt_path':
'exp/transformer/checkpoints/avg_10',
},
"deepspeech2offline_aishell-zh-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
'md5':
'932c3593d62fe5c741b59b31318aa314',
'cfg_path':
'model.yaml',
'ckpt_path':
'exp/deepspeech2/checkpoints/avg_1',
'lm_url':
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
'lm_md5':
'29e02312deb2e59b3c8686c7966d4fe3'
},
"deepspeech2online_aishell-zh-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz',
'md5':
'23e16c69730a1cb5d735c98c83c21e16',
'cfg_path':
'model.yaml',
'ckpt_path':
'exp/deepspeech2_online/checkpoints/avg_1',
'lm_url':
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
'lm_md5':
'29e02312deb2e59b3c8686c7966d4fe3'
},
"conformer2online_aishell-zh-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.1.2.model.tar.gz',
'md5':
'4814e52e0fc2fd48899373f95c84b0c9',
'cfg_path':
'config.yaml',
'ckpt_path':
'exp/deepspeech2_online/checkpoints/avg_30',
'lm_url':
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
'lm_md5':
'29e02312deb2e59b3c8686c7966d4fe3'
},
"deepspeech2offline_librispeech-en-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz',
'md5':
'f5666c81ad015c8de03aac2bc92e5762',
'cfg_path':
'model.yaml',
'ckpt_path':
'exp/deepspeech2/checkpoints/avg_1',
'lm_url':
'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm',
'lm_md5':
'099a601759d467cd0a8523ff939819c5'
},
}
model_alias = {
"deepspeech2offline":
"paddlespeech.s2t.models.ds2:DeepSpeech2Model",
"deepspeech2online":
"paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline",
"conformer":
"paddlespeech.s2t.models.u2:U2Model",
"conformer_online":
"paddlespeech.s2t.models.u2:U2Model",
"transformer":
"paddlespeech.s2t.models.u2:U2Model",
"wenetspeech":
"paddlespeech.s2t.models.u2:U2Model",
}
@cli_register( @cli_register(
name='paddlespeech.asr', description='Speech to text infer command.') name='paddlespeech.asr', description='Speech to text infer command.')
class ASRExecutor(BaseExecutor): class ASRExecutor(BaseExecutor):
def __init__(self): def __init__(self):
super(ASRExecutor, self).__init__() super().__init__()
self.model_alias = model_alias
self.pretrained_models = pretrained_models
self.parser = argparse.ArgumentParser( self.parser = argparse.ArgumentParser(
prog='paddlespeech.asr', add_help=True) prog='paddlespeech.asr', add_help=True)
...@@ -152,7 +56,9 @@ class ASRExecutor(BaseExecutor): ...@@ -152,7 +56,9 @@ class ASRExecutor(BaseExecutor):
'--model', '--model',
type=str, type=str,
default='conformer_wenetspeech', default='conformer_wenetspeech',
choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()], choices=[
tag[:tag.index('-')] for tag in self.pretrained_models.keys()
],
help='Choose model type of asr task.') help='Choose model type of asr task.')
self.parser.add_argument( self.parser.add_argument(
'--lang', '--lang',
...@@ -208,23 +114,6 @@ class ASRExecutor(BaseExecutor): ...@@ -208,23 +114,6 @@ class ASRExecutor(BaseExecutor):
action='store_true', action='store_true',
help='Increase logger verbosity of current task.') help='Increase logger verbosity of current task.')
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
Download and returns pretrained resources path of current task.
"""
support_models = list(pretrained_models.keys())
assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
tag, '\n\t\t'.join(support_models))
res_path = os.path.join(MODEL_HOME, tag)
decompressed_path = download_and_decompress(pretrained_models[tag],
res_path)
decompressed_path = os.path.abspath(decompressed_path)
logger.info(
'Use pretrained model stored in: {}'.format(decompressed_path))
return decompressed_path
def _init_from_path(self, def _init_from_path(self,
model_type: str='wenetspeech', model_type: str='wenetspeech',
lang: str='zh', lang: str='zh',
...@@ -245,10 +134,11 @@ class ASRExecutor(BaseExecutor): ...@@ -245,10 +134,11 @@ class ASRExecutor(BaseExecutor):
tag = model_type + '-' + lang + '-' + sample_rate_str tag = model_type + '-' + lang + '-' + sample_rate_str
res_path = self._get_pretrained_path(tag) # wenetspeech_zh res_path = self._get_pretrained_path(tag) # wenetspeech_zh
self.res_path = res_path self.res_path = res_path
self.cfg_path = os.path.join(res_path, self.cfg_path = os.path.join(
pretrained_models[tag]['cfg_path']) res_path, self.pretrained_models[tag]['cfg_path'])
self.ckpt_path = os.path.join( self.ckpt_path = os.path.join(
res_path, pretrained_models[tag]['ckpt_path'] + ".pdparams") res_path,
self.pretrained_models[tag]['ckpt_path'] + ".pdparams")
logger.info(res_path) logger.info(res_path)
else: else:
...@@ -273,8 +163,8 @@ class ASRExecutor(BaseExecutor): ...@@ -273,8 +163,8 @@ class ASRExecutor(BaseExecutor):
self.collate_fn_test = SpeechCollator.from_config(self.config) self.collate_fn_test = SpeechCollator.from_config(self.config)
self.text_feature = TextFeaturizer( self.text_feature = TextFeaturizer(
unit_type=self.config.unit_type, vocab=self.vocab) unit_type=self.config.unit_type, vocab=self.vocab)
lm_url = pretrained_models[tag]['lm_url'] lm_url = self.pretrained_models[tag]['lm_url']
lm_md5 = pretrained_models[tag]['lm_md5'] lm_md5 = self.pretrained_models[tag]['lm_md5']
self.download_lm( self.download_lm(
lm_url, lm_url,
os.path.dirname(self.config.decode.lang_model_path), lm_md5) os.path.dirname(self.config.decode.lang_model_path), lm_md5)
...@@ -291,7 +181,7 @@ class ASRExecutor(BaseExecutor): ...@@ -291,7 +181,7 @@ class ASRExecutor(BaseExecutor):
raise Exception("wrong type") raise Exception("wrong type")
model_name = model_type[:model_type.rindex( model_name = model_type[:model_type.rindex(
'_')] # model_type: {model_name}_{dataset} '_')] # model_type: {model_name}_{dataset}
model_class = dynamic_import(model_name, model_alias) model_class = dynamic_import(model_name, self.model_alias)
model_conf = self.config model_conf = self.config
model = model_class.from_config(model_conf) model = model_class.from_config(model_conf)
self.model = model self.model = model
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
pretrained_models = {
# The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
# e.g. "conformer_wenetspeech-zh-16k" and "panns_cnn6-32k".
# Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
# "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
"conformer_wenetspeech-zh-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/wenetspeech/asr1_conformer_wenetspeech_ckpt_0.1.1.model.tar.gz',
'md5':
'76cb19ed857e6623856b7cd7ebbfeda4',
'cfg_path':
'model.yaml',
'ckpt_path':
'exp/conformer/checkpoints/wenetspeech',
},
"transformer_librispeech-en-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_transformer_librispeech_ckpt_0.1.1.model.tar.gz',
'md5':
'2c667da24922aad391eacafe37bc1660',
'cfg_path':
'model.yaml',
'ckpt_path':
'exp/transformer/checkpoints/avg_10',
},
"deepspeech2offline_aishell-zh-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz',
'md5':
'932c3593d62fe5c741b59b31318aa314',
'cfg_path':
'model.yaml',
'ckpt_path':
'exp/deepspeech2/checkpoints/avg_1',
'lm_url':
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
'lm_md5':
'29e02312deb2e59b3c8686c7966d4fe3'
},
"deepspeech2online_aishell-zh-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz',
'md5':
'23e16c69730a1cb5d735c98c83c21e16',
'cfg_path':
'model.yaml',
'ckpt_path':
'exp/deepspeech2_online/checkpoints/avg_1',
'lm_url':
'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
'lm_md5':
'29e02312deb2e59b3c8686c7966d4fe3'
},
"deepspeech2offline_librispeech-en-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz',
'md5':
'f5666c81ad015c8de03aac2bc92e5762',
'cfg_path':
'model.yaml',
'ckpt_path':
'exp/deepspeech2/checkpoints/avg_1',
'lm_url':
'https://deepspeech.bj.bcebos.com/en_lm/common_crawl_00.prune01111.trie.klm',
'lm_md5':
'099a601759d467cd0a8523ff939819c5'
},
}
model_alias = {
"deepspeech2offline":
"paddlespeech.s2t.models.ds2:DeepSpeech2Model",
"deepspeech2online":
"paddlespeech.s2t.models.ds2_online:DeepSpeech2ModelOnline",
"conformer":
"paddlespeech.s2t.models.u2:U2Model",
"transformer":
"paddlespeech.s2t.models.u2:U2Model",
"wenetspeech":
"paddlespeech.s2t.models.u2:U2Model",
}
...@@ -25,55 +25,23 @@ import yaml ...@@ -25,55 +25,23 @@ import yaml
from ..executor import BaseExecutor from ..executor import BaseExecutor
from ..log import logger from ..log import logger
from ..utils import cli_register from ..utils import cli_register
from ..utils import download_and_decompress
from ..utils import MODEL_HOME
from ..utils import stats_wrapper from ..utils import stats_wrapper
from .pretrained_models import model_alias
from .pretrained_models import pretrained_models
from paddleaudio import load from paddleaudio import load
from paddleaudio.features import LogMelSpectrogram from paddleaudio.features import LogMelSpectrogram
from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.dynamic_import import dynamic_import
__all__ = ['CLSExecutor'] __all__ = ['CLSExecutor']
pretrained_models = {
# The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
# e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
# Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
# "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
"panns_cnn6-32k": {
'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz',
'md5': '4cf09194a95df024fd12f84712cf0f9c',
'cfg_path': 'panns.yaml',
'ckpt_path': 'cnn6.pdparams',
'label_file': 'audioset_labels.txt',
},
"panns_cnn10-32k": {
'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz',
'md5': 'cb8427b22176cc2116367d14847f5413',
'cfg_path': 'panns.yaml',
'ckpt_path': 'cnn10.pdparams',
'label_file': 'audioset_labels.txt',
},
"panns_cnn14-32k": {
'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz',
'md5': 'e3b9b5614a1595001161d0ab95edee97',
'cfg_path': 'panns.yaml',
'ckpt_path': 'cnn14.pdparams',
'label_file': 'audioset_labels.txt',
},
}
model_alias = {
"panns_cnn6": "paddlespeech.cls.models.panns:CNN6",
"panns_cnn10": "paddlespeech.cls.models.panns:CNN10",
"panns_cnn14": "paddlespeech.cls.models.panns:CNN14",
}
@cli_register( @cli_register(
name='paddlespeech.cls', description='Audio classification infer command.') name='paddlespeech.cls', description='Audio classification infer command.')
class CLSExecutor(BaseExecutor): class CLSExecutor(BaseExecutor):
def __init__(self): def __init__(self):
super(CLSExecutor, self).__init__() super().__init__()
self.model_alias = model_alias
self.pretrained_models = pretrained_models
self.parser = argparse.ArgumentParser( self.parser = argparse.ArgumentParser(
prog='paddlespeech.cls', add_help=True) prog='paddlespeech.cls', add_help=True)
...@@ -83,7 +51,9 @@ class CLSExecutor(BaseExecutor): ...@@ -83,7 +51,9 @@ class CLSExecutor(BaseExecutor):
'--model', '--model',
type=str, type=str,
default='panns_cnn14', default='panns_cnn14',
choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()], choices=[
tag[:tag.index('-')] for tag in self.pretrained_models.keys()
],
help='Choose model type of cls task.') help='Choose model type of cls task.')
self.parser.add_argument( self.parser.add_argument(
'--config', '--config',
...@@ -121,23 +91,6 @@ class CLSExecutor(BaseExecutor): ...@@ -121,23 +91,6 @@ class CLSExecutor(BaseExecutor):
action='store_true', action='store_true',
help='Increase logger verbosity of current task.') help='Increase logger verbosity of current task.')
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
Download and returns pretrained resources path of current task.
"""
support_models = list(pretrained_models.keys())
assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
tag, '\n\t\t'.join(support_models))
res_path = os.path.join(MODEL_HOME, tag)
decompressed_path = download_and_decompress(pretrained_models[tag],
res_path)
decompressed_path = os.path.abspath(decompressed_path)
logger.info(
'Use pretrained model stored in: {}'.format(decompressed_path))
return decompressed_path
def _init_from_path(self, def _init_from_path(self,
model_type: str='panns_cnn14', model_type: str='panns_cnn14',
cfg_path: Optional[os.PathLike]=None, cfg_path: Optional[os.PathLike]=None,
...@@ -153,12 +106,12 @@ class CLSExecutor(BaseExecutor): ...@@ -153,12 +106,12 @@ class CLSExecutor(BaseExecutor):
if label_file is None or ckpt_path is None: if label_file is None or ckpt_path is None:
tag = model_type + '-' + '32k' # panns_cnn14-32k tag = model_type + '-' + '32k' # panns_cnn14-32k
self.res_path = self._get_pretrained_path(tag) self.res_path = self._get_pretrained_path(tag)
self.cfg_path = os.path.join(self.res_path, self.cfg_path = os.path.join(
pretrained_models[tag]['cfg_path']) self.res_path, self.pretrained_models[tag]['cfg_path'])
self.label_file = os.path.join(self.res_path, self.label_file = os.path.join(
pretrained_models[tag]['label_file']) self.res_path, self.pretrained_models[tag]['label_file'])
self.ckpt_path = os.path.join(self.res_path, self.ckpt_path = os.path.join(
pretrained_models[tag]['ckpt_path']) self.res_path, self.pretrained_models[tag]['ckpt_path'])
else: else:
self.cfg_path = os.path.abspath(cfg_path) self.cfg_path = os.path.abspath(cfg_path)
self.label_file = os.path.abspath(label_file) self.label_file = os.path.abspath(label_file)
...@@ -175,7 +128,7 @@ class CLSExecutor(BaseExecutor): ...@@ -175,7 +128,7 @@ class CLSExecutor(BaseExecutor):
self._label_list.append(line.strip()) self._label_list.append(line.strip())
# model # model
model_class = dynamic_import(model_type, model_alias) model_class = dynamic_import(model_type, self.model_alias)
model_dict = paddle.load(self.ckpt_path) model_dict = paddle.load(self.ckpt_path)
self.model = model_class(extract_embedding=False) self.model = model_class(extract_embedding=False)
self.model.set_state_dict(model_dict) self.model.set_state_dict(model_dict)
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
pretrained_models = {
# The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
# e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
# Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
# "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
"panns_cnn6-32k": {
'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn6.tar.gz',
'md5': '4cf09194a95df024fd12f84712cf0f9c',
'cfg_path': 'panns.yaml',
'ckpt_path': 'cnn6.pdparams',
'label_file': 'audioset_labels.txt',
},
"panns_cnn10-32k": {
'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn10.tar.gz',
'md5': 'cb8427b22176cc2116367d14847f5413',
'cfg_path': 'panns.yaml',
'ckpt_path': 'cnn10.pdparams',
'label_file': 'audioset_labels.txt',
},
"panns_cnn14-32k": {
'url': 'https://paddlespeech.bj.bcebos.com/cls/panns_cnn14.tar.gz',
'md5': 'e3b9b5614a1595001161d0ab95edee97',
'cfg_path': 'panns.yaml',
'ckpt_path': 'cnn14.pdparams',
'label_file': 'audioset_labels.txt',
},
}
model_alias = {
"panns_cnn6": "paddlespeech.cls.models.panns:CNN6",
"panns_cnn10": "paddlespeech.cls.models.panns:CNN10",
"panns_cnn14": "paddlespeech.cls.models.panns:CNN14",
}
...@@ -25,6 +25,8 @@ from typing import Union ...@@ -25,6 +25,8 @@ from typing import Union
import paddle import paddle
from .log import logger from .log import logger
from .utils import download_and_decompress
from .utils import MODEL_HOME
class BaseExecutor(ABC): class BaseExecutor(ABC):
...@@ -35,19 +37,8 @@ class BaseExecutor(ABC): ...@@ -35,19 +37,8 @@ class BaseExecutor(ABC):
def __init__(self): def __init__(self):
self._inputs = OrderedDict() self._inputs = OrderedDict()
self._outputs = OrderedDict() self._outputs = OrderedDict()
self.pretrained_models = OrderedDict()
@abstractmethod self.model_alias = OrderedDict()
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
Download and returns pretrained resources path of current task.
Args:
tag (str): A tag of pretrained model.
Returns:
os.PathLike: The path on which resources of pretrained model locate.
"""
pass
@abstractmethod @abstractmethod
def _init_from_path(self, *args, **kwargs): def _init_from_path(self, *args, **kwargs):
...@@ -227,3 +218,20 @@ class BaseExecutor(ABC): ...@@ -227,3 +218,20 @@ class BaseExecutor(ABC):
] ]
for l in loggers: for l in loggers:
l.disabled = True l.disabled = True
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
Download and returns pretrained resources path of current task.
"""
support_models = list(self.pretrained_models.keys())
assert tag in self.pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
tag, '\n\t\t'.join(support_models))
res_path = os.path.join(MODEL_HOME, tag)
decompressed_path = download_and_decompress(self.pretrained_models[tag],
res_path)
decompressed_path = os.path.abspath(decompressed_path)
logger.info(
'Use pretrained model stored in: {}'.format(decompressed_path))
return decompressed_path
...@@ -32,40 +32,24 @@ from ..utils import cli_register ...@@ -32,40 +32,24 @@ from ..utils import cli_register
from ..utils import download_and_decompress from ..utils import download_and_decompress
from ..utils import MODEL_HOME from ..utils import MODEL_HOME
from ..utils import stats_wrapper from ..utils import stats_wrapper
from .pretrained_models import kaldi_bins
from .pretrained_models import model_alias
from .pretrained_models import pretrained_models
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.dynamic_import import dynamic_import
from paddlespeech.s2t.utils.utility import UpdateConfig from paddlespeech.s2t.utils.utility import UpdateConfig
__all__ = ["STExecutor"] __all__ = ["STExecutor"]
pretrained_models = {
"fat_st_ted-en-zh": {
"url":
"https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz",
"md5":
"d62063f35a16d91210a71081bd2dd557",
"cfg_path":
"model.yaml",
"ckpt_path":
"exp/transformer_mtl_noam/checkpoints/fat_st_ted-en-zh.pdparams",
}
}
model_alias = {"fat_st": "paddlespeech.s2t.models.u2_st:U2STModel"}
kaldi_bins = {
"url":
"https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/kaldi_bins.tar.gz",
"md5":
"c0682303b3f3393dbf6ed4c4e35a53eb",
}
@cli_register( @cli_register(
name="paddlespeech.st", description="Speech translation infer command.") name="paddlespeech.st", description="Speech translation infer command.")
class STExecutor(BaseExecutor): class STExecutor(BaseExecutor):
def __init__(self): def __init__(self):
super(STExecutor, self).__init__() super().__init__()
self.model_alias = model_alias
self.pretrained_models = pretrained_models
self.kaldi_bins = kaldi_bins
self.parser = argparse.ArgumentParser( self.parser = argparse.ArgumentParser(
prog="paddlespeech.st", add_help=True) prog="paddlespeech.st", add_help=True)
...@@ -75,7 +59,9 @@ class STExecutor(BaseExecutor): ...@@ -75,7 +59,9 @@ class STExecutor(BaseExecutor):
"--model", "--model",
type=str, type=str,
default="fat_st_ted", default="fat_st_ted",
choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()], choices=[
tag[:tag.index('-')] for tag in self.pretrained_models.keys()
],
help="Choose model type of st task.") help="Choose model type of st task.")
self.parser.add_argument( self.parser.add_argument(
"--src_lang", "--src_lang",
...@@ -119,28 +105,11 @@ class STExecutor(BaseExecutor): ...@@ -119,28 +105,11 @@ class STExecutor(BaseExecutor):
action='store_true', action='store_true',
help='Increase logger verbosity of current task.') help='Increase logger verbosity of current task.')
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
Download and returns pretrained resources path of current task.
"""
support_models = list(pretrained_models.keys())
assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
tag, '\n\t\t'.join(support_models))
res_path = os.path.join(MODEL_HOME, tag)
decompressed_path = download_and_decompress(pretrained_models[tag],
res_path)
decompressed_path = os.path.abspath(decompressed_path)
logger.info(
"Use pretrained model stored in: {}".format(decompressed_path))
return decompressed_path
def _set_kaldi_bins(self) -> os.PathLike: def _set_kaldi_bins(self) -> os.PathLike:
""" """
Download and returns kaldi_bins resources path of current task. Download and returns kaldi_bins resources path of current task.
""" """
decompressed_path = download_and_decompress(kaldi_bins, MODEL_HOME) decompressed_path = download_and_decompress(self.kaldi_bins, MODEL_HOME)
decompressed_path = os.path.abspath(decompressed_path) decompressed_path = os.path.abspath(decompressed_path)
logger.info("Kaldi_bins stored in: {}".format(decompressed_path)) logger.info("Kaldi_bins stored in: {}".format(decompressed_path))
if "LD_LIBRARY_PATH" in os.environ: if "LD_LIBRARY_PATH" in os.environ:
...@@ -197,7 +166,7 @@ class STExecutor(BaseExecutor): ...@@ -197,7 +166,7 @@ class STExecutor(BaseExecutor):
model_conf = self.config model_conf = self.config
model_name = model_type[:model_type.rindex( model_name = model_type[:model_type.rindex(
'_')] # model_type: {model_name}_{dataset} '_')] # model_type: {model_name}_{dataset}
model_class = dynamic_import(model_name, model_alias) model_class = dynamic_import(model_name, self.model_alias)
self.model = model_class.from_config(model_conf) self.model = model_class.from_config(model_conf)
self.model.eval() self.model.eval()
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
pretrained_models = {
"fat_st_ted-en-zh": {
"url":
"https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/st1_transformer_mtl_noam_ted-en-zh_ckpt_0.1.1.model.tar.gz",
"md5":
"d62063f35a16d91210a71081bd2dd557",
"cfg_path":
"model.yaml",
"ckpt_path":
"exp/transformer_mtl_noam/checkpoints/fat_st_ted-en-zh.pdparams",
}
}
model_alias = {"fat_st": "paddlespeech.s2t.models.u2_st:U2STModel"}
kaldi_bins = {
"url":
"https://paddlespeech.bj.bcebos.com/s2t/ted_en_zh/st1/kaldi_bins.tar.gz",
"md5":
"c0682303b3f3393dbf6ed4c4e35a53eb",
}
...@@ -16,7 +16,6 @@ from typing import List ...@@ -16,7 +16,6 @@ from typing import List
from prettytable import PrettyTable from prettytable import PrettyTable
from ..log import logger
from ..utils import cli_register from ..utils import cli_register
from ..utils import stats_wrapper from ..utils import stats_wrapper
...@@ -27,7 +26,8 @@ model_name_format = { ...@@ -27,7 +26,8 @@ model_name_format = {
'cls': 'Model-Sample Rate', 'cls': 'Model-Sample Rate',
'st': 'Model-Source language-Target language', 'st': 'Model-Source language-Target language',
'text': 'Model-Task-Language', 'text': 'Model-Task-Language',
'tts': 'Model-Language' 'tts': 'Model-Language',
'vector': 'Model-Sample Rate'
} }
...@@ -36,18 +36,18 @@ model_name_format = { ...@@ -36,18 +36,18 @@ model_name_format = {
description='Get speech tasks support models list.') description='Get speech tasks support models list.')
class StatsExecutor(): class StatsExecutor():
def __init__(self): def __init__(self):
super(StatsExecutor, self).__init__() super().__init__()
self.parser = argparse.ArgumentParser( self.parser = argparse.ArgumentParser(
prog='paddlespeech.stats', add_help=True) prog='paddlespeech.stats', add_help=True)
self.task_choices = ['asr', 'cls', 'st', 'text', 'tts', 'vector']
self.parser.add_argument( self.parser.add_argument(
'--task', '--task',
type=str, type=str,
default='asr', default='asr',
choices=['asr', 'cls', 'st', 'text', 'tts'], choices=self.task_choices,
help='Choose speech task.', help='Choose speech task.',
required=True) required=True)
self.task_choices = ['asr', 'cls', 'st', 'text', 'tts']
def show_support_models(self, pretrained_models: dict): def show_support_models(self, pretrained_models: dict):
fields = model_name_format[self.task].split("-") fields = model_name_format[self.task].split("-")
...@@ -61,73 +61,15 @@ class StatsExecutor(): ...@@ -61,73 +61,15 @@ class StatsExecutor():
Command line entry. Command line entry.
""" """
parser_args = self.parser.parse_args(argv) parser_args = self.parser.parse_args(argv)
self.task = parser_args.task has_exceptions = False
if self.task not in self.task_choices: try:
logger.error( self(parser_args.task)
"Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']" except Exception as e:
) has_exceptions = True
if has_exceptions:
return False return False
else:
elif self.task == 'asr': return True
try:
from ..asr.infer import pretrained_models
logger.info(
"Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API"
)
self.show_support_models(pretrained_models)
return True
except BaseException:
logger.error("Failed to get the list of ASR pretrained models.")
return False
elif self.task == 'cls':
try:
from ..cls.infer import pretrained_models
logger.info(
"Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API"
)
self.show_support_models(pretrained_models)
return True
except BaseException:
logger.error("Failed to get the list of CLS pretrained models.")
return False
elif self.task == 'st':
try:
from ..st.infer import pretrained_models
logger.info(
"Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API"
)
self.show_support_models(pretrained_models)
return True
except BaseException:
logger.error("Failed to get the list of ST pretrained models.")
return False
elif self.task == 'text':
try:
from ..text.infer import pretrained_models
logger.info(
"Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API"
)
self.show_support_models(pretrained_models)
return True
except BaseException:
logger.error(
"Failed to get the list of TEXT pretrained models.")
return False
elif self.task == 'tts':
try:
from ..tts.infer import pretrained_models
logger.info(
"Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API"
)
self.show_support_models(pretrained_models)
return True
except BaseException:
logger.error("Failed to get the list of TTS pretrained models.")
return False
@stats_wrapper @stats_wrapper
def __call__( def __call__(
...@@ -138,13 +80,12 @@ class StatsExecutor(): ...@@ -138,13 +80,12 @@ class StatsExecutor():
""" """
self.task = task self.task = task
if self.task not in self.task_choices: if self.task not in self.task_choices:
print( print("Please input correct speech task, choices = " + str(
"Please input correct speech task, choices = ['asr', 'cls', 'st', 'text', 'tts']" self.task_choices))
)
elif self.task == 'asr': elif self.task == 'asr':
try: try:
from ..asr.infer import pretrained_models from ..asr.pretrained_models import pretrained_models
print( print(
"Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API" "Here is the list of ASR pretrained models released by PaddleSpeech that can be used by command line and python API"
) )
...@@ -154,7 +95,7 @@ class StatsExecutor(): ...@@ -154,7 +95,7 @@ class StatsExecutor():
elif self.task == 'cls': elif self.task == 'cls':
try: try:
from ..cls.infer import pretrained_models from ..cls.pretrained_models import pretrained_models
print( print(
"Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API" "Here is the list of CLS pretrained models released by PaddleSpeech that can be used by command line and python API"
) )
...@@ -164,7 +105,7 @@ class StatsExecutor(): ...@@ -164,7 +105,7 @@ class StatsExecutor():
elif self.task == 'st': elif self.task == 'st':
try: try:
from ..st.infer import pretrained_models from ..st.pretrained_models import pretrained_models
print( print(
"Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API" "Here is the list of ST pretrained models released by PaddleSpeech that can be used by command line and python API"
) )
...@@ -174,7 +115,7 @@ class StatsExecutor(): ...@@ -174,7 +115,7 @@ class StatsExecutor():
elif self.task == 'text': elif self.task == 'text':
try: try:
from ..text.infer import pretrained_models from ..text.pretrained_models import pretrained_models
print( print(
"Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API" "Here is the list of TEXT pretrained models released by PaddleSpeech that can be used by command line and python API"
) )
...@@ -184,10 +125,22 @@ class StatsExecutor(): ...@@ -184,10 +125,22 @@ class StatsExecutor():
elif self.task == 'tts': elif self.task == 'tts':
try: try:
from ..tts.infer import pretrained_models from ..tts.pretrained_models import pretrained_models
print( print(
"Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API" "Here is the list of TTS pretrained models released by PaddleSpeech that can be used by command line and python API"
) )
self.show_support_models(pretrained_models) self.show_support_models(pretrained_models)
except BaseException: except BaseException:
print("Failed to get the list of TTS pretrained models.") print("Failed to get the list of TTS pretrained models.")
elif self.task == 'vector':
try:
from ..vector.pretrained_models import pretrained_models
print(
"Here is the list of Speaker Recognition pretrained models released by PaddleSpeech that can be used by command line and python API"
)
self.show_support_models(pretrained_models)
except BaseException:
print(
"Failed to get the list of Speaker Recognition pretrained models."
)
...@@ -25,58 +25,21 @@ from ...s2t.utils.dynamic_import import dynamic_import ...@@ -25,58 +25,21 @@ from ...s2t.utils.dynamic_import import dynamic_import
from ..executor import BaseExecutor from ..executor import BaseExecutor
from ..log import logger from ..log import logger
from ..utils import cli_register from ..utils import cli_register
from ..utils import download_and_decompress
from ..utils import MODEL_HOME
from ..utils import stats_wrapper from ..utils import stats_wrapper
from .pretrained_models import model_alias
from .pretrained_models import pretrained_models
from .pretrained_models import tokenizer_alias
__all__ = ['TextExecutor'] __all__ = ['TextExecutor']
pretrained_models = {
# The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
# e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
# Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
# "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
"ernie_linear_p7_wudao-punc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p7_wudao-punc-zh.tar.gz',
'md5':
'12283e2ddde1797c5d1e57036b512746',
'cfg_path':
'ckpt/model_config.json',
'ckpt_path':
'ckpt/model_state.pdparams',
'vocab_file':
'punc_vocab.txt',
},
"ernie_linear_p3_wudao-punc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_wudao-punc-zh.tar.gz',
'md5':
'448eb2fdf85b6a997e7e652e80c51dd2',
'cfg_path':
'ckpt/model_config.json',
'ckpt_path':
'ckpt/model_state.pdparams',
'vocab_file':
'punc_vocab.txt',
},
}
model_alias = {
"ernie_linear_p7": "paddlespeech.text.models:ErnieLinear",
"ernie_linear_p3": "paddlespeech.text.models:ErnieLinear",
}
tokenizer_alias = {
"ernie_linear_p7": "paddlenlp.transformers:ErnieTokenizer",
"ernie_linear_p3": "paddlenlp.transformers:ErnieTokenizer",
}
@cli_register(name='paddlespeech.text', description='Text infer command.') @cli_register(name='paddlespeech.text', description='Text infer command.')
class TextExecutor(BaseExecutor): class TextExecutor(BaseExecutor):
def __init__(self): def __init__(self):
super(TextExecutor, self).__init__() super().__init__()
self.model_alias = model_alias
self.pretrained_models = pretrained_models
self.tokenizer_alias = tokenizer_alias
self.parser = argparse.ArgumentParser( self.parser = argparse.ArgumentParser(
prog='paddlespeech.text', add_help=True) prog='paddlespeech.text', add_help=True)
...@@ -92,7 +55,9 @@ class TextExecutor(BaseExecutor): ...@@ -92,7 +55,9 @@ class TextExecutor(BaseExecutor):
'--model', '--model',
type=str, type=str,
default='ernie_linear_p7_wudao', default='ernie_linear_p7_wudao',
choices=[tag[:tag.index('-')] for tag in pretrained_models.keys()], choices=[
tag[:tag.index('-')] for tag in self.pretrained_models.keys()
],
help='Choose model type of text task.') help='Choose model type of text task.')
self.parser.add_argument( self.parser.add_argument(
'--lang', '--lang',
...@@ -131,23 +96,6 @@ class TextExecutor(BaseExecutor): ...@@ -131,23 +96,6 @@ class TextExecutor(BaseExecutor):
action='store_true', action='store_true',
help='Increase logger verbosity of current task.') help='Increase logger verbosity of current task.')
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
Download and returns pretrained resources path of current task.
"""
support_models = list(pretrained_models.keys())
assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
tag, '\n\t\t'.join(support_models))
res_path = os.path.join(MODEL_HOME, tag)
decompressed_path = download_and_decompress(pretrained_models[tag],
res_path)
decompressed_path = os.path.abspath(decompressed_path)
logger.info(
'Use pretrained model stored in: {}'.format(decompressed_path))
return decompressed_path
def _init_from_path(self, def _init_from_path(self,
task: str='punc', task: str='punc',
model_type: str='ernie_linear_p7_wudao', model_type: str='ernie_linear_p7_wudao',
...@@ -167,12 +115,12 @@ class TextExecutor(BaseExecutor): ...@@ -167,12 +115,12 @@ class TextExecutor(BaseExecutor):
if cfg_path is None or ckpt_path is None or vocab_file is None: if cfg_path is None or ckpt_path is None or vocab_file is None:
tag = '-'.join([model_type, task, lang]) tag = '-'.join([model_type, task, lang])
self.res_path = self._get_pretrained_path(tag) self.res_path = self._get_pretrained_path(tag)
self.cfg_path = os.path.join(self.res_path, self.cfg_path = os.path.join(
pretrained_models[tag]['cfg_path']) self.res_path, self.pretrained_models[tag]['cfg_path'])
self.ckpt_path = os.path.join(self.res_path, self.ckpt_path = os.path.join(
pretrained_models[tag]['ckpt_path']) self.res_path, self.pretrained_models[tag]['ckpt_path'])
self.vocab_file = os.path.join(self.res_path, self.vocab_file = os.path.join(
pretrained_models[tag]['vocab_file']) self.res_path, self.pretrained_models[tag]['vocab_file'])
else: else:
self.cfg_path = os.path.abspath(cfg_path) self.cfg_path = os.path.abspath(cfg_path)
self.ckpt_path = os.path.abspath(ckpt_path) self.ckpt_path = os.path.abspath(ckpt_path)
...@@ -187,8 +135,8 @@ class TextExecutor(BaseExecutor): ...@@ -187,8 +135,8 @@ class TextExecutor(BaseExecutor):
self._punc_list.append(line.strip()) self._punc_list.append(line.strip())
# model # model
model_class = dynamic_import(model_name, model_alias) model_class = dynamic_import(model_name, self.model_alias)
tokenizer_class = dynamic_import(model_name, tokenizer_alias) tokenizer_class = dynamic_import(model_name, self.tokenizer_alias)
self.model = model_class( self.model = model_class(
cfg_path=self.cfg_path, ckpt_path=self.ckpt_path) cfg_path=self.cfg_path, ckpt_path=self.ckpt_path)
self.tokenizer = tokenizer_class.from_pretrained('ernie-1.0') self.tokenizer = tokenizer_class.from_pretrained('ernie-1.0')
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
pretrained_models = {
# The tags for pretrained_models should be "{model_name}[_{dataset}][-{lang}][-...]".
# e.g. "conformer_wenetspeech-zh-16k", "transformer_aishell-zh-16k" and "panns_cnn6-32k".
# Command line and python api use "{model_name}[_{dataset}]" as --model, usage:
# "paddlespeech asr --model conformer_wenetspeech --lang zh --sr 16000 --input ./input.wav"
"ernie_linear_p7_wudao-punc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p7_wudao-punc-zh.tar.gz',
'md5':
'12283e2ddde1797c5d1e57036b512746',
'cfg_path':
'ckpt/model_config.json',
'ckpt_path':
'ckpt/model_state.pdparams',
'vocab_file':
'punc_vocab.txt',
},
"ernie_linear_p3_wudao-punc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/text/ernie_linear_p3_wudao-punc-zh.tar.gz',
'md5':
'448eb2fdf85b6a997e7e652e80c51dd2',
'cfg_path':
'ckpt/model_config.json',
'ckpt_path':
'ckpt/model_state.pdparams',
'vocab_file':
'punc_vocab.txt',
},
}
model_alias = {
"ernie_linear_p7": "paddlespeech.text.models:ErnieLinear",
"ernie_linear_p3": "paddlespeech.text.models:ErnieLinear",
}
tokenizer_alias = {
"ernie_linear_p7": "paddlenlp.transformers:ErnieTokenizer",
"ernie_linear_p3": "paddlenlp.transformers:ErnieTokenizer",
}
...@@ -29,9 +29,9 @@ from yacs.config import CfgNode ...@@ -29,9 +29,9 @@ from yacs.config import CfgNode
from ..executor import BaseExecutor from ..executor import BaseExecutor
from ..log import logger from ..log import logger
from ..utils import cli_register from ..utils import cli_register
from ..utils import download_and_decompress
from ..utils import MODEL_HOME
from ..utils import stats_wrapper from ..utils import stats_wrapper
from .pretrained_models import model_alias
from .pretrained_models import pretrained_models
from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.dynamic_import import dynamic_import
from paddlespeech.t2s.frontend import English from paddlespeech.t2s.frontend import English
from paddlespeech.t2s.frontend.zh_frontend import Frontend from paddlespeech.t2s.frontend.zh_frontend import Frontend
...@@ -39,299 +39,14 @@ from paddlespeech.t2s.modules.normalizer import ZScore ...@@ -39,299 +39,14 @@ from paddlespeech.t2s.modules.normalizer import ZScore
__all__ = ['TTSExecutor'] __all__ = ['TTSExecutor']
pretrained_models = {
# speedyspeech
"speedyspeech_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip',
'md5':
'6f6fa967b408454b6662c8c00c0027cb',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_30600.pdz',
'speech_stats':
'feats_stats.npy',
'phones_dict':
'phone_id_map.txt',
'tones_dict':
'tone_id_map.txt',
},
# fastspeech2
"fastspeech2_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
'md5':
'637d28a5e53aa60275612ba4393d5f22',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_76000.pdz',
'speech_stats':
'speech_stats.npy',
'phones_dict':
'phone_id_map.txt',
},
"fastspeech2_ljspeech-en": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip',
'md5':
'ffed800c93deaf16ca9b3af89bfcd747',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_100000.pdz',
'speech_stats':
'speech_stats.npy',
'phones_dict':
'phone_id_map.txt',
},
"fastspeech2_aishell3-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip',
'md5':
'f4dd4a5f49a4552b77981f544ab3392e',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_96400.pdz',
'speech_stats':
'speech_stats.npy',
'phones_dict':
'phone_id_map.txt',
'speaker_dict':
'speaker_id_map.txt',
},
"fastspeech2_vctk-en": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip',
'md5':
'743e5024ca1e17a88c5c271db9779ba4',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_66200.pdz',
'speech_stats':
'speech_stats.npy',
'phones_dict':
'phone_id_map.txt',
'speaker_dict':
'speaker_id_map.txt',
},
# tacotron2
"tacotron2_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip',
'md5':
'0df4b6f0bcbe0d73c5ed6df8867ab91a',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_30600.pdz',
'speech_stats':
'speech_stats.npy',
'phones_dict':
'phone_id_map.txt',
},
"tacotron2_ljspeech-en": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip',
'md5':
'6a5eddd81ae0e81d16959b97481135f3',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_60300.pdz',
'speech_stats':
'speech_stats.npy',
'phones_dict':
'phone_id_map.txt',
},
# pwgan
"pwgan_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip',
'md5':
'2e481633325b5bdf0a3823c714d2c117',
'config':
'pwg_default.yaml',
'ckpt':
'pwg_snapshot_iter_400000.pdz',
'speech_stats':
'pwg_stats.npy',
},
"pwgan_ljspeech-en": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip',
'md5':
'53610ba9708fd3008ccaf8e99dacbaf0',
'config':
'pwg_default.yaml',
'ckpt':
'pwg_snapshot_iter_400000.pdz',
'speech_stats':
'pwg_stats.npy',
},
"pwgan_aishell3-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip',
'md5':
'd7598fa41ad362d62f85ffc0f07e3d84',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_1000000.pdz',
'speech_stats':
'feats_stats.npy',
},
"pwgan_vctk-en": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip',
'md5':
'b3da1defcde3e578be71eb284cb89f2c',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_1500000.pdz',
'speech_stats':
'feats_stats.npy',
},
# mb_melgan
"mb_melgan_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
'md5':
'ee5f0604e20091f0d495b6ec4618b90d',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_1000000.pdz',
'speech_stats':
'feats_stats.npy',
},
# style_melgan
"style_melgan_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip',
'md5':
'5de2d5348f396de0c966926b8c462755',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_1500000.pdz',
'speech_stats':
'feats_stats.npy',
},
# hifigan
"hifigan_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
'md5':
'dd40a3d88dfcf64513fba2f0f961ada6',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_2500000.pdz',
'speech_stats':
'feats_stats.npy',
},
"hifigan_ljspeech-en": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip',
'md5':
'70e9131695decbca06a65fe51ed38a72',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_2500000.pdz',
'speech_stats':
'feats_stats.npy',
},
"hifigan_aishell3-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip',
'md5':
'3bb49bc75032ed12f79c00c8cc79a09a',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_2500000.pdz',
'speech_stats':
'feats_stats.npy',
},
"hifigan_vctk-en": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip',
'md5':
'7da8f88359bca2457e705d924cf27bd4',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_2500000.pdz',
'speech_stats':
'feats_stats.npy',
},
# wavernn
"wavernn_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip',
'md5':
'ee37b752f09bcba8f2af3b777ca38e13',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_400000.pdz',
'speech_stats':
'feats_stats.npy',
}
}
model_alias = {
# acoustic model
"speedyspeech":
"paddlespeech.t2s.models.speedyspeech:SpeedySpeech",
"speedyspeech_inference":
"paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference",
"fastspeech2":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2",
"fastspeech2_inference":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
"tacotron2":
"paddlespeech.t2s.models.tacotron2:Tacotron2",
"tacotron2_inference":
"paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
# voc
"pwgan":
"paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
"pwgan_inference":
"paddlespeech.t2s.models.parallel_wavegan:PWGInference",
"mb_melgan":
"paddlespeech.t2s.models.melgan:MelGANGenerator",
"mb_melgan_inference":
"paddlespeech.t2s.models.melgan:MelGANInference",
"style_melgan":
"paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
"style_melgan_inference":
"paddlespeech.t2s.models.melgan:StyleMelGANInference",
"hifigan":
"paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
"hifigan_inference":
"paddlespeech.t2s.models.hifigan:HiFiGANInference",
"wavernn":
"paddlespeech.t2s.models.wavernn:WaveRNN",
"wavernn_inference":
"paddlespeech.t2s.models.wavernn:WaveRNNInference",
}
@cli_register( @cli_register(
name='paddlespeech.tts', description='Text to Speech infer command.') name='paddlespeech.tts', description='Text to Speech infer command.')
class TTSExecutor(BaseExecutor): class TTSExecutor(BaseExecutor):
def __init__(self): def __init__(self):
super().__init__() super().__init__()
self.model_alias = model_alias
self.pretrained_models = pretrained_models
self.parser = argparse.ArgumentParser( self.parser = argparse.ArgumentParser(
prog='paddlespeech.tts', add_help=True) prog='paddlespeech.tts', add_help=True)
...@@ -449,22 +164,6 @@ class TTSExecutor(BaseExecutor): ...@@ -449,22 +164,6 @@ class TTSExecutor(BaseExecutor):
action='store_true', action='store_true',
help='Increase logger verbosity of current task.') help='Increase logger verbosity of current task.')
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""
Download and returns pretrained resources path of current task.
"""
support_models = list(pretrained_models.keys())
assert tag in pretrained_models, 'The model "{}" you want to use has not been supported, please choose other models.\nThe support models includes:\n\t\t{}\n'.format(
tag, '\n\t\t'.join(support_models))
res_path = os.path.join(MODEL_HOME, tag)
decompressed_path = download_and_decompress(pretrained_models[tag],
res_path)
decompressed_path = os.path.abspath(decompressed_path)
logger.info(
'Use pretrained model stored in: {}'.format(decompressed_path))
return decompressed_path
def _init_from_path( def _init_from_path(
self, self,
am: str='fastspeech2_csmsc', am: str='fastspeech2_csmsc',
...@@ -490,16 +189,15 @@ class TTSExecutor(BaseExecutor): ...@@ -490,16 +189,15 @@ class TTSExecutor(BaseExecutor):
if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None: if am_ckpt is None or am_config is None or am_stat is None or phones_dict is None:
am_res_path = self._get_pretrained_path(am_tag) am_res_path = self._get_pretrained_path(am_tag)
self.am_res_path = am_res_path self.am_res_path = am_res_path
self.am_config = os.path.join(am_res_path, self.am_config = os.path.join(
pretrained_models[am_tag]['config']) am_res_path, self.pretrained_models[am_tag]['config'])
self.am_ckpt = os.path.join(am_res_path, self.am_ckpt = os.path.join(am_res_path,
pretrained_models[am_tag]['ckpt']) self.pretrained_models[am_tag]['ckpt'])
self.am_stat = os.path.join( self.am_stat = os.path.join(
am_res_path, pretrained_models[am_tag]['speech_stats']) am_res_path, self.pretrained_models[am_tag]['speech_stats'])
# must have phones_dict in acoustic # must have phones_dict in acoustic
self.phones_dict = os.path.join( self.phones_dict = os.path.join(
am_res_path, pretrained_models[am_tag]['phones_dict']) am_res_path, self.pretrained_models[am_tag]['phones_dict'])
print("self.phones_dict:", self.phones_dict)
logger.info(am_res_path) logger.info(am_res_path)
logger.info(self.am_config) logger.info(self.am_config)
logger.info(self.am_ckpt) logger.info(self.am_ckpt)
...@@ -509,21 +207,20 @@ class TTSExecutor(BaseExecutor): ...@@ -509,21 +207,20 @@ class TTSExecutor(BaseExecutor):
self.am_stat = os.path.abspath(am_stat) self.am_stat = os.path.abspath(am_stat)
self.phones_dict = os.path.abspath(phones_dict) self.phones_dict = os.path.abspath(phones_dict)
self.am_res_path = os.path.dirname(os.path.abspath(self.am_config)) self.am_res_path = os.path.dirname(os.path.abspath(self.am_config))
print("self.phones_dict:", self.phones_dict)
# for speedyspeech # for speedyspeech
self.tones_dict = None self.tones_dict = None
if 'tones_dict' in pretrained_models[am_tag]: if 'tones_dict' in self.pretrained_models[am_tag]:
self.tones_dict = os.path.join( self.tones_dict = os.path.join(
am_res_path, pretrained_models[am_tag]['tones_dict']) am_res_path, self.pretrained_models[am_tag]['tones_dict'])
if tones_dict: if tones_dict:
self.tones_dict = tones_dict self.tones_dict = tones_dict
# for multi speaker fastspeech2 # for multi speaker fastspeech2
self.speaker_dict = None self.speaker_dict = None
if 'speaker_dict' in pretrained_models[am_tag]: if 'speaker_dict' in self.pretrained_models[am_tag]:
self.speaker_dict = os.path.join( self.speaker_dict = os.path.join(
am_res_path, pretrained_models[am_tag]['speaker_dict']) am_res_path, self.pretrained_models[am_tag]['speaker_dict'])
if speaker_dict: if speaker_dict:
self.speaker_dict = speaker_dict self.speaker_dict = speaker_dict
...@@ -532,12 +229,12 @@ class TTSExecutor(BaseExecutor): ...@@ -532,12 +229,12 @@ class TTSExecutor(BaseExecutor):
if voc_ckpt is None or voc_config is None or voc_stat is None: if voc_ckpt is None or voc_config is None or voc_stat is None:
voc_res_path = self._get_pretrained_path(voc_tag) voc_res_path = self._get_pretrained_path(voc_tag)
self.voc_res_path = voc_res_path self.voc_res_path = voc_res_path
self.voc_config = os.path.join(voc_res_path, self.voc_config = os.path.join(
pretrained_models[voc_tag]['config']) voc_res_path, self.pretrained_models[voc_tag]['config'])
self.voc_ckpt = os.path.join(voc_res_path, self.voc_ckpt = os.path.join(
pretrained_models[voc_tag]['ckpt']) voc_res_path, self.pretrained_models[voc_tag]['ckpt'])
self.voc_stat = os.path.join( self.voc_stat = os.path.join(
voc_res_path, pretrained_models[voc_tag]['speech_stats']) voc_res_path, self.pretrained_models[voc_tag]['speech_stats'])
logger.info(voc_res_path) logger.info(voc_res_path)
logger.info(self.voc_config) logger.info(self.voc_config)
logger.info(self.voc_ckpt) logger.info(self.voc_ckpt)
...@@ -588,8 +285,9 @@ class TTSExecutor(BaseExecutor): ...@@ -588,8 +285,9 @@ class TTSExecutor(BaseExecutor):
# model: {model_name}_{dataset} # model: {model_name}_{dataset}
am_name = am[:am.rindex('_')] am_name = am[:am.rindex('_')]
am_class = dynamic_import(am_name, model_alias) am_class = dynamic_import(am_name, self.model_alias)
am_inference_class = dynamic_import(am_name + '_inference', model_alias) am_inference_class = dynamic_import(am_name + '_inference',
self.model_alias)
if am_name == 'fastspeech2': if am_name == 'fastspeech2':
am = am_class( am = am_class(
...@@ -618,9 +316,9 @@ class TTSExecutor(BaseExecutor): ...@@ -618,9 +316,9 @@ class TTSExecutor(BaseExecutor):
# vocoder # vocoder
# model: {model_name}_{dataset} # model: {model_name}_{dataset}
voc_name = voc[:voc.rindex('_')] voc_name = voc[:voc.rindex('_')]
voc_class = dynamic_import(voc_name, model_alias) voc_class = dynamic_import(voc_name, self.model_alias)
voc_inference_class = dynamic_import(voc_name + '_inference', voc_inference_class = dynamic_import(voc_name + '_inference',
model_alias) self.model_alias)
if voc_name != 'wavernn': if voc_name != 'wavernn':
voc = voc_class(**self.voc_config["generator_params"]) voc = voc_class(**self.voc_config["generator_params"])
voc.set_state_dict(paddle.load(self.voc_ckpt)["generator_params"]) voc.set_state_dict(paddle.load(self.voc_ckpt)["generator_params"])
...@@ -735,7 +433,6 @@ class TTSExecutor(BaseExecutor): ...@@ -735,7 +433,6 @@ class TTSExecutor(BaseExecutor):
am_ckpt = args.am_ckpt am_ckpt = args.am_ckpt
am_stat = args.am_stat am_stat = args.am_stat
phones_dict = args.phones_dict phones_dict = args.phones_dict
print("phones_dict:", phones_dict)
tones_dict = args.tones_dict tones_dict = args.tones_dict
speaker_dict = args.speaker_dict speaker_dict = args.speaker_dict
voc = args.voc voc = args.voc
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
pretrained_models = {
# speedyspeech
"speedyspeech_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/speedyspeech/speedyspeech_csmsc_ckpt_0.2.0.zip',
'md5':
'6f6fa967b408454b6662c8c00c0027cb',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_30600.pdz',
'speech_stats':
'feats_stats.npy',
'phones_dict':
'phone_id_map.txt',
'tones_dict':
'tone_id_map.txt',
},
# fastspeech2
"fastspeech2_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip',
'md5':
'637d28a5e53aa60275612ba4393d5f22',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_76000.pdz',
'speech_stats':
'speech_stats.npy',
'phones_dict':
'phone_id_map.txt',
},
"fastspeech2_ljspeech-en": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_ljspeech_ckpt_0.5.zip',
'md5':
'ffed800c93deaf16ca9b3af89bfcd747',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_100000.pdz',
'speech_stats':
'speech_stats.npy',
'phones_dict':
'phone_id_map.txt',
},
"fastspeech2_aishell3-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_aishell3_ckpt_0.4.zip',
'md5':
'f4dd4a5f49a4552b77981f544ab3392e',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_96400.pdz',
'speech_stats':
'speech_stats.npy',
'phones_dict':
'phone_id_map.txt',
'speaker_dict':
'speaker_id_map.txt',
},
"fastspeech2_vctk-en": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_vctk_ckpt_0.5.zip',
'md5':
'743e5024ca1e17a88c5c271db9779ba4',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_66200.pdz',
'speech_stats':
'speech_stats.npy',
'phones_dict':
'phone_id_map.txt',
'speaker_dict':
'speaker_id_map.txt',
},
# tacotron2
"tacotron2_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_csmsc_ckpt_0.2.0.zip',
'md5':
'0df4b6f0bcbe0d73c5ed6df8867ab91a',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_30600.pdz',
'speech_stats':
'speech_stats.npy',
'phones_dict':
'phone_id_map.txt',
},
"tacotron2_ljspeech-en": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/tacotron2/tacotron2_ljspeech_ckpt_0.2.0.zip',
'md5':
'6a5eddd81ae0e81d16959b97481135f3',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_60300.pdz',
'speech_stats':
'speech_stats.npy',
'phones_dict':
'phone_id_map.txt',
},
# pwgan
"pwgan_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_baker_ckpt_0.4.zip',
'md5':
'2e481633325b5bdf0a3823c714d2c117',
'config':
'pwg_default.yaml',
'ckpt':
'pwg_snapshot_iter_400000.pdz',
'speech_stats':
'pwg_stats.npy',
},
"pwgan_ljspeech-en": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_ljspeech_ckpt_0.5.zip',
'md5':
'53610ba9708fd3008ccaf8e99dacbaf0',
'config':
'pwg_default.yaml',
'ckpt':
'pwg_snapshot_iter_400000.pdz',
'speech_stats':
'pwg_stats.npy',
},
"pwgan_aishell3-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_aishell3_ckpt_0.5.zip',
'md5':
'd7598fa41ad362d62f85ffc0f07e3d84',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_1000000.pdz',
'speech_stats':
'feats_stats.npy',
},
"pwgan_vctk-en": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/pwgan/pwg_vctk_ckpt_0.1.1.zip',
'md5':
'b3da1defcde3e578be71eb284cb89f2c',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_1500000.pdz',
'speech_stats':
'feats_stats.npy',
},
# mb_melgan
"mb_melgan_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip',
'md5':
'ee5f0604e20091f0d495b6ec4618b90d',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_1000000.pdz',
'speech_stats':
'feats_stats.npy',
},
# style_melgan
"style_melgan_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip',
'md5':
'5de2d5348f396de0c966926b8c462755',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_1500000.pdz',
'speech_stats':
'feats_stats.npy',
},
# hifigan
"hifigan_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip',
'md5':
'dd40a3d88dfcf64513fba2f0f961ada6',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_2500000.pdz',
'speech_stats':
'feats_stats.npy',
},
"hifigan_ljspeech-en": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip',
'md5':
'70e9131695decbca06a65fe51ed38a72',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_2500000.pdz',
'speech_stats':
'feats_stats.npy',
},
"hifigan_aishell3-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip',
'md5':
'3bb49bc75032ed12f79c00c8cc79a09a',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_2500000.pdz',
'speech_stats':
'feats_stats.npy',
},
"hifigan_vctk-en": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip',
'md5':
'7da8f88359bca2457e705d924cf27bd4',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_2500000.pdz',
'speech_stats':
'feats_stats.npy',
},
# wavernn
"wavernn_csmsc-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip',
'md5':
'ee37b752f09bcba8f2af3b777ca38e13',
'config':
'default.yaml',
'ckpt':
'snapshot_iter_400000.pdz',
'speech_stats':
'feats_stats.npy',
}
}
model_alias = {
# acoustic model
"speedyspeech":
"paddlespeech.t2s.models.speedyspeech:SpeedySpeech",
"speedyspeech_inference":
"paddlespeech.t2s.models.speedyspeech:SpeedySpeechInference",
"fastspeech2":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2",
"fastspeech2_inference":
"paddlespeech.t2s.models.fastspeech2:FastSpeech2Inference",
"tacotron2":
"paddlespeech.t2s.models.tacotron2:Tacotron2",
"tacotron2_inference":
"paddlespeech.t2s.models.tacotron2:Tacotron2Inference",
# voc
"pwgan":
"paddlespeech.t2s.models.parallel_wavegan:PWGGenerator",
"pwgan_inference":
"paddlespeech.t2s.models.parallel_wavegan:PWGInference",
"mb_melgan":
"paddlespeech.t2s.models.melgan:MelGANGenerator",
"mb_melgan_inference":
"paddlespeech.t2s.models.melgan:MelGANInference",
"style_melgan":
"paddlespeech.t2s.models.melgan:StyleMelGANGenerator",
"style_melgan_inference":
"paddlespeech.t2s.models.melgan:StyleMelGANInference",
"hifigan":
"paddlespeech.t2s.models.hifigan:HiFiGANGenerator",
"hifigan_inference":
"paddlespeech.t2s.models.hifigan:HiFiGANInference",
"wavernn":
"paddlespeech.t2s.models.wavernn:WaveRNN",
"wavernn_inference":
"paddlespeech.t2s.models.wavernn:WaveRNNInference",
}
...@@ -27,45 +27,24 @@ from yacs.config import CfgNode ...@@ -27,45 +27,24 @@ from yacs.config import CfgNode
from ..executor import BaseExecutor from ..executor import BaseExecutor
from ..log import logger from ..log import logger
from ..utils import cli_register from ..utils import cli_register
from ..utils import download_and_decompress
from ..utils import MODEL_HOME
from ..utils import stats_wrapper from ..utils import stats_wrapper
from .pretrained_models import model_alias
from .pretrained_models import pretrained_models
from paddleaudio.backends import load as load_audio from paddleaudio.backends import load as load_audio
from paddleaudio.compliance.librosa import melspectrogram from paddleaudio.compliance.librosa import melspectrogram
from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.dynamic_import import dynamic_import
from paddlespeech.vector.io.batch import feature_normalize from paddlespeech.vector.io.batch import feature_normalize
from paddlespeech.vector.modules.sid_model import SpeakerIdetification from paddlespeech.vector.modules.sid_model import SpeakerIdetification
pretrained_models = {
# The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
# e.g. "ecapatdnn_voxceleb12-16k".
# Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
# "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-16k --sr 16000 --input ./input.wav"
"ecapatdnn_voxceleb12-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_0.tar.gz',
'md5':
'cc33023c54ab346cd318408f43fcaf95',
'cfg_path':
'conf/model.yaml', # the yaml config path
'ckpt_path':
'model/model', # the format is ${dir}/{model_name},
# so the first 'model' is dir, the second 'model' is the name
# this means we have a model stored as model/model.pdparams
},
}
model_alias = {
"ecapatdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
}
@cli_register( @cli_register(
name="paddlespeech.vector", name="paddlespeech.vector",
description="Speech to vector embedding infer command.") description="Speech to vector embedding infer command.")
class VectorExecutor(BaseExecutor): class VectorExecutor(BaseExecutor):
def __init__(self): def __init__(self):
super(VectorExecutor, self).__init__() super().__init__()
self.model_alias = model_alias
self.pretrained_models = pretrained_models
self.parser = argparse.ArgumentParser( self.parser = argparse.ArgumentParser(
prog="paddlespeech.vector", add_help=True) prog="paddlespeech.vector", add_help=True)
...@@ -128,8 +107,8 @@ class VectorExecutor(BaseExecutor): ...@@ -128,8 +107,8 @@ class VectorExecutor(BaseExecutor):
Returns: Returns:
bool: bool:
False: some audio occurs error False: some audio occurs error
True: all audio process success True: all audio process success
""" """
# stage 0: parse the args and get the required args # stage 0: parse the args and get the required args
parser_args = self.parser.parse_args(argv) parser_args = self.parser.parse_args(argv)
...@@ -289,32 +268,6 @@ class VectorExecutor(BaseExecutor): ...@@ -289,32 +268,6 @@ class VectorExecutor(BaseExecutor):
return res return res
def _get_pretrained_path(self, tag: str) -> os.PathLike:
"""get the neural network path from the pretrained model list
we stored all the pretained mode in the variable `pretrained_models`
Args:
tag (str): model tag in the pretrained model list
Returns:
os.PathLike: the downloaded pretrained model path in the disk
"""
support_models = list(pretrained_models.keys())
assert tag in pretrained_models, \
'The model "{}" you want to use has not been supported,'\
'please choose other models.\n' \
'The support models includes\n\t\t{}'.format(tag, "\n\t\t".join(support_models))
res_path = os.path.join(MODEL_HOME, tag)
decompressed_path = download_and_decompress(pretrained_models[tag],
res_path)
decompressed_path = os.path.abspath(decompressed_path)
logger.info(
'Use pretrained model stored in: {}'.format(decompressed_path))
return decompressed_path
def _init_from_path(self, def _init_from_path(self,
model_type: str='ecapatdnn_voxceleb12', model_type: str='ecapatdnn_voxceleb12',
sample_rate: int=16000, sample_rate: int=16000,
...@@ -350,10 +303,11 @@ class VectorExecutor(BaseExecutor): ...@@ -350,10 +303,11 @@ class VectorExecutor(BaseExecutor):
res_path = self._get_pretrained_path(tag) res_path = self._get_pretrained_path(tag)
self.res_path = res_path self.res_path = res_path
self.cfg_path = os.path.join(res_path, self.cfg_path = os.path.join(
pretrained_models[tag]['cfg_path']) res_path, self.pretrained_models[tag]['cfg_path'])
self.ckpt_path = os.path.join( self.ckpt_path = os.path.join(
res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams') res_path,
self.pretrained_models[tag]['ckpt_path'] + '.pdparams')
else: else:
# get the model from disk # get the model from disk
self.cfg_path = os.path.abspath(cfg_path) self.cfg_path = os.path.abspath(cfg_path)
...@@ -373,7 +327,7 @@ class VectorExecutor(BaseExecutor): ...@@ -373,7 +327,7 @@ class VectorExecutor(BaseExecutor):
logger.info("start to dynamic import the model class") logger.info("start to dynamic import the model class")
model_name = model_type[:model_type.rindex('_')] model_name = model_type[:model_type.rindex('_')]
logger.info(f"model name {model_name}") logger.info(f"model name {model_name}")
model_class = dynamic_import(model_name, model_alias) model_class = dynamic_import(model_name, self.model_alias)
model_conf = self.config.model model_conf = self.config.model
backbone = model_class(**model_conf) backbone = model_class(**model_conf)
model = SpeakerIdetification( model = SpeakerIdetification(
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
pretrained_models = {
# The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
# e.g. "ecapatdnn_voxceleb12-16k".
# Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
# "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-16k --sr 16000 --input ./input.wav"
"ecapatdnn_voxceleb12-16k": {
'url':
'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_2_0.tar.gz',
'md5':
'cc33023c54ab346cd318408f43fcaf95',
'cfg_path':
'conf/model.yaml', # the yaml config path
'ckpt_path':
'model/model', # the format is ${dir}/{model_name},
# so the first 'model' is dir, the second 'model' is the name
# this means we have a model stored as model/model.pdparams
},
}
model_alias = {
"ecapatdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
}
...@@ -14,92 +14,17 @@ ...@@ -14,92 +14,17 @@
import argparse import argparse
from pathlib import Path from pathlib import Path
import numpy
import soundfile as sf import soundfile as sf
from paddle import inference
from timer import timer from timer import timer
from paddlespeech.t2s.exps.syn_utils import get_am_output
from paddlespeech.t2s.exps.syn_utils import get_frontend from paddlespeech.t2s.exps.syn_utils import get_frontend
from paddlespeech.t2s.exps.syn_utils import get_predictor
from paddlespeech.t2s.exps.syn_utils import get_sentences from paddlespeech.t2s.exps.syn_utils import get_sentences
from paddlespeech.t2s.exps.syn_utils import get_voc_output
from paddlespeech.t2s.utils import str2bool from paddlespeech.t2s.utils import str2bool
def get_predictor(args, filed='am'):
full_name = ''
if filed == 'am':
full_name = args.am
elif filed == 'voc':
full_name = args.voc
model_name = full_name[:full_name.rindex('_')]
config = inference.Config(
str(Path(args.inference_dir) / (full_name + ".pdmodel")),
str(Path(args.inference_dir) / (full_name + ".pdiparams")))
if args.device == "gpu":
config.enable_use_gpu(100, 0)
elif args.device == "cpu":
config.disable_gpu()
config.enable_memory_optim()
predictor = inference.create_predictor(config)
return predictor
def get_am_output(args, am_predictor, frontend, merge_sentences, input):
am_name = args.am[:args.am.rindex('_')]
am_dataset = args.am[args.am.rindex('_') + 1:]
am_input_names = am_predictor.get_input_names()
get_tone_ids = False
get_spk_id = False
if am_name == 'speedyspeech':
get_tone_ids = True
if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
get_spk_id = True
spk_id = numpy.array([args.spk_id])
if args.lang == 'zh':
input_ids = frontend.get_input_ids(
input, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids)
phone_ids = input_ids["phone_ids"]
elif args.lang == 'en':
input_ids = frontend.get_input_ids(
input, merge_sentences=merge_sentences)
phone_ids = input_ids["phone_ids"]
else:
print("lang should in {'zh', 'en'}!")
if get_tone_ids:
tone_ids = input_ids["tone_ids"]
tones = tone_ids[0].numpy()
tones_handle = am_predictor.get_input_handle(am_input_names[1])
tones_handle.reshape(tones.shape)
tones_handle.copy_from_cpu(tones)
if get_spk_id:
spk_id_handle = am_predictor.get_input_handle(am_input_names[1])
spk_id_handle.reshape(spk_id.shape)
spk_id_handle.copy_from_cpu(spk_id)
phones = phone_ids[0].numpy()
phones_handle = am_predictor.get_input_handle(am_input_names[0])
phones_handle.reshape(phones.shape)
phones_handle.copy_from_cpu(phones)
am_predictor.run()
am_output_names = am_predictor.get_output_names()
am_output_handle = am_predictor.get_output_handle(am_output_names[0])
am_output_data = am_output_handle.copy_to_cpu()
return am_output_data
def get_voc_output(args, voc_predictor, input):
voc_input_names = voc_predictor.get_input_names()
mel_handle = voc_predictor.get_input_handle(voc_input_names[0])
mel_handle.reshape(input.shape)
mel_handle.copy_from_cpu(input)
voc_predictor.run()
voc_output_names = voc_predictor.get_output_names()
voc_output_handle = voc_predictor.get_output_handle(voc_output_names[0])
wav = voc_output_handle.copy_to_cpu()
return wav
def parse_args(): def parse_args():
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Paddle Infernce with acoustic model & vocoder.") description="Paddle Infernce with acoustic model & vocoder.")
...@@ -204,7 +129,7 @@ def main(): ...@@ -204,7 +129,7 @@ def main():
merge_sentences=merge_sentences, merge_sentences=merge_sentences,
input=sentence) input=sentence)
wav = get_voc_output( wav = get_voc_output(
args, voc_predictor=voc_predictor, input=am_output_data) voc_predictor=voc_predictor, input=am_output_data)
speed = wav.size / t.elapse speed = wav.size / t.elapse
rtf = fs / speed rtf = fs / speed
print( print(
...@@ -224,7 +149,7 @@ def main(): ...@@ -224,7 +149,7 @@ def main():
merge_sentences=merge_sentences, merge_sentences=merge_sentences,
input=sentence) input=sentence)
wav = get_voc_output( wav = get_voc_output(
args, voc_predictor=voc_predictor, input=am_output_data) voc_predictor=voc_predictor, input=am_output_data)
N += wav.size N += wav.size
T += t.elapse T += t.elapse
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from pathlib import Path
import numpy as np
import soundfile as sf
from timer import timer
from paddlespeech.t2s.exps.syn_utils import denorm
from paddlespeech.t2s.exps.syn_utils import get_am_sublayer_output
from paddlespeech.t2s.exps.syn_utils import get_chunks
from paddlespeech.t2s.exps.syn_utils import get_frontend
from paddlespeech.t2s.exps.syn_utils import get_predictor
from paddlespeech.t2s.exps.syn_utils import get_sentences
from paddlespeech.t2s.exps.syn_utils import get_streaming_am_output
from paddlespeech.t2s.exps.syn_utils import get_streaming_am_predictor
from paddlespeech.t2s.exps.syn_utils import get_voc_output
from paddlespeech.t2s.utils import str2bool
def parse_args():
parser = argparse.ArgumentParser(
description="Paddle Infernce with acoustic model & vocoder.")
# acoustic model
parser.add_argument(
'--am',
type=str,
default='fastspeech2_csmsc',
choices=['fastspeech2_csmsc'],
help='Choose acoustic model type of tts task.')
parser.add_argument(
"--am_stat",
type=str,
default=None,
help="mean and standard deviation used to normalize spectrogram when training acoustic model."
)
parser.add_argument(
"--phones_dict", type=str, default=None, help="phone vocabulary file.")
parser.add_argument(
"--tones_dict", type=str, default=None, help="tone vocabulary file.")
parser.add_argument(
"--speaker_dict", type=str, default=None, help="speaker id map file.")
parser.add_argument(
'--spk_id',
type=int,
default=0,
help='spk id for multi speaker acoustic model')
# voc
parser.add_argument(
'--voc',
type=str,
default='pwgan_csmsc',
choices=['pwgan_csmsc', 'mb_melgan_csmsc', 'hifigan_csmsc'],
help='Choose vocoder type of tts task.')
# other
parser.add_argument(
'--lang',
type=str,
default='zh',
help='Choose model language. zh or en')
parser.add_argument(
"--text",
type=str,
help="text to synthesize, a 'utt_id sentence' pair per line")
parser.add_argument(
"--inference_dir", type=str, help="dir to save inference models")
parser.add_argument("--output_dir", type=str, help="output dir")
# inference
parser.add_argument(
"--device",
default="gpu",
choices=["gpu", "cpu"],
help="Device selected for inference.", )
# streaming related
parser.add_argument(
"--am_streaming",
type=str2bool,
default=False,
help="whether use streaming acoustic model")
parser.add_argument(
"--chunk_size", type=int, default=42, help="chunk size of am streaming")
parser.add_argument(
"--pad_size", type=int, default=12, help="pad size of am streaming")
args, _ = parser.parse_known_args()
return args
# only inference for models trained with csmsc now
def main():
args = parse_args()
# frontend
frontend = get_frontend(args)
# am_predictor
am_encoder_infer_predictor, am_decoder_predictor, am_postnet_predictor = get_streaming_am_predictor(
args)
am_mu, am_std = np.load(args.am_stat)
# model: {model_name}_{dataset}
am_dataset = args.am[args.am.rindex('_') + 1:]
# voc_predictor
voc_predictor = get_predictor(args, filed='voc')
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
sentences = get_sentences(args)
merge_sentences = True
fs = 24000 if am_dataset != 'ljspeech' else 22050
# warmup
for utt_id, sentence in sentences[:3]:
with timer() as t:
normalized_mel = get_streaming_am_output(
args,
am_encoder_infer_predictor=am_encoder_infer_predictor,
am_decoder_predictor=am_decoder_predictor,
am_postnet_predictor=am_postnet_predictor,
frontend=frontend,
merge_sentences=merge_sentences,
input=sentence)
mel = denorm(normalized_mel, am_mu, am_std)
wav = get_voc_output(voc_predictor=voc_predictor, input=mel)
speed = wav.size / t.elapse
rtf = fs / speed
print(
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
)
print("warm up done!")
N = 0
T = 0
chunk_size = args.chunk_size
pad_size = args.pad_size
get_tone_ids = False
for utt_id, sentence in sentences:
with timer() as t:
# frontend
if args.lang == 'zh':
input_ids = frontend.get_input_ids(
sentence,
merge_sentences=merge_sentences,
get_tone_ids=get_tone_ids)
phone_ids = input_ids["phone_ids"]
else:
print("lang should be 'zh' here!")
phones = phone_ids[0].numpy()
# acoustic model
orig_hs = get_am_sublayer_output(
am_encoder_infer_predictor, input=phones)
if args.am_streaming:
hss = get_chunks(orig_hs, chunk_size, pad_size)
chunk_num = len(hss)
mel_list = []
for i, hs in enumerate(hss):
am_decoder_output = get_am_sublayer_output(
am_decoder_predictor, input=hs)
am_postnet_output = get_am_sublayer_output(
am_postnet_predictor,
input=np.transpose(am_decoder_output, (0, 2, 1)))
am_output_data = am_decoder_output + np.transpose(
am_postnet_output, (0, 2, 1))
normalized_mel = am_output_data[0]
sub_mel = denorm(normalized_mel, am_mu, am_std)
# clip output part of pad
if i == 0:
sub_mel = sub_mel[:-pad_size]
elif i == chunk_num - 1:
# 最后一块的右侧一定没有 pad 够
sub_mel = sub_mel[pad_size:]
else:
# 倒数几块的右侧也可能没有 pad 够
sub_mel = sub_mel[pad_size:(chunk_size + pad_size) -
sub_mel.shape[0]]
mel_list.append(sub_mel)
mel = np.concatenate(mel_list, axis=0)
else:
am_decoder_output = get_am_sublayer_output(
am_decoder_predictor, input=orig_hs)
am_postnet_output = get_am_sublayer_output(
am_postnet_predictor,
input=np.transpose(am_decoder_output, (0, 2, 1)))
am_output_data = am_decoder_output + np.transpose(
am_postnet_output, (0, 2, 1))
normalized_mel = am_output_data[0]
mel = denorm(normalized_mel, am_mu, am_std)
# vocoder
wav = get_voc_output(voc_predictor=voc_predictor, input=mel)
N += wav.size
T += t.elapse
speed = wav.size / t.elapse
rtf = fs / speed
sf.write(output_dir / (utt_id + ".wav"), wav, samplerate=24000)
print(
f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
)
print(f"{utt_id} done!")
print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
if __name__ == "__main__":
main()
...@@ -16,39 +16,14 @@ from pathlib import Path ...@@ -16,39 +16,14 @@ from pathlib import Path
import jsonlines import jsonlines
import numpy as np import numpy as np
import onnxruntime as ort
import soundfile as sf import soundfile as sf
from timer import timer from timer import timer
from paddlespeech.t2s.exps.syn_utils import get_sess
from paddlespeech.t2s.exps.syn_utils import get_test_dataset from paddlespeech.t2s.exps.syn_utils import get_test_dataset
from paddlespeech.t2s.utils import str2bool from paddlespeech.t2s.utils import str2bool
def get_sess(args, filed='am'):
full_name = ''
if filed == 'am':
full_name = args.am
elif filed == 'voc':
full_name = args.voc
model_dir = str(Path(args.inference_dir) / (full_name + ".onnx"))
sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
if args.device == "gpu":
# fastspeech2/mb_melgan can't use trt now!
if args.use_trt:
providers = ['TensorrtExecutionProvider']
else:
providers = ['CUDAExecutionProvider']
elif args.device == "cpu":
providers = ['CPUExecutionProvider']
sess_options.intra_op_num_threads = args.cpu_threads
sess = ort.InferenceSession(
model_dir, providers=providers, sess_options=sess_options)
return sess
def ort_predict(args): def ort_predict(args):
# construct dataset for evaluation # construct dataset for evaluation
with jsonlines.open(args.test_metadata, 'r') as reader: with jsonlines.open(args.test_metadata, 'r') as reader:
...@@ -131,7 +106,7 @@ def parse_args(): ...@@ -131,7 +106,7 @@ def parse_args():
'--voc', '--voc',
type=str, type=str,
default='hifigan_csmsc', default='hifigan_csmsc',
choices=['hifigan_csmsc', 'mb_melgan_csmsc'], choices=['hifigan_csmsc', 'mb_melgan_csmsc', 'pwgan_csmsc'],
help='Choose vocoder type of tts task.') help='Choose vocoder type of tts task.')
# other # other
parser.add_argument( parser.add_argument(
......
...@@ -15,40 +15,15 @@ import argparse ...@@ -15,40 +15,15 @@ import argparse
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
import onnxruntime as ort
import soundfile as sf import soundfile as sf
from timer import timer from timer import timer
from paddlespeech.t2s.exps.syn_utils import get_frontend from paddlespeech.t2s.exps.syn_utils import get_frontend
from paddlespeech.t2s.exps.syn_utils import get_sentences from paddlespeech.t2s.exps.syn_utils import get_sentences
from paddlespeech.t2s.exps.syn_utils import get_sess
from paddlespeech.t2s.utils import str2bool from paddlespeech.t2s.utils import str2bool
def get_sess(args, filed='am'):
full_name = ''
if filed == 'am':
full_name = args.am
elif filed == 'voc':
full_name = args.voc
model_dir = str(Path(args.inference_dir) / (full_name + ".onnx"))
sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
if args.device == "gpu":
# fastspeech2/mb_melgan can't use trt now!
if args.use_trt:
providers = ['TensorrtExecutionProvider']
else:
providers = ['CUDAExecutionProvider']
elif args.device == "cpu":
providers = ['CPUExecutionProvider']
sess_options.intra_op_num_threads = args.cpu_threads
sess = ort.InferenceSession(
model_dir, providers=providers, sess_options=sess_options)
return sess
def ort_predict(args): def ort_predict(args):
# frontend # frontend
...@@ -156,7 +131,7 @@ def parse_args(): ...@@ -156,7 +131,7 @@ def parse_args():
'--voc', '--voc',
type=str, type=str,
default='hifigan_csmsc', default='hifigan_csmsc',
choices=['hifigan_csmsc', 'mb_melgan_csmsc'], choices=['hifigan_csmsc', 'mb_melgan_csmsc', 'pwgan_csmsc'],
help='Choose vocoder type of tts task.') help='Choose vocoder type of tts task.')
# other # other
parser.add_argument( parser.add_argument(
......
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from pathlib import Path
import numpy as np
import soundfile as sf
from timer import timer
from paddlespeech.t2s.exps.syn_utils import denorm
from paddlespeech.t2s.exps.syn_utils import get_chunks
from paddlespeech.t2s.exps.syn_utils import get_frontend
from paddlespeech.t2s.exps.syn_utils import get_sentences
from paddlespeech.t2s.exps.syn_utils import get_sess
from paddlespeech.t2s.exps.syn_utils import get_streaming_am_sess
from paddlespeech.t2s.utils import str2bool
def ort_predict(args):
# frontend
frontend = get_frontend(args)
output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
sentences = get_sentences(args)
am_name = args.am[:args.am.rindex('_')]
am_dataset = args.am[args.am.rindex('_') + 1:]
fs = 24000 if am_dataset != 'ljspeech' else 22050
# am
am_encoder_infer_sess, am_decoder_sess, am_postnet_sess = get_streaming_am_sess(
args)
am_mu, am_std = np.load(args.am_stat)
# vocoder
voc_sess = get_sess(args, filed='voc')
# frontend warmup
# Loading model cost 0.5+ seconds
if args.lang == 'zh':
frontend.get_input_ids("你好,欢迎使用飞桨框架进行深度学习研究!", merge_sentences=True)
else:
print("lang should in be 'zh' here!")
# am warmup
for T in [27, 38, 54]:
phone_ids = np.random.randint(1, 266, size=(T, ))
am_encoder_infer_sess.run(None, input_feed={'text': phone_ids})
am_decoder_input = np.random.rand(1, T * 15, 384).astype('float32')
am_decoder_sess.run(None, input_feed={'xs': am_decoder_input})
am_postnet_input = np.random.rand(1, 80, T * 15).astype('float32')
am_postnet_sess.run(None, input_feed={'xs': am_postnet_input})
# voc warmup
for T in [227, 308, 544]:
data = np.random.rand(T, 80).astype("float32")
voc_sess.run(None, input_feed={"logmel": data})
print("warm up done!")
N = 0
T = 0
merge_sentences = True
get_tone_ids = False
chunk_size = args.chunk_size
pad_size = args.pad_size
for utt_id, sentence in sentences:
with timer() as t:
if args.lang == 'zh':
input_ids = frontend.get_input_ids(
sentence,
merge_sentences=merge_sentences,
get_tone_ids=get_tone_ids)
phone_ids = input_ids["phone_ids"]
else:
print("lang should in be 'zh' here!")
# merge_sentences=True here, so we only use the first item of phone_ids
phone_ids = phone_ids[0].numpy()
orig_hs = am_encoder_infer_sess.run(
None, input_feed={'text': phone_ids})
if args.am_streaming:
hss = get_chunks(orig_hs[0], chunk_size, pad_size)
chunk_num = len(hss)
mel_list = []
for i, hs in enumerate(hss):
am_decoder_output = am_decoder_sess.run(
None, input_feed={'xs': hs})
am_postnet_output = am_postnet_sess.run(
None,
input_feed={
'xs': np.transpose(am_decoder_output[0], (0, 2, 1))
})
am_output_data = am_decoder_output + np.transpose(
am_postnet_output[0], (0, 2, 1))
normalized_mel = am_output_data[0][0]
sub_mel = denorm(normalized_mel, am_mu, am_std)
# clip output part of pad
if i == 0:
sub_mel = sub_mel[:-pad_size]
elif i == chunk_num - 1:
# 最后一块的右侧一定没有 pad 够
sub_mel = sub_mel[pad_size:]
else:
# 倒数几块的右侧也可能没有 pad 够
sub_mel = sub_mel[pad_size:(chunk_size + pad_size) -
sub_mel.shape[0]]
mel_list.append(sub_mel)
mel = np.concatenate(mel_list, axis=0)
else:
am_decoder_output = am_decoder_sess.run(
None, input_feed={'xs': orig_hs[0]})
am_postnet_output = am_postnet_sess.run(
None,
input_feed={
'xs': np.transpose(am_decoder_output[0], (0, 2, 1))
})
am_output_data = am_decoder_output + np.transpose(
am_postnet_output[0], (0, 2, 1))
normalized_mel = am_output_data[0]
mel = denorm(normalized_mel, am_mu, am_std)
mel = mel[0]
# vocoder
wav = voc_sess.run(output_names=None, input_feed={'logmel': mel})
N += len(wav[0])
T += t.elapse
speed = len(wav[0]) / t.elapse
rtf = fs / speed
sf.write(
str(output_dir / (utt_id + ".wav")),
np.array(wav)[0],
samplerate=fs)
print(
f"{utt_id}, mel: {mel.shape}, wave: {len(wav[0])}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
)
print(f"generation speed: {N / T}Hz, RTF: {fs / (N / T) }")
def parse_args():
parser = argparse.ArgumentParser(description="Infernce with onnxruntime.")
# acoustic model
parser.add_argument(
'--am',
type=str,
default='fastspeech2_csmsc',
choices=['fastspeech2_csmsc'],
help='Choose acoustic model type of tts task.')
parser.add_argument(
"--am_stat",
type=str,
default=None,
help="mean and standard deviation used to normalize spectrogram when training acoustic model."
)
parser.add_argument(
"--phones_dict", type=str, default=None, help="phone vocabulary file.")
parser.add_argument(
"--tones_dict", type=str, default=None, help="tone vocabulary file.")
# voc
parser.add_argument(
'--voc',
type=str,
default='hifigan_csmsc',
choices=['hifigan_csmsc', 'mb_melgan_csmsc', 'pwgan_csmsc'],
help='Choose vocoder type of tts task.')
# other
parser.add_argument(
"--inference_dir", type=str, help="dir to save inference models")
parser.add_argument(
"--text",
type=str,
help="text to synthesize, a 'utt_id sentence' pair per line")
parser.add_argument("--output_dir", type=str, help="output dir")
parser.add_argument(
'--lang',
type=str,
default='zh',
help='Choose model language. zh or en')
# inference
parser.add_argument(
"--use_trt",
type=str2bool,
default=False,
help="Whether to use inference engin TensorRT.", )
parser.add_argument(
"--device",
default="gpu",
choices=["gpu", "cpu"],
help="Device selected for inference.", )
parser.add_argument('--cpu_threads', type=int, default=1)
# streaming related
parser.add_argument(
"--am_streaming",
type=str2bool,
default=False,
help="whether use streaming acoustic model")
parser.add_argument(
"--chunk_size", type=int, default=42, help="chunk size of am streaming")
parser.add_argument(
"--pad_size", type=int, default=12, help="pad size of am streaming")
args, _ = parser.parse_known_args()
return args
def main():
args = parse_args()
ort_predict(args)
if __name__ == "__main__":
main()
...@@ -11,10 +11,14 @@ ...@@ -11,10 +11,14 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import math
import os import os
from pathlib import Path
import numpy as np import numpy as np
import onnxruntime as ort
import paddle import paddle
from paddle import inference
from paddle import jit from paddle import jit
from paddle.static import InputSpec from paddle.static import InputSpec
...@@ -62,6 +66,21 @@ model_alias = { ...@@ -62,6 +66,21 @@ model_alias = {
} }
def denorm(data, mean, std):
return data * std + mean
def get_chunks(data, chunk_size, pad_size):
data_len = data.shape[1]
chunks = []
n = math.ceil(data_len / chunk_size)
for i in range(n):
start = max(0, i * chunk_size - pad_size)
end = min((i + 1) * chunk_size + pad_size, data_len)
chunks.append(data[:, start:end, :])
return chunks
# input # input
def get_sentences(args): def get_sentences(args):
# construct dataset for evaluation # construct dataset for evaluation
...@@ -241,3 +260,221 @@ def voc_to_static(args, voc_inference): ...@@ -241,3 +260,221 @@ def voc_to_static(args, voc_inference):
paddle.jit.save(voc_inference, os.path.join(args.inference_dir, args.voc)) paddle.jit.save(voc_inference, os.path.join(args.inference_dir, args.voc))
voc_inference = paddle.jit.load(os.path.join(args.inference_dir, args.voc)) voc_inference = paddle.jit.load(os.path.join(args.inference_dir, args.voc))
return voc_inference return voc_inference
# inference
def get_predictor(args, filed='am'):
full_name = ''
if filed == 'am':
full_name = args.am
elif filed == 'voc':
full_name = args.voc
config = inference.Config(
str(Path(args.inference_dir) / (full_name + ".pdmodel")),
str(Path(args.inference_dir) / (full_name + ".pdiparams")))
if args.device == "gpu":
config.enable_use_gpu(100, 0)
elif args.device == "cpu":
config.disable_gpu()
config.enable_memory_optim()
predictor = inference.create_predictor(config)
return predictor
def get_am_output(args, am_predictor, frontend, merge_sentences, input):
am_name = args.am[:args.am.rindex('_')]
am_dataset = args.am[args.am.rindex('_') + 1:]
am_input_names = am_predictor.get_input_names()
get_tone_ids = False
get_spk_id = False
if am_name == 'speedyspeech':
get_tone_ids = True
if am_dataset in {"aishell3", "vctk"} and args.speaker_dict:
get_spk_id = True
spk_id = np.array([args.spk_id])
if args.lang == 'zh':
input_ids = frontend.get_input_ids(
input, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids)
phone_ids = input_ids["phone_ids"]
elif args.lang == 'en':
input_ids = frontend.get_input_ids(
input, merge_sentences=merge_sentences)
phone_ids = input_ids["phone_ids"]
else:
print("lang should in {'zh', 'en'}!")
if get_tone_ids:
tone_ids = input_ids["tone_ids"]
tones = tone_ids[0].numpy()
tones_handle = am_predictor.get_input_handle(am_input_names[1])
tones_handle.reshape(tones.shape)
tones_handle.copy_from_cpu(tones)
if get_spk_id:
spk_id_handle = am_predictor.get_input_handle(am_input_names[1])
spk_id_handle.reshape(spk_id.shape)
spk_id_handle.copy_from_cpu(spk_id)
phones = phone_ids[0].numpy()
phones_handle = am_predictor.get_input_handle(am_input_names[0])
phones_handle.reshape(phones.shape)
phones_handle.copy_from_cpu(phones)
am_predictor.run()
am_output_names = am_predictor.get_output_names()
am_output_handle = am_predictor.get_output_handle(am_output_names[0])
am_output_data = am_output_handle.copy_to_cpu()
return am_output_data
def get_voc_output(voc_predictor, input):
voc_input_names = voc_predictor.get_input_names()
mel_handle = voc_predictor.get_input_handle(voc_input_names[0])
mel_handle.reshape(input.shape)
mel_handle.copy_from_cpu(input)
voc_predictor.run()
voc_output_names = voc_predictor.get_output_names()
voc_output_handle = voc_predictor.get_output_handle(voc_output_names[0])
wav = voc_output_handle.copy_to_cpu()
return wav
# streaming am
def get_streaming_am_predictor(args):
full_name = args.am
am_encoder_infer_config = inference.Config(
str(
Path(args.inference_dir) /
(full_name + "_am_encoder_infer" + ".pdmodel")),
str(
Path(args.inference_dir) /
(full_name + "_am_encoder_infer" + ".pdiparams")))
am_decoder_config = inference.Config(
str(
Path(args.inference_dir) /
(full_name + "_am_decoder" + ".pdmodel")),
str(
Path(args.inference_dir) /
(full_name + "_am_decoder" + ".pdiparams")))
am_postnet_config = inference.Config(
str(
Path(args.inference_dir) /
(full_name + "_am_postnet" + ".pdmodel")),
str(
Path(args.inference_dir) /
(full_name + "_am_postnet" + ".pdiparams")))
if args.device == "gpu":
am_encoder_infer_config.enable_use_gpu(100, 0)
am_decoder_config.enable_use_gpu(100, 0)
am_postnet_config.enable_use_gpu(100, 0)
elif args.device == "cpu":
am_encoder_infer_config.disable_gpu()
am_decoder_config.disable_gpu()
am_postnet_config.disable_gpu()
am_encoder_infer_config.enable_memory_optim()
am_decoder_config.enable_memory_optim()
am_postnet_config.enable_memory_optim()
am_encoder_infer_predictor = inference.create_predictor(
am_encoder_infer_config)
am_decoder_predictor = inference.create_predictor(am_decoder_config)
am_postnet_predictor = inference.create_predictor(am_postnet_config)
return am_encoder_infer_predictor, am_decoder_predictor, am_postnet_predictor
def get_am_sublayer_output(am_sublayer_predictor, input):
am_sublayer_input_names = am_sublayer_predictor.get_input_names()
input_handle = am_sublayer_predictor.get_input_handle(
am_sublayer_input_names[0])
input_handle.reshape(input.shape)
input_handle.copy_from_cpu(input)
am_sublayer_predictor.run()
am_sublayer_names = am_sublayer_predictor.get_output_names()
am_sublayer_handle = am_sublayer_predictor.get_output_handle(
am_sublayer_names[0])
am_sublayer_output = am_sublayer_handle.copy_to_cpu()
return am_sublayer_output
def get_streaming_am_output(args, am_encoder_infer_predictor,
am_decoder_predictor, am_postnet_predictor,
frontend, merge_sentences, input):
get_tone_ids = False
if args.lang == 'zh':
input_ids = frontend.get_input_ids(
input, merge_sentences=merge_sentences, get_tone_ids=get_tone_ids)
phone_ids = input_ids["phone_ids"]
else:
print("lang should be 'zh' here!")
phones = phone_ids[0].numpy()
am_encoder_infer_output = get_am_sublayer_output(
am_encoder_infer_predictor, input=phones)
am_decoder_output = get_am_sublayer_output(
am_decoder_predictor, input=am_encoder_infer_output)
am_postnet_output = get_am_sublayer_output(
am_postnet_predictor, input=np.transpose(am_decoder_output, (0, 2, 1)))
am_output_data = am_decoder_output + np.transpose(am_postnet_output,
(0, 2, 1))
normalized_mel = am_output_data[0]
return normalized_mel
def get_sess(args, filed='am'):
full_name = ''
if filed == 'am':
full_name = args.am
elif filed == 'voc':
full_name = args.voc
model_dir = str(Path(args.inference_dir) / (full_name + ".onnx"))
sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
if args.device == "gpu":
# fastspeech2/mb_melgan can't use trt now!
if args.use_trt:
providers = ['TensorrtExecutionProvider']
else:
providers = ['CUDAExecutionProvider']
elif args.device == "cpu":
providers = ['CPUExecutionProvider']
sess_options.intra_op_num_threads = args.cpu_threads
sess = ort.InferenceSession(
model_dir, providers=providers, sess_options=sess_options)
return sess
# streaming am
def get_streaming_am_sess(args):
full_name = args.am
am_encoder_infer_model_dir = str(
Path(args.inference_dir) / (full_name + "_am_encoder_infer" + ".onnx"))
am_decoder_model_dir = str(
Path(args.inference_dir) / (full_name + "_am_decoder" + ".onnx"))
am_postnet_model_dir = str(
Path(args.inference_dir) / (full_name + "_am_postnet" + ".onnx"))
sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
sess_options.execution_mode = ort.ExecutionMode.ORT_SEQUENTIAL
if args.device == "gpu":
# fastspeech2/mb_melgan can't use trt now!
if args.use_trt:
providers = ['TensorrtExecutionProvider']
else:
providers = ['CUDAExecutionProvider']
elif args.device == "cpu":
providers = ['CPUExecutionProvider']
sess_options.intra_op_num_threads = args.cpu_threads
am_encoder_infer_sess = ort.InferenceSession(
am_encoder_infer_model_dir,
providers=providers,
sess_options=sess_options)
am_decoder_sess = ort.InferenceSession(
am_decoder_model_dir, providers=providers, sess_options=sess_options)
am_postnet_sess = ort.InferenceSession(
am_postnet_model_dir, providers=providers, sess_options=sess_options)
return am_encoder_infer_sess, am_decoder_sess, am_postnet_sess
...@@ -12,39 +12,29 @@ ...@@ -12,39 +12,29 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import argparse import argparse
import math import os
from pathlib import Path from pathlib import Path
import numpy as np import numpy as np
import paddle import paddle
import soundfile as sf import soundfile as sf
import yaml import yaml
from paddle import jit
from paddle.static import InputSpec
from timer import timer from timer import timer
from yacs.config import CfgNode from yacs.config import CfgNode
from paddlespeech.s2t.utils.dynamic_import import dynamic_import from paddlespeech.s2t.utils.dynamic_import import dynamic_import
from paddlespeech.t2s.exps.syn_utils import denorm
from paddlespeech.t2s.exps.syn_utils import get_chunks
from paddlespeech.t2s.exps.syn_utils import get_frontend from paddlespeech.t2s.exps.syn_utils import get_frontend
from paddlespeech.t2s.exps.syn_utils import get_sentences from paddlespeech.t2s.exps.syn_utils import get_sentences
from paddlespeech.t2s.exps.syn_utils import get_voc_inference from paddlespeech.t2s.exps.syn_utils import get_voc_inference
from paddlespeech.t2s.exps.syn_utils import model_alias from paddlespeech.t2s.exps.syn_utils import model_alias
from paddlespeech.t2s.exps.syn_utils import voc_to_static
from paddlespeech.t2s.utils import str2bool from paddlespeech.t2s.utils import str2bool
def denorm(data, mean, std):
return data * std + mean
def get_chunks(data, chunk_size, pad_size):
data_len = data.shape[1]
chunks = []
n = math.ceil(data_len / chunk_size)
for i in range(n):
start = max(0, i * chunk_size - pad_size)
end = min((i + 1) * chunk_size + pad_size, data_len)
chunks.append(data[:, start:end, :])
return chunks
def evaluate(args): def evaluate(args):
# Init body. # Init body.
...@@ -84,9 +74,49 @@ def evaluate(args): ...@@ -84,9 +74,49 @@ def evaluate(args):
am_mu = paddle.to_tensor(am_mu) am_mu = paddle.to_tensor(am_mu)
am_std = paddle.to_tensor(am_std) am_std = paddle.to_tensor(am_std)
# am sub layers
am_encoder_infer = am.encoder_infer
am_decoder = am.decoder
am_postnet = am.postnet
# vocoder # vocoder
voc_inference = get_voc_inference(args, voc_config) voc_inference = get_voc_inference(args, voc_config)
# whether dygraph to static
if args.inference_dir:
# fastspeech2 cnndecoder to static
# am.encoder_infer
am_encoder_infer = jit.to_static(
am_encoder_infer, input_spec=[InputSpec([-1], dtype=paddle.int64)])
paddle.jit.save(am_encoder_infer,
os.path.join(args.inference_dir,
args.am + "_am_encoder_infer"))
am_encoder_infer = paddle.jit.load(
os.path.join(args.inference_dir, args.am + "_am_encoder_infer"))
# am.decoder
am_decoder = jit.to_static(
am_decoder,
input_spec=[InputSpec([1, -1, 384], dtype=paddle.float32)])
paddle.jit.save(am_decoder,
os.path.join(args.inference_dir,
args.am + "_am_decoder"))
am_decoder = paddle.jit.load(
os.path.join(args.inference_dir, args.am + "_am_decoder"))
# am.postnet
am_postnet = jit.to_static(
am_postnet,
input_spec=[InputSpec([1, 80, -1], dtype=paddle.float32)])
paddle.jit.save(am_postnet,
os.path.join(args.inference_dir,
args.am + "_am_postnet"))
am_postnet = paddle.jit.load(
os.path.join(args.inference_dir, args.am + "_am_postnet"))
# vocoder
voc_inference = voc_to_static(args, voc_inference)
output_dir = Path(args.output_dir) output_dir = Path(args.output_dir)
output_dir.mkdir(parents=True, exist_ok=True) output_dir.mkdir(parents=True, exist_ok=True)
merge_sentences = True merge_sentences = True
...@@ -107,20 +137,19 @@ def evaluate(args): ...@@ -107,20 +137,19 @@ def evaluate(args):
phone_ids = input_ids["phone_ids"] phone_ids = input_ids["phone_ids"]
else: else:
print("lang should in be 'zh' here!") print("lang should be 'zh' here!")
# merge_sentences=True here, so we only use the first item of phone_ids # merge_sentences=True here, so we only use the first item of phone_ids
phone_ids = phone_ids[0] phone_ids = phone_ids[0]
with paddle.no_grad(): with paddle.no_grad():
# acoustic model # acoustic model
orig_hs, h_masks = am.encoder_infer(phone_ids) orig_hs = am_encoder_infer(phone_ids)
if args.am_streaming: if args.am_streaming:
hss = get_chunks(orig_hs, chunk_size, pad_size) hss = get_chunks(orig_hs, chunk_size, pad_size)
chunk_num = len(hss) chunk_num = len(hss)
mel_list = [] mel_list = []
for i, hs in enumerate(hss): for i, hs in enumerate(hss):
before_outs, _ = am.decoder(hs) before_outs = am_decoder(hs)
after_outs = before_outs + am.postnet( after_outs = before_outs + am_postnet(
before_outs.transpose((0, 2, 1))).transpose( before_outs.transpose((0, 2, 1))).transpose(
(0, 2, 1)) (0, 2, 1))
normalized_mel = after_outs[0] normalized_mel = after_outs[0]
...@@ -139,8 +168,8 @@ def evaluate(args): ...@@ -139,8 +168,8 @@ def evaluate(args):
mel = paddle.concat(mel_list, axis=0) mel = paddle.concat(mel_list, axis=0)
else: else:
before_outs, _ = am.decoder(orig_hs) before_outs = am_decoder(orig_hs)
after_outs = before_outs + am.postnet( after_outs = before_outs + am_postnet(
before_outs.transpose((0, 2, 1))).transpose((0, 2, 1)) before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
normalized_mel = after_outs[0] normalized_mel = after_outs[0]
mel = denorm(normalized_mel, am_mu, am_std) mel = denorm(normalized_mel, am_mu, am_std)
...@@ -201,16 +230,9 @@ def parse_args(): ...@@ -201,16 +230,9 @@ def parse_args():
default='pwgan_csmsc', default='pwgan_csmsc',
choices=[ choices=[
'pwgan_csmsc', 'pwgan_csmsc',
'pwgan_ljspeech',
'pwgan_aishell3',
'pwgan_vctk',
'mb_melgan_csmsc', 'mb_melgan_csmsc',
'style_melgan_csmsc', 'style_melgan_csmsc',
'hifigan_csmsc', 'hifigan_csmsc',
'hifigan_ljspeech',
'hifigan_aishell3',
'hifigan_vctk',
'wavernn_csmsc',
], ],
help='Choose vocoder type of tts task.') help='Choose vocoder type of tts task.')
parser.add_argument( parser.add_argument(
...@@ -233,13 +255,19 @@ def parse_args(): ...@@ -233,13 +255,19 @@ def parse_args():
default='zh', default='zh',
help='Choose model language. zh or en') help='Choose model language. zh or en')
parser.add_argument(
"--inference_dir",
type=str,
default=None,
help="dir to save inference models")
parser.add_argument( parser.add_argument(
"--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.") "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
parser.add_argument( parser.add_argument(
"--text", "--text",
type=str, type=str,
help="text to synthesize, a 'utt_id sentence' pair per line.") help="text to synthesize, a 'utt_id sentence' pair per line.")
# streaming related
parser.add_argument( parser.add_argument(
"--am_streaming", "--am_streaming",
type=str2bool, type=str2bool,
......
...@@ -590,15 +590,17 @@ class FastSpeech2(nn.Layer): ...@@ -590,15 +590,17 @@ class FastSpeech2(nn.Layer):
h_masks = self._source_mask(olens_in) h_masks = self._source_mask(olens_in)
else: else:
h_masks = None h_masks = None
if return_after_enc: if return_after_enc:
return hs, h_masks return hs, h_masks
# (B, Lmax, adim)
zs, _ = self.decoder(hs, h_masks)
# (B, Lmax, odim)
if self.decoder_type == 'cnndecoder': if self.decoder_type == 'cnndecoder':
# remove output masks for dygraph to static graph
zs = self.decoder(hs, h_masks)
before_outs = zs before_outs = zs
else: else:
# (B, Lmax, adim)
zs, _ = self.decoder(hs, h_masks)
# (B, Lmax, odim)
before_outs = self.feat_out(zs).reshape( before_outs = self.feat_out(zs).reshape(
(paddle.shape(zs)[0], -1, self.odim)) (paddle.shape(zs)[0], -1, self.odim))
...@@ -633,7 +635,8 @@ class FastSpeech2(nn.Layer): ...@@ -633,7 +635,8 @@ class FastSpeech2(nn.Layer):
tone_id = tone_id.unsqueeze(0) tone_id = tone_id.unsqueeze(0)
# (1, L, odim) # (1, L, odim)
hs, h_masks = self._forward( # use *_ to avoid bug in dygraph to static graph
hs, *_ = self._forward(
xs, xs,
ilens, ilens,
is_inference=True, is_inference=True,
...@@ -642,7 +645,7 @@ class FastSpeech2(nn.Layer): ...@@ -642,7 +645,7 @@ class FastSpeech2(nn.Layer):
spk_emb=spk_emb, spk_emb=spk_emb,
spk_id=spk_id, spk_id=spk_id,
tone_id=tone_id) tone_id=tone_id)
return hs, h_masks return hs
def inference( def inference(
self, self,
......
...@@ -602,7 +602,7 @@ class CNNDecoder(nn.Layer): ...@@ -602,7 +602,7 @@ class CNNDecoder(nn.Layer):
if masks is not None: if masks is not None:
outputs = outputs * masks outputs = outputs * masks
outputs = outputs.transpose([0, 2, 1]) outputs = outputs.transpose([0, 2, 1])
return outputs, masks return outputs
class CNNPostnet(nn.Layer): class CNNPostnet(nn.Layer):
......
# Copyright (c) 2022 SpeechBrain Authors. All Rights Reserved. # Copyright (c) 2022 PaddlePaddle and SpeechBrain Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -18,12 +18,14 @@ This script has an optional dependency on open source sklearn library. ...@@ -18,12 +18,14 @@ This script has an optional dependency on open source sklearn library.
A few sklearn functions are modified in this script as per requirement. A few sklearn functions are modified in this script as per requirement.
""" """
import argparse import argparse
import copy
import warnings import warnings
from distutils.util import strtobool from distutils.util import strtobool
import numpy as np import numpy as np
import scipy import scipy
import sklearn import sklearn
from scipy import linalg
from scipy import sparse from scipy import sparse
from scipy.sparse.csgraph import connected_components from scipy.sparse.csgraph import connected_components
from scipy.sparse.csgraph import laplacian as csgraph_laplacian from scipy.sparse.csgraph import laplacian as csgraph_laplacian
...@@ -346,6 +348,8 @@ class EmbeddingMeta: ...@@ -346,6 +348,8 @@ class EmbeddingMeta:
--------- ---------
segset : list segset : list
List of session IDs as an array of strings. List of session IDs as an array of strings.
modelset : list
List of model IDs as an array of strings.
stats : tensor stats : tensor
An ndarray of float64. Each line contains embedding An ndarray of float64. Each line contains embedding
from the corresponding session. from the corresponding session.
...@@ -354,15 +358,20 @@ class EmbeddingMeta: ...@@ -354,15 +358,20 @@ class EmbeddingMeta:
def __init__( def __init__(
self, self,
segset=None, segset=None,
modelset=None,
stats=None, ): stats=None, ):
if segset is None: if segset is None:
self.segset = numpy.empty(0, dtype="|O") self.segset = np.empty(0, dtype="|O")
self.stats = numpy.array([], dtype=np.float64) self.modelset = np.empty(0, dtype="|O")
self.stats = np.array([], dtype=np.float64)
else: else:
self.segset = segset self.segset = segset
self.modelset = modelset
self.stats = stats self.stats = stats
self.stat0 = np.array([[1.0]] * self.stats.shape[0])
def norm_stats(self): def norm_stats(self):
""" """
Divide all first-order statistics by their Euclidean norm. Divide all first-order statistics by their Euclidean norm.
...@@ -371,6 +380,188 @@ class EmbeddingMeta: ...@@ -371,6 +380,188 @@ class EmbeddingMeta:
vect_norm = np.clip(np.linalg.norm(self.stats, axis=1), 1e-08, np.inf) vect_norm = np.clip(np.linalg.norm(self.stats, axis=1), 1e-08, np.inf)
self.stats = (self.stats.transpose() / vect_norm).transpose() self.stats = (self.stats.transpose() / vect_norm).transpose()
def get_mean_stats(self):
"""
Return the mean of first order statistics.
"""
mu = np.mean(self.stats, axis=0)
return mu
def get_total_covariance_stats(self):
"""
Compute and return the total covariance matrix of the first-order statistics.
"""
C = self.stats - self.stats.mean(axis=0)
return np.dot(C.transpose(), C) / self.stats.shape[0]
def get_model_stat0(self, mod_id):
"""Return zero-order statistics of a given model
Arguments
---------
mod_id : str
ID of the model which stat0 will be returned.
"""
S = self.stat0[self.modelset == mod_id, :]
return S
def get_model_stats(self, mod_id):
"""Return first-order statistics of a given model.
Arguments
---------
mod_id : str
ID of the model which stat1 will be returned.
"""
return self.stats[self.modelset == mod_id, :]
def sum_stat_per_model(self):
"""
Sum the zero- and first-order statistics per model and store them
in a new EmbeddingMeta.
Returns a EmbeddingMeta object with the statistics summed per model
and a numpy array with session_per_model.
"""
sts_per_model = EmbeddingMeta()
sts_per_model.modelset = np.unique(
self.modelset) # nd: get uniq spkr ids
sts_per_model.segset = copy.deepcopy(sts_per_model.modelset)
sts_per_model.stat0 = np.zeros(
(sts_per_model.modelset.shape[0], self.stat0.shape[1]),
dtype=np.float64, )
sts_per_model.stats = np.zeros(
(sts_per_model.modelset.shape[0], self.stats.shape[1]),
dtype=np.float64, )
session_per_model = np.zeros(np.unique(self.modelset).shape[0])
# For each model sum the stats
for idx, model in enumerate(sts_per_model.modelset):
sts_per_model.stat0[idx, :] = self.get_model_stat0(model).sum(
axis=0)
sts_per_model.stats[idx, :] = self.get_model_stats(model).sum(
axis=0)
session_per_model[idx] += self.get_model_stats(model).shape[0]
return sts_per_model, session_per_model
def center_stats(self, mu):
"""
Center first order statistics.
Arguments
---------
mu : array
Array to center on.
"""
dim = self.stats.shape[1] / self.stat0.shape[1]
index_map = np.repeat(np.arange(self.stat0.shape[1]), dim)
self.stats = self.stats - (self.stat0[:, index_map] *
mu.astype(np.float64))
def rotate_stats(self, R):
"""
Rotate first-order statistics by a right-product.
Arguments
---------
R : ndarray
Matrix to use for right product on the first order statistics.
"""
self.stats = np.dot(self.stats, R)
def whiten_stats(self, mu, sigma, isSqrInvSigma=False):
"""
Whiten first-order statistics
If sigma.ndim == 1, case of a diagonal covariance.
If sigma.ndim == 2, case of a single Gaussian with full covariance.
If sigma.ndim == 3, case of a full covariance UBM.
Arguments
---------
mu : array
Mean vector to be subtracted from the statistics.
sigma : narray
Co-variance matrix or covariance super-vector.
isSqrInvSigma : bool
True if the input Sigma matrix is the inverse of the square root of a covariance matrix.
"""
if sigma.ndim == 1:
self.center_stats(mu)
self.stats = self.stats / np.sqrt(sigma.astype(np.float64))
elif sigma.ndim == 2:
# Compute the inverse square root of the co-variance matrix Sigma
sqr_inv_sigma = sigma
if not isSqrInvSigma:
# eigen_values, eigen_vectors = scipy.linalg.eigh(sigma)
eigen_values, eigen_vectors = linalg.eigh(sigma)
ind = eigen_values.real.argsort()[::-1]
eigen_values = eigen_values.real[ind]
eigen_vectors = eigen_vectors.real[:, ind]
sqr_inv_eval_sigma = 1 / np.sqrt(eigen_values.real)
sqr_inv_sigma = np.dot(eigen_vectors,
np.diag(sqr_inv_eval_sigma))
else:
pass
# Whitening of the first-order statistics
self.center_stats(mu) # CENTERING
self.rotate_stats(sqr_inv_sigma)
elif sigma.ndim == 3:
# we assume that sigma is a 3D ndarray of size D x n x n
# where D is the number of distributions and n is the dimension of a single distribution
n = self.stats.shape[1] // self.stat0.shape[1]
sess_nb = self.stat0.shape[0]
self.center_stats(mu)
self.stats = (np.einsum("ikj,ikl->ilj",
self.stats.T.reshape(-1, n, sess_nb), sigma)
.reshape(-1, sess_nb).T)
else:
raise Exception("Wrong dimension of Sigma, must be 1 or 2")
def align_models(self, model_list):
"""
Align models of the current EmbeddingMeta to match a list of models
provided as input parameter. The size of the StatServer might be
reduced to match the input list of models.
Arguments
---------
model_list : ndarray of strings
List of models to match.
"""
indx = np.array(
[np.argwhere(self.modelset == v)[0][0] for v in model_list])
self.segset = self.segset[indx]
self.modelset = self.modelset[indx]
self.stat0 = self.stat0[indx, :]
self.stats = self.stats[indx, :]
def align_segments(self, segment_list):
"""
Align segments of the current EmbeddingMeta to match a list of segment
provided as input parameter. The size of the StatServer might be
reduced to match the input list of segments.
Arguments
---------
segment_list: ndarray of strings
list of segments to match
"""
indx = np.array(
[np.argwhere(self.segset == v)[0][0] for v in segment_list])
self.segset = self.segset[indx]
self.modelset = self.modelset[indx]
self.stat0 = self.stat0[indx, :]
self.stats = self.stats[indx, :]
class SpecClustUnorm: class SpecClustUnorm:
""" """
......
此差异已折叠。
...@@ -26,14 +26,14 @@ from paddleaudio.compliance.librosa import mfcc ...@@ -26,14 +26,14 @@ from paddleaudio.compliance.librosa import mfcc
class meta_info: class meta_info:
"""the audio meta info in the vector JSONDataset """the audio meta info in the vector JSONDataset
Args: Args:
id (str): the segment name utt_id (str): the segment name
duration (float): segment time duration (float): segment time
wav (str): wav file path wav (str): wav file path
start (int): start point in the original wav file start (int): start point in the original wav file
stop (int): stop point in the original wav file stop (int): stop point in the original wav file
lab_id (str): the record id lab_id (str): the record id
""" """
id: str utt_id: str
duration: float duration: float
wav: str wav: str
start: int start: int
......
...@@ -42,6 +42,7 @@ base = [ ...@@ -42,6 +42,7 @@ base = [
"loguru", "loguru",
"matplotlib", "matplotlib",
"nara_wpe", "nara_wpe",
"onnxruntime",
"pandas", "pandas",
"paddleaudio", "paddleaudio",
"paddlenlp", "paddlenlp",
...@@ -64,6 +65,7 @@ base = [ ...@@ -64,6 +65,7 @@ base = [
"webrtcvad", "webrtcvad",
"yacs~=0.1.8", "yacs~=0.1.8",
"prettytable", "prettytable",
"zhon",
] ]
server = [ server = [
...@@ -90,7 +92,6 @@ requirements = { ...@@ -90,7 +92,6 @@ requirements = {
"unidecode", "unidecode",
"yq", "yq",
"pre-commit", "pre-commit",
"zhon",
] ]
} }
......
# Examples for SpeechX # Examples for SpeechX
* dev - for speechx developer, using for test.
* ngram - using to build NGram ARPA lm.
* ds2_ol - ds2 streaming test under `aishell-1` test dataset. * ds2_ol - ds2 streaming test under `aishell-1` test dataset.
The entrypoint is `ds2_ol/aishell/run.sh` The entrypoint is `ds2_ol/aishell/run.sh`
## How to run ## How to run
`run.sh` is the entry point. `run.sh` is the entry point.
...@@ -17,9 +15,23 @@ pushd ds2_ol/aishell ...@@ -17,9 +15,23 @@ pushd ds2_ol/aishell
bash run.sh bash run.sh
``` ```
## Display Model with [Netron](https://github.com/lutzroeder/netron) ## Display Model with [Netron](https://github.com/lutzroeder/netron)
``` ```
pip install netron pip install netron
netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel --port 8022 --host 10.21.55.20 netron exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel --port 8022 --host 10.21.55.20
``` ```
## For Developer
> Warning: Only for developer, make sure you know what's it.
* dev - for speechx developer, using for test.
## Build WFST
> Warning: Using below example when you know what's it.
* text_lm - process text for build lm
* ngram - using to build NGram ARPA lm.
* wfst - build wfst for TLG.
...@@ -10,12 +10,18 @@ Other -> 0.00 % N=0 C=0 S=0 D=0 I=0 ...@@ -10,12 +10,18 @@ Other -> 0.00 % N=0 C=0 S=0 D=0 I=0
## CTC Prefix Beam Search w LM ## CTC Prefix Beam Search w LM
LM: zh_giga.no_cna_cmn.prune01244.klm
``` ```
Overall -> 7.86 % N=104768 C=96865 S=7573 D=330 I=327
Mandarin -> 7.86 % N=104768 C=96865 S=7573 D=330 I=327
Other -> 0.00 % N=0 C=0 S=0 D=0 I=0
``` ```
## CTC WFST ## CTC WFST
LM: aishell train
```
Overall -> 11.14 % N=103017 C=93363 S=9583 D=71 I=1819
Mandarin -> 11.14 % N=103017 C=93363 S=9583 D=71 I=1818
Other -> 0.00 % N=0 C=0 S=0 D=0 I=1
``` ```
```
\ No newline at end of file
...@@ -11,4 +11,4 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin ...@@ -11,4 +11,4 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
export LC_AL=C export LC_AL=C
SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/decoder:$SPEECHX_EXAMPLES/ds2_ol/feat SPEECHX_BIN=$SPEECHX_EXAMPLES/ds2_ol/decoder:$SPEECHX_EXAMPLES/ds2_ol/feat
export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
\ No newline at end of file
...@@ -5,7 +5,10 @@ set -e ...@@ -5,7 +5,10 @@ set -e
. path.sh . path.sh
nj=40 nj=40
stage=0
stop_stage=100
. utils/parse_options.sh
# 1. compile # 1. compile
if [ ! -d ${SPEECHX_EXAMPLES} ]; then if [ ! -d ${SPEECHX_EXAMPLES} ]; then
...@@ -26,102 +29,112 @@ vocb_dir=$ckpt_dir/data/lang_char/ ...@@ -26,102 +29,112 @@ vocb_dir=$ckpt_dir/data/lang_char/
mkdir -p exp mkdir -p exp
exp=$PWD/exp exp=$PWD/exp
aishell_wav_scp=aishell_test.scp if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
if [ ! -d $data/test ]; then aishell_wav_scp=aishell_test.scp
pushd $data if [ ! -d $data/test ]; then
wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip pushd $data
unzip aishell_test.zip wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_test.zip
popd unzip aishell_test.zip
popd
realpath $data/test/*/*.wav > $data/wavlist
awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id realpath $data/test/*/*.wav > $data/wavlist
paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp awk -F '/' '{ print $(NF) }' $data/wavlist | awk -F '.' '{ print $1 }' > $data/utt_id
fi paste $data/utt_id $data/wavlist > $data/$aishell_wav_scp
fi
if [ ! -d $ckpt_dir ]; then
mkdir -p $ckpt_dir if [ ! -d $ckpt_dir ]; then
wget -P $ckpt_dir -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz mkdir -p $ckpt_dir
tar xzfv $model_dir/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz -C $ckpt_dir wget -P $ckpt_dir -c https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz
fi tar xzfv $model_dir/asr0_deepspeech2_online_aishell_ckpt_0.2.0.model.tar.gz -C $ckpt_dir
fi
lm=$data/zh_giga.no_cna_cmn.prune01244.klm
if [ ! -f $lm ]; then lm=$data/zh_giga.no_cna_cmn.prune01244.klm
pushd $data if [ ! -f $lm ]; then
wget -c https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm pushd $data
popd wget -c https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm
popd
fi
fi fi
# 3. make feature # 3. make feature
text=$data/test/text
label_file=./aishell_result label_file=./aishell_result
wer=./aishell_wer wer=./aishell_wer
export GLOG_logtostderr=1 export GLOG_logtostderr=1
# 3. gen linear feat
cmvn=$PWD/cmvn.ark
cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
if [ $stage -le 1 ] && [ $stop_stage -ge 1 ]; then
# 3. gen linear feat
cmvn=$data/cmvn.ark
cmvn-json2kaldi --json_file=$ckpt_dir/data/mean_std.json --cmvn_write_path=$cmvn
./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj ./local/split_data.sh $data $data/$aishell_wav_scp $aishell_wav_scp $nj
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat.log \ utils/run.pl JOB=1:$nj $data/split${nj}/JOB/feat.log \
linear-spectrogram-wo-db-norm-ol \ linear-spectrogram-wo-db-norm-ol \
--wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \ --wav_rspecifier=scp:$data/split${nj}/JOB/${aishell_wav_scp} \
--feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \ --feature_wspecifier=ark,scp:$data/split${nj}/JOB/feat.ark,$data/split${nj}/JOB/feat.scp \
--cmvn_file=$cmvn \ --cmvn_file=$cmvn \
--streaming_chunk=0.36 --streaming_chunk=0.36
fi
text=$data/test/text
# 4. recognizer if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \ # recognizer
ctc-prefix-beam-search-decoder-ol \ utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wolm.log \
--feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \ ctc-prefix-beam-search-decoder-ol \
--model_path=$model_dir/avg_1.jit.pdmodel \ --feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
--param_path=$model_dir/avg_1.jit.pdiparams \ --model_path=$model_dir/avg_1.jit.pdmodel \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \ --param_path=$model_dir/avg_1.jit.pdiparams \
--dict_file=$vocb_dir/vocab.txt \ --model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
--result_wspecifier=ark,t:$data/split${nj}/JOB/result --dict_file=$vocb_dir/vocab.txt \
--result_wspecifier=ark,t:$data/split${nj}/JOB/result
cat $data/split${nj}/*/result > ${label_file}
utils/compute-wer.py --char=1 --v=1 ${label_file} $text > ${wer} cat $data/split${nj}/*/result > $exp/${label_file}
utils/compute-wer.py --char=1 --v=1 $exp/${label_file} $text > $exp/${wer}
# 4. decode with lm
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \
ctc-prefix-beam-search-decoder-ol \
--feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
--model_path=$model_dir/avg_1.jit.pdmodel \
--param_path=$model_dir/avg_1.jit.pdiparams \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
--dict_file=$vocb_dir/vocab.txt \
--lm_path=$lm \
--result_wspecifier=ark,t:$data/split${nj}/JOB/result_lm
cat $data/split${nj}/*/result_lm > ${label_file}_lm
utils/compute-wer.py --char=1 --v=1 ${label_file}_lm $text > ${wer}_lm
graph_dir=./aishell_graph
if [ ! -d $ ]; then
wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip
unzip -d aishell_graph.zip
fi fi
if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
# decode with lm
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.lm.log \
ctc-prefix-beam-search-decoder-ol \
--feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
--model_path=$model_dir/avg_1.jit.pdmodel \
--param_path=$model_dir/avg_1.jit.pdiparams \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
--dict_file=$vocb_dir/vocab.txt \
--lm_path=$lm \
--result_wspecifier=ark,t:$data/split${nj}/JOB/result_lm
cat $data/split${nj}/*/result_lm > $exp/${label_file}_lm
utils/compute-wer.py --char=1 --v=1 $exp/${label_file}_lm $text > $exp/${wer}_lm
fi
# 5. test TLG decoder
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \
wfst-decoder-ol \
--feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
--model_path=$model_dir/avg_1.jit.pdmodel \
--param_path=$model_dir/avg_1.jit.pdiparams \
--word_symbol_table=$graph_dir/words.txt \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
--graph_path=$graph_dir/TLG.fst --max_active=7500 \
--acoustic_scale=1.2 \
--result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg
wfst=$data/wfst/
mkdir -p $wfst
if [ ! -f $wfst/aishell_graph.zip ]; then
pushd $wfst
wget -c https://paddlespeech.bj.bcebos.com/s2t/paddle_asr_online/aishell_graph.zip
unzip aishell_graph.zip
popd
fi
cat $data/split${nj}/*/result_tlg > ${label_file}_tlg graph_dir=$wfst/aishell_graph
utils/compute-wer.py --char=1 --v=1 ${label_file}_tlg $text > ${wer}_tlg if [ $stage -le 4 ] && [ $stop_stage -ge 4 ]; then
\ No newline at end of file # TLG decoder
utils/run.pl JOB=1:$nj $data/split${nj}/JOB/recog.wfst.log \
wfst-decoder-ol \
--feature_rspecifier=scp:$data/split${nj}/JOB/feat.scp \
--model_path=$model_dir/avg_1.jit.pdmodel \
--param_path=$model_dir/avg_1.jit.pdiparams \
--word_symbol_table=$graph_dir/words.txt \
--model_output_names=softmax_0.tmp_0,tmp_5,concat_0.tmp_0,concat_1.tmp_0 \
--graph_path=$graph_dir/TLG.fst --max_active=7500 \
--acoustic_scale=1.2 \
--result_wspecifier=ark,t:$data/split${nj}/JOB/result_tlg
cat $data/split${nj}/*/result_tlg > $exp/${label_file}_tlg
utils/compute-wer.py --char=1 --v=1 $exp/${label_file}_tlg $text > $exp/${wer}_tlg
fi
\ No newline at end of file
# ngram train for mandarin
Quick run:
```
bash run.sh --stage -1
```
## input
input files:
```
data/
├── lexicon.txt
├── text
└── vocab.txt
```
```
==> data/text <==
BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
BAC009S0002W0125 各地 政府 便 纷纷 跟进
BAC009S0002W0126 仅 一 个 多 月 的 时间 里
BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
BAC009S0002W0128 四十六 个 限 购 城市 当中
BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
BAC009S0002W0131 显示 出 了 极 强 的 威力
==> data/lexicon.txt <==
SIL sil
<SPOKEN_NOISE> sil
啊 aa a1
啊 aa a2
啊 aa a4
啊 aa a5
啊啊啊 aa a2 aa a2 aa a2
啊啊啊 aa a5 aa a5 aa a5
坐地 z uo4 d i4
坐实 z uo4 sh ix2
坐视 z uo4 sh ix4
坐稳 z uo4 uu un3
坐拥 z uo4 ii iong1
坐诊 z uo4 zh en3
坐庄 z uo4 zh uang1
坐姿 z uo4 z iy1
==> data/vocab.txt <==
<blank>
<unk>
A
B
C
D
E
<eos>
```
## output
```
data/
├── local
│ ├── dict
│ │ ├── lexicon.txt
│ │ └── units.txt
│ └── lm
│ ├── heldout
│ ├── lm.arpa
│ ├── text
│ ├── text.no_oov
│ ├── train
│ ├── unigram.counts
│ ├── word.counts
│ └── wordlist
```
```
/workspace/srilm/bin/i686-m64/ngram-count
Namespace(bpemodel=None, in_lexicon='data/lexicon.txt', out_lexicon='data/local/dict/lexicon.txt', unit_file='data/vocab.txt')
Ignoring words 矽, which contains oov unit
Ignoring words 傩, which contains oov unit
Ignoring words 堀, which contains oov unit
Ignoring words 莼, which contains oov unit
Ignoring words 菰, which contains oov unit
Ignoring words 摭, which contains oov unit
Ignoring words 帙, which contains oov unit
Ignoring words 迨, which contains oov unit
Ignoring words 孥, which contains oov unit
Ignoring words 瑗, which contains oov unit
...
...
...
file data/local/lm/heldout: 10000 sentences, 89496 words, 0 OOVs
0 zeroprobs, logprob= -270337.9 ppl= 521.2819 ppl1= 1048.745
build LM done.
```
#!/bin/bash
# To be run from one directory above this script.
. ./path.sh
text=data/local/lm/text
lexicon=data/local/dict/lexicon.txt
for f in "$text" "$lexicon"; do
[ ! -f $x ] && echo "$0: No such file $f" && exit 1;
done
# Check SRILM tools
if ! which ngram-count > /dev/null; then
echo "srilm tools are not found, please download it and install it from: "
echo "http://www.speech.sri.com/projects/srilm/download.html"
echo "Then add the tools to your PATH"
exit 1
fi
# This script takes no arguments. It assumes you have already run
# aishell_data_prep.sh.
# It takes as input the files
# data/local/lm/text
# data/local/dict/lexicon.txt
dir=data/local/lm
mkdir -p $dir
cleantext=$dir/text.no_oov
# oov to <SPOKEN_NOISE>
# lexicon line: word char0 ... charn
# text line: utt word0 ... wordn -> line: <SPOKEN_NOISE> word0 ... wordn
cat $text | awk -v lex=$lexicon 'BEGIN{while((getline<lex) >0){ seen[$1]=1; } }
{for(n=1; n<=NF;n++) { if (seen[$n]) { printf("%s ", $n); } else {printf("<SPOKEN_NOISE> ");} } printf("\n");}' \
> $cleantext || exit 1;
# compute word counts, sort in descending order
# line: count word
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | sort | uniq -c | \
sort -nr > $dir/word.counts || exit 1;
# Get counts from acoustic training transcripts, and add one-count
# for each word in the lexicon (but not silence, we don't want it
# in the LM-- we'll add it optionally later).
cat $cleantext | awk '{for(n=2;n<=NF;n++) print $n; }' | \
cat - <(grep -w -v '!SIL' $lexicon | awk '{print $1}') | \
sort | uniq -c | sort -nr > $dir/unigram.counts || exit 1;
# word with <s> </s>
cat $dir/unigram.counts | awk '{print $2}' | cat - <(echo "<s>"; echo "</s>" ) > $dir/wordlist
# hold out to compute ppl
heldout_sent=10000 # Don't change this if you want result to be comparable with kaldi_lm results
mkdir -p $dir
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
head -$heldout_sent > $dir/heldout
cat $cleantext | awk '{for(n=2;n<=NF;n++){ printf $n; if(n<NF) printf " "; else print ""; }}' | \
tail -n +$heldout_sent > $dir/train
ngram-count -text $dir/train -order 3 -limit-vocab -vocab $dir/wordlist -unk \
-map-unk "<UNK>" -kndiscount -interpolate -lm $dir/lm.arpa
ngram -lm $dir/lm.arpa -ppl $dir/heldout
\ No newline at end of file
#!/usr/bin/env python3
import argparse
from collections import Counter
def main(args):
counter = Counter()
with open(args.text, 'r') as fin, open(args.lexicon, 'w') as fout:
for line in fin:
line = line.strip()
if args.has_key:
utt, text = line.split(maxsplit=1)
words = text.split()
else:
words = line.split()
counter.update(words)
for word in counter:
val = " ".join(list(word))
fout.write(f"{word}\t{val}\n")
fout.flush()
if __name__ == '__main__':
parser = argparse.ArgumentParser(
description='text(line:utt1 中国 人) to lexicon(line:中国 中 国).')
parser.add_argument(
'--has_key',
default=True,
help='text path, with utt or not')
parser.add_argument(
'--text',
required=True,
help='text path. line: utt1 中国 人 or 中国 人')
parser.add_argument(
'--lexicon',
required=True,
help='lexicon path. line:中国 中 国')
args = parser.parse_args()
print(args)
main(args)
# This contains the locations of binarys build required for running the examples.
MAIN_ROOT=`realpath $PWD/../../../../`
SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
export LC_AL=C
# srilm
export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
export SRILM=${MAIN_ROOT}/tools/srilm
export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
#!/bin/bash
set -eo pipefail
. path.sh
stage=-1
stop_stage=100
corpus=aishell
unit=data/vocab.txt # vocab file, line: char/spm_pice
lexicon=data/lexicon.txt # line: word ph0 ... phn, aishell/resource_aishell/lexicon.txt
text=data/text # line: utt text, aishell/data_aishell/transcript/aishell_transcript_v0.8.txt
. utils/parse_options.sh
data=$PWD/data
mkdir -p $data
if [ $stage -le -1 ] && [ $stop_stage -ge -1 ]; then
if [ ! -f $data/speech.ngram.zh.tar.gz ];then
pushd $data
wget -c http://paddlespeech.bj.bcebos.com/speechx/examples/ngram/zh/speech.ngram.zh.tar.gz
tar xvzf speech.ngram.zh.tar.gz
popd
fi
fi
if [ ! -f $unit ]; then
echo "$0: No such file $unit"
exit 1;
fi
if ! which ngram-count; then
pushd $MAIN_ROOT/tools
make srilm.done
popd
fi
mkdir -p data/local/dict
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# 7.1 Prepare dict
# line: char/spm_pices
cp $unit data/local/dict/units.txt
if [ ! -f $lexicon ];then
local/text_to_lexicon.py --has_key true --text $text --lexicon $lexicon
echo "Generate $lexicon from $text"
fi
# filter by vocab
# line: word ph0 ... phn -> line: word char0 ... charn
utils/fst/prepare_dict.py \
--unit_file $unit \
--in_lexicon ${lexicon} \
--out_lexicon data/local/dict/lexicon.txt
fi
lm=data/local/lm
mkdir -p $lm
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
# 7.2 Train lm
cp $text $lm/text
local/aishell_train_lms.sh
fi
echo "build LM done."
exit 0
../../../../utils/
\ No newline at end of file
# Text PreProcess for building ngram LM
Output `text` file like this:
```
BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
BAC009S0002W0125 各地 政府 便 纷纷 跟进
BAC009S0002W0126 仅 一 个 多 月 的 时间 里
BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
BAC009S0002W0128 四十六 个 限 购 城市 当中
BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
```
MAIN_ROOT=`realpath $PWD/../../../../`
SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
export LC_AL=C
#!/bin/bash
set -eo pipefail
. path.sh
stage=0
stop_stage=100
has_key=true
token_type=word
. utils/parse_options.sh || exit -1;
text=data/text
if [ ! -f $text ]; then
echo "$0: Not find $1";
exit -1;
fi
if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then
echo "text tn & wordseg preprocess"
rm -rf ${text}.tn
python3 utils/zh_tn.py --has_key $has_key --token_type $token_type ${text} ${text}.tn
fi
\ No newline at end of file
../../../utils/
\ No newline at end of file
# Built TLG wfst
## Input
```
data/local/
├── dict
│ ├── lexicon.txt
│ └── units.txt
└── lm
├── heldout
├── lm.arpa
├── text
├── text.no_oov
├── train
├── unigram.counts
├── word.counts
└── wordlist
```
```
==> data/local/dict/lexicon.txt <==
啊 啊
啊啊啊 啊 啊 啊
阿 阿
阿尔 阿 尔
阿根廷 阿 根 廷
阿九 阿 九
阿克 阿 克
阿拉伯数字 阿 拉 伯 数 字
阿拉法特 阿 拉 法 特
阿拉木图 阿 拉 木 图
==> data/local/dict/units.txt <==
<blank>
<unk>
A
B
C
D
E
F
G
H
==> data/local/lm/heldout <==
而 对 楼市 成交 抑制 作用 最 大 的 限 购
也 成为 地方 政府 的 眼中 钉
自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
各地 政府 便 纷纷 跟进
仅 一 个 多 月 的 时间 里
除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
四十六 个 限 购 城市 当中
四十一 个 已 正式 取消 或 变相 放松 了 限 购
财政 金融 政策 紧随 其后 而来
显示 出 了 极 强 的 威力
==> data/local/lm/lm.arpa <==
\data\
ngram 1=129356
ngram 2=504661
ngram 3=123455
\1-grams:
-1.531278 </s>
-3.828829 <SPOKEN_NOISE> -0.1600094
-6.157292 <UNK>
==> data/local/lm/text <==
BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
BAC009S0002W0125 各地 政府 便 纷纷 跟进
BAC009S0002W0126 仅 一 个 多 月 的 时间 里
BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
BAC009S0002W0128 四十六 个 限 购 城市 当中
BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
BAC009S0002W0131 显示 出 了 极 强 的 威力
==> data/local/lm/text.no_oov <==
<SPOKEN_NOISE> 而 对 楼市 成交 抑制 作用 最 大 的 限 购
<SPOKEN_NOISE> 也 成为 地方 政府 的 眼中 钉
<SPOKEN_NOISE> 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
<SPOKEN_NOISE> 各地 政府 便 纷纷 跟进
<SPOKEN_NOISE> 仅 一 个 多 月 的 时间 里
<SPOKEN_NOISE> 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
<SPOKEN_NOISE> 四十六 个 限 购 城市 当中
<SPOKEN_NOISE> 四十一 个 已 正式 取消 或 变相 放松 了 限 购
<SPOKEN_NOISE> 财政 ���融 政策 紧随 其后 而来
<SPOKEN_NOISE> 显示 出 了 极 强 的 威力
==> data/local/lm/train <==
汉莎 不 得 不 通过 这样 的 方式 寻求 新 的 发展 点
并 计划 朝云 计算 方面 发展
汉莎 的 基础 设施 部门 拥有 一千四百 名 员工
媒体 就 曾 披露 这笔 交易
虽然 双方 已经 正式 签署 了 外包 协议
但是 这笔 交易 还 需要 得到 反 垄断 部门 的 批准
陈 黎明 一九八九 年 获得 美国 康乃尔 大学 硕士 学位
并 于 二零零三 年 顺利 完成 美国 哈佛 商学 院 高级 管理 课程
曾 在 多家 国际 公司 任职
拥有 业务 开发 商务 及 企业 治理
==> data/local/lm/unigram.counts <==
57487 的
13099 在
11862 一
11397 了
10998 不
9913 是
7952 有
6250 和
6152 个
5422 将
==> data/local/lm/word.counts <==
57486 的
13098 在
11861 一
11396 了
10997 不
9912 是
7951 有
6249 和
6151 个
5421 将
==> data/local/lm/wordlist <==
```
## Output
```
fstaddselfloops 'echo 4234 |' 'echo 123660 |'
Lexicon and Token FSTs compiling succeeded
arpa2fst --read-symbol-table=data/lang_test/words.txt --keep-symbols=true -
LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:94) Reading \data\ section.
LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \1-grams: section.
LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \2-grams: section.
LOG (arpa2fst[5.5.0~1-5a37]:Read():arpa-file-parser.cc:149) Reading \3-grams: section.
Checking how stochastic G is (the first of these numbers should be small):
fstisstochastic data/lang_test/G.fst
0 -1.14386
fsttablecompose data/lang_test/L.fst data/lang_test/G.fst
fstminimizeencoded
fstdeterminizestar --use-log=true
fsttablecompose data/lang_test/T.fst data/lang_test/LG.fst
Composing decoding graph TLG.fst succeeded
Aishell build TLG done.
```
```
data/
├── lang_test
│ ├── G.fst
│ ├── L.fst
│ ├── LG.fst
│ ├── T.fst
│ ├── TLG.fst
│ ├── tokens.txt
│ ├── units.txt
│ └── words.txt
└── local
├── lang
│ ├── L.fst
│ ├── T.fst
│ ├── tokens.txt
│ ├── units.txt
│ └── words.txt
└── tmp
├── disambig.list
├── lexiconp_disambig.txt
├── lexiconp.txt
└── units.list
```
\ No newline at end of file
# This contains the locations of binarys build required for running the examples.
MAIN_ROOT=`realpath $PWD/../../../`
SPEECHX_ROOT=`realpath $MAIN_ROOT/speechx`
export LC_AL=C
# srilm
export LIBLBFGS=${MAIN_ROOT}/tools/liblbfgs-1.10
export LD_LIBRARY_PATH=${LD_LIBRARY_PATH:-}:${LIBLBFGS}/lib/.libs
export SRILM=${MAIN_ROOT}/tools/srilm
export PATH=${PATH}:${SRILM}/bin:${SRILM}/bin/i686-m64
# Kaldi
export KALDI_ROOT=${MAIN_ROOT}/tools/kaldi
[ -f $KALDI_ROOT/tools/env.sh ] && . $KALDI_ROOT/tools/env.sh
export PATH=$PWD/utils/:$KALDI_ROOT/tools/openfst/bin:$PWD:$PATH
[ ! -f $KALDI_ROOT/tools/config/common_path.sh ] && echo >&2 "The standard file $KALDI_ROOT/tools/config/common_path.sh is not present, can not using Kaldi!"
[ -f $KALDI_ROOT/tools/config/common_path.sh ] && . $KALDI_ROOT/tools/config/common_path.sh
#!/bin/bash
set -eo pipefail
. path.sh
stage=-1
stop_stage=100
. utils/parse_options.sh
if ! which fstprint ; then
pushd $MAIN_ROOT/tools
make kaldi.done
popd
fi
if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
# build T & L
# utils/fst/compile_lexicon_token_fst.sh <dict-src-dir> <tmp-dir> <lang-dir>
utils/fst/compile_lexicon_token_fst.sh \
data/local/dict data/local/tmp data/local/lang
# build G & LG & TLG
# utils/fst/make_tlg.sh <lm_dir> <src_lang> <tgt_lang>
utils/fst/make_tlg.sh data/local/lm data/local/lang data/lang_test || exit 1;
fi
echo "build TLG done."
exit 0
../../../utils/
\ No newline at end of file
此差异已折叠。
#!/bin/bash #!/bin/bash
set -e set -e
# Audio classification # Audio classification
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav
paddlespeech cls --input ./cat.wav --topk 10 paddlespeech cls --input ./cat.wav --topk 10
...@@ -28,26 +29,16 @@ paddlespeech tts --am tacotron2_csmsc --input "你好,欢迎使用百度飞桨 ...@@ -28,26 +29,16 @@ paddlespeech tts --am tacotron2_csmsc --input "你好,欢迎使用百度飞桨
paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!" paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"
paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get." paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
# Speech Translation (only support linux) # Speech Translation (only support linux)
paddlespeech st --input ./en.wav paddlespeech st --input ./en.wav
# batch process
echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
# shell pipeline
paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
# stats
paddlespeech stats --task asr
paddlespeech stats --task tts
paddlespeech stats --task cls
# Speaker Verification # Speaker Verification
wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
paddlespeech vector --task spk --input 85236145389.wav paddlespeech vector --task spk --input 85236145389.wav
# batch process
echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
echo -e "demo1 85236145389.wav \n demo2 85236145389.wav" > vec.job echo -e "demo1 85236145389.wav \n demo2 85236145389.wav" > vec.job
paddlespeech vector --task spk --input vec.job paddlespeech vector --task spk --input vec.job
...@@ -55,4 +46,13 @@ echo -e "demo3 85236145389.wav \n demo4 85236145389.wav" | paddlespeech vector - ...@@ -55,4 +46,13 @@ echo -e "demo3 85236145389.wav \n demo4 85236145389.wav" | paddlespeech vector -
rm 85236145389.wav rm 85236145389.wav
rm vec.job rm vec.job
# shell pipeline
paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
# stats
paddlespeech stats --task asr
paddlespeech stats --task tts
paddlespeech stats --task cls
paddlespeech stats --task text
paddlespeech stats --task vector
paddlespeech stats --task st
此差异已折叠。
此差异已折叠。
此差异已折叠。
文件模式从 100644 更改为 100755
此差异已折叠。
文件模式从 100644 更改为 100755
文件模式从 100644 更改为 100755
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册