diff --git a/.gitignore b/.gitignore
index ad8e74925d712f617305045bd9264744a9c462e2..639472001a719aca5cb93e851ef1f628fc3cae9b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,7 @@
.DS_Store
*.pyc
.vscode
-*log
+*.log
*.wav
*.pdmodel
*.pdiparams*
@@ -34,4 +34,6 @@ tools/activate_python.sh
tools/miniconda.sh
tools/CRF++-0.58/
-speechx/fc_patch/
\ No newline at end of file
+speechx/fc_patch/
+
+third_party/ctc_decoders/paddlespeech_ctcdecoders.py
diff --git a/.mergify.yml b/.mergify.yml
index 6dae66d04dd05f627d24b329c0a4fbd1491ea0cb..68b2481015a87a27817cb0aeb279114e7438378f 100644
--- a/.mergify.yml
+++ b/.mergify.yml
@@ -52,7 +52,7 @@ pull_request_rules:
add: ["T2S"]
- name: "auto add label=Audio"
conditions:
- - files~=^audio/
+ - files~=^paddleaudio/
actions:
label:
add: ["Audio"]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 62fead47015b245f91f8cb87fb9c0abad36cfe94..2782b8176de0be645ee219af64ad335052f7f5f3 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,15 @@
# Changelog
+
+Date: 2022-3-22, Author: yt605155624.
+Add features to: CLI:
+ - Support aishell3_hifigan、vctk_hifigan
+ - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1587
+
+Date: 2022-3-09, Author: yt605155624.
+Add features to: T2S:
+ - Add ljspeech hifigan egs.
+ - PRLink: https://github.com/PaddlePaddle/PaddleSpeech/pull/1549
+
Date: 2022-3-08, Author: yt605155624.
Add features to: T2S:
- Add aishell3 hifigan egs.
diff --git a/README.md b/README.md
index ceef15af62c033a6c08d7f7792a73e9249c813e0..1144d3ab52ed7b8f2d6ae4cb7d8f50b6602a4ca2 100644
--- a/README.md
+++ b/README.md
@@ -7,6 +7,7 @@
Quick Start
+ | Quick Start Server
| Documents
| Models List
@@ -178,6 +179,8 @@ Via the easy-to-use, efficient, flexible and scalable implementation, our vision
+- 👏🏻 2022.03.28: PaddleSpeech Server is available for Audio Classification, Automatic Speech Recognition and Text-to-Speech.
+- 👏🏻 2022.03.28: PaddleSpeech CLI is available for Speaker Verfication.
- 🤗 2021.12.14: Our PaddleSpeech [ASR](https://huggingface.co/spaces/KPatrick/PaddleSpeechASR) and [TTS](https://huggingface.co/spaces/KPatrick/PaddleSpeechTTS) Demos on Hugging Face Spaces are available!
- 👏🏻 2021.12.10: PaddleSpeech CLI is available for Audio Classification, Automatic Speech Recognition, Speech Translation (English to Chinese) and Text-to-Speech.
@@ -203,6 +206,11 @@ Developers can have a try of our models with [PaddleSpeech Command Line](./paddl
paddlespeech cls --input input.wav
```
+**Speaker Verification**
+```
+paddlespeech vector --task spk --input input_16k.wav
+```
+
**Automatic Speech Recognition**
```shell
paddlespeech asr --lang zh --input input_16k.wav
@@ -242,6 +250,36 @@ For more command lines, please see: [demos](https://github.com/PaddlePaddle/Padd
If you want to try more functions like training and tuning, please have a look at [Speech-to-Text Quick Start](./docs/source/asr/quick_start.md) and [Text-to-Speech Quick Start](./docs/source/tts/quick_start.md).
+
+
+## Quick Start Server
+
+Developers can have a try of our speech server with [PaddleSpeech Server Command Line](./paddlespeech/server/README.md).
+
+**Start server**
+```shell
+paddlespeech_server start --config_file ./paddlespeech/server/conf/application.yaml
+```
+
+**Access Speech Recognition Services**
+```shell
+paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input input_16k.wav
+```
+
+**Access Text to Speech Services**
+```shell
+paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav
+```
+
+**Access Audio Classification Services**
+```shell
+paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input input.wav
+```
+
+
+For more information about server command lines, please see: [speech server demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_server)
+
+
## Model List
PaddleSpeech supports a series of most popular models. They are summarized in [released models](./docs/source/released_model.md) and attached with available pretrained models.
@@ -458,6 +496,29 @@ PaddleSpeech supports a series of most popular models. They are summarized in [r
+**Speaker Verification**
+
+
+
+
+ Task
+ Dataset
+ Model Type
+ Link
+
+
+
+
+ Speaker Verification
+ VoxCeleb12
+ ECAPA-TDNN
+
+ ecapa-tdnn-voxceleb12
+
+
+
+
+
**Punctuation Restoration**
@@ -499,6 +560,7 @@ Normally, [Speech SoTA](https://paperswithcode.com/area/speech), [Audio SoTA](ht
- [Chinese Rule Based Text Frontend](./docs/source/tts/zh_text_frontend.md)
- [Test Audio Samples](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html)
- [Audio Classification](./demos/audio_tagging/README.md)
+ - [Speaker Verification](./demos/speaker_verification/README.md)
- [Speech Translation](./demos/speech_translation/README.md)
- [Released Models](./docs/source/released_model.md)
- [Community](#Community)
diff --git a/README_cn.md b/README_cn.md
index 8ea91e98d42662c3ee3afcab52228d98191c19fc..ab4ce6e6b878626011ac5cbcfb5c82b4b03ef5d6 100644
--- a/README_cn.md
+++ b/README_cn.md
@@ -6,6 +6,7 @@
快速开始
+ | 快速使用服务
| 教程文档
| 模型列表
@@ -179,7 +180,9 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
-- 🤗 2021.12.14: 我们在 Hugging Face Spaces 上的 [ASR](https://huggingface.co/spaces/KPatrick/PaddleSpeechASR) 以及 [TTS](https://huggingface.co/spaces/akhaliq/paddlespeech) Demos 上线啦!
+- 👏🏻 2022.03.28: PaddleSpeech Server 上线! 覆盖了声音分类、语音识别、以及语音合成。
+- 👏🏻 2022.03.28: PaddleSpeech CLI 上线声纹验证。
+- 🤗 2021.12.14: Our PaddleSpeech [ASR](https://huggingface.co/spaces/KPatrick/PaddleSpeechASR) and [TTS](https://huggingface.co/spaces/KPatrick/PaddleSpeechTTS) Demos on Hugging Face Spaces are available!
- 👏🏻 2021.12.10: PaddleSpeech CLI 上线!覆盖了声音分类、语音识别、语音翻译(英译中)以及语音合成。
### 技术交流群
@@ -202,6 +205,10 @@ from https://github.com/18F/open-source-guide/blob/18f-pages/pages/making-readme
```shell
paddlespeech cls --input input.wav
```
+**声纹识别**
+```shell
+paddlespeech vector --task spk --input input_16k.wav
+```
**语音识别**
```shell
paddlespeech asr --lang zh --input input_16k.wav
@@ -236,6 +243,33 @@ paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
更多命令行命令请参考 [demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos)
> Note: 如果需要训练或者微调,请查看[语音识别](./docs/source/asr/quick_start.md), [语音合成](./docs/source/tts/quick_start.md)。
+
+## 快速使用服务
+安装完成后,开发者可以通过命令行快速使用服务。
+
+**启动服务**
+```shell
+paddlespeech_server start --config_file ./paddlespeech/server/conf/application.yaml
+```
+
+**访问语音识别服务**
+```shell
+paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input input_16k.wav
+```
+
+**访问语音合成服务**
+```shell
+paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "您好,欢迎使用百度飞桨语音合成服务。" --output output.wav
+```
+
+**访问音频分类服务**
+```shell
+paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input input.wav
+```
+
+更多服务相关的命令行使用信息,请参考 [demos](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/demos/speech_server)
+
+
## 模型列表
PaddleSpeech 支持很多主流的模型,并提供了预训练模型,详情请见[模型列表](./docs/source/released_model.md)。
@@ -453,6 +487,30 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
+
+**声纹识别**
+
+
+
+
+ Task
+ Dataset
+ Model Type
+ Link
+
+
+
+
+ Speaker Verification
+ VoxCeleb12
+ ECAPA-TDNN
+
+ ecapa-tdnn-voxceleb12
+
+
+
+
+
**标点恢复**
@@ -499,6 +557,7 @@ PaddleSpeech 的 **语音合成** 主要包含三个模块:文本前端、声
- [中文文本前端](./docs/source/tts/zh_text_frontend.md)
- [测试语音样本](https://paddlespeech.readthedocs.io/en/latest/tts/demo.html)
- [声音分类](./demos/audio_tagging/README_cn.md)
+ - [声纹识别](./demos/speaker_verification/README_cn.md)
- [语音翻译](./demos/speech_translation/README_cn.md)
- [模型列表](#模型列表)
- [语音识别](#语音识别模型)
@@ -521,6 +580,15 @@ author={PaddlePaddle Authors},
howpublished = {\url{https://github.com/PaddlePaddle/PaddleSpeech}},
year={2021}
}
+
+@inproceedings{zheng2021fused,
+ title={Fused acoustic and text encoding for multimodal bilingual pretraining and speech translation},
+ author={Zheng, Renjie and Chen, Junkun and Ma, Mingbo and Huang, Liang},
+ booktitle={International Conference on Machine Learning},
+ pages={12736--12746},
+ year={2021},
+ organization={PMLR}
+}
```
@@ -568,7 +636,6 @@ year={2021}
## 致谢
- 非常感谢 [yeyupiaoling](https://github.com/yeyupiaoling)/[PPASR](https://github.com/yeyupiaoling/PPASR)/[PaddlePaddle-DeepSpeech](https://github.com/yeyupiaoling/PaddlePaddle-DeepSpeech)/[VoiceprintRecognition-PaddlePaddle](https://github.com/yeyupiaoling/VoiceprintRecognition-PaddlePaddle)/[AudioClassification-PaddlePaddle](https://github.com/yeyupiaoling/AudioClassification-PaddlePaddle) 多年来的关注和建议,以及在诸多问题上的帮助。
-- 非常感谢 [AK391](https://github.com/AK391) 在 Huggingface Spaces 上使用 Gradio 对我们的语音合成功能进行网页版演示。
- 非常感谢 [mymagicpower](https://github.com/mymagicpower) 采用PaddleSpeech 对 ASR 的[短语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_sdk)及[长语音](https://github.com/mymagicpower/AIAS/tree/main/3_audio_sdks/asr_long_audio_sdk)进行 Java 实现。
- 非常感谢 [JiehangXie](https://github.com/JiehangXie)/[PaddleBoBo](https://github.com/JiehangXie/PaddleBoBo) 采用 PaddleSpeech 语音合成功能实现 Virtual Uploader(VUP)/Virtual YouTuber(VTuber) 虚拟主播。
- 非常感谢 [745165806](https://github.com/745165806)/[PaddleSpeechTask](https://github.com/745165806/PaddleSpeechTask) 贡献标点重建相关模型。
diff --git a/dataset/librispeech/librispeech.py b/dataset/librispeech/librispeech.py
index 69f0db599e12d0a482a8d7783eb85ce9e04c744d..65cab2490305762b84a06408b6d302517caea182 100644
--- a/dataset/librispeech/librispeech.py
+++ b/dataset/librispeech/librispeech.py
@@ -20,12 +20,12 @@ of each audio file in the data set.
"""
import argparse
import codecs
-import distutils.util
import io
import json
import os
from multiprocessing.pool import Pool
+import distutils.util
import soundfile
from utils.utility import download
diff --git a/dataset/voxceleb/voxceleb1.py b/dataset/voxceleb/voxceleb1.py
index e50c91bc169541612cc94575b85ba3794f7dbd05..9058620083ab74dfd21f0d33d368d543e41e7744 100644
--- a/dataset/voxceleb/voxceleb1.py
+++ b/dataset/voxceleb/voxceleb1.py
@@ -59,12 +59,19 @@ DEV_TARGET_DATA = "vox1_dev_wav_parta* vox1_dev_wav.zip ae63e55b951748cc486645f5
TEST_LIST = {"vox1_test_wav.zip": "185fdc63c3c739954633d50379a3d102"}
TEST_TARGET_DATA = "vox1_test_wav.zip vox1_test_wav.zip 185fdc63c3c739954633d50379a3d102"
-# kaldi trial
-# this trial file is organized by kaldi according the official file,
-# which is a little different with the official trial veri_test2.txt
-KALDI_BASE_URL = "http://www.openslr.org/resources/49/"
-TRIAL_LIST = {"voxceleb1_test_v2.txt": "29fc7cc1c5d59f0816dc15d6e8be60f7"}
-TRIAL_TARGET_DATA = "voxceleb1_test_v2.txt voxceleb1_test_v2.txt 29fc7cc1c5d59f0816dc15d6e8be60f7"
+# voxceleb trial
+
+TRIAL_BASE_URL = "https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/"
+TRIAL_LIST = {
+ "veri_test.txt": "29fc7cc1c5d59f0816dc15d6e8be60f7", # voxceleb1
+ "veri_test2.txt": "b73110731c9223c1461fe49cb48dddfc", # voxceleb1(cleaned)
+ "list_test_hard.txt": "21c341b6b2168eea2634df0fb4b8fff1", # voxceleb1-H
+ "list_test_hard2.txt":
+ "857790e09d579a68eb2e339a090343c8", # voxceleb1-H(cleaned)
+ "list_test_all.txt": "b9ecf7aa49d4b656aa927a8092844e4a", # voxceleb1-E
+ "list_test_all2.txt":
+ "a53e059deb562ffcfc092bf5d90d9f3a" # voxceleb1-E(cleaned)
+}
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
@@ -82,7 +89,7 @@ args = parser.parse_args()
def create_manifest(data_dir, manifest_path_prefix):
- print("Creating manifest %s ..." % manifest_path_prefix)
+ print(f"Creating manifest {manifest_path_prefix} from {data_dir}")
json_lines = []
data_path = os.path.join(data_dir, "wav", "**", "*.wav")
total_sec = 0.0
@@ -114,6 +121,9 @@ def create_manifest(data_dir, manifest_path_prefix):
# voxceleb1 is given explicit in the path
data_dir_name = Path(data_dir).name
manifest_path_prefix = manifest_path_prefix + "." + data_dir_name
+ if not os.path.exists(os.path.dirname(manifest_path_prefix)):
+ os.makedirs(os.path.dirname(manifest_path_prefix))
+
with codecs.open(manifest_path_prefix, 'w', encoding='utf-8') as f:
for line in json_lines:
f.write(line + "\n")
@@ -133,11 +143,13 @@ def create_manifest(data_dir, manifest_path_prefix):
def prepare_dataset(base_url, data_list, target_dir, manifest_path,
target_data):
if not os.path.exists(target_dir):
- os.mkdir(target_dir)
+ os.makedirs(target_dir)
# wav directory already exists, it need do nothing
+ # we will download the voxceleb1 data to ${target_dir}/vox1/dev/ or ${target_dir}/vox1/test directory
if not os.path.exists(os.path.join(target_dir, "wav")):
# download all dataset part
+ print("start to download the vox1 dev zip package")
for zip_part in data_list.keys():
download_url = " --no-check-certificate " + base_url + "/" + zip_part
download(
@@ -167,10 +179,22 @@ def prepare_dataset(base_url, data_list, target_dir, manifest_path,
create_manifest(data_dir=target_dir, manifest_path_prefix=manifest_path)
+def prepare_trial(base_url, data_list, target_dir):
+ if not os.path.exists(target_dir):
+ os.makedirs(target_dir)
+
+ for trial, md5sum in data_list.items():
+ target_trial = os.path.join(target_dir, trial)
+ if not os.path.exists(os.path.join(target_dir, trial)):
+ download_url = " --no-check-certificate " + base_url + "/" + trial
+ download(url=download_url, md5sum=md5sum, target_dir=target_dir)
+
+
def main():
if args.target_dir.startswith('~'):
args.target_dir = os.path.expanduser(args.target_dir)
+ # prepare the vox1 dev data
prepare_dataset(
base_url=BASE_URL,
data_list=DEV_LIST,
@@ -178,6 +202,7 @@ def main():
manifest_path=args.manifest_prefix,
target_data=DEV_TARGET_DATA)
+ # prepare the vox1 test data
prepare_dataset(
base_url=BASE_URL,
data_list=TEST_LIST,
@@ -185,6 +210,12 @@ def main():
manifest_path=args.manifest_prefix,
target_data=TEST_TARGET_DATA)
+ # prepare the vox1 trial
+ prepare_trial(
+ base_url=TRIAL_BASE_URL,
+ data_list=TRIAL_LIST,
+ target_dir=os.path.dirname(args.manifest_prefix))
+
print("Manifest prepare done!")
diff --git a/dataset/voxceleb/voxceleb2.py b/dataset/voxceleb/voxceleb2.py
new file mode 100644
index 0000000000000000000000000000000000000000..22a2e2ffe21c1ba22c1db94af773b7a5b9938f54
--- /dev/null
+++ b/dataset/voxceleb/voxceleb2.py
@@ -0,0 +1,164 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prepare VoxCeleb2 dataset
+
+Download and unpack the voxceleb2 data files.
+Voxceleb2 data is stored as the m4a format,
+so we need convert the m4a to wav with the convert.sh scripts
+"""
+import argparse
+import codecs
+import glob
+import json
+import os
+from pathlib import Path
+
+import soundfile
+
+from utils.utility import download
+from utils.utility import unzip
+
+# all the data will be download in the current data/voxceleb directory default
+DATA_HOME = os.path.expanduser('.')
+
+BASE_URL = "--no-check-certificate https://www.robots.ox.ac.uk/~vgg/data/voxceleb/data/"
+
+# dev data
+DEV_DATA_URL = BASE_URL + '/vox2_aac.zip'
+DEV_MD5SUM = "bbc063c46078a602ca71605645c2a402"
+
+# test data
+TEST_DATA_URL = BASE_URL + '/vox2_test_aac.zip'
+TEST_MD5SUM = "0d2b3ea430a821c33263b5ea37ede312"
+
+parser = argparse.ArgumentParser(description=__doc__)
+parser.add_argument(
+ "--target_dir",
+ default=DATA_HOME + "/voxceleb2/",
+ type=str,
+ help="Directory to save the voxceleb1 dataset. (default: %(default)s)")
+parser.add_argument(
+ "--manifest_prefix",
+ default="manifest",
+ type=str,
+ help="Filepath prefix for output manifests. (default: %(default)s)")
+parser.add_argument(
+ "--download",
+ default=False,
+ action="store_true",
+ help="Download the voxceleb2 dataset. (default: %(default)s)")
+parser.add_argument(
+ "--generate",
+ default=False,
+ action="store_true",
+ help="Generate the manifest files. (default: %(default)s)")
+
+args = parser.parse_args()
+
+
+def create_manifest(data_dir, manifest_path_prefix):
+ print("Creating manifest %s ..." % manifest_path_prefix)
+ json_lines = []
+ data_path = os.path.join(data_dir, "**", "*.wav")
+ total_sec = 0.0
+ total_text = 0.0
+ total_num = 0
+ speakers = set()
+ for audio_path in glob.glob(data_path, recursive=True):
+ audio_id = "-".join(audio_path.split("/")[-3:])
+ utt2spk = audio_path.split("/")[-3]
+ duration = soundfile.info(audio_path).duration
+ text = ""
+ json_lines.append(
+ json.dumps(
+ {
+ "utt": audio_id,
+ "utt2spk": str(utt2spk),
+ "feat": audio_path,
+ "feat_shape": (duration, ),
+ "text": text # compatible with asr data format
+ },
+ ensure_ascii=False))
+
+ total_sec += duration
+ total_text += len(text)
+ total_num += 1
+ speakers.add(utt2spk)
+
+ # data_dir_name refer to dev or test
+ # voxceleb2 is given explicit in the path
+ data_dir_name = Path(data_dir).name
+ manifest_path_prefix = manifest_path_prefix + "." + data_dir_name
+
+ if not os.path.exists(os.path.dirname(manifest_path_prefix)):
+ os.makedirs(os.path.dirname(manifest_path_prefix))
+ with codecs.open(manifest_path_prefix, 'w', encoding='utf-8') as f:
+ for line in json_lines:
+ f.write(line + "\n")
+
+ manifest_dir = os.path.dirname(manifest_path_prefix)
+ meta_path = os.path.join(manifest_dir, "voxceleb2." +
+ data_dir_name) + ".meta"
+ with codecs.open(meta_path, 'w', encoding='utf-8') as f:
+ print(f"{total_num} utts", file=f)
+ print(f"{len(speakers)} speakers", file=f)
+ print(f"{total_sec / (60 * 60)} h", file=f)
+ print(f"{total_text} text", file=f)
+ print(f"{total_text / total_sec} text/sec", file=f)
+ print(f"{total_sec / total_num} sec/utt", file=f)
+
+
+def download_dataset(url, md5sum, target_dir, dataset):
+ if not os.path.exists(target_dir):
+ os.makedirs(target_dir)
+
+ # wav directory already exists, it need do nothing
+ print("target dir {}".format(os.path.join(target_dir, dataset)))
+ # unzip the dev dataset will create the dev and unzip the m4a to dev dir
+ # but the test dataset will unzip to aac
+ # so, wo create the ${target_dir}/test and unzip the m4a to test dir
+ if not os.path.exists(os.path.join(target_dir, dataset)):
+ filepath = download(url, md5sum, target_dir)
+ if dataset == "test":
+ unzip(filepath, os.path.join(target_dir, "test"))
+
+
+def main():
+ if args.target_dir.startswith('~'):
+ args.target_dir = os.path.expanduser(args.target_dir)
+
+ # download and unpack the vox2-dev data
+ print("download: {}".format(args.download))
+ if args.download:
+ download_dataset(
+ url=DEV_DATA_URL,
+ md5sum=DEV_MD5SUM,
+ target_dir=args.target_dir,
+ dataset="dev")
+
+ download_dataset(
+ url=TEST_DATA_URL,
+ md5sum=TEST_MD5SUM,
+ target_dir=args.target_dir,
+ dataset="test")
+
+ print("VoxCeleb2 download is done!")
+
+ if args.generate:
+ create_manifest(
+ args.target_dir, manifest_path_prefix=args.manifest_prefix)
+
+
+if __name__ == '__main__':
+ main()
diff --git a/demos/README.md b/demos/README.md
index 4482aa191cb7bf8b79d39ee37bb75590c24fe3b1..36e93dbf17d18476f6447066fe3dc0b715d57a1d 100644
--- a/demos/README.md
+++ b/demos/README.md
@@ -4,6 +4,7 @@
The directory containes many speech applications in multi scenarios.
+* audio searching - mass audio similarity retrieval
* audio tagging - multi-label tagging of an audio file
* automatic_video_subtitiles - generate subtitles from a video
* metaverse - 2D AR with TTS
diff --git a/demos/README_cn.md b/demos/README_cn.md
index 242b4f070d41e791577c526610a7ae78ea24df3e..add6e25f565adf8f7cf046ba83f2c5c2ae18cd8e 100644
--- a/demos/README_cn.md
+++ b/demos/README_cn.md
@@ -4,6 +4,7 @@
该目录包含基于 PaddleSpeech 开发的不同场景的语音应用 Demo:
+* 声音检索 - 海量音频相似性检索。
* 声音分类 - 基于 AudioSet 的 527 类标签的音频多标签分类。
* 视频字幕生成 - 识别视频中语音的文本,并进行文本后处理。
* 元宇宙 - 基于语音合成的 2D 增强现实。
diff --git a/demos/audio_searching/README.md b/demos/audio_searching/README.md
index 2b417c0eb805d61291c60ababb279e9f9009988e..8a6f386392afb128e657bd47d6be59dd915f4145 100644
--- a/demos/audio_searching/README.md
+++ b/demos/audio_searching/README.md
@@ -3,27 +3,36 @@
# Audio Searching
## Introduction
-As the Internet continues to evolve, unstructured data such as emails, social media photos, live videos, and customer service voice calls have become increasingly common. If we want to process the data on a computer, we need to use embedding technology to transform the data into vector and store, index, and query it
+As the Internet continues to evolve, unstructured data such as emails, social media photos, live videos, and customer service voice calls have become increasingly common. If we want to process the data on a computer, we need to use embedding technology to transform the data into vector and store, index, and query it.
-However, when there is a large amount of data, such as hundreds of millions of audio tracks, it is more difficult to do a similarity search. The exhaustive method is feasible, but very time consuming. For this scenario, this demo will introduce how to build an audio similarity retrieval system using the open source vector database Milvus
+However, when there is a large amount of data, such as hundreds of millions of audio tracks, it is more difficult to do a similarity search. The exhaustive method is feasible, but very time consuming. For this scenario, this demo will introduce how to build an audio similarity retrieval system using the open source vector database Milvus.
-Audio retrieval (speech, music, speaker, etc.) enables querying and finding similar sounds (or the same speaker) in a large amount of audio data. The audio similarity retrieval system can be used to identify similar sound effects, minimize intellectual property infringement, quickly retrieve the voice print library, and help enterprises control fraud and identity theft. Audio retrieval also plays an important role in the classification and statistical analysis of audio data
+Audio retrieval (speech, music, speaker, etc.) enables querying and finding similar sounds (or the same speaker) in a large amount of audio data. The audio similarity retrieval system can be used to identify similar sound effects, minimize intellectual property infringement, quickly retrieve the voice print library, and help enterprises control fraud and identity theft. Audio retrieval also plays an important role in the classification and statistical analysis of audio data.
-In this demo, you will learn how to build an audio retrieval system to retrieve similar sound snippets. The uploaded audio clips are converted into vector data using paddlespeech-based pre-training models (audio classification model, speaker recognition model, etc.) and stored in Milvus. Milvus automatically generates a unique ID for each vector, then stores the ID and the corresponding audio information (audio ID, audio speaker ID, etc.) in MySQL to complete the library construction. During retrieval, users upload test audio to obtain vector, and then conduct vector similarity search in Milvus. The retrieval result returned by Milvus is vector ID, and the corresponding audio information can be queried in MySQL by ID
+In this demo, you will learn how to build an audio retrieval system to retrieve similar sound snippets. The uploaded audio clips are converted into vector data using paddlespeech-based pre-training models (audio classification model, speaker recognition model, etc.) and stored in Milvus. Milvus automatically generates a unique ID for each vector, then stores the ID and the corresponding audio information (audio ID, audio speaker ID, etc.) in MySQL to complete the library construction. During retrieval, users upload test audio to obtain vector, and then conduct vector similarity search in Milvus.The retrieval result returned by Milvus is vector ID, and the corresponding audio information can be queried in MySQL by ID.

-Note:this demo uses the [CN-Celeb](http://openslr.org/82/) dataset of at least 650,000 audio entries and 3000 speakers to build the audio vector library, which is then retrieved using a preset distance calculation. The dataset can also use other, Adjust as needed, e.g. Librispeech, VoxCeleb, UrbanSound, GloVe, MNIST, etc
+Note:this demo uses the [CN-Celeb](http://openslr.org/82/) dataset of at least 650,000 audio entries and 3000 speakers to build the audio vector library, which is then retrieved using a preset distance calculation. The dataset can also use other, Adjust as needed, e.g. Librispeech, VoxCeleb, UrbanSound, GloVe, MNIST, etc.
## Usage
-### 1. Prepare MySQL and Milvus services by docker-compose
+### 1. Prepare PaddleSpeech
+Audio vector extraction requires PaddleSpeech training model, so please make sure that PaddleSpeech has been installed before running. Specific installation steps: See [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
+
+You can choose one way from easy, meduim and hard to install paddlespeech.
+
+### 2. Prepare MySQL and Milvus services by docker-compose
The audio similarity search system requires Milvus, MySQL services. We can start these containers with one click through [docker-compose.yaml](./docker-compose.yaml), so please make sure you have [installed Docker Engine](https://docs.docker.com/engine/install/) and [Docker Compose](https://docs.docker.com/compose/install/) before running. then
```bash
+## Enter the audio_searching directory for the following example
+cd ~/PaddleSpeech/demos/audio_searching/
+
+## Then start the related services within the container
docker-compose -f docker-compose.yaml up -d
```
-Then you will see the that all containers are created:
+You will see the that all containers are created:
```bash
Creating network "quick_deploy_app_net" with driver "bridge"
@@ -42,10 +51,10 @@ b2bcf279e599 milvusdb/milvus:v2.0.1 "/tini -- milvus run…" 22 hours ago Up
d8ef4c84e25c mysql:5.7 "docker-entrypoint.s…" 22 hours ago Up 22 hours 0.0.0.0:3306->3306/tcp, 33060/tcp audio-mysql
8fb501edb4f3 quay.io/coreos/etcd:v3.5.0 "etcd -advertise-cli…" 22 hours ago Up 22 hours 2379-2380/tcp milvus-etcd
ffce340b3790 minio/minio:RELEASE.2020-12-03T00-03-10Z "/usr/bin/docker-ent…" 22 hours ago Up 22 hours (healthy) 9000/tcp milvus-minio
-15c84a506754 iregistry.baidu-int.com/paddlespeech/audio-search-client:1.0 "/bin/bash -c '/usr/…" 22 hours ago Up 22 hours (healthy) 0.0.0.0:8068->80/tcp audio-webclient
+15c84a506754 paddlepaddle/paddlespeech-audio-search-client:2.3 "/bin/bash -c '/usr/…" 22 hours ago Up 22 hours (healthy) 0.0.0.0:8068->80/tcp audio-webclient
```
-### 2. Start API Server
+### 3. Start API Server
Then to start the system server, and it provides HTTP backend services.
- Install the Python packages
@@ -53,95 +62,153 @@ Then to start the system server, and it provides HTTP backend services.
```bash
pip install -r requirements.txt
```
-- Set configuration
+- Set configuration(In the case of local running, you can skip this step.)
```bash
+ ## Method 1: Modify the source file
vim src/config.py
+
+ ## Method 2: Modify the environment variables, as shown in
+ export MILVUS_HOST=127.0.0.1
+ export MYSQL_HOST=127.0.0.1
```
- Modify the parameters according to your own environment. Here listing some parameters that need to be set, for more information please refer to [config.py](./src/config.py).
+ Here listing some parameters that need to be set, for more information please refer to [config.py](./src/config.py).
- | **Parameter** | **Description** | **Default setting** |
- | ---------------- | ----------------------------------------------------- | ------------------- |
- | MILVUS_HOST | The IP address of Milvus, you can get it by ifconfig. If running everything on one machine, most likely 127.0.0.1 | 127.0.0.1 |
- | MILVUS_PORT | Port of Milvus. | 19530 |
- | VECTOR_DIMENSION | Dimension of the vectors. | 2048 |
- | MYSQL_HOST | The IP address of Mysql. | 127.0.0.1 |
- | MYSQL_PORT | Port of Milvus. | 3306 |
- | DEFAULT_TABLE | The milvus and mysql default collection name. | audio_table |
+ | **Parameter** |**Description** | **Default setting** |
+ | ---------------- | -----------------------| ------------------- |
+ | MILVUS_HOST | The IP address of Milvus, you can get it by ifconfig. If running everything on one machine, most likely 127.0.0.1 | 127.0.0.1
+ | MILVUS_PORT | Port of Milvus. | 19530 |
+ | VECTOR_DIMENSION | Dimension of the vectors. | 2048 |
+ | MYSQL_HOST | The IP address of Mysql. | 127.0.0.1 |
+ | MYSQL_PORT | Port of Mysql. | 3306 |
+ | DEFAULT_TABLE | The milvus and mysql default collection name. | audio_table |
- Run the code
Then start the server with Fastapi.
```bash
- export PYTHONPATH=$PYTHONPATH:./src
+ export PYTHONPATH=$PYTHONPATH:./src:../../paddleaudio
python src/main.py
```
Then you will see the Application is started:
```bash
- INFO: Started server process [3949]
- 2022-03-07 17:39:14,864 | INFO | server.py | serve | 75 | Started server process [3949]
+ INFO: Started server process [13352]
+ 2022-03-26 22:45:30,838 | INFO | server.py | serve | 75 | Started server process [13352]
INFO: Waiting for application startup.
- 2022-03-07 17:39:14,865 | INFO | on.py | startup | 45 | Waiting for application startup.
+ 2022-03-26 22:45:30,839 | INFO | on.py | startup | 45 | Waiting for application startup.
INFO: Application startup complete.
- 2022-03-07 17:39:14,866 | INFO | on.py | startup | 59 | Application startup complete.
+ 2022-03-26 22:45:30,839 | INFO | on.py | startup | 59 | Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8002 (Press CTRL+C to quit)
- 2022-03-07 17:39:14,867 | INFO | server.py | _log_started_message | 206 | Uvicorn running on http://0.0.0.0:8002 (Press CTRL+C to quit)
+ 2022-03-26 22:45:30,840 | INFO | server.py | _log_started_message | 206 | Uvicorn running on http://0.0.0.0:8002 (Press CTRL+C to quit)
```
-### 3. Usage
+### 4. Usage
- Prepare data
```bash
wget -c https://www.openslr.org/resources/82/cn-celeb_v2.tar.gz && tar -xvf cn-celeb_v2.tar.gz
```
- Note: If you want to build a quick demo, you can use ./src/test_main.py:download_audio_data function, it downloads 20 audio files , Subsequent results show this collection as an example
+ **Note**: If you want to build a quick demo, you can use ./src/test_main.py:download_audio_data function, it downloads 20 audio files , Subsequent results show this collection as an example
+
+- Prepare model(Skip this step if you use the default model.)
+ ```bash
+ ## Modify model configuration parameters. Currently, only ecapatdnn_voxceleb12 is supported, and multiple types will be supported in the future
+ vim ./src/encode.py
+ ```
- - scripts test (recommend!)
+- Scripts test (Recommended)
- The internal process is downloading data, loading the Paddlespeech model, extracting embedding, storing library, retrieving and deleting library
+ The internal process is downloading data, loading the paddlespeech model, extracting embedding, storing library, retrieving and deleting library
```bash
python ./src/test_main.py
```
Output:
```bash
- Checkpoint path: %your model path%
+ Downloading https://paddlespeech.bj.bcebos.com/vector/audio/example_audio.tar.gz ...
+ ...
+ Unpacking ./example_audio.tar.gz ...
+ [2022-03-26 22:50:54,987] [ INFO] - checking the aduio file format......
+ [2022-03-26 22:50:54,987] [ INFO] - The sample rate is 16000
+ [2022-03-26 22:50:54,987] [ INFO] - The audio file format is right
+ [2022-03-26 22:50:54,988] [ INFO] - device type: cpu
+ [2022-03-26 22:50:54,988] [ INFO] - load the pretrained model: ecapatdnn_voxceleb12-16k
+ [2022-03-26 22:50:54,990] [ INFO] - Downloading sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz from https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz
+ ...
+ [2022-03-26 22:51:17,285] [ INFO] - start to dynamic import the model class
+ [2022-03-26 22:51:17,285] [ INFO] - model name ecapatdnn
+ [2022-03-26 22:51:23,864] [ INFO] - start to set the model parameters to model
+ [2022-03-26 22:54:08,115] [ INFO] - create the model instance success
+ [2022-03-26 22:54:08,116] [ INFO] - Preprocess audio file: /home/zhaoqingen/PaddleSpeech/demos/audio_
+ searching/example_audio/knife_hit_iron3.wav
+ [2022-03-26 22:54:08,116] [ INFO] - load the audio sample points, shape is: (11012,)
+ [2022-03-26 22:54:08,150] [ INFO] - extract the audio feat, shape is: (80, 69)
+ [2022-03-26 22:54:08,152] [ INFO] - feats shape: [1, 80, 69]
+ [2022-03-26 22:54:08,154] [ INFO] - audio extract the feat success
+ [2022-03-26 22:54:08,155] [ INFO] - start to do backbone network model forward
+ [2022-03-26 22:54:08,155] [ INFO] - feats shape:[1, 80, 69], lengths shape: [1]
+ [2022-03-26 22:54:08,433] [ INFO] - embedding size: (192,)
Extracting feature from audio No. 1 , 20 audios in total
+ [2022-03-26 22:54:08,435] [ INFO] - checking the aduio file format......
+ [2022-03-26 22:54:08,435] [ INFO] - The sample rate is 16000
+ [2022-03-26 22:54:08,436] [ INFO] - The audio file format is right
+ [2022-03-26 22:54:08,436] [ INFO] - device type: cpu
+ [2022-03-26 22:54:08,436] [ INFO] - Model has been initialized
+ [2022-03-26 22:54:08,436] [ INFO] - Preprocess audio file: /home/zhaoqingen/PaddleSpeech/demos/audio_searching/example_audio/sword_wielding.wav
+ [2022-03-26 22:54:08,436] [ INFO] - load the audio sample points, shape is: (6391,)
+ [2022-03-26 22:54:08,452] [ INFO] - extract the audio feat, shape is: (80, 40)
+ [2022-03-26 22:54:08,454] [ INFO] - feats shape: [1, 80, 40]
+ [2022-03-26 22:54:08,454] [ INFO] - audio extract the feat success
+ [2022-03-26 22:54:08,454] [ INFO] - start to do backbone network model forward
+ [2022-03-26 22:54:08,455] [ INFO] - feats shape:[1, 80, 40], lengths shape: [1]
+ [2022-03-26 22:54:08,633] [ INFO] - embedding size: (192,)
Extracting feature from audio No. 2 , 20 audios in total
...
- 2022-03-09 17:22:13,870 | INFO | main.py | load_audios | 85 | Successfully loaded data, total count: 20
- 2022-03-09 17:22:13,898 | INFO | main.py | count_audio | 147 | Successfully count the number of data!
- 2022-03-09 17:22:13,918 | INFO | main.py | audio_path | 57 | Successfully load audio: ./example_audio/test.wav
+ 2022-03-26 22:54:15,892 | INFO | main.py | load_audios | 85 | Successfully loaded data, total count: 20
+ 2022-03-26 22:54:15,908 | INFO | main.py | count_audio | 148 | Successfully count the number of data!
+ [2022-03-26 22:54:15,916] [ INFO] - checking the aduio file format......
+ [2022-03-26 22:54:15,916] [ INFO] - The sample rate is 16000
+ [2022-03-26 22:54:15,916] [ INFO] - The audio file format is right
+ [2022-03-26 22:54:15,916] [ INFO] - device type: cpu
+ [2022-03-26 22:54:15,916] [ INFO] - Model has been initialized
+ [2022-03-26 22:54:15,916] [ INFO] - Preprocess audio file: /home/zhaoqingen/PaddleSpeech/demos/audio_searching/example_audio/test.wav
+ [2022-03-26 22:54:15,917] [ INFO] - load the audio sample points, shape is: (8456,)
+ [2022-03-26 22:54:15,923] [ INFO] - extract the audio feat, shape is: (80, 53)
+ [2022-03-26 22:54:15,924] [ INFO] - feats shape: [1, 80, 53]
+ [2022-03-26 22:54:15,924] [ INFO] - audio extract the feat success
+ [2022-03-26 22:54:15,924] [ INFO] - start to do backbone network model forward
+ [2022-03-26 22:54:15,924] [ INFO] - feats shape:[1, 80, 53], lengths shape: [1]
+ [2022-03-26 22:54:16,051] [ INFO] - embedding size: (192,)
...
- 2022-03-09 17:22:32,580 | INFO | main.py | search_local_audio | 131 | search result http://testserver/data?audio_path=./example_audio/test.wav, distance 0.0
- 2022-03-09 17:22:32,580 | INFO | main.py | search_local_audio | 131 | search result http://testserver/data?audio_path=./example_audio/knife_chopping.wav, distance 0.021805256605148315
- 2022-03-09 17:22:32,580 | INFO | main.py | search_local_audio | 131 | search result http://testserver/data?audio_path=./example_audio/knife_cut_into_flesh.wav, distance 0.052762262523174286
+ 2022-03-26 22:54:16,086 | INFO | main.py | search_local_audio | 132 | search result http://testserver/data?audio_path=./example_audio/test.wav, score 100.0
+ 2022-03-26 22:54:16,087 | INFO | main.py | search_local_audio | 132 | search result http://testserver/data?audio_path=./example_audio/knife_chopping.wav, score 29.182177782058716
+ 2022-03-26 22:54:16,087 | INFO | main.py | search_local_audio | 132 | search result http://testserver/data?audio_path=./example_audio/knife_cut_into_body.wav, score 22.73637056350708
...
- 2022-03-09 17:22:32,582 | INFO | main.py | search_local_audio | 135 | Successfully searched similar audio!
- 2022-03-09 17:22:33,658 | INFO | main.py | drop_tables | 159 | Successfully drop tables in Milvus and MySQL!
+ 2022-03-26 22:54:16,088 | INFO | main.py | search_local_audio | 136 | Successfully searched similar audio!
+ 2022-03-26 22:54:17,164 | INFO | main.py | drop_tables | 160 | Successfully drop tables in Milvus and MySQL!
```
-- GUI test (optional)
+- GUI test (Optional)
- Navigate to 127.0.0.1:8068 in your browser to access the front-end interface
+ Navigate to 127.0.0.1:8068 in your browser to access the front-end interface.
- Note: If the browser and the service are not on the same machine, then the IP needs to be changed to the IP of the machine where the service is located, and the corresponding API_URL in docker-compose.yaml needs to be changed and the service can be restarted
+ **Note**: If the browser and the service are not on the same machine, then the IP needs to be changed to the IP of the machine where the service is located, and the corresponding API_URL in docker-compose.yaml needs to be changed, and the docker-compose.yaml file needs to be re-executed for the change to take effect.
- Insert data
- Download the data and decompress it to a path named /home/speech/data. Then enter /home/speech/data in the address bar of the upload page to upload the data
+ Download the data on the server and decompress it to a file, for example, /home/speech/data/. Then enter /home/speech/data/ in the address bar of the upload page to upload the data.

- Search for similar audio
- Select the magnifying glass icon on the left side of the interface. Then, press the "Default Target Audio File" button and upload a .wav sound file you'd like to search. Results will be displayed
+ Select the magnifying glass icon on the left side of the interface. Then, press the "Default Target Audio File" button and upload a .wav sound file from the client you'd like to search. Results will be displayed.

-### 4.Result
+### 5.Result
machine configuration:
- OS: CentOS release 7.6
@@ -157,15 +224,12 @@ recall and elapsed time statistics are shown in the following figure:

-The retrieval framework based on Milvus takes about 2.9 milliseconds to retrieve on the premise of 90% recall rate, and it takes about 500 milliseconds for feature extraction (testing audio takes about 5 seconds), that is, a single audio test takes about 503 milliseconds in total, which can meet most application scenarios
+The retrieval framework based on Milvus takes about 2.9 milliseconds to retrieve on the premise of 90% recall rate, and it takes about 500 milliseconds for feature extraction (testing audio takes about 5 seconds), that is, a single audio test takes about 503 milliseconds in total, which can meet most application scenarios.
-### 5.Pretrained Models
+### 6.Pretrained Models
Here is a list of pretrained models released by PaddleSpeech :
| Model | Sample Rate
| :--- | :---:
| ecapa_tdnn | 16000
-| panns_cnn6| 32000
-| panns_cnn10| 32000
-| panns_cnn14| 32000
diff --git a/demos/audio_searching/README_cn.md b/demos/audio_searching/README_cn.md
index d822c00df0a13d0483940818fb9e5a97af613ada..0d0f42a0f69fb1e365370b28027674424ff1031c 100644
--- a/demos/audio_searching/README_cn.md
+++ b/demos/audio_searching/README_cn.md
@@ -4,27 +4,36 @@
# 音频相似性检索
## 介绍
-随着互联网不断发展,电子邮件、社交媒体照片、直播视频、客服语音等非结构化数据已经变得越来越普遍。如果想要使用计算机来处理这些数据,需要使用 embedding 技术将这些数据转化为向量 vector,然后进行存储、建索引、并查询
+随着互联网不断发展,电子邮件、社交媒体照片、直播视频、客服语音等非结构化数据已经变得越来越普遍。如果想要使用计算机来处理这些数据,需要使用 embedding 技术将这些数据转化为向量 vector,然后进行存储、建索引、并查询。
-但是,当数据量很大,比如上亿条音频要做相似度搜索,就比较困难了。穷举法固然可行,但非常耗时。针对这种场景,该demo 将介绍如何使用开源向量数据库 Milvus 搭建音频相似度检索系统
+但是,当数据量很大,比如上亿条音频要做相似度搜索,就比较困难了。穷举法固然可行,但非常耗时。针对这种场景,该 demo 将介绍如何使用开源向量数据库 Milvus 搭建音频相似度检索系统。
-音频检索(如演讲、音乐、说话人等检索)实现了在海量音频数据中查询并找出相似声音(或相同说话人)片段。音频相似性检索系统可用于识别相似的音效、最大限度减少知识产权侵权等,还可以快速的检索声纹库、帮助企业控制欺诈和身份盗用等。在音频数据的分类和统计分析中,音频检索也发挥着重要作用
+音频检索(如演讲、音乐、说话人等检索)实现了在海量音频数据中查询并找出相似声音(或相同说话人)片段。音频相似性检索系统可用于识别相似的音效、最大限度减少知识产权侵权等,还可以快速的检索声纹库、帮助企业控制欺诈和身份盗用等。在音频数据的分类和统计分析中,音频检索也发挥着重要作用。
-在本 demo 中,你将学会如何构建一个音频检索系统,用来检索相似的声音片段。使用基于 PaddleSpeech 预训练模型(音频分类模型,说话人识别模型等)将上传的音频片段转换为向量数据,并存储在 Milvus 中。Milvus 自动为每个向量生成唯一的 ID,然后将 ID 和 相应的音频信息(音频id,音频的说话人id等等)存储在 MySQL,这样就完成建库的工作。用户在检索时,上传测试音频,得到向量,然后在 Milvus 中进行向量相似度搜索,Milvus 返回的检索结果为向量 ID,通过 ID 在 MySQL 内部查询相应的音频信息即可
+在本 demo 中,你将学会如何构建一个音频检索系统,用来检索相似的声音片段。使用基于 PaddleSpeech 预训练模型(音频分类模型,说话人识别模型等)将上传的音频片段转换为向量数据,并存储在 Milvus 中。Milvus 自动为每个向量生成唯一的 ID,然后将 ID 和 相应的音频信息(音频id,音频的说话人id等等)存储在 MySQL,这样就完成建库的工作。用户在检索时,上传测试音频,得到向量,然后在 Milvus 中进行向量相似度搜索,Milvus 返回的检索结果为向量 ID,通过 ID 在 MySQL 内部查询相应的音频信息即可。

-注:该 demo 使用 [CN-Celeb](http://openslr.org/82/) 数据集,包括至少 650000 条音频,3000 个说话人,来建立音频向量库(音频特征,或音频说话人特征),然后通过预设的距离计算方式进行音频(或说话人)检索,这里面数据集也可以使用其他的,根据需要调整,如Librispeech,VoxCeleb,UrbanSound,GloVe,MNIST等
+注:该 demo 使用 [CN-Celeb](http://openslr.org/82/) 数据集,包括至少 650000 条音频,3000 个说话人,来建立音频向量库(音频特征,或音频说话人特征),然后通过预设的距离计算方式进行音频(或说话人)检索,这里面数据集也可以使用其他的,根据需要调整,如Librispeech,VoxCeleb,UrbanSound,GloVe,MNIST等。
## 使用方法
-### 1. MySQL 和 Milvus 安装
-音频相似度搜索系统需要用到 Milvus, MySQL 服务。 我们可以通过 [docker-compose.yaml](./docker-compose.yaml) 一键启动这些容器,所以请确保在运行之前已经安装了 [Docker Engine](https://docs.docker.com/engine/install/) 和 [Docker Compose](https://docs.docker.com/compose/install/)。 即
+### 1. PaddleSpeech 安装
+音频向量的提取需要用到基于 PaddleSpeech 训练的模型,所以请确保在运行之前已经安装了 PaddleSpeech,具体安装步骤,详见[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。
+
+你可以从 easy,medium,hard 三种方式中选择一种方式安装。
+
+### 2. MySQL 和 Milvus 安装
+音频相似性的检索需要用到 Milvus, MySQL 服务。 我们可以通过 [docker-compose.yaml](./docker-compose.yaml) 一键启动这些容器,所以请确保在运行之前已经安装了 [Docker Engine](https://docs.docker.com/engine/install/) 和 [Docker Compose](https://docs.docker.com/compose/install/)。 即
```bash
+## 先进入到 audio_searching 目录,如下示例
+cd ~/PaddleSpeech/demos/audio_searching/
+
+## 然后启动容器内的相关服务
docker-compose -f docker-compose.yaml up -d
```
-然后你会看到所有的容器都被创建:
+你会看到所有的容器都被创建:
```bash
Creating network "quick_deploy_app_net" with driver "bridge"
@@ -43,63 +52,74 @@ b2bcf279e599 milvusdb/milvus:v2.0.1 "/tini -- milvus run…" 22 hours ago Up
d8ef4c84e25c mysql:5.7 "docker-entrypoint.s…" 22 hours ago Up 22 hours 0.0.0.0:3306->3306/tcp, 33060/tcp audio-mysql
8fb501edb4f3 quay.io/coreos/etcd:v3.5.0 "etcd -advertise-cli…" 22 hours ago Up 22 hours 2379-2380/tcp milvus-etcd
ffce340b3790 minio/minio:RELEASE.2020-12-03T00-03-10Z "/usr/bin/docker-ent…" 22 hours ago Up 22 hours (healthy) 9000/tcp milvus-minio
-15c84a506754 iregistry.baidu-int.com/paddlespeech/audio-search-client:1.0 "/bin/bash -c '/usr/…" 22 hours ago Up 22 hours (healthy) 0.0.0.0:8068->80/tcp audio-webclient
+15c84a506754 paddlepaddle/paddlespeech-audio-search-client:2.3 "/bin/bash -c '/usr/…" 22 hours ago Up 22 hours (healthy) 0.0.0.0:8068->80/tcp audio-webclient
```
-### 2. 配置并启动 API 服务
-启动系统服务程序,它会提供基于 Http 后端服务
+### 3. 配置并启动 API 服务
+启动系统服务程序,它会提供基于 HTTP 后端服务。
- 安装服务依赖的 python 基础包
```bash
pip install -r requirements.txt
```
-- 修改配置
+- 修改配置(本地运行情况下,一般不用修改,可以跳过该步骤)
```bash
+ ## 方法一:修改源码文件
vim src/config.py
+
+ ## 方法二:修改环境变量,如下所示
+ export MILVUS_HOST=127.0.0.1
+ export MYSQL_HOST=127.0.0.1
```
- 请根据实际环境进行修改。 这里列出了一些需要设置的参数,更多信息请参考 [config.py](./src/config.py)
+ 这里列出了一些需要设置的参数,更多信息请参考 [config.py](./src/config.py)
- | **Parameter** | **Description** | **Default setting** |
- | ---------------- | ----------------------------------------------------- | ------------------- |
- | MILVUS_HOST | The IP address of Milvus, you can get it by ifconfig. If running everything on one machine, most likely 127.0.0.1 | 127.0.0.1 |
- | MILVUS_PORT | Port of Milvus. | 19530 |
- | VECTOR_DIMENSION | Dimension of the vectors. | 2048 |
- | MYSQL_HOST | The IP address of Mysql. | 127.0.0.1 |
- | MYSQL_PORT | Port of Milvus. | 3306 |
- | DEFAULT_TABLE | The milvus and mysql default collection name. | audio_table |
+ | **参数** | **描述** | **默认设置** |
+ | ---------------- | -------------------- | ------------------- |
+ | MILVUS_HOST | Milvus 服务的 IP 地址 | 127.0.0.1 |
+ | MILVUS_PORT | Milvus 服务的端口号 | 19530 |
+ | VECTOR_DIMENSION | 特征向量的维度 | 192 |
+ | MYSQL_HOST | Mysql 服务的 IP 地址 | 127.0.0.1 |
+ | MYSQL_PORT | Mysql 服务的端口号 | 3306 |
+ | DEFAULT_TABLE | 默认存储的表名 | audio_table |
- 运行程序
启动用 Fastapi 构建的服务
```bash
- export PYTHONPATH=$PYTHONPATH:./src
+ export PYTHONPATH=$PYTHONPATH:./src:../../paddleaudio
python src/main.py
```
然后你会看到应用程序启动:
```bash
- INFO: Started server process [3949]
- 2022-03-07 17:39:14,864 | INFO | server.py | serve | 75 | Started server process [3949]
+ INFO: Started server process [13352]
+ 2022-03-26 22:45:30,838 | INFO | server.py | serve | 75 | Started server process [13352]
INFO: Waiting for application startup.
- 2022-03-07 17:39:14,865 | INFO | on.py | startup | 45 | Waiting for application startup.
+ 2022-03-26 22:45:30,839 | INFO | on.py | startup | 45 | Waiting for application startup.
INFO: Application startup complete.
- 2022-03-07 17:39:14,866 | INFO | on.py | startup | 59 | Application startup complete.
+ 2022-03-26 22:45:30,839 | INFO | on.py | startup | 59 | Application startup complete.
INFO: Uvicorn running on http://0.0.0.0:8002 (Press CTRL+C to quit)
- 2022-03-07 17:39:14,867 | INFO | server.py | _log_started_message | 206 | Uvicorn running on http://0.0.0.0:8002 (Press CTRL+C to quit)
+ 2022-03-26 22:45:30,840 | INFO | server.py | _log_started_message | 206 | Uvicorn running on http://0.0.0.0:8002 (Press CTRL+C to quit)
```
-### 3. 测试方法
+### 4. 测试方法
- 准备数据
```bash
wget -c https://www.openslr.org/resources/82/cn-celeb_v2.tar.gz && tar -xvf cn-celeb_v2.tar.gz
```
- 注:如果希望快速搭建 demo,可以采用 ./src/test_main.py:download_audio_data 内部的 20 条音频,另外后续结果展示以该集合为例
+ **注**:如果希望快速搭建 demo,可以采用 ./src/test_main.py:download_audio_data 内部的 20 条音频,另外后续结果展示以该集合为例
+
+- 准备模型(如果使用默认模型,可以跳过此步骤)
+ ```bash
+ ## 修改模型配置参数,目前 model 仅支持 ecapatdnn_voxceleb12,后续将支持多种类型
+ vim ./src/encode.py
+ ```
- 脚本测试(推荐)
@@ -110,40 +130,88 @@ ffce340b3790 minio/minio:RELEASE.2020-12-03T00-03-10Z "/usr/bin/docker-ent…"
输出:
```bash
- Checkpoint path: %your model path%
+ Downloading https://paddlespeech.bj.bcebos.com/vector/audio/example_audio.tar.gz ...
+ ...
+ Unpacking ./example_audio.tar.gz ...
+ [2022-03-26 22:50:54,987] [ INFO] - checking the aduio file format......
+ [2022-03-26 22:50:54,987] [ INFO] - The sample rate is 16000
+ [2022-03-26 22:50:54,987] [ INFO] - The audio file format is right
+ [2022-03-26 22:50:54,988] [ INFO] - device type: cpu
+ [2022-03-26 22:50:54,988] [ INFO] - load the pretrained model: ecapatdnn_voxceleb12-16k
+ [2022-03-26 22:50:54,990] [ INFO] - Downloading sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz from https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_0.tar.gz
+ ...
+ [2022-03-26 22:51:17,285] [ INFO] - start to dynamic import the model class
+ [2022-03-26 22:51:17,285] [ INFO] - model name ecapatdnn
+ [2022-03-26 22:51:23,864] [ INFO] - start to set the model parameters to model
+ [2022-03-26 22:54:08,115] [ INFO] - create the model instance success
+ [2022-03-26 22:54:08,116] [ INFO] - Preprocess audio file: /home/zhaoqingen/PaddleSpeech/demos/audio_
+ searching/example_audio/knife_hit_iron3.wav
+ [2022-03-26 22:54:08,116] [ INFO] - load the audio sample points, shape is: (11012,)
+ [2022-03-26 22:54:08,150] [ INFO] - extract the audio feat, shape is: (80, 69)
+ [2022-03-26 22:54:08,152] [ INFO] - feats shape: [1, 80, 69]
+ [2022-03-26 22:54:08,154] [ INFO] - audio extract the feat success
+ [2022-03-26 22:54:08,155] [ INFO] - start to do backbone network model forward
+ [2022-03-26 22:54:08,155] [ INFO] - feats shape:[1, 80, 69], lengths shape: [1]
+ [2022-03-26 22:54:08,433] [ INFO] - embedding size: (192,)
Extracting feature from audio No. 1 , 20 audios in total
+ [2022-03-26 22:54:08,435] [ INFO] - checking the aduio file format......
+ [2022-03-26 22:54:08,435] [ INFO] - The sample rate is 16000
+ [2022-03-26 22:54:08,436] [ INFO] - The audio file format is right
+ [2022-03-26 22:54:08,436] [ INFO] - device type: cpu
+ [2022-03-26 22:54:08,436] [ INFO] - Model has been initialized
+ [2022-03-26 22:54:08,436] [ INFO] - Preprocess audio file: /home/zhaoqingen/PaddleSpeech/demos/audio_searching/example_audio/sword_wielding.wav
+ [2022-03-26 22:54:08,436] [ INFO] - load the audio sample points, shape is: (6391,)
+ [2022-03-26 22:54:08,452] [ INFO] - extract the audio feat, shape is: (80, 40)
+ [2022-03-26 22:54:08,454] [ INFO] - feats shape: [1, 80, 40]
+ [2022-03-26 22:54:08,454] [ INFO] - audio extract the feat success
+ [2022-03-26 22:54:08,454] [ INFO] - start to do backbone network model forward
+ [2022-03-26 22:54:08,455] [ INFO] - feats shape:[1, 80, 40], lengths shape: [1]
+ [2022-03-26 22:54:08,633] [ INFO] - embedding size: (192,)
Extracting feature from audio No. 2 , 20 audios in total
...
- 2022-03-09 17:22:13,870 | INFO | main.py | load_audios | 85 | Successfully loaded data, total count: 20
- 2022-03-09 17:22:13,898 | INFO | main.py | count_audio | 147 | Successfully count the number of data!
- 2022-03-09 17:22:13,918 | INFO | main.py | audio_path | 57 | Successfully load audio: ./example_audio/test.wav
+ 2022-03-26 22:54:15,892 | INFO | main.py | load_audios | 85 | Successfully loaded data, total count: 20
+ 2022-03-26 22:54:15,908 | INFO | main.py | count_audio | 148 | Successfully count the number of data!
+ [2022-03-26 22:54:15,916] [ INFO] - checking the aduio file format......
+ [2022-03-26 22:54:15,916] [ INFO] - The sample rate is 16000
+ [2022-03-26 22:54:15,916] [ INFO] - The audio file format is right
+ [2022-03-26 22:54:15,916] [ INFO] - device type: cpu
+ [2022-03-26 22:54:15,916] [ INFO] - Model has been initialized
+ [2022-03-26 22:54:15,916] [ INFO] - Preprocess audio file: /home/zhaoqingen/PaddleSpeech/demos/audio_searching/example_audio/test.wav
+ [2022-03-26 22:54:15,917] [ INFO] - load the audio sample points, shape is: (8456,)
+ [2022-03-26 22:54:15,923] [ INFO] - extract the audio feat, shape is: (80, 53)
+ [2022-03-26 22:54:15,924] [ INFO] - feats shape: [1, 80, 53]
+ [2022-03-26 22:54:15,924] [ INFO] - audio extract the feat success
+ [2022-03-26 22:54:15,924] [ INFO] - start to do backbone network model forward
+ [2022-03-26 22:54:15,924] [ INFO] - feats shape:[1, 80, 53], lengths shape: [1]
+ [2022-03-26 22:54:16,051] [ INFO] - embedding size: (192,)
...
- 2022-03-09 17:22:32,580 | INFO | main.py | search_local_audio | 131 | search result http://testserver/data?audio_path=./example_audio/test.wav, distance 0.0
- 2022-03-09 17:22:32,580 | INFO | main.py | search_local_audio | 131 | search result http://testserver/data?audio_path=./example_audio/knife_chopping.wav, distance 0.021805256605148315
- 2022-03-09 17:22:32,580 | INFO | main.py | search_local_audio | 131 | search result http://testserver/data?audio_path=./example_audio/knife_cut_into_flesh.wav, distance 0.052762262523174286
+ 2022-03-26 22:54:16,086 | INFO | main.py | search_local_audio | 132 | search result http://testserver/data?audio_path=./example_audio/test.wav, score 100.0
+ 2022-03-26 22:54:16,087 | INFO | main.py | search_local_audio | 132 | search result http://testserver/data?audio_path=./example_audio/knife_chopping.wav, score 29.182177782058716
+ 2022-03-26 22:54:16,087 | INFO | main.py | search_local_audio | 132 | search result http://testserver/data?audio_path=./example_audio/knife_cut_into_body.wav, score 22.73637056350708
...
- 2022-03-09 17:22:32,582 | INFO | main.py | search_local_audio | 135 | Successfully searched similar audio!
- 2022-03-09 17:22:33,658 | INFO | main.py | drop_tables | 159 | Successfully drop tables in Milvus and MySQL!
+ 2022-03-26 22:54:16,088 | INFO | main.py | search_local_audio | 136 | Successfully searched similar audio!
+ 2022-03-26 22:54:17,164 | INFO | main.py | drop_tables | 160 | Successfully drop tables in Milvus and MySQL!
```
+
- 前端测试(可选)
在浏览器中输入 127.0.0.1:8068 访问前端页面
- 注:如果浏览器和服务不在同一台机器上,那么 IP 需要修改成服务所在的机器 IP,并且docker-compose.yaml 中相应的 API_URL 也要修改,并重新起服务即可
+ **注**:如果浏览器和服务不在同一台机器上,那么 IP 需要修改成服务所在的机器 IP,并且 docker-compose.yaml 中相应的 API_URL 也要修改,然后重新执行 docker-compose.yaml 文件,使修改生效。
- 上传音频
- 下载数据并解压到一文件夹,假设为 /home/speech/data,那么在上传页面地址栏输入 /home/speech/data 进行数据上传
+ 在服务端下载数据并解压到一文件夹,假设为 /home/speech/data/,那么在上传页面地址栏输入 /home/speech/data/ 进行数据上传

- 检索相似音频
- 选择左上角放大镜,点击 “Default Target Audio File” 按钮,上传测试音频,接着你将看到检索结果
+ 选择左上角放大镜,点击 “Default Target Audio File” 按钮,从客户端上传测试音频,接着你将看到检索结果

-### 4. 结果
+### 5. 结果
机器配置:
- 操作系统: CentOS release 7.6
@@ -158,15 +226,12 @@ ffce340b3790 minio/minio:RELEASE.2020-12-03T00-03-10Z "/usr/bin/docker-ent…"

-基于 milvus 的检索框架在召回率 90% 的前提下,检索耗时约 2.9 毫秒,加上特征提取(Embedding)耗时约 500毫秒(测试音频时长约 5秒),即单条音频测试总共耗时约 503 毫秒,可以满足大多数应用场景
+基于 Milvus 的检索框架在召回率 90% 的前提下,检索耗时约 2.9 毫秒,加上特征提取(Embedding)耗时约 500 毫秒(测试音频时长约 5 秒),即单条音频测试总共耗时约 503 毫秒,可以满足大多数应用场景。
-### 5. 预训练模型
+### 6. 预训练模型
以下是 PaddleSpeech 提供的预训练模型列表:
| 模型 | 采样率
| :--- | :---:
| ecapa_tdnn| 16000
-| panns_cnn6| 32000
-| panns_cnn10| 32000
-| panns_cnn14| 32000
diff --git a/demos/audio_searching/docker-compose.yaml b/demos/audio_searching/docker-compose.yaml
index 8916e76fdc88356d7057b9928e658a88d93bed90..16ac054d6f3162e2f46d478ef9fc89209bdb21a5 100644
--- a/demos/audio_searching/docker-compose.yaml
+++ b/demos/audio_searching/docker-compose.yaml
@@ -64,7 +64,7 @@ services:
webclient:
container_name: audio-webclient
- image: qingen1/paddlespeech-audio-search-client:2.3
+ image: paddlepaddle/paddlespeech-audio-search-client:2.3
networks:
app_net:
ipv4_address: 172.16.23.13
diff --git a/demos/audio_searching/img/insert.png b/demos/audio_searching/img/insert.png
index b9e766bda96f7f3046a6eafae5cfb58f7fc7a378..a01015e4ee786ac27bd54e1572f0ce41bae8f635 100644
Binary files a/demos/audio_searching/img/insert.png and b/demos/audio_searching/img/insert.png differ
diff --git a/demos/audio_searching/img/search.png b/demos/audio_searching/img/search.png
index 26bcd9bddc202b0c129bfae432d5391b9a641d81..cccc7fb92c91369c3804f00133ac310b4b8abedc 100644
Binary files a/demos/audio_searching/img/search.png and b/demos/audio_searching/img/search.png differ
diff --git a/demos/audio_searching/requirements.txt b/demos/audio_searching/requirements.txt
index 9e73361b47327783d58741646d5051fabaebf226..057c6ab927803540d58046e5f6fb0889e0fb3a5e 100644
--- a/demos/audio_searching/requirements.txt
+++ b/demos/audio_searching/requirements.txt
@@ -1,12 +1,13 @@
-soundfile==0.10.3.post1
-librosa==0.8.0
-numpy
-pymysql
-fastapi
-uvicorn
diskcache==5.2.1
+dtaidistance==2.3.1
+fastapi
+librosa==0.8.0
+numpy==1.21.0
+pydantic
pymilvus==2.0.1
+pymysql
python-multipart
-typing
+soundfile==0.10.3.post1
starlette
-pydantic
\ No newline at end of file
+typing
+uvicorn
\ No newline at end of file
diff --git a/demos/audio_searching/src/config.py b/demos/audio_searching/src/config.py
index 72a8fb4beadb6fdd3df7801d13d83a42678219d0..3d6d3d43b24d63194f0becc9de363cf7466d4478 100644
--- a/demos/audio_searching/src/config.py
+++ b/demos/audio_searching/src/config.py
@@ -11,13 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-
import os
############### Milvus Configuration ###############
MILVUS_HOST = os.getenv("MILVUS_HOST", "127.0.0.1")
MILVUS_PORT = int(os.getenv("MILVUS_PORT", "19530"))
-VECTOR_DIMENSION = int(os.getenv("VECTOR_DIMENSION", "2048"))
+VECTOR_DIMENSION = int(os.getenv("VECTOR_DIMENSION", "192"))
INDEX_FILE_SIZE = int(os.getenv("INDEX_FILE_SIZE", "1024"))
METRIC_TYPE = os.getenv("METRIC_TYPE", "L2")
DEFAULT_TABLE = os.getenv("DEFAULT_TABLE", "audio_table")
diff --git a/demos/audio_searching/src/encode.py b/demos/audio_searching/src/encode.py
index eba5c48c0d19b2fd5e8190c92ff79c2c716b1ae8..f67184c295b77555b6c31955cf882f739739d0be 100644
--- a/demos/audio_searching/src/encode.py
+++ b/demos/audio_searching/src/encode.py
@@ -11,11 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-import os
-
-import librosa
import numpy as np
+
from logs import LOGGER
+from paddlespeech.cli import VectorExecutor
+
+vector_executor = VectorExecutor()
def get_audio_embedding(path):
@@ -23,16 +24,10 @@ def get_audio_embedding(path):
Use vpr_inference to generate embedding of audio
"""
try:
- RESAMPLE_RATE = 16000
- audio, _ = librosa.load(path, sr=RESAMPLE_RATE, mono=True)
-
- # TODO add infer/python interface to get embedding, now fake it by rand
- # vpr = ECAPATDNN(checkpoint_path=None, device='cuda')
- # embedding = vpr.inference(audio)
- np.random.seed(hash(os.path.basename(path)) % 1000000)
- embedding = np.random.rand(1, 2048)
+ embedding = vector_executor(
+ audio_file=path, model='ecapatdnn_voxceleb12')
embedding = embedding / np.linalg.norm(embedding)
- embedding = embedding.tolist()[0]
+ embedding = embedding.tolist()
return embedding
except Exception as e:
LOGGER.error(f"Error with embedding:{e}")
diff --git a/demos/audio_searching/src/logs.py b/demos/audio_searching/src/logs.py
index ba3ed069c6428797353b1adcdfb0f5b18b02a8ad..465eb682a4fb1b97422c54f3db1854498abae7c6 100644
--- a/demos/audio_searching/src/logs.py
+++ b/demos/audio_searching/src/logs.py
@@ -11,7 +11,6 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-import codecs
import datetime
import logging
import os
@@ -124,7 +123,7 @@ class MultiprocessHandler(logging.FileHandler):
logging.FileHandler.emit(self, record)
except (KeyboardInterrupt, SystemExit):
raise
- except:
+ except Exception as e:
self.handleError(record)
diff --git a/demos/audio_searching/src/operations/load.py b/demos/audio_searching/src/operations/load.py
index 7a295bf344827bdc8f6825e85e2da2563a344d71..80b6375fa134e0d6502febebe95a8da3109c2863 100644
--- a/demos/audio_searching/src/operations/load.py
+++ b/demos/audio_searching/src/operations/load.py
@@ -26,9 +26,8 @@ def get_audios(path):
"""
supported_formats = [".wav", ".mp3", ".ogg", ".flac", ".m4a"]
return [
- item
- for sublist in [[os.path.join(dir, file) for file in files]
- for dir, _, files in list(os.walk(path))]
+ item for sublist in [[os.path.join(dir, file) for file in files]
+ for dir, _, files in list(os.walk(path))]
for item in sublist if os.path.splitext(item)[1] in supported_formats
]
diff --git a/demos/audio_searching/src/test_main.py b/demos/audio_searching/src/test_main.py
index 331208ff159bf95f07c51307854d44334b999e0c..32030bae7f56f72633f9e19a45eb07440ff3f9f3 100644
--- a/demos/audio_searching/src/test_main.py
+++ b/demos/audio_searching/src/test_main.py
@@ -11,12 +11,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
-import zipfile
-
-import gdown
from fastapi.testclient import TestClient
from main import app
+from utils.utility import download
+from utils.utility import unpack
+
client = TestClient(app)
@@ -24,11 +24,11 @@ def download_audio_data():
"""
download audio data
"""
- url = 'https://drive.google.com/uc?id=1bKu21JWBfcZBuEuzFEvPoAX6PmRrgnUp'
- gdown.download(url)
-
- with zipfile.ZipFile('example_audio.zip', 'r') as zip_ref:
- zip_ref.extractall('./example_audio')
+ url = "https://paddlespeech.bj.bcebos.com/vector/audio/example_audio.tar.gz"
+ md5sum = "52ac69316c1aa1fdef84da7dd2c67b39"
+ target_dir = "./"
+ filepath = download(url, md5sum, target_dir)
+ unpack(filepath, target_dir, True)
def test_drop():
diff --git a/demos/speaker_verification/README.md b/demos/speaker_verification/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..8739d402da97a576e5c1349fd01913e3c399911e
--- /dev/null
+++ b/demos/speaker_verification/README.md
@@ -0,0 +1,158 @@
+([简体中文](./README_cn.md)|English)
+# Speech Verification)
+
+## Introduction
+
+Speaker Verification, refers to the problem of getting a speaker embedding from an audio.
+
+This demo is an implementation to extract speaker embedding from a specific audio file. It can be done by a single command or a few lines in python using `PaddleSpeech`.
+
+## Usage
+### 1. Installation
+see [installation](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install.md).
+
+You can choose one way from easy, meduim and hard to install paddlespeech.
+
+### 2. Prepare Input File
+The input of this demo should be a WAV file(`.wav`), and the sample rate must be the same as the model.
+
+Here are sample files for this demo that can be downloaded:
+```bash
+wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
+```
+
+### 3. Usage
+- Command Line(Recommended)
+ ```bash
+ paddlespeech vector --task spk --input 85236145389.wav
+
+ echo -e "demo1 85236145389.wav" > vec.job
+ paddlespeech vector --task spk --input vec.job
+
+ echo -e "demo2 85236145389.wav \n demo3 85236145389.wav" | paddlespeech vector --task spk
+ ```
+
+ Usage:
+ ```bash
+ paddlespeech vector --help
+ ```
+ Arguments:
+ - `input`(required): Audio file to recognize.
+ - `model`: Model type of vector task. Default: `ecapatdnn_voxceleb12`.
+ - `sample_rate`: Sample rate of the model. Default: `16000`.
+ - `config`: Config of vector task. Use pretrained model when it is None. Default: `None`.
+ - `ckpt_path`: Model checkpoint. Use pretrained model when it is None. Default: `None`.
+ - `device`: Choose device to execute model inference. Default: default device of paddlepaddle in current environment.
+
+ Output:
+
+ ```bash
+ demo [ -5.749211 9.505463 -8.200284 -5.2075014 5.3940268
+ -3.04878 1.611095 10.127234 -10.534177 -15.821609
+ 1.2032688 -0.35080156 1.2629458 -12.643498 -2.5758228
+ -11.343508 2.3385992 -8.719341 14.213509 15.404744
+ -0.39327756 6.338786 2.688887 8.7104025 17.469526
+ -8.77959 7.0576906 4.648855 -1.3089896 -23.294737
+ 8.013747 13.891729 -9.926753 5.655307 -5.9422326
+ -22.842539 0.6293588 -18.46266 -10.811862 9.8192625
+ 3.0070958 3.8072643 -2.3861165 3.0821571 -14.739942
+ 1.7594414 -0.6485091 4.485623 2.0207152 7.264915
+ -6.40137 23.63524 2.9711294 -22.708025 9.93719
+ 20.354511 -10.324688 -0.700492 -8.783211 -5.27593
+ 15.999649 3.3004563 12.747926 15.429879 4.7849145
+ 5.6699696 -2.3826702 10.605882 3.9112158 3.1500628
+ 15.859915 -2.1832209 -23.908653 -6.4799504 -4.5365124
+ -9.224193 14.568347 -10.568833 4.982321 -4.342062
+ 0.0914714 12.645902 -5.74285 -3.2141201 -2.7173362
+ -6.680575 0.4757669 -5.035051 -6.7964664 16.865469
+ -11.54324 7.681869 0.44475392 9.708182 -8.932846
+ 0.4123232 -4.361452 1.3948607 9.511665 0.11667654
+ 2.9079323 6.049952 9.275183 -18.078873 6.2983274
+ -0.7500531 -2.725033 -7.6027865 3.3404543 2.990815
+ 4.010979 11.000591 -2.8873312 7.1352735 -16.79663
+ 18.495346 -14.293832 7.89578 2.2714825 22.976387
+ -4.875734 -3.0836344 -2.9999814 13.751918 6.448228
+ -11.924197 2.171869 2.0423572 -6.173772 10.778437
+ 25.77281 -4.9495463 14.57806 0.3044315 2.6132357
+ -7.591999 -2.076944 9.025118 1.7834753 -3.1799617
+ -4.9401326 23.465864 5.1685796 -9.018578 9.037825
+ -4.4150195 6.859591 -12.274467 -0.88911164 5.186309
+ -3.9988663 -13.638606 -9.925445 -0.06329413 -3.6709652
+ -12.397416 -12.719869 -1.395601 2.1150916 5.7381287
+ -4.4691963 -3.82819 -0.84233856 -1.1604277 -13.490127
+ 8.731719 -20.778936 -11.495662 5.8033476 -4.752041
+ 10.833007 -6.717991 4.504732 13.4244375 1.1306485
+ 7.3435574 1.400918 14.704036 -9.501399 7.2315617
+ -6.417456 1.3333273 11.872697 -0.30664724 8.8845
+ 6.5569253 4.7948146 0.03662816 -8.704245 6.224871
+ -3.2701402 -11.508579 ]
+ ```
+
+- Python API
+ ```python
+ import paddle
+ from paddlespeech.cli import VectorExecutor
+
+ vector_executor = VectorExecutor()
+ audio_emb = vector_executor(
+ model='ecapatdnn_voxceleb12',
+ sample_rate=16000,
+ config=None,
+ ckpt_path=None,
+ audio_file='./85236145389.wav',
+ force_yes=False,
+ device=paddle.get_device())
+ print('Audio embedding Result: \n{}'.format(audio_emb))
+ ```
+
+ Output:
+ ```bash
+ # Vector Result:
+ [ -5.749211 9.505463 -8.200284 -5.2075014 5.3940268
+ -3.04878 1.611095 10.127234 -10.534177 -15.821609
+ 1.2032688 -0.35080156 1.2629458 -12.643498 -2.5758228
+ -11.343508 2.3385992 -8.719341 14.213509 15.404744
+ -0.39327756 6.338786 2.688887 8.7104025 17.469526
+ -8.77959 7.0576906 4.648855 -1.3089896 -23.294737
+ 8.013747 13.891729 -9.926753 5.655307 -5.9422326
+ -22.842539 0.6293588 -18.46266 -10.811862 9.8192625
+ 3.0070958 3.8072643 -2.3861165 3.0821571 -14.739942
+ 1.7594414 -0.6485091 4.485623 2.0207152 7.264915
+ -6.40137 23.63524 2.9711294 -22.708025 9.93719
+ 20.354511 -10.324688 -0.700492 -8.783211 -5.27593
+ 15.999649 3.3004563 12.747926 15.429879 4.7849145
+ 5.6699696 -2.3826702 10.605882 3.9112158 3.1500628
+ 15.859915 -2.1832209 -23.908653 -6.4799504 -4.5365124
+ -9.224193 14.568347 -10.568833 4.982321 -4.342062
+ 0.0914714 12.645902 -5.74285 -3.2141201 -2.7173362
+ -6.680575 0.4757669 -5.035051 -6.7964664 16.865469
+ -11.54324 7.681869 0.44475392 9.708182 -8.932846
+ 0.4123232 -4.361452 1.3948607 9.511665 0.11667654
+ 2.9079323 6.049952 9.275183 -18.078873 6.2983274
+ -0.7500531 -2.725033 -7.6027865 3.3404543 2.990815
+ 4.010979 11.000591 -2.8873312 7.1352735 -16.79663
+ 18.495346 -14.293832 7.89578 2.2714825 22.976387
+ -4.875734 -3.0836344 -2.9999814 13.751918 6.448228
+ -11.924197 2.171869 2.0423572 -6.173772 10.778437
+ 25.77281 -4.9495463 14.57806 0.3044315 2.6132357
+ -7.591999 -2.076944 9.025118 1.7834753 -3.1799617
+ -4.9401326 23.465864 5.1685796 -9.018578 9.037825
+ -4.4150195 6.859591 -12.274467 -0.88911164 5.186309
+ -3.9988663 -13.638606 -9.925445 -0.06329413 -3.6709652
+ -12.397416 -12.719869 -1.395601 2.1150916 5.7381287
+ -4.4691963 -3.82819 -0.84233856 -1.1604277 -13.490127
+ 8.731719 -20.778936 -11.495662 5.8033476 -4.752041
+ 10.833007 -6.717991 4.504732 13.4244375 1.1306485
+ 7.3435574 1.400918 14.704036 -9.501399 7.2315617
+ -6.417456 1.3333273 11.872697 -0.30664724 8.8845
+ 6.5569253 4.7948146 0.03662816 -8.704245 6.224871
+ -3.2701402 -11.508579 ]
+ ```
+
+### 4.Pretrained Models
+
+Here is a list of pretrained models released by PaddleSpeech that can be used by command and python API:
+
+| Model | Sample Rate
+| :--- | :---: |
+| ecapatdnn_voxceleb12 | 16k
diff --git a/demos/speaker_verification/README_cn.md b/demos/speaker_verification/README_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..fe8949b3ca6d9de77e5095d6bc55844133b73f52
--- /dev/null
+++ b/demos/speaker_verification/README_cn.md
@@ -0,0 +1,155 @@
+(简体中文|[English](./README.md))
+
+# 声纹识别
+## 介绍
+声纹识别是一项用计算机程序自动提取说话人特征的技术。
+
+这个 demo 是一个从给定音频文件提取说话人特征,它可以通过使用 `PaddleSpeech` 的单个命令或 python 中的几行代码来实现。
+
+## 使用方法
+### 1. 安装
+请看[安装文档](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/docs/source/install_cn.md)。
+
+你可以从 easy,medium,hard 三中方式中选择一种方式安装。
+
+### 2. 准备输入
+这个 demo 的输入应该是一个 WAV 文件(`.wav`),并且采样率必须与模型的采样率相同。
+
+可以下载此 demo 的示例音频:
+```bash
+# 该音频的内容是数字串 85236145389
+wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
+```
+### 3. 使用方法
+- 命令行 (推荐使用)
+ ```bash
+ paddlespeech vector --task spk --input 85236145389.wav
+
+ echo -e "demo1 85236145389.wav" > vec.job
+ paddlespeech vector --task spk --input vec.job
+
+ echo -e "demo2 85236145389.wav \n demo3 85236145389.wav" | paddlespeech vector --task spk
+ ```
+
+ 使用方法:
+ ```bash
+ paddlespeech vector --help
+ ```
+ 参数:
+ - `input`(必须输入):用于识别的音频文件。
+ - `model`:声纹任务的模型,默认值:`ecapatdnn_voxceleb12`。
+ - `sample_rate`:音频采样率,默认值:`16000`。
+ - `config`:声纹任务的参数文件,若不设置则使用预训练模型中的默认配置,默认值:`None`。
+ - `ckpt_path`:模型参数文件,若不设置则下载预训练模型使用,默认值:`None`。
+ - `device`:执行预测的设备,默认值:当前系统下 paddlepaddle 的默认 device。
+
+ 输出:
+ ```bash
+ demo [ -5.749211 9.505463 -8.200284 -5.2075014 5.3940268
+ -3.04878 1.611095 10.127234 -10.534177 -15.821609
+ 1.2032688 -0.35080156 1.2629458 -12.643498 -2.5758228
+ -11.343508 2.3385992 -8.719341 14.213509 15.404744
+ -0.39327756 6.338786 2.688887 8.7104025 17.469526
+ -8.77959 7.0576906 4.648855 -1.3089896 -23.294737
+ 8.013747 13.891729 -9.926753 5.655307 -5.9422326
+ -22.842539 0.6293588 -18.46266 -10.811862 9.8192625
+ 3.0070958 3.8072643 -2.3861165 3.0821571 -14.739942
+ 1.7594414 -0.6485091 4.485623 2.0207152 7.264915
+ -6.40137 23.63524 2.9711294 -22.708025 9.93719
+ 20.354511 -10.324688 -0.700492 -8.783211 -5.27593
+ 15.999649 3.3004563 12.747926 15.429879 4.7849145
+ 5.6699696 -2.3826702 10.605882 3.9112158 3.1500628
+ 15.859915 -2.1832209 -23.908653 -6.4799504 -4.5365124
+ -9.224193 14.568347 -10.568833 4.982321 -4.342062
+ 0.0914714 12.645902 -5.74285 -3.2141201 -2.7173362
+ -6.680575 0.4757669 -5.035051 -6.7964664 16.865469
+ -11.54324 7.681869 0.44475392 9.708182 -8.932846
+ 0.4123232 -4.361452 1.3948607 9.511665 0.11667654
+ 2.9079323 6.049952 9.275183 -18.078873 6.2983274
+ -0.7500531 -2.725033 -7.6027865 3.3404543 2.990815
+ 4.010979 11.000591 -2.8873312 7.1352735 -16.79663
+ 18.495346 -14.293832 7.89578 2.2714825 22.976387
+ -4.875734 -3.0836344 -2.9999814 13.751918 6.448228
+ -11.924197 2.171869 2.0423572 -6.173772 10.778437
+ 25.77281 -4.9495463 14.57806 0.3044315 2.6132357
+ -7.591999 -2.076944 9.025118 1.7834753 -3.1799617
+ -4.9401326 23.465864 5.1685796 -9.018578 9.037825
+ -4.4150195 6.859591 -12.274467 -0.88911164 5.186309
+ -3.9988663 -13.638606 -9.925445 -0.06329413 -3.6709652
+ -12.397416 -12.719869 -1.395601 2.1150916 5.7381287
+ -4.4691963 -3.82819 -0.84233856 -1.1604277 -13.490127
+ 8.731719 -20.778936 -11.495662 5.8033476 -4.752041
+ 10.833007 -6.717991 4.504732 13.4244375 1.1306485
+ 7.3435574 1.400918 14.704036 -9.501399 7.2315617
+ -6.417456 1.3333273 11.872697 -0.30664724 8.8845
+ 6.5569253 4.7948146 0.03662816 -8.704245 6.224871
+ -3.2701402 -11.508579 ]
+ ```
+
+- Python API
+ ```python
+ import paddle
+ from paddlespeech.cli import VectorExecutor
+
+ vector_executor = VectorExecutor()
+ audio_emb = vector_executor(
+ model='ecapatdnn_voxceleb12',
+ sample_rate=16000,
+ config=None, # Set `config` and `ckpt_path` to None to use pretrained model.
+ ckpt_path=None,
+ audio_file='./85236145389.wav',
+ force_yes=False,
+ device=paddle.get_device())
+ print('Audio embedding Result: \n{}'.format(audio_emb))
+ ```
+
+ 输出:
+ ```bash
+ # Vector Result:
+ [ -5.749211 9.505463 -8.200284 -5.2075014 5.3940268
+ -3.04878 1.611095 10.127234 -10.534177 -15.821609
+ 1.2032688 -0.35080156 1.2629458 -12.643498 -2.5758228
+ -11.343508 2.3385992 -8.719341 14.213509 15.404744
+ -0.39327756 6.338786 2.688887 8.7104025 17.469526
+ -8.77959 7.0576906 4.648855 -1.3089896 -23.294737
+ 8.013747 13.891729 -9.926753 5.655307 -5.9422326
+ -22.842539 0.6293588 -18.46266 -10.811862 9.8192625
+ 3.0070958 3.8072643 -2.3861165 3.0821571 -14.739942
+ 1.7594414 -0.6485091 4.485623 2.0207152 7.264915
+ -6.40137 23.63524 2.9711294 -22.708025 9.93719
+ 20.354511 -10.324688 -0.700492 -8.783211 -5.27593
+ 15.999649 3.3004563 12.747926 15.429879 4.7849145
+ 5.6699696 -2.3826702 10.605882 3.9112158 3.1500628
+ 15.859915 -2.1832209 -23.908653 -6.4799504 -4.5365124
+ -9.224193 14.568347 -10.568833 4.982321 -4.342062
+ 0.0914714 12.645902 -5.74285 -3.2141201 -2.7173362
+ -6.680575 0.4757669 -5.035051 -6.7964664 16.865469
+ -11.54324 7.681869 0.44475392 9.708182 -8.932846
+ 0.4123232 -4.361452 1.3948607 9.511665 0.11667654
+ 2.9079323 6.049952 9.275183 -18.078873 6.2983274
+ -0.7500531 -2.725033 -7.6027865 3.3404543 2.990815
+ 4.010979 11.000591 -2.8873312 7.1352735 -16.79663
+ 18.495346 -14.293832 7.89578 2.2714825 22.976387
+ -4.875734 -3.0836344 -2.9999814 13.751918 6.448228
+ -11.924197 2.171869 2.0423572 -6.173772 10.778437
+ 25.77281 -4.9495463 14.57806 0.3044315 2.6132357
+ -7.591999 -2.076944 9.025118 1.7834753 -3.1799617
+ -4.9401326 23.465864 5.1685796 -9.018578 9.037825
+ -4.4150195 6.859591 -12.274467 -0.88911164 5.186309
+ -3.9988663 -13.638606 -9.925445 -0.06329413 -3.6709652
+ -12.397416 -12.719869 -1.395601 2.1150916 5.7381287
+ -4.4691963 -3.82819 -0.84233856 -1.1604277 -13.490127
+ 8.731719 -20.778936 -11.495662 5.8033476 -4.752041
+ 10.833007 -6.717991 4.504732 13.4244375 1.1306485
+ 7.3435574 1.400918 14.704036 -9.501399 7.2315617
+ -6.417456 1.3333273 11.872697 -0.30664724 8.8845
+ 6.5569253 4.7948146 0.03662816 -8.704245 6.224871
+ -3.2701402 -11.508579 ]
+ ```
+
+### 4.预训练模型
+以下是 PaddleSpeech 提供的可以被命令行和 python API 使用的预训练模型列表:
+
+| 模型 | 采样率
+| :--- | :---: |
+| ecapatdnn_voxceleb12 | 16k
diff --git a/demos/speaker_verification/run.sh b/demos/speaker_verification/run.sh
new file mode 100644
index 0000000000000000000000000000000000000000..856886d333cd30f983576875e809ed2016a51f50
--- /dev/null
+++ b/demos/speaker_verification/run.sh
@@ -0,0 +1,6 @@
+#!/bin/bash
+
+wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
+
+# asr
+paddlespeech vector --task spk --input ./85236145389.wav
\ No newline at end of file
diff --git a/demos/speech_server/README.md b/demos/speech_server/README.md
index 10489e7131408ac8c074797f543e8e0edefa289e..0323d3983ab58f40285f81f135dedf2f9f019b7e 100644
--- a/demos/speech_server/README.md
+++ b/demos/speech_server/README.md
@@ -15,8 +15,8 @@ You can choose one way from meduim and hard to install paddlespeech.
### 2. Prepare config File
The configuration file can be found in `conf/application.yaml` .
-Among them, `engine_list` indicates the speech engine that will be included in the service to be started, in the format of _.
-At present, the speech tasks integrated by the service include: asr (speech recognition) and tts (speech synthesis).
+Among them, `engine_list` indicates the speech engine that will be included in the service to be started, in the format of `_`.
+At present, the speech tasks integrated by the service include: asr (speech recognition), tts (text to sppech) and cls (audio classification).
Currently the engine type supports two forms: python and inference (Paddle Inference)
diff --git a/demos/speech_server/README_cn.md b/demos/speech_server/README_cn.md
index 2bd8af6c91f88045cad2aed643ebe524148f6184..687b51f10aca14936b20f6d6667d13644049c380 100644
--- a/demos/speech_server/README_cn.md
+++ b/demos/speech_server/README_cn.md
@@ -17,7 +17,7 @@
### 2. 准备配置文件
配置文件可参见 `conf/application.yaml` 。
其中,`engine_list`表示即将启动的服务将会包含的语音引擎,格式为 <语音任务>_<引擎类型>。
-目前服务集成的语音任务有: asr(语音识别)、tts(语音合成)。
+目前服务集成的语音任务有: asr(语音识别)、tts(语音合成)以及cls(音频分类)。
目前引擎类型支持两种形式:python 及 inference (Paddle Inference)
diff --git a/docs/source/released_model.md b/docs/source/released_model.md
index 62986da03d10cc29dd8ba76565feab0555cf3ba7..9a423e03ecf685dc853119be2c69b9219ea1536a 100644
--- a/docs/source/released_model.md
+++ b/docs/source/released_model.md
@@ -8,7 +8,8 @@ Acoustic Model | Training Data | Token-based | Size | Descriptions | CER | WER |
:-------------:| :------------:| :-----: | -----: | :-----: |:-----:| :-----: | :-----: | :-----:
[Ds2 Online Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 345 MB | 2 Conv + 5 LSTM layers with only forward direction | 0.080 |-| 151 h | [D2 Online Aishell ASR0](../../examples/aishell/asr0)
[Ds2 Offline Aishell ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_aishell_ckpt_0.1.1.model.tar.gz)| Aishell Dataset | Char-based | 306 MB | 2 Conv + 3 bidirectional GRU layers| 0.064 |-| 151 h | [Ds2 Offline Aishell ASR0](../../examples/aishell/asr0)
-[Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 284 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.056 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1)
+[Conformer Online Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_chunk_conformer_aishell_ckpt_0.1.2.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0565 |-| 151 h | [Conformer Online Aishell ASR1](../../examples/aishell/asr1)
+[Conformer Offline Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_conformer_aishell_ckpt_0.1.2.model.tar.gz) | Aishell Dataset | Char-based | 189 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0483 |-| 151 h | [Conformer Offline Aishell ASR1](../../examples/aishell/asr1)
[Transformer Aishell ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/aishell/asr1/asr1_transformer_aishell_ckpt_0.1.1.model.tar.gz) | Aishell Dataset | Char-based | 128 MB | Encoder:Transformer, Decoder:Transformer, Decoding method: Attention rescoring | 0.0523 || 151 h | [Transformer Aishell ASR1](../../examples/aishell/asr1)
[Ds2 Offline Librispeech ASR0 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr0/asr0_deepspeech2_librispeech_ckpt_0.1.1.model.tar.gz)| Librispeech Dataset | Char-based | 518 MB | 2 Conv + 3 bidirectional LSTM layers| - |0.0725| 960 h | [Ds2 Offline Librispeech ASR0](../../examples/librispeech/asr0)
[Conformer Librispeech ASR1 Model](https://paddlespeech.bj.bcebos.com/s2t/librispeech/asr1/asr1_conformer_librispeech_ckpt_0.1.1.model.tar.gz) | Librispeech Dataset | subword-based | 191 MB | Encoder:Conformer, Decoder:Transformer, Decoding method: Attention rescoring |-| 0.0337 | 960 h | [Conformer Librispeech ASR1](../../examples/librispeech/asr1)
@@ -54,8 +55,9 @@ Parallel WaveGAN| VCTK |[PWGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeec
|Multi Band MelGAN | CSMSC |[MB MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc3) | [mb_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_ckpt_0.1.1.zip) [mb_melgan_baker_finetune_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_baker_finetune_ckpt_0.5.zip)|[mb_melgan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/mb_melgan/mb_melgan_csmsc_static_0.1.1.zip) |8.2MB|
Style MelGAN | CSMSC |[Style MelGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc4)|[style_melgan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/style_melgan/style_melgan_csmsc_ckpt_0.1.1.zip)| | |
HiFiGAN | CSMSC |[HiFiGAN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc5)|[hifigan_csmsc_ckpt_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_ckpt_0.1.1.zip)|[hifigan_csmsc_static_0.1.1.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_csmsc_static_0.1.1.zip)|50MB|
+HiFiGAN | LJSpeech |[HiFiGAN-ljspeech](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/ljspeech/voc5)|[hifigan_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip)|||
HiFiGAN | AISHELL-3 |[HiFiGAN-aishell3](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/voc5)|[hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip)|||
-HiFiGAN | VCTK |[HiFiGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc5)|[hifigan_aishell3_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip)|||
+HiFiGAN | VCTK |[HiFiGAN-vctk](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/voc5)|[hifigan_vctk_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_vctk_ckpt_0.2.0.zip)|||
WaveRNN | CSMSC |[WaveRNN-csmsc](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/csmsc/voc6)|[wavernn_csmsc_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_ckpt_0.2.0.zip)|[wavernn_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/wavernn/wavernn_csmsc_static_0.2.0.zip)|18MB|
@@ -74,6 +76,12 @@ Model Type | Dataset| Example Link | Pretrained Models | Static Models
PANN | Audioset| [audioset_tagging_cnn](https://github.com/qiuqiangkong/audioset_tagging_cnn) | [panns_cnn6.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn6.pdparams), [panns_cnn10.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn10.pdparams), [panns_cnn14.pdparams](https://bj.bcebos.com/paddleaudio/models/panns_cnn14.pdparams) | [panns_cnn6_static.tar.gz](https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn6_static.tar.gz)(18M), [panns_cnn10_static.tar.gz](https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn10_static.tar.gz)(19M), [panns_cnn14_static.tar.gz](https://paddlespeech.bj.bcebos.com/cls/inference_model/panns_cnn14_static.tar.gz)(289M)
PANN | ESC-50 |[pann-esc50](../../examples/esc50/cls0)|[esc50_cnn6.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn6.tar.gz), [esc50_cnn10.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn10.tar.gz), [esc50_cnn14.tar.gz](https://paddlespeech.bj.bcebos.com/cls/esc50/esc50_cnn14.tar.gz)
+## Speaker Verification Models
+
+Model Type | Dataset| Example Link | Pretrained Models | Static Models
+:-------------:| :------------:| :-----: | :-----: | :-----:
+PANN | VoxCeleb| [voxceleb_ecapatdnn](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/voxceleb/sv0) | [ecapatdnn.tar.gz](https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_1.tar.gz) | -
+
## Punctuation Restoration Models
Model Type | Dataset| Example Link | Pretrained Models
:-------------:| :------------:| :-----: | :-----:
diff --git a/examples/aishell/asr1/README.md b/examples/aishell/asr1/README.md
index 1226a4f4ec8518f8a3fe88967e0494fe2cc1eddc..5277a31ebab0392d5a2857aaab0f47973b511fc6 100644
--- a/examples/aishell/asr1/README.md
+++ b/examples/aishell/asr1/README.md
@@ -168,30 +168,7 @@ bash local/data.sh --stage -1 --stop_stage -1
bash local/data.sh --stage 2 --stop_stage 2
CUDA_VISIBLE_DEVICES= ./local/test.sh conf/transformer.yaml exp/transformer/checkpoints/avg_20
```
-The performance of the released models are shown below:
-### Conformer
-| Model | Params | Config | Augmentation | Test set | Decode method | Loss | CER |
-| --------- | ------ | ------------------- | ---------------- | -------- | ---------------------- | ---- | -------- |
-| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | attention | - | 0.059858 |
-| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | - | 0.062311 |
-| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 |
-| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 |
-### Chunk Conformer
-Need set `decoding.decoding_chunk_size=16` when decoding.
-| Model | Params | Config | Augmentation | Test set | Decode method | Chunk Size & Left Chunks | Loss | CER |
-| --------- | ------ | ------------------------- | ---------------- | -------- | ---------------------- | ------------------------ | ---- | -------- |
-| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16, -1 | - | 0.061939 |
-| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 |
-| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 |
-| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 | - | 0.059400 |
-
-### Transformer
-| Model | Params | Config | Augmentation | Test set | Decode method | Loss | CER |
-| ----------- | ------ | --------------------- | ------------ | -------- | ---------------------- | ----------------- | -------- |
-| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention | 3.858648955821991 | 0.057293 |
-| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_greedy_search | 3.858648955821991 | 0.061837 |
-| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | ctc_prefix_beam_search | 3.858648955821991 | 0.061685 |
-| transformer | 31.95M | conf/transformer.yaml | spec_aug | test | attention_rescoring | 3.858648955821991 | 0.053844 |
+[The performance of the released models](https://github.com/PaddlePaddle/PaddleSpeech/blob/develop/examples/aishell/asr1/RESULTS.md)
## Stage 4: CTC Alignment
If you want to get the alignment between the audio and the text, you can use the ctc alignment. The code of this stage is shown below:
```bash
diff --git a/examples/aishell/asr1/RESULTS.md b/examples/aishell/asr1/RESULTS.md
index b68d69924d6d3db4306b6d866a3201e74e1536d6..73cd57bda18a910cb6b4bd4d26e179308341e1d1 100644
--- a/examples/aishell/asr1/RESULTS.md
+++ b/examples/aishell/asr1/RESULTS.md
@@ -1,24 +1,27 @@
# Aishell
## Conformer
-
-| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER |
-| --- | --- | --- | --- | --- | --- | --- | --- |
-| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | attention | - | 0.059858 |
-| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | ctc_greedy_search | - | 0.062311 |
-| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | - | 0.062196 |
-| conformer | 47.07M | conf/conformer.yaml | spec_aug + shift | test | attention_rescoring | - | 0.054694 |
+paddle version: 2.2.2
+paddlespeech version: 0.1.2
+| Model | Params | Config | Augmentation| Test set | Decode method | Loss | CER |
+| --- | --- | --- | --- | --- | --- | --- | --- |
+| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention | - | 0.0548 |
+| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | ctc_greedy_search | - | 0.05127 |
+| conformer | 47.07M | conf/conformer.yaml | spec_aug| test | ctc_prefix_beam_search | - | 0.05131 |
+| conformer | 47.07M | conf/conformer.yaml | spec_aug | test | attention_rescoring | - | 0.04829 |
## Chunk Conformer
+paddle version: 2.2.2
+paddlespeech version: 0.1.2
Need set `decoding.decoding_chunk_size=16` when decoding.
| Model | Params | Config | Augmentation| Test set | Decode method | Chunk Size & Left Chunks | Loss | CER |
| --- | --- | --- | --- | --- | --- | --- | --- | --- |
-| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention | 16, -1 | - | 0.061939 |
-| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_greedy_search | 16, -1 | - | 0.070806 |
-| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | ctc_prefix_beam_search | 16, -1 | - | 0.070739 |
-| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug + shift | test | attention_rescoring | 16, -1 | - | 0.059400 |
+| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | attention | 16, -1 | - | 0.0573884 |
+| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | ctc_greedy_search | 16, -1 | - | 0.06599091 |
+| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | ctc_prefix_beam_search | 16, -1 | - | 0.065991 |
+| conformer | 47.06M | conf/chunk_conformer.yaml | spec_aug | test | attention_rescoring | 16, -1 | - | 0.056502 |
## Transformer
diff --git a/examples/aishell/asr1/conf/chunk_conformer.yaml b/examples/aishell/asr1/conf/chunk_conformer.yaml
index 68e852ba77770cd0de9b4c33e93ee3ed777fe674..9f70e4c576d06f06f13635c8c19b2ca46842e841 100644
--- a/examples/aishell/asr1/conf/chunk_conformer.yaml
+++ b/examples/aishell/asr1/conf/chunk_conformer.yaml
@@ -39,6 +39,7 @@ model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
+ init_type: 'kaiming_uniform'
###########################################
# Data #
@@ -61,28 +62,29 @@ feat_dim: 80
stride_ms: 10.0
window_ms: 25.0
sortagrad: 0 # Feed samples from shortest to longest ; -1: enabled for all epochs, 0: disabled, other: enabled for 'other' epochs
-batch_size: 64
+batch_size: 32
maxlen_in: 512 # if input length > maxlen-in, batchsize is automatically reduced
maxlen_out: 150 # if output length > maxlen-out, batchsize is automatically reduced
minibatches: 0 # for debug
batch_count: auto
-batch_bins: 0
+batch_bins: 0
batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
-num_workers: 0
+num_workers: 2
subsampling_factor: 1
num_encs: 1
###########################################
# Training #
###########################################
-n_epoch: 240
-accum_grad: 2
+n_epoch: 180
+accum_grad: 1
global_grad_clip: 5.0
+dist_sampler: True
optim: adam
optim_conf:
- lr: 0.002
+ lr: 0.001
weight_decay: 1.0e-6
scheduler: warmuplr
scheduler_conf:
@@ -92,4 +94,3 @@ log_interval: 100
checkpoint:
kbest_n: 50
latest_n: 5
-
diff --git a/examples/aishell/asr1/conf/conformer.yaml b/examples/aishell/asr1/conf/conformer.yaml
index 775a4527d49925e6f0aaf73a2d9b6f7bc37657da..a150a04d55671edf25e5871b1695bcad14710367 100644
--- a/examples/aishell/asr1/conf/conformer.yaml
+++ b/examples/aishell/asr1/conf/conformer.yaml
@@ -37,6 +37,7 @@ model_conf:
ctc_weight: 0.3
lsm_weight: 0.1 # label smoothing option
length_normalized_loss: false
+ init_type: 'kaiming_uniform'
###########################################
# Data #
@@ -75,6 +76,7 @@ num_encs: 1
n_epoch: 240
accum_grad: 2
global_grad_clip: 5.0
+dist_sampler: True
optim: adam
optim_conf:
lr: 0.002
diff --git a/examples/aishell/asr1/conf/preprocess.yaml b/examples/aishell/asr1/conf/preprocess.yaml
index f7f4c58d5228beea1b6462355bb5970d6f070ad4..d3992cb9fc3e46b2a8779d4e276223e1477570af 100644
--- a/examples/aishell/asr1/conf/preprocess.yaml
+++ b/examples/aishell/asr1/conf/preprocess.yaml
@@ -23,7 +23,3 @@ process:
n_mask: 2
inplace: true
replace_with_zero: false
-
-
-
-
diff --git a/examples/aishell/asr1/conf/transformer.yaml b/examples/aishell/asr1/conf/transformer.yaml
index 9d2946537b44ed55f59dbebc09de2ef7571324bf..9e08ea0ec79168fb969cb3b13a54be60e94157af 100644
--- a/examples/aishell/asr1/conf/transformer.yaml
+++ b/examples/aishell/asr1/conf/transformer.yaml
@@ -61,16 +61,17 @@ batch_frames_in: 0
batch_frames_out: 0
batch_frames_inout: 0
preprocess_config: conf/preprocess.yaml
-num_workers: 0
+num_workers: 2
subsampling_factor: 1
num_encs: 1
###########################################
# Training #
###########################################
-n_epoch: 240
+n_epoch: 30
accum_grad: 2
global_grad_clip: 5.0
+dist_sampler: False
optim: adam
optim_conf:
lr: 0.002
diff --git a/examples/ami/sd0/local/ami_prepare.py b/examples/ami/sd0/local/ami_prepare.py
index 01582dbdd3350f3acc2d2c2942fefc55d99c5189..1f02afe0004834000408ed8b592740ae8bd6c8b9 100644
--- a/examples/ami/sd0/local/ami_prepare.py
+++ b/examples/ami/sd0/local/ami_prepare.py
@@ -18,18 +18,17 @@ Download: http://groups.inf.ed.ac.uk/ami/download/
Prepares metadata files (JSON) from manual annotations "segments/" using RTTM format (Oracle VAD).
"""
-
import argparse
import glob
import json
import logging
import os
import xml.etree.ElementTree as et
-from distutils.util import strtobool
from ami_splits import get_AMI_split
from dataio import load_pkl
from dataio import save_pkl
+from distutils.util import strtobool
logger = logging.getLogger(__name__)
SAMPLERATE = 16000
diff --git a/examples/csmsc/tts3/README.md b/examples/csmsc/tts3/README.md
index 7b803526f9144e91d336f07eea4fa75320c645a3..ae8f7af607253861f96e5c59ac23f8e7c0d69c0e 100644
--- a/examples/csmsc/tts3/README.md
+++ b/examples/csmsc/tts3/README.md
@@ -226,8 +226,11 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path}
Pretrained FastSpeech2 model with no silence in the edge of audios:
- [fastspeech2_nosil_baker_ckpt_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_ckpt_0.4.zip)
- [fastspeech2_conformer_baker_ckpt_0.5.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_conformer_baker_ckpt_0.5.zip)
+- [fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_cnndecoder_csmsc_ckpt_1.0.0.zip)
-The static model can be downloaded here [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip).
+The static model can be downloaded here:
+- [fastspeech2_nosil_baker_static_0.4.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_nosil_baker_static_0.4.zip)
+- [fastspeech2_csmsc_static_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/fastspeech2/fastspeech2_csmsc_static_0.2.0.zip)
Model | Step | eval/loss | eval/l1_loss | eval/duration_loss | eval/pitch_loss| eval/energy_loss
:-------------:| :------------:| :-----: | :-----: | :--------: |:--------:|:---------:
diff --git a/examples/csmsc/tts3/conf/cnndecoder.yaml b/examples/csmsc/tts3/conf/cnndecoder.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..8b46fea44254bf4b11ae6f9f5a389924e2ae68c0
--- /dev/null
+++ b/examples/csmsc/tts3/conf/cnndecoder.yaml
@@ -0,0 +1,107 @@
+# use CNND
+###########################################################
+# FEATURE EXTRACTION SETTING #
+###########################################################
+
+fs: 24000 # sr
+n_fft: 2048 # FFT size (samples).
+n_shift: 300 # Hop size (samples). 12.5ms
+win_length: 1200 # Window length (samples). 50ms
+ # If set to null, it will be the same as fft_size.
+window: "hann" # Window function.
+
+# Only used for feats_type != raw
+
+fmin: 80 # Minimum frequency of Mel basis.
+fmax: 7600 # Maximum frequency of Mel basis.
+n_mels: 80 # The number of mel basis.
+
+# Only used for the model using pitch features (e.g. FastSpeech2)
+f0min: 80 # Minimum f0 for pitch extraction.
+f0max: 400 # Maximum f0 for pitch extraction.
+
+
+###########################################################
+# DATA SETTING #
+###########################################################
+batch_size: 64
+num_workers: 4
+
+
+###########################################################
+# MODEL SETTING #
+###########################################################
+model:
+ adim: 384 # attention dimension
+ aheads: 2 # number of attention heads
+ elayers: 4 # number of encoder layers
+ eunits: 1536 # number of encoder ff units
+ dlayers: 4 # number of decoder layers
+ dunits: 1536 # number of decoder ff units
+ positionwise_layer_type: conv1d # type of position-wise layer
+ positionwise_conv_kernel_size: 3 # kernel size of position wise conv layer
+ duration_predictor_layers: 2 # number of layers of duration predictor
+ duration_predictor_chans: 256 # number of channels of duration predictor
+ duration_predictor_kernel_size: 3 # filter size of duration predictor
+ postnet_layers: 5 # number of layers of postnset
+ postnet_filts: 5 # filter size of conv layers in postnet
+ postnet_chans: 256 # number of channels of conv layers in postnet
+ use_scaled_pos_enc: True # whether to use scaled positional encoding
+ encoder_normalize_before: True # whether to perform layer normalization before the input
+ decoder_normalize_before: True # whether to perform layer normalization before the input
+ reduction_factor: 1 # reduction factor
+ encoder_type: transformer # encoder type
+ decoder_type: cnndecoder # decoder type
+ init_type: xavier_uniform # initialization type
+ init_enc_alpha: 1.0 # initial value of alpha of encoder scaled position encoding
+ init_dec_alpha: 1.0 # initial value of alpha of decoder scaled position encoding
+ transformer_enc_dropout_rate: 0.2 # dropout rate for transformer encoder layer
+ transformer_enc_positional_dropout_rate: 0.2 # dropout rate for transformer encoder positional encoding
+ transformer_enc_attn_dropout_rate: 0.2 # dropout rate for transformer encoder attention layer
+ cnn_dec_dropout_rate: 0.2 # dropout rate for cnn decoder layer
+ cnn_postnet_dropout_rate: 0.2
+ cnn_postnet_resblock_kernel_sizes: [256, 256] # kernel sizes for residual block of cnn_postnet
+ cnn_postnet_kernel_size: 5 # kernel size of cnn_postnet
+ cnn_decoder_embedding_dim: 256
+ pitch_predictor_layers: 5 # number of conv layers in pitch predictor
+ pitch_predictor_chans: 256 # number of channels of conv layers in pitch predictor
+ pitch_predictor_kernel_size: 5 # kernel size of conv leyers in pitch predictor
+ pitch_predictor_dropout: 0.5 # dropout rate in pitch predictor
+ pitch_embed_kernel_size: 1 # kernel size of conv embedding layer for pitch
+ pitch_embed_dropout: 0.0 # dropout rate after conv embedding layer for pitch
+ stop_gradient_from_pitch_predictor: True # whether to stop the gradient from pitch predictor to encoder
+ energy_predictor_layers: 2 # number of conv layers in energy predictor
+ energy_predictor_chans: 256 # number of channels of conv layers in energy predictor
+ energy_predictor_kernel_size: 3 # kernel size of conv leyers in energy predictor
+ energy_predictor_dropout: 0.5 # dropout rate in energy predictor
+ energy_embed_kernel_size: 1 # kernel size of conv embedding layer for energy
+ energy_embed_dropout: 0.0 # dropout rate after conv embedding layer for energy
+ stop_gradient_from_energy_predictor: False # whether to stop the gradient from energy predictor to encoder
+
+
+
+###########################################################
+# UPDATER SETTING #
+###########################################################
+updater:
+ use_masking: True # whether to apply masking for padded part in loss calculation
+
+
+###########################################################
+# OPTIMIZER SETTING #
+###########################################################
+optimizer:
+ optim: adam # optimizer type
+ learning_rate: 0.001 # learning rate
+
+###########################################################
+# TRAINING SETTING #
+###########################################################
+max_epoch: 1000
+num_snapshots: 5
+
+
+###########################################################
+# OTHER SETTING #
+###########################################################
+seed: 10086
diff --git a/examples/csmsc/tts3/local/synthesize_streaming.sh b/examples/csmsc/tts3/local/synthesize_streaming.sh
new file mode 100755
index 0000000000000000000000000000000000000000..7606c23857fd76d958f8b4757345badf4fb1b9c8
--- /dev/null
+++ b/examples/csmsc/tts3/local/synthesize_streaming.sh
@@ -0,0 +1,92 @@
+#!/bin/bash
+
+config_path=$1
+train_output_path=$2
+ckpt_name=$3
+
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+ python3 ${BIN_DIR}/../synthesize_streaming.py \
+ --am=fastspeech2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=pwgan_csmsc \
+ --voc_config=pwg_baker_ckpt_0.4/pwg_default.yaml \
+ --voc_ckpt=pwg_baker_ckpt_0.4/pwg_snapshot_iter_400000.pdz \
+ --voc_stat=pwg_baker_ckpt_0.4/pwg_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/test_e2e_streaming \
+ --phones_dict=dump/phone_id_map.txt \
+ --am_streaming=True
+fi
+
+# for more GAN Vocoders
+# multi band melgan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+ python3 ${BIN_DIR}/../synthesize_streaming.py \
+ --am=fastspeech2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=mb_melgan_csmsc \
+ --voc_config=mb_melgan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=mb_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1000000.pdz\
+ --voc_stat=mb_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/test_e2e_streaming \
+ --phones_dict=dump/phone_id_map.txt \
+ --am_streaming=True
+fi
+
+# the pretrained models haven't release now
+# style melgan
+# style melgan's Dygraph to Static Graph is not ready now
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+ python3 ${BIN_DIR}/../synthesize_streaming.py \
+ --am=fastspeech2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=style_melgan_csmsc \
+ --voc_config=style_melgan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=style_melgan_csmsc_ckpt_0.1.1/snapshot_iter_1500000.pdz \
+ --voc_stat=style_melgan_csmsc_ckpt_0.1.1/feats_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/test_e2e_streaming \
+ --phones_dict=dump/phone_id_map.txt \
+ --am_streaming=True
+fi
+
+# hifigan
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ echo "in hifigan syn_e2e"
+ FLAGS_allocator_strategy=naive_best_fit \
+ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+ python3 ${BIN_DIR}/../synthesize_streaming.py \
+ --am=fastspeech2_csmsc \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=hifigan_csmsc \
+ --voc_config=hifigan_csmsc_ckpt_0.1.1/default.yaml \
+ --voc_ckpt=hifigan_csmsc_ckpt_0.1.1/snapshot_iter_2500000.pdz \
+ --voc_stat=hifigan_csmsc_ckpt_0.1.1/feats_stats.npy \
+ --lang=zh \
+ --text=${BIN_DIR}/../sentences.txt \
+ --output_dir=${train_output_path}/test_e2e_streaming \
+ --phones_dict=dump/phone_id_map.txt \
+ --am_streaming=True
+fi
diff --git a/examples/csmsc/tts3/run_cnndecoder.sh b/examples/csmsc/tts3/run_cnndecoder.sh
new file mode 100755
index 0000000000000000000000000000000000000000..5cccef01610515d18799e2af5d490f22266c0cb3
--- /dev/null
+++ b/examples/csmsc/tts3/run_cnndecoder.sh
@@ -0,0 +1,48 @@
+#!/bin/bash
+
+set -e
+source path.sh
+
+gpus=0,1
+stage=0
+stop_stage=100
+
+conf_path=conf/cnndecoder.yaml
+train_output_path=exp/cnndecoder
+ckpt_name=snapshot_iter_153.pdz
+
+# with the following command, you can choose the stage range you want to run
+# such as `./run.sh --stage 0 --stop-stage 0`
+# this can not be mixed use with `$1`, `$2` ...
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ # prepare data
+ ./local/preprocess.sh ${conf_path} || exit -1
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ # train model, all `ckpt` under `train_output_path/checkpoints/` dir
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/train.sh ${conf_path} ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ # synthesize, vocoder is pwgan
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+ # synthesize_e2e, vocoder is pwgan
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+ # inference with static model
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/inference.sh ${train_output_path} || exit -1
+fi
+
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+ # synthesize_e2e, vocoder is pwgan
+ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_streaming.sh ${conf_path} ${train_output_path} ${ckpt_name} || exit -1
+fi
+
diff --git a/examples/ljspeech/tts3/local/synthesize.sh b/examples/ljspeech/tts3/local/synthesize.sh
index f150d158f6832cecfd0eff5028d5dd716f3d52f8..6dc34274c5862f928584b0e8fc513f488f5760d5 100755
--- a/examples/ljspeech/tts3/local/synthesize.sh
+++ b/examples/ljspeech/tts3/local/synthesize.sh
@@ -4,17 +4,42 @@ config_path=$1
train_output_path=$2
ckpt_name=$3
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize.py \
- --am=fastspeech2_ljspeech \
- --am_config=${config_path} \
- --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
- --am_stat=dump/train/speech_stats.npy \
- --voc=pwgan_ljspeech \
- --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
- --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
- --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
- --test_metadata=dump/test/norm/metadata.jsonl \
- --output_dir=${train_output_path}/test \
- --phones_dict=dump/phone_id_map.txt
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+ python3 ${BIN_DIR}/../synthesize.py \
+ --am=fastspeech2_ljspeech \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=pwgan_ljspeech \
+ --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
+ --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
+ --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
+ --test_metadata=dump/test/norm/metadata.jsonl \
+ --output_dir=${train_output_path}/test \
+ --phones_dict=dump/phone_id_map.txt
+fi
+
+# hifigan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+ python3 ${BIN_DIR}/../synthesize.py \
+ --am=fastspeech2_ljspeech \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=hifigan_ljspeech \
+ --voc_config=hifigan_ljspeech_ckpt_0.2.0/default.yaml \
+ --voc_ckpt=hifigan_ljspeech_ckpt_0.2.0/snapshot_iter_2500000.pdz \
+ --voc_stat=hifigan_ljspeech_ckpt_0.2.0/feats_stats.npy \
+ --test_metadata=dump/test/norm/metadata.jsonl \
+ --output_dir=${train_output_path}/test \
+ --phones_dict=dump/phone_id_map.txt
+fi
+
diff --git a/examples/ljspeech/tts3/local/synthesize_e2e.sh b/examples/ljspeech/tts3/local/synthesize_e2e.sh
index 0b0cb5741938d0455ec6244d70c083a97c89be0d..36865f7f169d12f9767819f1a8912e7349065df1 100755
--- a/examples/ljspeech/tts3/local/synthesize_e2e.sh
+++ b/examples/ljspeech/tts3/local/synthesize_e2e.sh
@@ -4,19 +4,45 @@ config_path=$1
train_output_path=$2
ckpt_name=$3
-FLAGS_allocator_strategy=naive_best_fit \
-FLAGS_fraction_of_gpu_memory_to_use=0.01 \
-python3 ${BIN_DIR}/../synthesize_e2e.py \
- --am=fastspeech2_ljspeech \
- --am_config=${config_path} \
- --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
- --am_stat=dump/train/speech_stats.npy \
- --voc=pwgan_ljspeech \
- --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
- --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
- --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
- --lang=en \
- --text=${BIN_DIR}/../sentences_en.txt \
- --output_dir=${train_output_path}/test_e2e \
- --inference_dir=${train_output_path}/inference \
- --phones_dict=dump/phone_id_map.txt
\ No newline at end of file
+stage=0
+stop_stage=0
+
+# pwgan
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=fastspeech2_ljspeech \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=pwgan_ljspeech \
+ --voc_config=pwg_ljspeech_ckpt_0.5/pwg_default.yaml \
+ --voc_ckpt=pwg_ljspeech_ckpt_0.5/pwg_snapshot_iter_400000.pdz \
+ --voc_stat=pwg_ljspeech_ckpt_0.5/pwg_stats.npy \
+ --lang=en \
+ --text=${BIN_DIR}/../sentences_en.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --inference_dir=${train_output_path}/inference \
+ --phones_dict=dump/phone_id_map.txt
+fi
+
+# hifigan
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ FLAGS_allocator_strategy=naive_best_fit \
+ FLAGS_fraction_of_gpu_memory_to_use=0.01 \
+ python3 ${BIN_DIR}/../synthesize_e2e.py \
+ --am=fastspeech2_ljspeech \
+ --am_config=${config_path} \
+ --am_ckpt=${train_output_path}/checkpoints/${ckpt_name} \
+ --am_stat=dump/train/speech_stats.npy \
+ --voc=hifigan_ljspeech \
+ --voc_config=hifigan_ljspeech_ckpt_0.2.0/default.yaml \
+ --voc_ckpt=hifigan_ljspeech_ckpt_0.2.0/snapshot_iter_2500000.pdz \
+ --voc_stat=hifigan_ljspeech_ckpt_0.2.0/feats_stats.npy \
+ --lang=en \
+ --text=${BIN_DIR}/../sentences_en.txt \
+ --output_dir=${train_output_path}/test_e2e \
+ --inference_dir=${train_output_path}/inference \
+ --phones_dict=dump/phone_id_map.txt
+fi
diff --git a/examples/ljspeech/voc5/README.md b/examples/ljspeech/voc5/README.md
index 2108294284552ef2f8dec83959aad6c5779dc52e..9fbb9f74615bd9eef4e54f93542b3a836e9fb000 100644
--- a/examples/ljspeech/voc5/README.md
+++ b/examples/ljspeech/voc5/README.md
@@ -127,6 +127,21 @@ optional arguments:
5. `--ngpu` is the number of gpus to use, if ngpu == 0, use cpu.
## Pretrained Model
+The pretrained model can be downloaded here [hifigan_ljspeech_ckpt_0.2.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip).
+
+
+Model | Step | eval/generator_loss | eval/mel_loss| eval/feature_matching_loss
+:-------------:| :------------:| :-----: | :-----: | :--------:
+default| 1(gpu) x 2500000|24.492|0.115|7.227
+
+HiFiGAN checkpoint contains files listed below.
+
+```text
+hifigan_ljspeech_ckpt_0.2.0
+├── default.yaml # default config used to train hifigan
+├── feats_stats.npy # statistics used to normalize spectrogram when training hifigan
+└── snapshot_iter_2500000.pdz # generator parameters of hifigan
+```
## Acknowledgement
diff --git a/examples/voxceleb/README.md b/examples/voxceleb/README.md
index 2c8ad1386023fc621fe0673c856719c6f899f3f4..42f8903e4c4f764e887695d8171c7cf7b72339f8 100644
--- a/examples/voxceleb/README.md
+++ b/examples/voxceleb/README.md
@@ -6,3 +6,45 @@ sv0 - speaker verfication with softmax backend etc, all python code
sv1 - dependence on kaldi, speaker verfication with plda/sc backend,
more info refer to the sv1/readme.txt
+
+
+## VoxCeleb2 preparation
+
+VoxCeleb2 audio files are released in m4a format. All the VoxCeleb2 m4a audio files must be converted in wav files before feeding them in PaddleSpeech.
+Please, follow these steps to prepare the dataset correctly:
+
+1. Download Voxceleb2.
+You can find download instructions here: http://www.robots.ox.ac.uk/~vgg/data/voxceleb/
+
+2. Convert .m4a to wav
+VoxCeleb2 stores files with the m4a audio format. To use them in PaddleSpeech, you have to convert all the m4a audio files into wav files.
+
+``` shell
+ffmpeg -y -i %s -ac 1 -vn -acodec pcm_s16le -ar 16000 %s
+```
+
+You can do the conversion using ffmpeg https://gist.github.com/seungwonpark/4f273739beef2691cd53b5c39629d830). This operation might take several hours and should be only once.
+
+3. Put all the wav files in a folder called `wav`. You should have something like `voxceleb2/wav/id*/*.wav` (e.g, `voxceleb2/wav/id00012/21Uxsk56VDQ/00001.wav`)
+
+
+## voxceleb dataset summary
+
+
+|dataset | vox1 - dev | vox1 - test |vox2 - dev| vox2 - test|
+|---------|-----------|------------|-----------|----------|
+|spks | 1211 |40 | 5994 | 118|
+|utts | 148642 | 4874 | 1092009 |36273|
+| time(h) | 340.4 | 11.2 | 2360.2 |79.9 |
+
+
+## trial summary
+
+| trial | filename | nums | positive | negative |
+|--------|-----------|--------|-------|------|
+| VoxCeleb1 | veri_test.txt | 37720 | 18860 | 18860 |
+| VoxCeleb1(cleaned) | veri_test2.txt | 37611 | 18802 | 18809 |
+| VoxCeleb1-H | list_test_hard.txt | 552536 | 276270 | 276266 |
+|VoxCeleb1-H(cleaned) |list_test_hard2.txt | 550894 | 275488 | 275406 |
+|VoxCeleb1-E | list_test_all.txt | 581480 | 290743 | 290737 |
+|VoxCeleb1-E(cleaned) | list_test_all2.txt |579818 |289921 |289897 |
diff --git a/examples/voxceleb/sv0/RESULT.md b/examples/voxceleb/sv0/RESULT.md
new file mode 100644
index 0000000000000000000000000000000000000000..c37bcecef9b4276adcd7eb05b14893c48c3bdf96
--- /dev/null
+++ b/examples/voxceleb/sv0/RESULT.md
@@ -0,0 +1,7 @@
+# VoxCeleb
+
+## ECAPA-TDNN
+
+| Model | Number of Params | Release | Config | dim | Test set | Cosine | Cosine + S-Norm |
+| --- | --- | --- | --- | --- | --- | --- | ---- |
+| ECAPA-TDNN | 85M | 0.1.1 | conf/ecapa_tdnn.yaml |192 | test | 1.15 | 1.06 |
diff --git a/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..e58dca82dcceb600b807bdc68c55894f7e88e834
--- /dev/null
+++ b/examples/voxceleb/sv0/conf/ecapa_tdnn.yaml
@@ -0,0 +1,52 @@
+###########################################
+# Data #
+###########################################
+# we should explicitly specify the wav path of vox2 audio data converted from m4a
+vox2_base_path:
+augment: True
+batch_size: 16
+num_workers: 2
+num_speakers: 7205 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
+shuffle: True
+random_chunk: True
+
+###########################################################
+# FEATURE EXTRACTION SETTING #
+###########################################################
+# currently, we only support fbank
+sr: 16000 # sample rate
+n_mels: 80
+window_size: 400 #25ms, sample rate 16000, 25 * 16000 / 1000 = 400
+hop_size: 160 #10ms, sample rate 16000, 10 * 16000 / 1000 = 160
+
+###########################################################
+# MODEL SETTING #
+###########################################################
+# currently, we only support ecapa-tdnn in the ecapa_tdnn.yaml
+# if we want use another model, please choose another configuration yaml file
+model:
+ input_size: 80
+ # "channels": [512, 512, 512, 512, 1536],
+ channels: [1024, 1024, 1024, 1024, 3072]
+ kernel_sizes: [5, 3, 3, 3, 1]
+ dilations: [1, 2, 3, 4, 1]
+ attention_channels: 128
+ lin_neurons: 192
+
+###########################################
+# Training #
+###########################################
+seed: 1986 # according from speechbrain configuration
+epochs: 10
+save_interval: 1
+log_interval: 1
+learning_rate: 1e-8
+
+
+###########################################
+# Testing #
+###########################################
+global_embedding_norm: True
+embedding_mean_norm: True
+embedding_std_norm: False
+
diff --git a/examples/voxceleb/sv0/local/data.sh b/examples/voxceleb/sv0/local/data.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a3ff1c486bd6030eda1a455c6dbf74431e7003d6
--- /dev/null
+++ b/examples/voxceleb/sv0/local/data.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+stage=1
+stop_stage=100
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
+
+if [ $# -ne 2 ] ; then
+ echo "Usage: $0 [options] ";
+ echo "e.g.: $0 ./data/ conf/ecapa_tdnn.yaml"
+ echo "Options: "
+ echo " --stage # Used to run a partially-completed data process from somewhere in the middle."
+ echo " --stop-stage # Used to run a partially-completed data process stop stage in the middle"
+ exit 1;
+fi
+
+dir=$1
+conf_path=$2
+mkdir -p ${dir}
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ # data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
+ # we should use the local/convert.sh convert m4a to wav
+ python3 local/data_prepare.py \
+ --data-dir ${dir} \
+ --config ${conf_path}
+fi
+
+TARGET_DIR=${MAIN_ROOT}/dataset
+mkdir -p ${TARGET_DIR}
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ # download data, generate manifests
+ python3 ${TARGET_DIR}/voxceleb/voxceleb1.py \
+ --manifest_prefix="data/vox1/manifest" \
+ --target_dir="${TARGET_DIR}/voxceleb/vox1/"
+
+ if [ $? -ne 0 ]; then
+ echo "Prepare voxceleb failed. Terminated."
+ exit 1
+ fi
+
+ # for dataset in train dev test; do
+ # mv data/manifest.${dataset} data/manifest.${dataset}.raw
+ # done
+fi
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/local/data_prepare.py b/examples/voxceleb/sv0/local/data_prepare.py
new file mode 100644
index 0000000000000000000000000000000000000000..03d054004f0f5f98103a78b5becdc8f0a8bda357
--- /dev/null
+++ b/examples/voxceleb/sv0/local/data_prepare.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+
+import paddle
+from yacs.config import CfgNode
+
+from paddleaudio.datasets.voxceleb import VoxCeleb
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.augment import build_augment_pipeline
+from paddlespeech.vector.training.seeding import seed_everything
+
+logger = Log(__name__).getlog()
+
+
+def main(args, config):
+
+ # stage0: set the cpu device, all data prepare process will be done in cpu mode
+ paddle.set_device("cpu")
+ # set the random seed, it is a must for multiprocess training
+ seed_everything(config.seed)
+
+ # stage 1: generate the voxceleb csv file
+ # Note: this may occurs c++ execption, but the program will execute fine
+ # so we ignore the execption
+ # we explicitly pass the vox2 base path to data prepare and generate the audio info
+ logger.info("start to generate the voxceleb dataset info")
+ train_dataset = VoxCeleb(
+ 'train', target_dir=args.data_dir, vox2_base_path=config.vox2_base_path)
+
+ # stage 2: generate the augment noise csv file
+ if config.augment:
+ logger.info("start to generate the augment dataset info")
+ augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
+
+
+if __name__ == "__main__":
+ # yapf: disable
+ parser = argparse.ArgumentParser(__doc__)
+ parser.add_argument("--data-dir",
+ default="./data/",
+ type=str,
+ help="data directory")
+ parser.add_argument("--config",
+ default=None,
+ type=str,
+ help="configuration file")
+ args = parser.parse_args()
+ # yapf: enable
+
+ # https://yaml.org/type/float.html
+ config = CfgNode(new_allowed=True)
+ if args.config:
+ config.merge_from_file(args.config)
+
+ config.freeze()
+ print(config)
+
+ main(args, config)
diff --git a/examples/voxceleb/sv0/local/emb.sh b/examples/voxceleb/sv0/local/emb.sh
new file mode 100755
index 0000000000000000000000000000000000000000..31d79e52d036c7f7f0073403033253c60b5d6e62
--- /dev/null
+++ b/examples/voxceleb/sv0/local/emb.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+. ./path.sh
+
+stage=0
+stop_stage=100
+exp_dir=exp/ecapa-tdnn-vox12-big/ # experiment directory
+conf_path=conf/ecapa_tdnn.yaml
+audio_path="demo/voxceleb/00001.wav"
+use_gpu=true
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
+
+if [ $# -ne 0 ] ; then
+ echo "Usage: $0 [options]";
+ echo "e.g.: $0 ./data/ exp/voxceleb12/ conf/ecapa_tdnn.yaml"
+ echo "Options: "
+ echo " --use-gpu # specify is gpu is to be used for training"
+ echo " --stage # Used to run a partially-completed data process from somewhere in the middle."
+ echo " --stop-stage # Used to run a partially-completed data process stop stage in the middle"
+ echo " --exp-dir # experiment directorh, where is has the model.pdparams"
+ echo " --conf-path # configuration file for extracting the embedding"
+ echo " --audio-path # audio-path, which will be processed to extract the embedding"
+ exit 1;
+fi
+
+# set the test device
+device="cpu"
+if ${use_gpu}; then
+ device="gpu"
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ # extract the audio embedding
+ python3 ${BIN_DIR}/extract_emb.py --device ${device} \
+ --config ${conf_path} \
+ --audio-path ${audio_path} --load-checkpoint ${exp_dir}
+fi
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/local/test.sh b/examples/voxceleb/sv0/local/test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..4460a165ace5ad12a10977f97432937623b185fd
--- /dev/null
+++ b/examples/voxceleb/sv0/local/test.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+stage=1
+stop_stage=100
+use_gpu=true # if true, we run on GPU.
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
+
+if [ $# -ne 3 ] ; then
+ echo "Usage: $0 [options] ";
+ echo "e.g.: $0 ./data/ exp/voxceleb12/ conf/ecapa_tdnn.yaml"
+ echo "Options: "
+ echo " --use-gpu # specify is gpu is to be used for training"
+ echo " --stage # Used to run a partially-completed data process from somewhere in the middle."
+ echo " --stop-stage # Used to run a partially-completed data process stop stage in the middle"
+ exit 1;
+fi
+
+dir=$1
+exp_dir=$2
+conf_path=$3
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ # test the model and compute the eer metrics
+ python3 ${BIN_DIR}/test.py \
+ --data-dir ${dir} \
+ --load-checkpoint ${exp_dir} \
+ --config ${conf_path}
+fi
diff --git a/examples/voxceleb/sv0/local/train.sh b/examples/voxceleb/sv0/local/train.sh
new file mode 100755
index 0000000000000000000000000000000000000000..5477d0a34dff3ec546146d772793136b586705f5
--- /dev/null
+++ b/examples/voxceleb/sv0/local/train.sh
@@ -0,0 +1,61 @@
+#!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+stage=0
+stop_stage=100
+use_gpu=true # if true, we run on GPU.
+
+. ${MAIN_ROOT}/utils/parse_options.sh || exit -1;
+
+if [ $# -ne 3 ] ; then
+ echo "Usage: $0 [options] ";
+ echo "e.g.: $0 ./data/ exp/voxceleb12/ conf/ecapa_tdnn.yaml"
+ echo "Options: "
+ echo " --use-gpu # specify is gpu is to be used for training"
+ echo " --stage # Used to run a partially-completed data process from somewhere in the middle."
+ echo " --stop-stage # Used to run a partially-completed data process stop stage in the middle"
+ exit 1;
+fi
+
+dir=$1
+exp_dir=$2
+conf_path=$3
+
+# get the gpu nums for training
+ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
+echo "using $ngpu gpus..."
+
+# setting training device
+device="cpu"
+if ${use_gpu}; then
+ device="gpu"
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ # train the speaker identification task with voxceleb data
+ # and we will create the trained model parameters in ${exp_dir}/model.pdparams as the soft link
+ # Note: we will store the log file in exp/log directory
+ python3 -m paddle.distributed.launch --gpus=$CUDA_VISIBLE_DEVICES \
+ ${BIN_DIR}/train.py --device ${device} --checkpoint-dir ${exp_dir} \
+ --data-dir ${dir} --config ${conf_path}
+
+fi
+
+if [ $? -ne 0 ]; then
+ echo "Failed in training!"
+ exit 1
+fi
+
+exit 0
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/path.sh b/examples/voxceleb/sv0/path.sh
new file mode 100755
index 0000000000000000000000000000000000000000..2be098e04ec2dc8e2b88111d1cf713f7b7978677
--- /dev/null
+++ b/examples/voxceleb/sv0/path.sh
@@ -0,0 +1,28 @@
+#!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+export MAIN_ROOT=`realpath ${PWD}/../../../`
+
+export PATH=${MAIN_ROOT}:${MAIN_ROOT}/utils:${PATH}
+export LC_ALL=C
+
+export PYTHONDONTWRITEBYTECODE=1
+# Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PYTHONPATH=${MAIN_ROOT}:${PYTHONPATH}
+
+export LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib/
+
+MODEL=ecapa_tdnn
+export BIN_DIR=${MAIN_ROOT}/paddlespeech/vector/exps/${MODEL}
\ No newline at end of file
diff --git a/examples/voxceleb/sv0/run.sh b/examples/voxceleb/sv0/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..bbc9e3dbb66b4dc18a1fc54d0bd0808f01169ea8
--- /dev/null
+++ b/examples/voxceleb/sv0/run.sh
@@ -0,0 +1,69 @@
+#!/bin/bash
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+. ./path.sh
+set -e
+
+#######################################################################
+# stage 0: data prepare, including voxceleb1 download and generate {train,dev,enroll,test}.csv
+# voxceleb2 data is m4a format, so we need user to convert the m4a to wav yourselves as described in Readme.md with the script local/convert.sh
+# stage 1: train the speaker identification model
+# stage 2: test speaker identification
+# stage 3: extract the training embeding to train the LDA and PLDA
+######################################################################
+
+# we can set the variable PPAUDIO_HOME to specifiy the root directory of the downloaded vox1 and vox2 dataset
+# default the dataset will be stored in the ~/.paddleaudio/
+# the vox2 dataset is stored in m4a format, we need to convert the audio from m4a to wav yourself
+# and put all of them to ${PPAUDIO_HOME}/datasets/vox2
+# we will find the wav from ${PPAUDIO_HOME}/datasets/vox1/wav and ${PPAUDIO_HOME}/datasets/vox2/wav
+# export PPAUDIO_HOME=
+stage=0
+stop_stage=50
+
+# data directory
+# if we set the variable ${dir}, we will store the wav info to this directory
+# otherwise, we will store the wav info to vox1 and vox2 directory respectively
+# vox2 wav path, we must convert the m4a format to wav format
+dir=data/ # data info directory
+
+exp_dir=exp/ecapa-tdnn-vox12-big/ # experiment directory
+conf_path=conf/ecapa_tdnn.yaml
+gpus=0,1,2,3
+
+source ${MAIN_ROOT}/utils/parse_options.sh || exit 1;
+
+mkdir -p ${exp_dir}
+
+if [ $stage -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+ # stage 0: data prepare for vox1 and vox2, vox2 must be converted from m4a to wav
+ bash ./local/data.sh ${dir} ${conf_path}|| exit -1;
+fi
+
+if [ $stage -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+ # stage 1: train the speaker identification model
+ CUDA_VISIBLE_DEVICES=${gpus} bash ./local/train.sh ${dir} ${exp_dir} ${conf_path}
+fi
+
+if [ $stage -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+ # stage 2: get the speaker verification scores with cosine function
+ # now we only support use cosine to get the scores
+ CUDA_VISIBLE_DEVICES=0 bash ./local/test.sh ${dir} ${exp_dir} ${conf_path}
+fi
+
+# if [ $stage -le 3 ]; then
+# # stage 2: extract the training embeding to train the LDA and PLDA
+# # todo: extract the training embedding
+# fi
diff --git a/examples/voxceleb/sv0/utils b/examples/voxceleb/sv0/utils
new file mode 120000
index 0000000000000000000000000000000000000000..256f914abcaa47d966c44878b88a300437f110fb
--- /dev/null
+++ b/examples/voxceleb/sv0/utils
@@ -0,0 +1 @@
+../../../utils/
\ No newline at end of file
diff --git a/paddleaudio/.gitignore b/paddleaudio/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..1c930053d56ac426518959387705c79f8a394a7c
--- /dev/null
+++ b/paddleaudio/.gitignore
@@ -0,0 +1,2 @@
+.eggs
+*.wav
diff --git a/paddleaudio/README.md b/paddleaudio/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..697c017394044e862883fe7f9589d00c77127868
--- /dev/null
+++ b/paddleaudio/README.md
@@ -0,0 +1,7 @@
+# PaddleAudio
+
+PaddleAudio is an audio library for PaddlePaddle.
+
+## Install
+
+`pip install .`
diff --git a/paddleaudio/docs/Makefile b/paddleaudio/docs/Makefile
new file mode 100644
index 0000000000000000000000000000000000000000..69fe55ecfa9aade66e1412aef0ee7d04a9bcde86
--- /dev/null
+++ b/paddleaudio/docs/Makefile
@@ -0,0 +1,19 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+SOURCEDIR = source
+BUILDDIR = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/paddleaudio/docs/README.md b/paddleaudio/docs/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..20626f52bfced5b52b8fa014a0a540ed69ece8a7
--- /dev/null
+++ b/paddleaudio/docs/README.md
@@ -0,0 +1,24 @@
+# Build docs for PaddleAudio
+
+Execute the following steps in **current directory**.
+
+## 1. Install
+
+`pip install Sphinx sphinx_rtd_theme`
+
+
+## 2. Generate API docs
+
+Generate API docs from doc string.
+
+`sphinx-apidoc -fMeT -o source ../paddleaudio ../paddleaudio/utils --templatedir source/_templates`
+
+
+## 3. Build
+
+`sphinx-build source _html`
+
+
+## 4. Preview
+
+Open `_html/index.html` for page preview.
diff --git a/paddleaudio/docs/images/paddle.png b/paddleaudio/docs/images/paddle.png
new file mode 100644
index 0000000000000000000000000000000000000000..bc1135abfab7aa48f29392da4bca614f688314af
Binary files /dev/null and b/paddleaudio/docs/images/paddle.png differ
diff --git a/paddleaudio/docs/make.bat b/paddleaudio/docs/make.bat
new file mode 100644
index 0000000000000000000000000000000000000000..543c6b13b473ff3c586d5d97ae418d267ee795c4
--- /dev/null
+++ b/paddleaudio/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+ set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=source
+set BUILDDIR=build
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+ echo.
+ echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+ echo.installed, then set the SPHINXBUILD environment variable to point
+ echo.to the full path of the 'sphinx-build' executable. Alternatively you
+ echo.may add the Sphinx directory to PATH.
+ echo.
+ echo.If you don't have Sphinx installed, grab it from
+ echo.http://sphinx-doc.org/
+ exit /b 1
+)
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
+
+:end
+popd
diff --git a/paddleaudio/docs/source/_static/custom.css b/paddleaudio/docs/source/_static/custom.css
new file mode 100644
index 0000000000000000000000000000000000000000..bb65c51a928cff8318a676af91dd4e320b723990
--- /dev/null
+++ b/paddleaudio/docs/source/_static/custom.css
@@ -0,0 +1,5 @@
+.wy-nav-content {
+ max-width: 80%;
+}
+.table table{ background:#b9b9b9}
+.table table td{ background:#FFF; }
diff --git a/paddleaudio/docs/source/_templates/module.rst_t b/paddleaudio/docs/source/_templates/module.rst_t
new file mode 100644
index 0000000000000000000000000000000000000000..d9a50e6b9752a1b04ef1317c33075e8c19fc97cd
--- /dev/null
+++ b/paddleaudio/docs/source/_templates/module.rst_t
@@ -0,0 +1,9 @@
+{%- if show_headings %}
+{{- basename | e | heading }}
+
+{% endif -%}
+.. automodule:: {{ qualname }}
+{%- for option in automodule_options %}
+ :{{ option }}:
+{%- endfor %}
+
diff --git a/paddleaudio/docs/source/_templates/package.rst_t b/paddleaudio/docs/source/_templates/package.rst_t
new file mode 100644
index 0000000000000000000000000000000000000000..7239c11b73a3b7cd8436436046ef1cd3a272d0e6
--- /dev/null
+++ b/paddleaudio/docs/source/_templates/package.rst_t
@@ -0,0 +1,57 @@
+{%- macro automodule(modname, options) -%}
+.. automodule:: {{ modname }}
+{%- for option in options %}
+ :{{ option }}:
+{%- endfor %}
+{%- endmacro %}
+
+{%- macro toctree(docnames) -%}
+.. toctree::
+ :maxdepth: {{ maxdepth }}
+{% for docname in docnames %}
+ {{ docname }}
+{%- endfor %}
+{%- endmacro %}
+
+{%- if is_namespace %}
+{{- [pkgname, "namespace"] | join(" ") | e | heading }}
+{% else %}
+{{- pkgname | e | heading }}
+{% endif %}
+
+{%- if is_namespace %}
+.. py:module:: {{ pkgname }}
+{% endif %}
+
+{%- if modulefirst and not is_namespace %}
+{{ automodule(pkgname, automodule_options) }}
+{% endif %}
+
+{%- if subpackages %}
+Subpackages
+-----------
+
+{{ toctree(subpackages) }}
+{% endif %}
+
+{%- if submodules %}
+Submodules
+----------
+{% if separatemodules %}
+{{ toctree(submodules) }}
+{% else %}
+{%- for submodule in submodules %}
+{% if show_headings %}
+{{- submodule | e | heading(2) }}
+{% endif %}
+{{ automodule(submodule, automodule_options) }}
+{% endfor %}
+{%- endif %}
+{%- endif %}
+
+{%- if not modulefirst and not is_namespace %}
+Module contents
+---------------
+
+{{ automodule(pkgname, automodule_options) }}
+{% endif %}
diff --git a/paddleaudio/docs/source/_templates/toc.rst_t b/paddleaudio/docs/source/_templates/toc.rst_t
new file mode 100644
index 0000000000000000000000000000000000000000..f0877eeb2f85324a48eb63d793a536a8cfdb4a00
--- /dev/null
+++ b/paddleaudio/docs/source/_templates/toc.rst_t
@@ -0,0 +1,8 @@
+{{ header | heading }}
+
+.. toctree::
+ :maxdepth: {{ maxdepth }}
+{% for docname in docnames %}
+ {{ docname }}
+{%- endfor %}
+
diff --git a/paddleaudio/docs/source/conf.py b/paddleaudio/docs/source/conf.py
new file mode 100644
index 0000000000000000000000000000000000000000..09c4f312fc4afff86b358050a35ec635c7b873b9
--- /dev/null
+++ b/paddleaudio/docs/source/conf.py
@@ -0,0 +1,181 @@
+# -*- coding: utf-8 -*-
+#
+# Configuration file for the Sphinx documentation builder.
+#
+# This file does only contain a selection of the most common options. For a
+# full list see the documentation:
+# http://www.sphinx-doc.org/en/master/config
+# -- Path setup --------------------------------------------------------------
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+import os
+import sys
+sys.path.insert(0, os.path.abspath('../..'))
+
+# -- Project information -----------------------------------------------------
+
+project = 'PaddleAudio'
+copyright = '2022, PaddlePaddle'
+author = 'PaddlePaddle'
+
+# The short X.Y version
+version = ''
+# The full version, including alpha/beta/rc tags
+release = '0.2.0'
+
+# -- General configuration ---------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#
+# needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = [
+ 'sphinx.ext.autodoc',
+ 'sphinx.ext.intersphinx',
+ 'sphinx.ext.mathjax',
+ 'sphinx.ext.viewcode',
+ 'sphinx.ext.napoleon',
+]
+
+napoleon_google_docstring = True
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = ['_templates']
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+#
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The master toctree document.
+master_doc = 'index'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+# This pattern also affects html_static_path and html_extra_path.
+exclude_patterns = []
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = None
+
+# -- Options for HTML output -------------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+#
+
+import sphinx_rtd_theme
+html_theme = 'sphinx_rtd_theme'
+html_theme_path = [sphinx_rtd_theme.get_html_theme_path()]
+smartquotes = False
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#
+# html_theme_options = {}
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+html_logo = '../images/paddle.png'
+html_css_files = [
+ 'custom.css',
+]
+
+# Custom sidebar templates, must be a dictionary that maps document names
+# to template names.
+#
+# The default sidebars (for documents that don't match any pattern) are
+# defined by theme itself. Builtin themes are using these templates by
+# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
+# 'searchbox.html']``.
+#
+# html_sidebars = {}
+
+# -- Options for HTMLHelp output ---------------------------------------------
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'PaddleAudiodoc'
+
+# -- Options for LaTeX output ------------------------------------------------
+
+latex_elements = {
+ # The paper size ('letterpaper' or 'a4paper').
+ #
+ # 'papersize': 'letterpaper',
+
+ # The font size ('10pt', '11pt' or '12pt').
+ #
+ # 'pointsize': '10pt',
+
+ # Additional stuff for the LaTeX preamble.
+ #
+ # 'preamble': '',
+
+ # Latex figure (float) alignment
+ #
+ # 'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+# author, documentclass [howto, manual, or own class]).
+latex_documents = [
+ (master_doc, 'PaddleAudio.tex', 'PaddleAudio Documentation', 'PaddlePaddle',
+ 'manual'),
+]
+
+# -- Options for manual page output ------------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [(master_doc, 'paddleaudio', 'PaddleAudio Documentation', [author],
+ 1)]
+
+# -- Options for Texinfo output ----------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+# dir menu entry, description, category)
+texinfo_documents = [
+ (master_doc, 'PaddleAudio', 'PaddleAudio Documentation', author,
+ 'PaddleAudio', 'One line description of project.', 'Miscellaneous'),
+]
+
+# -- Options for Epub output -------------------------------------------------
+
+# Bibliographic Dublin Core info.
+epub_title = project
+
+# The unique identifier of the text. This can be a ISBN number
+# or the project homepage.
+#
+# epub_identifier = ''
+
+# A unique identification for the text.
+#
+# epub_uid = ''
+
+# A list of files that should not be packed into the epub file.
+epub_exclude_files = ['search.html']
+
+# -- Extension configuration -------------------------------------------------
+
+# -- Options for intersphinx extension ---------------------------------------
+
+# Example configuration for intersphinx: refer to the Python standard library.
+intersphinx_mapping = {'https://docs.python.org/': None}
diff --git a/paddleaudio/docs/source/index.rst b/paddleaudio/docs/source/index.rst
new file mode 100644
index 0000000000000000000000000000000000000000..26963308eeab3cfce089614300ac9e5baf71607d
--- /dev/null
+++ b/paddleaudio/docs/source/index.rst
@@ -0,0 +1,22 @@
+.. PaddleAudio documentation master file, created by
+ sphinx-quickstart on Tue Mar 22 15:57:16 2022.
+ You can adapt this file completely to your liking, but it should at least
+ contain the root `toctree` directive.
+
+Welcome to PaddleAudio's documentation!
+=======================================
+
+.. toctree::
+ :maxdepth: 1
+
+ Index
+
+
+API References
+--------------
+
+.. toctree::
+ :maxdepth: 2
+ :titlesonly:
+
+ paddleaudio
\ No newline at end of file
diff --git a/paddleaudio/paddleaudio/compliance/__init__.py b/paddleaudio/paddleaudio/compliance/__init__.py
index 97043fd7ba6885aac81cad5a49924c23c67d4d47..c08f9ab11ea7b6e71eb62f095b9404e4d4331e91 100644
--- a/paddleaudio/paddleaudio/compliance/__init__.py
+++ b/paddleaudio/paddleaudio/compliance/__init__.py
@@ -11,3 +11,5 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+from . import kaldi
+from . import librosa
diff --git a/paddleaudio/paddleaudio/datasets/__init__.py b/paddleaudio/paddleaudio/datasets/__init__.py
index 5c5f036949c3553edd4569e7769334d0741d3de4..ebd4af984f697a8fe73c7a87f4d8362a95915c42 100644
--- a/paddleaudio/paddleaudio/datasets/__init__.py
+++ b/paddleaudio/paddleaudio/datasets/__init__.py
@@ -13,5 +13,7 @@
# limitations under the License.
from .esc50 import ESC50
from .gtzan import GTZAN
+from .rirs_noises import OpenRIRNoise
from .tess import TESS
from .urban_sound import UrbanSound8K
+from .voxceleb import VoxCeleb
diff --git a/paddleaudio/paddleaudio/datasets/rirs_noises.py b/paddleaudio/paddleaudio/datasets/rirs_noises.py
new file mode 100644
index 0000000000000000000000000000000000000000..68639a60487143b88251b9722a92e9a4180dcd03
--- /dev/null
+++ b/paddleaudio/paddleaudio/datasets/rirs_noises.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import csv
+import os
+import random
+from typing import List
+
+from paddle.io import Dataset
+from tqdm import tqdm
+
+from ..backends import load as load_audio
+from ..backends import save as save_wav
+from ..utils import DATA_HOME
+from ..utils.download import download_and_decompress
+from .dataset import feat_funcs
+
+__all__ = ['OpenRIRNoise']
+
+
+class OpenRIRNoise(Dataset):
+ archieves = [
+ {
+ 'url': 'http://www.openslr.org/resources/28/rirs_noises.zip',
+ 'md5': 'e6f48e257286e05de56413b4779d8ffb',
+ },
+ ]
+
+ sample_rate = 16000
+ meta_info = collections.namedtuple('META_INFO', ('id', 'duration', 'wav'))
+ base_path = os.path.join(DATA_HOME, 'open_rir_noise')
+ wav_path = os.path.join(base_path, 'RIRS_NOISES')
+ csv_path = os.path.join(base_path, 'csv')
+ subsets = ['rir', 'noise']
+
+ def __init__(self,
+ subset: str='rir',
+ feat_type: str='raw',
+ target_dir=None,
+ random_chunk: bool=True,
+ chunk_duration: float=3.0,
+ seed: int=0,
+ **kwargs):
+
+ assert subset in self.subsets, \
+ 'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
+
+ self.subset = subset
+ self.feat_type = feat_type
+ self.feat_config = kwargs
+ self.random_chunk = random_chunk
+ self.chunk_duration = chunk_duration
+
+ OpenRIRNoise.csv_path = os.path.join(
+ target_dir, "open_rir_noise",
+ "csv") if target_dir else self.csv_path
+ self._data = self._get_data()
+ super(OpenRIRNoise, self).__init__()
+
+ # Set up a seed to reproduce training or predicting result.
+ # random.seed(seed)
+
+ def _get_data(self):
+ # Download audio files.
+ print(f"rirs noises base path: {self.base_path}")
+ if not os.path.isdir(self.base_path):
+ download_and_decompress(
+ self.archieves, self.base_path, decompress=True)
+ else:
+ print(
+ f"{self.base_path} already exists, we will not download and decompress again"
+ )
+
+ # Data preparation.
+ print(f"prepare the csv to {self.csv_path}")
+ if not os.path.isdir(self.csv_path):
+ os.makedirs(self.csv_path)
+ self.prepare_data()
+
+ data = []
+ with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
+ for line in rf.readlines()[1:]:
+ audio_id, duration, wav = line.strip().split(',')
+ data.append(self.meta_info(audio_id, float(duration), wav))
+
+ random.shuffle(data)
+ return data
+
+ def _convert_to_record(self, idx: int):
+ sample = self._data[idx]
+
+ record = {}
+ # To show all fields in a namedtuple: `type(sample)._fields`
+ for field in type(sample)._fields:
+ record[field] = getattr(sample, field)
+
+ waveform, sr = load_audio(record['wav'])
+
+ assert self.feat_type in feat_funcs.keys(), \
+ f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
+ feat_func = feat_funcs[self.feat_type]
+ feat = feat_func(
+ waveform, sr=sr, **self.feat_config) if feat_func else waveform
+
+ record.update({'feat': feat})
+ return record
+
+ @staticmethod
+ def _get_chunks(seg_dur, audio_id, audio_duration):
+ num_chunks = int(audio_duration / seg_dur) # all in milliseconds
+
+ chunk_lst = [
+ audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
+ for i in range(num_chunks)
+ ]
+ return chunk_lst
+
+ def _get_audio_info(self, wav_file: str,
+ split_chunks: bool) -> List[List[str]]:
+ waveform, sr = load_audio(wav_file)
+ audio_id = wav_file.split("/open_rir_noise/")[-1].split(".")[0]
+ audio_duration = waveform.shape[0] / sr
+
+ ret = []
+ if split_chunks and audio_duration > self.chunk_duration: # Split into pieces of self.chunk_duration seconds.
+ uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
+ audio_duration)
+
+ for idx, chunk in enumerate(uniq_chunks_list):
+ s, e = chunk.split("_")[-2:] # Timestamps of start and end
+ start_sample = int(float(s) * sr)
+ end_sample = int(float(e) * sr)
+ new_wav_file = os.path.join(self.base_path,
+ audio_id + f'_chunk_{idx+1:02}.wav')
+ save_wav(waveform[start_sample:end_sample], sr, new_wav_file)
+ # id, duration, new_wav
+ ret.append([chunk, self.chunk_duration, new_wav_file])
+ else: # Keep whole audio.
+ ret.append([audio_id, audio_duration, wav_file])
+ return ret
+
+ def generate_csv(self,
+ wav_files: List[str],
+ output_file: str,
+ split_chunks: bool=True):
+ print(f'Generating csv: {output_file}')
+ header = ["id", "duration", "wav"]
+
+ infos = list(
+ tqdm(
+ map(self._get_audio_info, wav_files, [split_chunks] * len(
+ wav_files)),
+ total=len(wav_files)))
+
+ csv_lines = []
+ for info in infos:
+ csv_lines.extend(info)
+
+ with open(output_file, mode="w") as csv_f:
+ csv_writer = csv.writer(
+ csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
+ csv_writer.writerow(header)
+ for line in csv_lines:
+ csv_writer.writerow(line)
+
+ def prepare_data(self):
+ rir_list = os.path.join(self.wav_path, "real_rirs_isotropic_noises",
+ "rir_list")
+ rir_files = []
+ with open(rir_list, 'r') as f:
+ for line in f.readlines():
+ rir_file = line.strip().split(' ')[-1]
+ rir_files.append(os.path.join(self.base_path, rir_file))
+
+ noise_list = os.path.join(self.wav_path, "pointsource_noises",
+ "noise_list")
+ noise_files = []
+ with open(noise_list, 'r') as f:
+ for line in f.readlines():
+ noise_file = line.strip().split(' ')[-1]
+ noise_files.append(os.path.join(self.base_path, noise_file))
+
+ self.generate_csv(rir_files, os.path.join(self.csv_path, 'rir.csv'))
+ self.generate_csv(noise_files, os.path.join(self.csv_path, 'noise.csv'))
+
+ def __getitem__(self, idx):
+ return self._convert_to_record(idx)
+
+ def __len__(self):
+ return len(self._data)
diff --git a/paddleaudio/paddleaudio/datasets/voxceleb.py b/paddleaudio/paddleaudio/datasets/voxceleb.py
new file mode 100644
index 0000000000000000000000000000000000000000..3f72b5f2eb0d0ab5fa6b6e67b9cc48a207583117
--- /dev/null
+++ b/paddleaudio/paddleaudio/datasets/voxceleb.py
@@ -0,0 +1,356 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+import csv
+import glob
+import os
+import random
+from multiprocessing import cpu_count
+from typing import List
+
+from paddle.io import Dataset
+from pathos.multiprocessing import Pool
+from tqdm import tqdm
+
+from ..backends import load as load_audio
+from ..utils import DATA_HOME
+from ..utils import decompress
+from ..utils.download import download_and_decompress
+from .dataset import feat_funcs
+
+__all__ = ['VoxCeleb']
+
+
+class VoxCeleb(Dataset):
+ source_url = 'https://thor.robots.ox.ac.uk/~vgg/data/voxceleb/vox1a/'
+ archieves_audio_dev = [
+ {
+ 'url': source_url + 'vox1_dev_wav_partaa',
+ 'md5': 'e395d020928bc15670b570a21695ed96',
+ },
+ {
+ 'url': source_url + 'vox1_dev_wav_partab',
+ 'md5': 'bbfaaccefab65d82b21903e81a8a8020',
+ },
+ {
+ 'url': source_url + 'vox1_dev_wav_partac',
+ 'md5': '017d579a2a96a077f40042ec33e51512',
+ },
+ {
+ 'url': source_url + 'vox1_dev_wav_partad',
+ 'md5': '7bb1e9f70fddc7a678fa998ea8b3ba19',
+ },
+ ]
+ archieves_audio_test = [
+ {
+ 'url': source_url + 'vox1_test_wav.zip',
+ 'md5': '185fdc63c3c739954633d50379a3d102',
+ },
+ ]
+ archieves_meta = [
+ {
+ 'url':
+ 'https://www.robots.ox.ac.uk/~vgg/data/voxceleb/meta/veri_test2.txt',
+ 'md5':
+ 'b73110731c9223c1461fe49cb48dddfc',
+ },
+ ]
+
+ num_speakers = 1211 # 1211 vox1, 5994 vox2, 7205 vox1+2, test speakers: 41
+ sample_rate = 16000
+ meta_info = collections.namedtuple(
+ 'META_INFO', ('id', 'duration', 'wav', 'start', 'stop', 'spk_id'))
+ base_path = os.path.join(DATA_HOME, 'vox1')
+ wav_path = os.path.join(base_path, 'wav')
+ meta_path = os.path.join(base_path, 'meta')
+ veri_test_file = os.path.join(meta_path, 'veri_test2.txt')
+ csv_path = os.path.join(base_path, 'csv')
+ subsets = ['train', 'dev', 'enroll', 'test']
+
+ def __init__(
+ self,
+ subset: str='train',
+ feat_type: str='raw',
+ random_chunk: bool=True,
+ chunk_duration: float=3.0, # seconds
+ split_ratio: float=0.9, # train split ratio
+ seed: int=0,
+ target_dir: str=None,
+ vox2_base_path=None,
+ **kwargs):
+ """VoxCeleb data prepare and get the specific dataset audio info
+
+ Args:
+ subset (str, optional): dataset name, such as train, dev, enroll or test. Defaults to 'train'.
+ feat_type (str, optional): feat type, such raw, melspectrogram(fbank) or mfcc . Defaults to 'raw'.
+ random_chunk (bool, optional): random select a duration from audio. Defaults to True.
+ chunk_duration (float, optional): chunk duration if random_chunk flag is set. Defaults to 3.0.
+ target_dir (str, optional): data dir, audio info will be stored in this directory. Defaults to None.
+ vox2_base_path (_type_, optional): vox2 directory. vox2 data must be converted from m4a to wav. Defaults to None.
+ """
+ assert subset in self.subsets, \
+ 'Dataset subset must be one in {}, but got {}'.format(self.subsets, subset)
+
+ self.subset = subset
+ self.spk_id2label = {}
+ self.feat_type = feat_type
+ self.feat_config = kwargs
+ self.random_chunk = random_chunk
+ self.chunk_duration = chunk_duration
+ self.split_ratio = split_ratio
+ self.target_dir = target_dir if target_dir else VoxCeleb.base_path
+ self.vox2_base_path = vox2_base_path
+
+ # if we set the target dir, we will change the vox data info data from base path to target dir
+ VoxCeleb.csv_path = os.path.join(
+ target_dir, "voxceleb", 'csv') if target_dir else VoxCeleb.csv_path
+ VoxCeleb.meta_path = os.path.join(
+ target_dir, "voxceleb",
+ 'meta') if target_dir else VoxCeleb.meta_path
+ VoxCeleb.veri_test_file = os.path.join(VoxCeleb.meta_path,
+ 'veri_test2.txt')
+ # self._data = self._get_data()[:1000] # KP: Small dataset test.
+ self._data = self._get_data()
+ super(VoxCeleb, self).__init__()
+
+ # Set up a seed to reproduce training or predicting result.
+ # random.seed(seed)
+
+ def _get_data(self):
+ # Download audio files.
+ # We need the users to decompress all vox1/dev/wav and vox1/test/wav/ to vox1/wav/ dir
+ # so, we check the vox1/wav dir status
+ print(f"wav base path: {self.wav_path}")
+ if not os.path.isdir(self.wav_path):
+ print("start to download the voxceleb1 dataset")
+ download_and_decompress( # multi-zip parts concatenate to vox1_dev_wav.zip
+ self.archieves_audio_dev,
+ self.base_path,
+ decompress=False)
+ download_and_decompress( # download the vox1_test_wav.zip and unzip
+ self.archieves_audio_test,
+ self.base_path,
+ decompress=True)
+
+ # Download all parts and concatenate the files into one zip file.
+ dev_zipfile = os.path.join(self.base_path, 'vox1_dev_wav.zip')
+ print(f'Concatenating all parts to: {dev_zipfile}')
+ os.system(
+ f'cat {os.path.join(self.base_path, "vox1_dev_wav_parta*")} > {dev_zipfile}'
+ )
+
+ # Extract all audio files of dev and test set.
+ decompress(dev_zipfile, self.base_path)
+
+ # Download meta files.
+ if not os.path.isdir(self.meta_path):
+ print("prepare the meta data")
+ download_and_decompress(
+ self.archieves_meta, self.meta_path, decompress=False)
+
+ # Data preparation.
+ if not os.path.isdir(self.csv_path):
+ os.makedirs(self.csv_path)
+ self.prepare_data()
+
+ data = []
+ print(
+ f"read the {self.subset} from {os.path.join(self.csv_path, f'{self.subset}.csv')}"
+ )
+ with open(os.path.join(self.csv_path, f'{self.subset}.csv'), 'r') as rf:
+ for line in rf.readlines()[1:]:
+ audio_id, duration, wav, start, stop, spk_id = line.strip(
+ ).split(',')
+ data.append(
+ self.meta_info(audio_id,
+ float(duration), wav,
+ int(start), int(stop), spk_id))
+
+ with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'r') as f:
+ for line in f.readlines():
+ spk_id, label = line.strip().split(' ')
+ self.spk_id2label[spk_id] = int(label)
+
+ return data
+
+ def _convert_to_record(self, idx: int):
+ sample = self._data[idx]
+
+ record = {}
+ # To show all fields in a namedtuple: `type(sample)._fields`
+ for field in type(sample)._fields:
+ record[field] = getattr(sample, field)
+
+ waveform, sr = load_audio(record['wav'])
+
+ # random select a chunk audio samples from the audio
+ if self.random_chunk:
+ num_wav_samples = waveform.shape[0]
+ num_chunk_samples = int(self.chunk_duration * sr)
+ start = random.randint(0, num_wav_samples - num_chunk_samples - 1)
+ stop = start + num_chunk_samples
+ else:
+ start = record['start']
+ stop = record['stop']
+
+ waveform = waveform[start:stop]
+
+ assert self.feat_type in feat_funcs.keys(), \
+ f"Unknown feat_type: {self.feat_type}, it must be one in {list(feat_funcs.keys())}"
+ feat_func = feat_funcs[self.feat_type]
+ feat = feat_func(
+ waveform, sr=sr, **self.feat_config) if feat_func else waveform
+
+ record.update({'feat': feat})
+ if self.subset in ['train',
+ 'dev']: # Labels are available in train and dev.
+ record.update({'label': self.spk_id2label[record['spk_id']]})
+
+ return record
+
+ @staticmethod
+ def _get_chunks(seg_dur, audio_id, audio_duration):
+ num_chunks = int(audio_duration / seg_dur) # all in milliseconds
+
+ chunk_lst = [
+ audio_id + "_" + str(i * seg_dur) + "_" + str(i * seg_dur + seg_dur)
+ for i in range(num_chunks)
+ ]
+ return chunk_lst
+
+ def _get_audio_info(self, wav_file: str,
+ split_chunks: bool) -> List[List[str]]:
+ waveform, sr = load_audio(wav_file)
+ spk_id, sess_id, utt_id = wav_file.split("/")[-3:]
+ audio_id = '-'.join([spk_id, sess_id, utt_id.split(".")[0]])
+ audio_duration = waveform.shape[0] / sr
+
+ ret = []
+ if split_chunks: # Split into pieces of self.chunk_duration seconds.
+ uniq_chunks_list = self._get_chunks(self.chunk_duration, audio_id,
+ audio_duration)
+
+ for chunk in uniq_chunks_list:
+ s, e = chunk.split("_")[-2:] # Timestamps of start and end
+ start_sample = int(float(s) * sr)
+ end_sample = int(float(e) * sr)
+ # id, duration, wav, start, stop, spk_id
+ ret.append([
+ chunk, audio_duration, wav_file, start_sample, end_sample,
+ spk_id
+ ])
+ else: # Keep whole audio.
+ ret.append([
+ audio_id, audio_duration, wav_file, 0, waveform.shape[0], spk_id
+ ])
+ return ret
+
+ def generate_csv(self,
+ wav_files: List[str],
+ output_file: str,
+ split_chunks: bool=True):
+ print(f'Generating csv: {output_file}')
+ header = ["ID", "duration", "wav", "start", "stop", "spk_id"]
+ # Note: this may occurs c++ execption, but the program will execute fine
+ # so we can ignore the execption
+ with Pool(cpu_count()) as p:
+ infos = list(
+ tqdm(
+ p.imap(lambda x: self._get_audio_info(x, split_chunks),
+ wav_files),
+ total=len(wav_files)))
+
+ csv_lines = []
+ for info in infos:
+ csv_lines.extend(info)
+
+ with open(output_file, mode="w") as csv_f:
+ csv_writer = csv.writer(
+ csv_f, delimiter=",", quotechar='"', quoting=csv.QUOTE_MINIMAL)
+ csv_writer.writerow(header)
+ for line in csv_lines:
+ csv_writer.writerow(line)
+
+ def prepare_data(self):
+ # Audio of speakers in veri_test_file should not be included in training set.
+ print("start to prepare the data csv file")
+ enroll_files = set()
+ test_files = set()
+ # get the enroll and test audio file path
+ with open(self.veri_test_file, 'r') as f:
+ for line in f.readlines():
+ _, enrol_file, test_file = line.strip().split(' ')
+ enroll_files.add(os.path.join(self.wav_path, enrol_file))
+ test_files.add(os.path.join(self.wav_path, test_file))
+ enroll_files = sorted(enroll_files)
+ test_files = sorted(test_files)
+
+ # get the enroll and test speakers
+ test_spks = set()
+ for file in (enroll_files + test_files):
+ spk = file.split('/wav/')[1].split('/')[0]
+ test_spks.add(spk)
+
+ # get all the train and dev audios file path
+ audio_files = []
+ speakers = set()
+ print("Getting file list...")
+ for path in [self.wav_path, self.vox2_base_path]:
+ # if vox2 directory is not set and vox2 is not a directory
+ # we will not process this directory
+ if not path or not os.path.exists(path):
+ print(f"{path} is an invalid path, please check again, "
+ "and we will ignore the vox2 base path")
+ continue
+ for file in glob.glob(
+ os.path.join(path, "**", "*.wav"), recursive=True):
+ spk = file.split('/wav/')[1].split('/')[0]
+ if spk in test_spks:
+ continue
+ speakers.add(spk)
+ audio_files.append(file)
+
+ print(
+ f"start to generate the {os.path.join(self.meta_path, 'spk_id2label.txt')}"
+ )
+ # encode the train and dev speakers label to spk_id2label.txt
+ with open(os.path.join(self.meta_path, 'spk_id2label.txt'), 'w') as f:
+ for label, spk_id in enumerate(
+ sorted(speakers)): # 1211 vox1, 5994 vox2, 7205 vox1+2
+ f.write(f'{spk_id} {label}\n')
+
+ audio_files = sorted(audio_files)
+ random.shuffle(audio_files)
+ split_idx = int(self.split_ratio * len(audio_files))
+ # split_ratio to train
+ train_files, dev_files = audio_files[:split_idx], audio_files[
+ split_idx:]
+
+ self.generate_csv(train_files, os.path.join(self.csv_path, 'train.csv'))
+ self.generate_csv(dev_files, os.path.join(self.csv_path, 'dev.csv'))
+
+ self.generate_csv(
+ enroll_files,
+ os.path.join(self.csv_path, 'enroll.csv'),
+ split_chunks=False)
+ self.generate_csv(
+ test_files,
+ os.path.join(self.csv_path, 'test.csv'),
+ split_chunks=False)
+
+ def __getitem__(self, idx):
+ return self._convert_to_record(idx)
+
+ def __len__(self):
+ return len(self._data)
diff --git a/paddleaudio/paddleaudio/metric/__init__.py b/paddleaudio/paddleaudio/metric/__init__.py
index a96530ff6e51308ae1587f2c2d6fc86cde1d64a8..8e5ca9f757029cf925c335990a4aaa1455f95175 100644
--- a/paddleaudio/paddleaudio/metric/__init__.py
+++ b/paddleaudio/paddleaudio/metric/__init__.py
@@ -12,4 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from .dtw import dtw_distance
+from .eer import compute_eer
+from .eer import compute_minDCF
from .mcd import mcd_distance
diff --git a/paddleaudio/paddleaudio/metric/dtw.py b/paddleaudio/paddleaudio/metric/dtw.py
index c4dc7a283d9b6a82b87e82066bfd36ba06aac0db..662e4506d03fcbdd229b547a2a4d12c09667bb5f 100644
--- a/paddleaudio/paddleaudio/metric/dtw.py
+++ b/paddleaudio/paddleaudio/metric/dtw.py
@@ -24,11 +24,15 @@ def dtw_distance(xs: np.ndarray, ys: np.ndarray) -> float:
This function keeps a compact matrix, not the full warping paths matrix.
Uses dynamic programming to compute:
- wps[i, j] = (s1[i]-s2[j])**2 + min(
- wps[i-1, j ] + penalty, // vertical / insertion / expansion
- wps[i , j-1] + penalty, // horizontal / deletion / compression
- wps[i-1, j-1]) // diagonal / match
- dtw = sqrt(wps[-1, -1])
+ Examples:
+ .. code-block:: python
+
+ wps[i, j] = (s1[i]-s2[j])**2 + min(
+ wps[i-1, j ] + penalty, // vertical / insertion / expansion
+ wps[i , j-1] + penalty, // horizontal / deletion / compression
+ wps[i-1, j-1]) // diagonal / match
+
+ dtw = sqrt(wps[-1, -1])
Args:
xs (np.ndarray): ref sequence, [T,D]
diff --git a/paddleaudio/paddleaudio/metric/eer.py b/paddleaudio/paddleaudio/metric/eer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1166d3f93a2135b692ad822aae8134ffd1f5295
--- /dev/null
+++ b/paddleaudio/paddleaudio/metric/eer.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+import numpy as np
+import paddle
+from sklearn.metrics import roc_curve
+
+
+def compute_eer(labels: np.ndarray, scores: np.ndarray) -> List[float]:
+ """Compute EER and return score threshold.
+
+ Args:
+ labels (np.ndarray): the trial label, shape: [N], one-dimention, N refer to the samples num
+ scores (np.ndarray): the trial scores, shape: [N], one-dimention, N refer to the samples num
+
+ Returns:
+ List[float]: eer and the specific threshold
+ """
+ fpr, tpr, threshold = roc_curve(y_true=labels, y_score=scores)
+ fnr = 1 - tpr
+ eer_threshold = threshold[np.nanargmin(np.absolute((fnr - fpr)))]
+ eer = fpr[np.nanargmin(np.absolute((fnr - fpr)))]
+ return eer, eer_threshold
+
+
+def compute_minDCF(positive_scores,
+ negative_scores,
+ c_miss=1.0,
+ c_fa=1.0,
+ p_target=0.01):
+ """
+ This is modified from SpeechBrain
+ https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/utils/metric_stats.py#L509
+ Computes the minDCF metric normally used to evaluate speaker verification
+ systems. The min_DCF is the minimum of the following C_det function computed
+ within the defined threshold range:
+
+ C_det = c_miss * p_miss * p_target + c_fa * p_fa * (1 -p_target)
+
+ where p_miss is the missing probability and p_fa is the probability of having
+ a false alarm.
+
+ Args:
+ positive_scores (Paddle.Tensor): The scores from entries of the same class.
+ negative_scores (Paddle.Tensor): The scores from entries of different classes.
+ c_miss (float, optional): Cost assigned to a missing error (default 1.0).
+ c_fa (float, optional): Cost assigned to a false alarm (default 1.0).
+ p_target (float, optional): Prior probability of having a target (default 0.01).
+
+ Returns:
+ List[float]: min dcf and the specific threshold
+ """
+ # Computing candidate thresholds
+ if len(positive_scores.shape) > 1:
+ positive_scores = positive_scores.squeeze()
+
+ if len(negative_scores.shape) > 1:
+ negative_scores = negative_scores.squeeze()
+
+ thresholds = paddle.sort(paddle.concat([positive_scores, negative_scores]))
+ thresholds = paddle.unique(thresholds)
+
+ # Adding intermediate thresholds
+ interm_thresholds = (thresholds[0:-1] + thresholds[1:]) / 2
+ thresholds = paddle.sort(paddle.concat([thresholds, interm_thresholds]))
+
+ # Computing False Rejection Rate (miss detection)
+ positive_scores = paddle.concat(
+ len(thresholds) * [positive_scores.unsqueeze(0)])
+ pos_scores_threshold = positive_scores.transpose(perm=[1, 0]) <= thresholds
+ p_miss = (pos_scores_threshold.sum(0)
+ ).astype("float32") / positive_scores.shape[1]
+ del positive_scores
+ del pos_scores_threshold
+
+ # Computing False Acceptance Rate (false alarm)
+ negative_scores = paddle.concat(
+ len(thresholds) * [negative_scores.unsqueeze(0)])
+ neg_scores_threshold = negative_scores.transpose(perm=[1, 0]) > thresholds
+ p_fa = (neg_scores_threshold.sum(0)
+ ).astype("float32") / negative_scores.shape[1]
+ del negative_scores
+ del neg_scores_threshold
+
+ c_det = c_miss * p_miss * p_target + c_fa * p_fa * (1 - p_target)
+ c_min = paddle.min(c_det, axis=0)
+ min_index = paddle.argmin(c_det, axis=0)
+ return float(c_min), float(thresholds[min_index])
diff --git a/paddleaudio/paddleaudio/metric/mcd.py b/paddleaudio/paddleaudio/metric/mcd.py
index 465cd5a45db48d3d1ac33e338436e3531764f61a..63a25fc230766c20114a622544a6e35e2248e014 100644
--- a/paddleaudio/paddleaudio/metric/mcd.py
+++ b/paddleaudio/paddleaudio/metric/mcd.py
@@ -11,6 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+from typing import Callable
+
import mcd.metrics_fast as mt
import numpy as np
from mcd import dtw
@@ -20,29 +22,42 @@ __all__ = [
]
-def mcd_distance(xs: np.ndarray, ys: np.ndarray, cost_fn=mt.logSpecDbDist):
+def mcd_distance(xs: np.ndarray,
+ ys: np.ndarray,
+ cost_fn: Callable=mt.logSpecDbDist) -> float:
"""Mel cepstral distortion (MCD), dtw distance.
Dynamic Time Warping.
Uses dynamic programming to compute:
- wps[i, j] = cost_fn(xs[i], ys[j]) + min(
- wps[i-1, j ], // vertical / insertion / expansion
- wps[i , j-1], // horizontal / deletion / compression
- wps[i-1, j-1]) // diagonal / match
- dtw = sqrt(wps[-1, -1])
+
+ Examples:
+ .. code-block:: python
+
+ wps[i, j] = cost_fn(xs[i], ys[j]) + min(
+ wps[i-1, j ], // vertical / insertion / expansion
+ wps[i , j-1], // horizontal / deletion / compression
+ wps[i-1, j-1]) // diagonal / match
+
+ dtw = sqrt(wps[-1, -1])
Cost Function:
- logSpecDbConst = 10.0 / math.log(10.0) * math.sqrt(2.0)
- def logSpecDbDist(x, y):
- diff = x - y
- return logSpecDbConst * math.sqrt(np.inner(diff, diff))
+ Examples:
+ .. code-block:: python
+
+ logSpecDbConst = 10.0 / math.log(10.0) * math.sqrt(2.0)
+
+ def logSpecDbDist(x, y):
+ diff = x - y
+ return logSpecDbConst * math.sqrt(np.inner(diff, diff))
Args:
xs (np.ndarray): ref sequence, [T,D]
ys (np.ndarray): hyp sequence, [T,D]
+ cost_fn (Callable, optional): Cost function. Defaults to mt.logSpecDbDist.
Returns:
float: dtw distance
"""
+
min_cost, path = dtw.dtw(xs, ys, cost_fn)
return min_cost
diff --git a/paddleaudio/paddleaudio/utils/download.py b/paddleaudio/paddleaudio/utils/download.py
index 4658352f948f496a1420b76916ab5a5d15016adf..07d5eea845ff67dfd444794905d811dbf5fb8522 100644
--- a/paddleaudio/paddleaudio/utils/download.py
+++ b/paddleaudio/paddleaudio/utils/download.py
@@ -37,7 +37,9 @@ def decompress(file: str):
download._decompress(file)
-def download_and_decompress(archives: List[Dict[str, str]], path: str):
+def download_and_decompress(archives: List[Dict[str, str]],
+ path: str,
+ decompress: bool=True):
"""
Download archieves and decompress to specific path.
"""
@@ -47,8 +49,8 @@ def download_and_decompress(archives: List[Dict[str, str]], path: str):
for archive in archives:
assert 'url' in archive and 'md5' in archive, \
'Dictionary keys of "url" and "md5" are required in the archive, but got: {list(archieve.keys())}'
-
- download.get_path_from_url(archive['url'], path, archive['md5'])
+ download.get_path_from_url(
+ archive['url'], path, archive['md5'], decompress=decompress)
def load_state_dict_from_url(url: str, path: str, md5: str=None):
diff --git a/paddleaudio/setup.py b/paddleaudio/setup.py
index 930f86e41e551073d96f4046587133c7ac88d6b2..e08b88a3bdc9901bcacf63d2bab4157911fd80bf 100644
--- a/paddleaudio/setup.py
+++ b/paddleaudio/setup.py
@@ -82,13 +82,9 @@ setuptools.setup(
],
python_requires='>=3.6',
install_requires=[
- 'numpy >= 1.15.0',
- 'scipy >= 1.0.0',
- 'resampy >= 0.2.2',
- 'soundfile >= 0.9.0',
- 'colorlog',
- 'dtaidistance >= 2.3.6',
- 'mcd >= 0.4',
+ 'numpy >= 1.15.0', 'scipy >= 1.0.0', 'resampy >= 0.2.2',
+ 'soundfile >= 0.9.0', 'colorlog', 'dtaidistance == 2.3.1', 'mcd >= 0.4',
+ 'pathos'
],
extras_require={
'test': [
diff --git a/paddlespeech/cli/README.md b/paddlespeech/cli/README.md
index 5ac7a3bcaf1709b94020715d4480c08cf98cc3f0..19c822040de6699123781f14b6eac5bcf3ca15a6 100644
--- a/paddlespeech/cli/README.md
+++ b/paddlespeech/cli/README.md
@@ -13,6 +13,12 @@
paddlespeech cls --input input.wav
```
+ ## Speaker Verification
+
+ ```bash
+ paddlespeech vector --task spk --input input_16k.wav
+ ```
+
## Automatic Speech Recognition
```
paddlespeech asr --lang zh --input input_16k.wav
diff --git a/paddlespeech/cli/README_cn.md b/paddlespeech/cli/README_cn.md
index 75ab9e41b10152446db762b1b4ed1c180cd49967..4b15d6c7bc68a39075aba7efb37a04e687b5ab35 100644
--- a/paddlespeech/cli/README_cn.md
+++ b/paddlespeech/cli/README_cn.md
@@ -12,6 +12,12 @@
## 声音分类
```bash
paddlespeech cls --input input.wav
+ ```
+
+ ## 声纹识别
+
+ ```bash
+ paddlespeech vector --task spk --input input_16k.wav
```
## 语音识别
diff --git a/paddlespeech/cli/__init__.py b/paddlespeech/cli/__init__.py
index b526a3849b0ed5deddd519e7a0573a592c743d2f..ddf0359bc5fcb7ff80b437a65112869d7faa12eb 100644
--- a/paddlespeech/cli/__init__.py
+++ b/paddlespeech/cli/__init__.py
@@ -21,5 +21,6 @@ from .st import STExecutor
from .stats import StatsExecutor
from .text import TextExecutor
from .tts import TTSExecutor
+from .vector import VectorExecutor
_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
diff --git a/paddlespeech/cli/tts/infer.py b/paddlespeech/cli/tts/infer.py
index 78eae769bee752dd72d6ec377e0be64a2c0f7fc0..c7a1edc93325166f7f3eeb172d577a5353eeb234 100644
--- a/paddlespeech/cli/tts/infer.py
+++ b/paddlespeech/cli/tts/infer.py
@@ -237,6 +237,18 @@ pretrained_models = {
'speech_stats':
'feats_stats.npy',
},
+ "hifigan_ljspeech-en": {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_ljspeech_ckpt_0.2.0.zip',
+ 'md5':
+ '70e9131695decbca06a65fe51ed38a72',
+ 'config':
+ 'default.yaml',
+ 'ckpt':
+ 'snapshot_iter_2500000.pdz',
+ 'speech_stats':
+ 'feats_stats.npy',
+ },
"hifigan_aishell3-zh": {
'url':
'https://paddlespeech.bj.bcebos.com/Parakeet/released_models/hifigan/hifigan_aishell3_ckpt_0.2.0.zip',
@@ -389,6 +401,7 @@ class TTSExecutor(BaseExecutor):
'mb_melgan_csmsc',
'style_melgan_csmsc',
'hifigan_csmsc',
+ 'hifigan_ljspeech',
'hifigan_aishell3',
'hifigan_vctk',
'wavernn_csmsc',
diff --git a/paddlespeech/cli/vector/__init__.py b/paddlespeech/cli/vector/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..038596af02cc3e74d0446f7d279ef8016b429255
--- /dev/null
+++ b/paddlespeech/cli/vector/__init__.py
@@ -0,0 +1,14 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from .infer import VectorExecutor
diff --git a/paddlespeech/cli/vector/infer.py b/paddlespeech/cli/vector/infer.py
new file mode 100644
index 0000000000000000000000000000000000000000..175a9723e1bd811be97de5995996d79f0ef19307
--- /dev/null
+++ b/paddlespeech/cli/vector/infer.py
@@ -0,0 +1,448 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import sys
+from collections import OrderedDict
+from typing import List
+from typing import Optional
+from typing import Union
+
+import paddle
+import soundfile
+from yacs.config import CfgNode
+
+from ..executor import BaseExecutor
+from ..log import logger
+from ..utils import cli_register
+from ..utils import download_and_decompress
+from ..utils import MODEL_HOME
+from ..utils import stats_wrapper
+from paddleaudio.backends import load as load_audio
+from paddleaudio.compliance.librosa import melspectrogram
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+from paddlespeech.vector.io.batch import feature_normalize
+from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+
+pretrained_models = {
+ # The tags for pretrained_models should be "{model_name}[-{dataset}][-{sr}][-...]".
+ # e.g. "ecapatdnn_voxceleb12-16k".
+ # Command line and python api use "{model_name}[-{dataset}]" as --model, usage:
+ # "paddlespeech vector --task spk --model ecapatdnn_voxceleb12-16k --sr 16000 --input ./input.wav"
+ "ecapatdnn_voxceleb12-16k": {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/vector/voxceleb/sv0_ecapa_tdnn_voxceleb12_ckpt_0_1_1.tar.gz',
+ 'md5':
+ 'a1c0dba7d4de997187786ff517d5b4ec',
+ 'cfg_path':
+ 'conf/model.yaml', # the yaml config path
+ 'ckpt_path':
+ 'model/model', # the format is ${dir}/{model_name},
+ # so the first 'model' is dir, the second 'model' is the name
+ # this means we have a model stored as model/model.pdparams
+ },
+}
+
+model_alias = {
+ "ecapatdnn": "paddlespeech.vector.models.ecapa_tdnn:EcapaTdnn",
+}
+
+
+@cli_register(
+ name="paddlespeech.vector",
+ description="Speech to vector embedding infer command.")
+class VectorExecutor(BaseExecutor):
+ def __init__(self):
+ super(VectorExecutor, self).__init__()
+
+ self.parser = argparse.ArgumentParser(
+ prog="paddlespeech.vector", add_help=True)
+
+ self.parser.add_argument(
+ "--model",
+ type=str,
+ default="ecapatdnn_voxceleb12",
+ choices=["ecapatdnn_voxceleb12"],
+ help="Choose model type of vector task.")
+ self.parser.add_argument(
+ "--task",
+ type=str,
+ default="spk",
+ choices=["spk"],
+ help="task type in vector domain")
+ self.parser.add_argument(
+ "--input",
+ type=str,
+ default=None,
+ help="Audio file to extract embedding.")
+ self.parser.add_argument(
+ "--sample_rate",
+ type=int,
+ default=16000,
+ choices=[16000],
+ help="Choose the audio sample rate of the model. 8000 or 16000")
+ self.parser.add_argument(
+ "--ckpt_path",
+ type=str,
+ default=None,
+ help="Checkpoint file of model.")
+ self.parser.add_argument(
+ '--config',
+ type=str,
+ default=None,
+ help='Config of asr task. Use deault config when it is None.')
+ self.parser.add_argument(
+ "--device",
+ type=str,
+ default=paddle.get_device(),
+ help="Choose device to execute model inference.")
+ self.parser.add_argument(
+ '-d',
+ '--job_dump_result',
+ action='store_true',
+ help='Save job result into file.')
+
+ self.parser.add_argument(
+ '-v',
+ '--verbose',
+ action='store_true',
+ help='Increase logger verbosity of current task.')
+
+ def execute(self, argv: List[str]) -> bool:
+ """Command line entry for vector model
+
+ Args:
+ argv (List[str]): command line args list
+
+ Returns:
+ bool:
+ False: some audio occurs error
+ True: all audio process success
+ """
+ # stage 0: parse the args and get the required args
+ parser_args = self.parser.parse_args(argv)
+ model = parser_args.model
+ sample_rate = parser_args.sample_rate
+ config = parser_args.config
+ ckpt_path = parser_args.ckpt_path
+ device = parser_args.device
+
+ # stage 1: configurate the verbose flag
+ if not parser_args.verbose:
+ self.disable_task_loggers()
+
+ # stage 2: read the input data and store them as a list
+ task_source = self.get_task_source(parser_args.input)
+ logger.info(f"task source: {task_source}")
+
+ # stage 3: process the audio one by one
+ task_result = OrderedDict()
+ has_exceptions = False
+ for id_, input_ in task_source.items():
+ try:
+ res = self(input_, model, sample_rate, config, ckpt_path,
+ device)
+ task_result[id_] = res
+ except Exception as e:
+ has_exceptions = True
+ task_result[id_] = f'{e.__class__.__name__}: {e}'
+
+ logger.info("task result as follows: ")
+ logger.info(f"{task_result}")
+
+ # stage 4: process the all the task results
+ self.process_task_results(parser_args.input, task_result,
+ parser_args.job_dump_result)
+
+ # stage 5: return the exception flag
+ # if return False, somen audio process occurs error
+ if has_exceptions:
+ return False
+ else:
+ return True
+
+ @stats_wrapper
+ def __call__(self,
+ audio_file: os.PathLike,
+ model: str='ecapatdnn_voxceleb12',
+ sample_rate: int=16000,
+ config: os.PathLike=None,
+ ckpt_path: os.PathLike=None,
+ device=paddle.get_device()):
+ """Extract the audio embedding
+
+ Args:
+ audio_file (os.PathLike): audio path,
+ whose format must be wav and sample rate must be matched the model
+ model (str, optional): mode type, which is been loaded from the pretrained model list.
+ Defaults to 'ecapatdnn-voxceleb12'.
+ sample_rate (int, optional): model sample rate. Defaults to 16000.
+ config (os.PathLike, optional): yaml config. Defaults to None.
+ ckpt_path (os.PathLike, optional): pretrained model path. Defaults to None.
+ device (optional): paddle running host device. Defaults to paddle.get_device().
+
+ Returns:
+ dict: return the audio embedding and the embedding shape
+ """
+ # stage 0: check the audio format
+ audio_file = os.path.abspath(audio_file)
+ if not self._check(audio_file, sample_rate):
+ sys.exit(-1)
+
+ # stage 1: set the paddle runtime host device
+ logger.info(f"device type: {device}")
+ paddle.device.set_device(device)
+
+ # stage 2: read the specific pretrained model
+ self._init_from_path(model, sample_rate, config, ckpt_path)
+
+ # stage 3: preprocess the audio and get the audio feat
+ self.preprocess(model, audio_file)
+
+ # stage 4: infer the model and get the audio embedding
+ self.infer(model)
+
+ # stage 5: process the result and set them to output dict
+ res = self.postprocess()
+
+ return res
+
+ def _get_pretrained_path(self, tag: str) -> os.PathLike:
+ """get the neural network path from the pretrained model list
+ we stored all the pretained mode in the variable `pretrained_models`
+
+ Args:
+ tag (str): model tag in the pretrained model list
+
+ Returns:
+ os.PathLike: the downloaded pretrained model path in the disk
+ """
+ support_models = list(pretrained_models.keys())
+ assert tag in pretrained_models, \
+ 'The model "{}" you want to use has not been supported,'\
+ 'please choose other models.\n' \
+ 'The support models includes\n\t\t{}'.format(tag, "\n\t\t".join(support_models))
+
+ res_path = os.path.join(MODEL_HOME, tag)
+ decompressed_path = download_and_decompress(pretrained_models[tag],
+ res_path)
+
+ decompressed_path = os.path.abspath(decompressed_path)
+ logger.info(
+ 'Use pretrained model stored in: {}'.format(decompressed_path))
+
+ return decompressed_path
+
+ def _init_from_path(self,
+ model_type: str='ecapatdnn_voxceleb12',
+ sample_rate: int=16000,
+ cfg_path: Optional[os.PathLike]=None,
+ ckpt_path: Optional[os.PathLike]=None):
+ """Init the neural network from the model path
+
+ Args:
+ model_type (str, optional): model tag in the pretrained model list.
+ Defaults to 'ecapatdnn_voxceleb12'.
+ sample_rate (int, optional): model sample rate.
+ Defaults to 16000.
+ cfg_path (Optional[os.PathLike], optional): yaml config file path.
+ Defaults to None.
+ ckpt_path (Optional[os.PathLike], optional): the pretrained model path, which is stored in the disk.
+ Defaults to None.
+ """
+ # stage 0: avoid to init the mode again
+ if hasattr(self, "model"):
+ logger.info("Model has been initialized")
+ return
+
+ # stage 1: get the model and config path
+ # if we want init the network from the model stored in the disk,
+ # we must pass the config path and the ckpt model path
+ if cfg_path is None or ckpt_path is None:
+ # get the mode from pretrained list
+ sample_rate_str = "16k" if sample_rate == 16000 else "8k"
+ tag = model_type + "-" + sample_rate_str
+ logger.info(f"load the pretrained model: {tag}")
+ # get the model from the pretrained list
+ # we download the pretrained model and store it in the res_path
+ res_path = self._get_pretrained_path(tag)
+ self.res_path = res_path
+
+ self.cfg_path = os.path.join(res_path,
+ pretrained_models[tag]['cfg_path'])
+ self.ckpt_path = os.path.join(
+ res_path, pretrained_models[tag]['ckpt_path'] + '.pdparams')
+ else:
+ # get the model from disk
+ self.cfg_path = os.path.abspath(cfg_path)
+ self.ckpt_path = os.path.abspath(ckpt_path + ".pdparams")
+ self.res_path = os.path.dirname(
+ os.path.dirname(os.path.abspath(self.cfg_path)))
+
+ logger.info(f"start to read the ckpt from {self.ckpt_path}")
+ logger.info(f"read the config from {self.cfg_path}")
+ logger.info(f"get the res path {self.res_path}")
+
+ # stage 2: read and config and init the model body
+ self.config = CfgNode(new_allowed=True)
+ self.config.merge_from_file(self.cfg_path)
+
+ # stage 3: get the model name to instance the model network with dynamic_import
+ logger.info("start to dynamic import the model class")
+ model_name = model_type[:model_type.rindex('_')]
+ logger.info(f"model name {model_name}")
+ model_class = dynamic_import(model_name, model_alias)
+ model_conf = self.config.model
+ backbone = model_class(**model_conf)
+ model = SpeakerIdetification(
+ backbone=backbone, num_class=self.config.num_speakers)
+ self.model = model
+ self.model.eval()
+
+ # stage 4: load the model parameters
+ logger.info("start to set the model parameters to model")
+ model_dict = paddle.load(self.ckpt_path)
+ self.model.set_state_dict(model_dict)
+
+ logger.info("create the model instance success")
+
+ @paddle.no_grad()
+ def infer(self, model_type: str):
+ """Infer the model to get the embedding
+
+ Args:
+ model_type (str): speaker verification model type
+ """
+ # stage 0: get the feat and length from _inputs
+ feats = self._inputs["feats"]
+ lengths = self._inputs["lengths"]
+ logger.info("start to do backbone network model forward")
+ logger.info(
+ f"feats shape:{feats.shape}, lengths shape: {lengths.shape}")
+
+ # stage 1: get the audio embedding
+ # embedding from (1, emb_size, 1) -> (emb_size)
+ embedding = self.model.backbone(feats, lengths).squeeze().numpy()
+ logger.info(f"embedding size: {embedding.shape}")
+
+ # stage 2: put the embedding and dim info to _outputs property
+ # the embedding type is numpy.array
+ self._outputs["embedding"] = embedding
+
+ def postprocess(self) -> Union[str, os.PathLike]:
+ """Return the audio embedding info
+
+ Returns:
+ Union[str, os.PathLike]: audio embedding info
+ """
+ embedding = self._outputs["embedding"]
+ return embedding
+
+ def preprocess(self, model_type: str, input_file: Union[str, os.PathLike]):
+ """Extract the audio feat
+
+ Args:
+ model_type (str): speaker verification model type
+ input_file (Union[str, os.PathLike]): audio file path
+ """
+ audio_file = input_file
+ if isinstance(audio_file, (str, os.PathLike)):
+ logger.info(f"Preprocess audio file: {audio_file}")
+
+ # stage 1: load the audio sample points
+ # Note: this process must match the training process
+ waveform, sr = load_audio(audio_file)
+ logger.info(f"load the audio sample points, shape is: {waveform.shape}")
+
+ # stage 2: get the audio feat
+ # Note: Now we only support fbank feature
+ try:
+ feat = melspectrogram(
+ x=waveform,
+ sr=self.config.sr,
+ n_mels=self.config.n_mels,
+ window_size=self.config.window_size,
+ hop_length=self.config.hop_size)
+ logger.info(f"extract the audio feat, shape is: {feat.shape}")
+ except Exception as e:
+ logger.info(f"feat occurs exception {e}")
+ sys.exit(-1)
+
+ feat = paddle.to_tensor(feat).unsqueeze(0)
+ # in inference period, the lengths is all one without padding
+ lengths = paddle.ones([1])
+
+ # stage 3: we do feature normalize,
+ # Now we assume that the feat must do normalize
+ feat = feature_normalize(feat, mean_norm=True, std_norm=False)
+
+ # stage 4: store the feat and length in the _inputs,
+ # which will be used in other function
+ logger.info(f"feats shape: {feat.shape}")
+ self._inputs["feats"] = feat
+ self._inputs["lengths"] = lengths
+
+ logger.info("audio extract the feat success")
+
+ def _check(self, audio_file: str, sample_rate: int):
+ """Check if the model sample match the audio sample rate
+
+ Args:
+ audio_file (str): audio file path, which will be extracted the embedding
+ sample_rate (int): the desired model sample rate
+
+ Returns:
+ bool: return if the audio sample rate matches the model sample rate
+ """
+ self.sample_rate = sample_rate
+ if self.sample_rate != 16000 and self.sample_rate != 8000:
+ logger.error(
+ "invalid sample rate, please input --sr 8000 or --sr 16000")
+ return False
+
+ if isinstance(audio_file, (str, os.PathLike)):
+ if not os.path.isfile(audio_file):
+ logger.error("Please input the right audio file path")
+ return False
+
+ logger.info("checking the aduio file format......")
+ try:
+ audio, audio_sample_rate = soundfile.read(
+ audio_file, dtype="float32", always_2d=True)
+ except Exception as e:
+ logger.exception(e)
+ logger.error(
+ "can not open the audio file, please check the audio file format is 'wav'. \n \
+ you can try to use sox to change the file format.\n \
+ For example: \n \
+ sample rate: 16k \n \
+ sox input_audio.xx --rate 16k --bits 16 --channels 1 output_audio.wav \n \
+ sample rate: 8k \n \
+ sox input_audio.xx --rate 8k --bits 16 --channels 1 output_audio.wav \n \
+ ")
+ return False
+
+ logger.info(f"The sample rate is {audio_sample_rate}")
+
+ if audio_sample_rate != self.sample_rate:
+ logger.error("The sample rate of the input file is not {}.\n \
+ The program will resample the wav file to {}.\n \
+ If the result does not meet your expectations,\n \
+ Please input the 16k 16 bit 1 channel wav file. \
+ ".format(self.sample_rate, self.sample_rate))
+ sys.exit(-1)
+ else:
+ logger.info("The audio file format is right")
+
+ return True
diff --git a/paddlespeech/s2t/decoders/recog.py b/paddlespeech/s2t/decoders/recog.py
index 88955eacb16f0e8a042143d06193484f1440f5ca..2d2aa2109191d80bc1c85f0153d0f5ec80e421aa 100644
--- a/paddlespeech/s2t/decoders/recog.py
+++ b/paddlespeech/s2t/decoders/recog.py
@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# Reference espnet Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+# Modified from espnet(https://github.com/espnet/espnet)
"""V2 backend for `asr_recog.py` using py:class:`decoders.beam_search.BeamSearch`."""
import jsonlines
import paddle
diff --git a/paddlespeech/s2t/decoders/recog_bin.py b/paddlespeech/s2t/decoders/recog_bin.py
index cd7a360ae253cdcdf59e641f9adefc4ca87dc299..37b49f3a059fa29d177d3db0f9be973450660430 100644
--- a/paddlespeech/s2t/decoders/recog_bin.py
+++ b/paddlespeech/s2t/decoders/recog_bin.py
@@ -12,15 +12,16 @@
# See the License for the specific language governing permissions and
# limitations under the License.
# Reference espnet Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
+# Modified from espnet(https://github.com/espnet/espnet)
"""End-to-end speech recognition model decoding script."""
import logging
import os
import random
import sys
-from distutils.util import strtobool
import configargparse
import numpy as np
+from distutils.util import strtobool
def get_parser():
diff --git a/paddlespeech/s2t/exps/u2/model.py b/paddlespeech/s2t/exps/u2/model.py
index d7bee6d7fe753554916d6b32e38756004507a49f..efcc9629fdbf63981cfdc4cc5b91693e5f3a85ee 100644
--- a/paddlespeech/s2t/exps/u2/model.py
+++ b/paddlespeech/s2t/exps/u2/model.py
@@ -239,7 +239,7 @@ class U2Trainer(Trainer):
n_iter_processes=config.num_workers,
subsampling_factor=1,
num_encs=1,
- dist_sampler=False,
+ dist_sampler=config.get('dist_sampler', False),
shortest_first=False)
self.valid_loader = BatchDataLoader(
@@ -260,7 +260,7 @@ class U2Trainer(Trainer):
n_iter_processes=config.num_workers,
subsampling_factor=1,
num_encs=1,
- dist_sampler=False,
+ dist_sampler=config.get('dist_sampler', False),
shortest_first=False)
logger.info("Setup train/valid Dataloader!")
else:
diff --git a/paddlespeech/s2t/frontend/audio.py b/paddlespeech/s2t/frontend/audio.py
index d0368cc8d229e2e298a6fabffa8af09af2f7cbb1..7f71e5dd947621621d8c02f72984e8269aa1940f 100644
--- a/paddlespeech/s2t/frontend/audio.py
+++ b/paddlespeech/s2t/frontend/audio.py
@@ -208,6 +208,18 @@ class AudioSegment():
io.BytesIO(bytes), dtype='float32')
return cls(samples, sample_rate)
+ @classmethod
+ def from_pcm(cls, samples, sample_rate):
+ """Create audio segment from a byte string containing audio samples.
+ :param samples: Audio samples [num_samples x num_channels].
+ :type samples: numpy.ndarray
+ :param sample_rate: Audio sample rate.
+ :type sample_rate: int
+ :return: Audio segment instance.
+ :rtype: AudioSegment
+ """
+ return cls(samples, sample_rate)
+
@classmethod
def concatenate(cls, *segments):
"""Concatenate an arbitrary number of audio segments together.
diff --git a/paddlespeech/s2t/frontend/speech.py b/paddlespeech/s2t/frontend/speech.py
index 8fd661c9246cbe973102d348d074a2ba3c1b20cc..96997104741ec4e36390674f8e45086b2db588a2 100644
--- a/paddlespeech/s2t/frontend/speech.py
+++ b/paddlespeech/s2t/frontend/speech.py
@@ -107,6 +107,27 @@ class SpeechSegment(AudioSegment):
return cls(audio.samples, audio.sample_rate, transcript, tokens,
token_ids)
+ @classmethod
+ def from_pcm(cls,
+ samples,
+ sample_rate,
+ transcript,
+ tokens=None,
+ token_ids=None):
+ """Create speech segment from pcm on online mode
+ Args:
+ samples (numpy.ndarray): Audio samples [num_samples x num_channels].
+ sample_rate (int): Audio sample rate.
+ transcript (str): Transcript text for the speech.
+ tokens (List[str], optional): text tokens. Defaults to None.
+ token_ids (List[int], optional): text token ids. Defaults to None.
+ Returns:
+ SpeechSegment: Speech segment instance.
+ """
+ audio = AudioSegment.from_pcm(samples, sample_rate)
+ return cls(audio.samples, audio.sample_rate, transcript, tokens,
+ token_ids)
+
@classmethod
def concatenate(cls, *segments):
"""Concatenate an arbitrary number of speech segments together, both
diff --git a/paddlespeech/s2t/models/u2/u2.py b/paddlespeech/s2t/models/u2/u2.py
index 910798127ee5c8c7c00893b603ec3ef95dc5be26..6a98607b69721b63b02c833932b74ab77913d078 100644
--- a/paddlespeech/s2t/models/u2/u2.py
+++ b/paddlespeech/s2t/models/u2/u2.py
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
"""U2 ASR Model
Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition
(https://arxiv.org/pdf/2012.05481.pdf)
@@ -36,6 +37,7 @@ from paddlespeech.s2t.modules.ctc import CTCDecoderBase
from paddlespeech.s2t.modules.decoder import TransformerDecoder
from paddlespeech.s2t.modules.encoder import ConformerEncoder
from paddlespeech.s2t.modules.encoder import TransformerEncoder
+from paddlespeech.s2t.modules.initializer import DefaultInitializerContext
from paddlespeech.s2t.modules.loss import LabelSmoothingLoss
from paddlespeech.s2t.modules.mask import make_pad_mask
from paddlespeech.s2t.modules.mask import mask_finished_preds
@@ -72,6 +74,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
assert 0.0 <= ctc_weight <= 1.0, ctc_weight
nn.Layer.__init__(self)
+
# note that eos is the same as sos (equivalent ID)
self.sos = vocab_size - 1
self.eos = vocab_size - 1
@@ -780,9 +783,12 @@ class U2DecodeModel(U2BaseModel):
class U2Model(U2DecodeModel):
def __init__(self, configs: dict):
- vocab_size, encoder, decoder, ctc = U2Model._init_from_config(configs)
-
model_conf = configs.get('model_conf', dict())
+ init_type = model_conf.get("init_type", None)
+ with DefaultInitializerContext(init_type):
+ vocab_size, encoder, decoder, ctc = U2Model._init_from_config(
+ configs)
+
super().__init__(
vocab_size=vocab_size,
encoder=encoder,
diff --git a/paddlespeech/s2t/models/u2/updater.py b/paddlespeech/s2t/models/u2/updater.py
index bb18fe4160d24fb39196f0286397b879d8069d31..c59090a84ee4d416353eff3d6049ff3451cf0dae 100644
--- a/paddlespeech/s2t/models/u2/updater.py
+++ b/paddlespeech/s2t/models/u2/updater.py
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
from contextlib import nullcontext
import paddle
diff --git a/paddlespeech/s2t/models/u2_st/u2_st.py b/paddlespeech/s2t/models/u2_st/u2_st.py
index 999723e5100309976c1b89cbf256ac106d8829e6..6447753c50f0f27bbfc3ed87495ec8cd42d79c59 100644
--- a/paddlespeech/s2t/models/u2_st/u2_st.py
+++ b/paddlespeech/s2t/models/u2_st/u2_st.py
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
"""U2 ASR Model
Unified Streaming and Non-streaming Two-pass End-to-end Model for Speech Recognition
(https://arxiv.org/pdf/2012.05481.pdf)
diff --git a/paddlespeech/s2t/modules/activation.py b/paddlespeech/s2t/modules/activation.py
index 4081f7f81a5ca9a0b8594ff01cff23ef6d3eac94..2f387b0d99b68ed5d37cb05a13a030ad49aaa381 100644
--- a/paddlespeech/s2t/modules/activation.py
+++ b/paddlespeech/s2t/modules/activation.py
@@ -17,6 +17,8 @@ import paddle
from paddle import nn
from paddle.nn import functional as F
+from paddlespeech.s2t.modules.align import Conv2D
+from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()
@@ -51,7 +53,7 @@ class LinearGLUBlock(nn.Layer):
idim (int): input and output dimension
"""
super().__init__()
- self.fc = nn.Linear(idim, idim * 2)
+ self.fc = Linear(idim, idim * 2)
def forward(self, xs):
return glu(self.fc(xs), dim=-1)
@@ -75,7 +77,7 @@ class ConvGLUBlock(nn.Layer):
self.conv_residual = None
if in_ch != out_ch:
self.conv_residual = nn.utils.weight_norm(
- nn.Conv2D(
+ Conv2D(
in_channels=in_ch, out_channels=out_ch, kernel_size=(1, 1)),
name='weight',
dim=0)
@@ -86,7 +88,7 @@ class ConvGLUBlock(nn.Layer):
layers = OrderedDict()
if bottlececk_dim == 0:
layers['conv'] = nn.utils.weight_norm(
- nn.Conv2D(
+ Conv2D(
in_channels=in_ch,
out_channels=out_ch * 2,
kernel_size=(kernel_size, 1)),
@@ -106,7 +108,7 @@ class ConvGLUBlock(nn.Layer):
dim=0)
layers['dropout_in'] = nn.Dropout(p=dropout)
layers['conv_bottleneck'] = nn.utils.weight_norm(
- nn.Conv2D(
+ Conv2D(
in_channels=bottlececk_dim,
out_channels=bottlececk_dim,
kernel_size=(kernel_size, 1)),
@@ -115,7 +117,7 @@ class ConvGLUBlock(nn.Layer):
layers['dropout'] = nn.Dropout(p=dropout)
layers['glu'] = GLU()
layers['conv_out'] = nn.utils.weight_norm(
- nn.Conv2D(
+ Conv2D(
in_channels=bottlececk_dim,
out_channels=out_ch * 2,
kernel_size=(1, 1)),
diff --git a/paddlespeech/s2t/modules/align.py b/paddlespeech/s2t/modules/align.py
new file mode 100644
index 0000000000000000000000000000000000000000..f889167936115ccc7267037d9046765f83b403bd
--- /dev/null
+++ b/paddlespeech/s2t/modules/align.py
@@ -0,0 +1,139 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+from paddle import nn
+
+from paddlespeech.s2t.modules.initializer import KaimingUniform
+"""
+ To align the initializer between paddle and torch,
+ the API below are set defalut initializer with priority higger than global initializer.
+"""
+global_init_type = None
+
+
+class LayerNorm(nn.LayerNorm):
+ def __init__(self,
+ normalized_shape,
+ epsilon=1e-05,
+ weight_attr=None,
+ bias_attr=None,
+ name=None):
+ if weight_attr is None:
+ weight_attr = paddle.ParamAttr(
+ initializer=nn.initializer.Constant(1.0))
+ if bias_attr is None:
+ bias_attr = paddle.ParamAttr(
+ initializer=nn.initializer.Constant(0.0))
+ super(LayerNorm, self).__init__(normalized_shape, epsilon, weight_attr,
+ bias_attr, name)
+
+
+class BatchNorm1D(nn.BatchNorm1D):
+ def __init__(self,
+ num_features,
+ momentum=0.9,
+ epsilon=1e-05,
+ weight_attr=None,
+ bias_attr=None,
+ data_format='NCL',
+ name=None):
+ if weight_attr is None:
+ weight_attr = paddle.ParamAttr(
+ initializer=nn.initializer.Constant(1.0))
+ if bias_attr is None:
+ bias_attr = paddle.ParamAttr(
+ initializer=nn.initializer.Constant(0.0))
+ super(BatchNorm1D,
+ self).__init__(num_features, momentum, epsilon, weight_attr,
+ bias_attr, data_format, name)
+
+
+class Embedding(nn.Embedding):
+ def __init__(self,
+ num_embeddings,
+ embedding_dim,
+ padding_idx=None,
+ sparse=False,
+ weight_attr=None,
+ name=None):
+ if weight_attr is None:
+ weight_attr = paddle.ParamAttr(initializer=nn.initializer.Normal())
+ super(Embedding, self).__init__(num_embeddings, embedding_dim,
+ padding_idx, sparse, weight_attr, name)
+
+
+class Linear(nn.Linear):
+ def __init__(self,
+ in_features,
+ out_features,
+ weight_attr=None,
+ bias_attr=None,
+ name=None):
+ if weight_attr is None:
+ if global_init_type == "kaiming_uniform":
+ weight_attr = paddle.ParamAttr(initializer=KaimingUniform())
+ if bias_attr is None:
+ if global_init_type == "kaiming_uniform":
+ bias_attr = paddle.ParamAttr(initializer=KaimingUniform())
+ super(Linear, self).__init__(in_features, out_features, weight_attr,
+ bias_attr, name)
+
+
+class Conv1D(nn.Conv1D):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=1,
+ padding=0,
+ dilation=1,
+ groups=1,
+ padding_mode='zeros',
+ weight_attr=None,
+ bias_attr=None,
+ data_format='NCL'):
+ if weight_attr is None:
+ if global_init_type == "kaiming_uniform":
+ print("set kaiming_uniform")
+ weight_attr = paddle.ParamAttr(initializer=KaimingUniform())
+ if bias_attr is None:
+ if global_init_type == "kaiming_uniform":
+ bias_attr = paddle.ParamAttr(initializer=KaimingUniform())
+ super(Conv1D, self).__init__(
+ in_channels, out_channels, kernel_size, stride, padding, dilation,
+ groups, padding_mode, weight_attr, bias_attr, data_format)
+
+
+class Conv2D(nn.Conv2D):
+ def __init__(self,
+ in_channels,
+ out_channels,
+ kernel_size,
+ stride=1,
+ padding=0,
+ dilation=1,
+ groups=1,
+ padding_mode='zeros',
+ weight_attr=None,
+ bias_attr=None,
+ data_format='NCHW'):
+ if weight_attr is None:
+ if global_init_type == "kaiming_uniform":
+ weight_attr = paddle.ParamAttr(initializer=KaimingUniform())
+ if bias_attr is None:
+ if global_init_type == "kaiming_uniform":
+ bias_attr = paddle.ParamAttr(initializer=KaimingUniform())
+ super(Conv2D, self).__init__(
+ in_channels, out_channels, kernel_size, stride, padding, dilation,
+ groups, padding_mode, weight_attr, bias_attr, data_format)
diff --git a/paddlespeech/s2t/modules/attention.py b/paddlespeech/s2t/modules/attention.py
index 3d5f8cd1d3aaff3841a8b519bb7b3af178c700ef..438efd2a14151904cb75ff6c72f7be01663bff09 100644
--- a/paddlespeech/s2t/modules/attention.py
+++ b/paddlespeech/s2t/modules/attention.py
@@ -22,6 +22,7 @@ import paddle
from paddle import nn
from paddle.nn import initializer as I
+from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()
@@ -48,10 +49,10 @@ class MultiHeadedAttention(nn.Layer):
# We assume d_v always equals d_k
self.d_k = n_feat // n_head
self.h = n_head
- self.linear_q = nn.Linear(n_feat, n_feat)
- self.linear_k = nn.Linear(n_feat, n_feat)
- self.linear_v = nn.Linear(n_feat, n_feat)
- self.linear_out = nn.Linear(n_feat, n_feat)
+ self.linear_q = Linear(n_feat, n_feat)
+ self.linear_k = Linear(n_feat, n_feat)
+ self.linear_v = Linear(n_feat, n_feat)
+ self.linear_out = Linear(n_feat, n_feat)
self.dropout = nn.Dropout(p=dropout_rate)
def forward_qkv(self,
@@ -95,7 +96,7 @@ class MultiHeadedAttention(nn.Layer):
mask (paddle.Tensor): Mask, size (#batch, 1, time2) or
(#batch, time1, time2).
Returns:
- paddle.Tensor: Transformed value weighted
+ paddle.Tensor: Transformed value weighted
by the attention score, (#batch, time1, d_model).
"""
n_batch = value.shape[0]
@@ -150,7 +151,7 @@ class RelPositionMultiHeadedAttention(MultiHeadedAttention):
"""
super().__init__(n_head, n_feat, dropout_rate)
# linear transformation for positional encoding
- self.linear_pos = nn.Linear(n_feat, n_feat, bias_attr=False)
+ self.linear_pos = Linear(n_feat, n_feat, bias_attr=False)
# these two learnable bias are used in matrix c and matrix d
# as described in https://arxiv.org/abs/1901.02860 Section 3.3
#self.pos_bias_u = nn.Parameter(torch.Tensor(self.h, self.d_k))
diff --git a/paddlespeech/s2t/modules/conformer_convolution.py b/paddlespeech/s2t/modules/conformer_convolution.py
index 7ec92554eec73b8889335b3a16fd1a34692bb021..89e6526885a2679b8ab09a4e4e4423a15e51ac08 100644
--- a/paddlespeech/s2t/modules/conformer_convolution.py
+++ b/paddlespeech/s2t/modules/conformer_convolution.py
@@ -21,6 +21,9 @@ import paddle
from paddle import nn
from typeguard import check_argument_types
+from paddlespeech.s2t.modules.align import BatchNorm1D
+from paddlespeech.s2t.modules.align import Conv1D
+from paddlespeech.s2t.modules.align import LayerNorm
from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()
@@ -49,7 +52,7 @@ class ConvolutionModule(nn.Layer):
"""
assert check_argument_types()
super().__init__()
- self.pointwise_conv1 = nn.Conv1D(
+ self.pointwise_conv1 = Conv1D(
channels,
2 * channels,
kernel_size=1,
@@ -60,8 +63,8 @@ class ConvolutionModule(nn.Layer):
)
# self.lorder is used to distinguish if it's a causal convolution,
- # if self.lorder > 0:
- # it's a causal convolution, the input will be padded with
+ # if self.lorder > 0:
+ # it's a causal convolution, the input will be padded with
# `self.lorder` frames on the left in forward (causal conv impl).
# else: it's a symmetrical convolution
if causal:
@@ -73,7 +76,7 @@ class ConvolutionModule(nn.Layer):
padding = (kernel_size - 1) // 2
self.lorder = 0
- self.depthwise_conv = nn.Conv1D(
+ self.depthwise_conv = Conv1D(
channels,
channels,
kernel_size,
@@ -87,12 +90,12 @@ class ConvolutionModule(nn.Layer):
assert norm in ['batch_norm', 'layer_norm']
if norm == "batch_norm":
self.use_layer_norm = False
- self.norm = nn.BatchNorm1D(channels)
+ self.norm = BatchNorm1D(channels)
else:
self.use_layer_norm = True
- self.norm = nn.LayerNorm(channels)
+ self.norm = LayerNorm(channels)
- self.pointwise_conv2 = nn.Conv1D(
+ self.pointwise_conv2 = Conv1D(
channels,
channels,
kernel_size=1,
diff --git a/paddlespeech/s2t/modules/ctc.py b/paddlespeech/s2t/modules/ctc.py
index 2094182af1a6d31068288d865654bace577b5975..33ad472defba0a86bc945582f386acb406e4c35e 100644
--- a/paddlespeech/s2t/modules/ctc.py
+++ b/paddlespeech/s2t/modules/ctc.py
@@ -18,6 +18,7 @@ from paddle import nn
from paddle.nn import functional as F
from typeguard import check_argument_types
+from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.modules.loss import CTCLoss
from paddlespeech.s2t.utils import ctc_utils
from paddlespeech.s2t.utils.log import Log
@@ -69,7 +70,7 @@ class CTCDecoderBase(nn.Layer):
self.blank_id = blank_id
self.odim = odim
self.dropout = nn.Dropout(dropout_rate)
- self.ctc_lo = nn.Linear(enc_n_units, self.odim)
+ self.ctc_lo = Linear(enc_n_units, self.odim)
reduction_type = "sum" if reduction else "none"
self.criterion = CTCLoss(
blank=self.blank_id,
diff --git a/paddlespeech/s2t/modules/decoder.py b/paddlespeech/s2t/modules/decoder.py
index 6b4d959123b19cc23cd42bdcf68491ac6e5f61de..3a851ec62c35f633ce07fd0b4380d92b31d67b3b 100644
--- a/paddlespeech/s2t/modules/decoder.py
+++ b/paddlespeech/s2t/modules/decoder.py
@@ -24,6 +24,9 @@ from paddle import nn
from typeguard import check_argument_types
from paddlespeech.s2t.decoders.scorers.scorer_interface import BatchScorerInterface
+from paddlespeech.s2t.modules.align import Embedding
+from paddlespeech.s2t.modules.align import LayerNorm
+from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.modules.attention import MultiHeadedAttention
from paddlespeech.s2t.modules.decoder_layer import DecoderLayer
from paddlespeech.s2t.modules.embedding import PositionalEncoding
@@ -76,21 +79,22 @@ class TransformerDecoder(BatchScorerInterface, nn.Layer):
concat_after: bool=False, ):
assert check_argument_types()
+
nn.Layer.__init__(self)
self.selfattention_layer_type = 'selfattn'
attention_dim = encoder_output_size
if input_layer == "embed":
self.embed = nn.Sequential(
- nn.Embedding(vocab_size, attention_dim),
+ Embedding(vocab_size, attention_dim),
PositionalEncoding(attention_dim, positional_dropout_rate), )
else:
raise ValueError(f"only 'embed' is supported: {input_layer}")
self.normalize_before = normalize_before
- self.after_norm = nn.LayerNorm(attention_dim, epsilon=1e-12)
+ self.after_norm = LayerNorm(attention_dim, epsilon=1e-12)
self.use_output_layer = use_output_layer
- self.output_layer = nn.Linear(attention_dim, vocab_size)
+ self.output_layer = Linear(attention_dim, vocab_size)
self.decoders = nn.LayerList([
DecoderLayer(
diff --git a/paddlespeech/s2t/modules/decoder_layer.py b/paddlespeech/s2t/modules/decoder_layer.py
index 520b18dea17928b6fe95bbda804bd89ef28aa904..b7f8694c12623ce82eb6849bcd9438483f513502 100644
--- a/paddlespeech/s2t/modules/decoder_layer.py
+++ b/paddlespeech/s2t/modules/decoder_layer.py
@@ -20,6 +20,8 @@ from typing import Tuple
import paddle
from paddle import nn
+from paddlespeech.s2t.modules.align import LayerNorm
+from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()
@@ -62,14 +64,14 @@ class DecoderLayer(nn.Layer):
self.self_attn = self_attn
self.src_attn = src_attn
self.feed_forward = feed_forward
- self.norm1 = nn.LayerNorm(size, epsilon=1e-12)
- self.norm2 = nn.LayerNorm(size, epsilon=1e-12)
- self.norm3 = nn.LayerNorm(size, epsilon=1e-12)
+ self.norm1 = LayerNorm(size, epsilon=1e-12)
+ self.norm2 = LayerNorm(size, epsilon=1e-12)
+ self.norm3 = LayerNorm(size, epsilon=1e-12)
self.dropout = nn.Dropout(dropout_rate)
self.normalize_before = normalize_before
self.concat_after = concat_after
- self.concat_linear1 = nn.Linear(size + size, size)
- self.concat_linear2 = nn.Linear(size + size, size)
+ self.concat_linear1 = Linear(size + size, size)
+ self.concat_linear2 = Linear(size + size, size)
def forward(
self,
diff --git a/paddlespeech/s2t/modules/encoder.py b/paddlespeech/s2t/modules/encoder.py
index 5c8ba0810d00db66a3c96238cf5d243802eb9d7b..c843c0e207054b20a5d3850334198ef6bcb6888c 100644
--- a/paddlespeech/s2t/modules/encoder.py
+++ b/paddlespeech/s2t/modules/encoder.py
@@ -23,6 +23,7 @@ from paddle import nn
from typeguard import check_argument_types
from paddlespeech.s2t.modules.activation import get_activation
+from paddlespeech.s2t.modules.align import LayerNorm
from paddlespeech.s2t.modules.attention import MultiHeadedAttention
from paddlespeech.s2t.modules.attention import RelPositionMultiHeadedAttention
from paddlespeech.s2t.modules.conformer_convolution import ConvolutionModule
@@ -129,7 +130,7 @@ class BaseEncoder(nn.Layer):
d_model=output_size, dropout_rate=positional_dropout_rate), )
self.normalize_before = normalize_before
- self.after_norm = nn.LayerNorm(output_size, epsilon=1e-12)
+ self.after_norm = LayerNorm(output_size, epsilon=1e-12)
self.static_chunk_size = static_chunk_size
self.use_dynamic_chunk = use_dynamic_chunk
self.use_dynamic_left_chunk = use_dynamic_left_chunk
@@ -457,6 +458,7 @@ class ConformerEncoder(BaseEncoder):
cnn_module_norm (str): cnn conv norm type, Optional['batch_norm','layer_norm']
"""
assert check_argument_types()
+
super().__init__(input_size, output_size, attention_heads, linear_units,
num_blocks, dropout_rate, positional_dropout_rate,
attention_dropout_rate, input_layer,
diff --git a/paddlespeech/s2t/modules/encoder_layer.py b/paddlespeech/s2t/modules/encoder_layer.py
index d39c0695a044cd9cdc5969b547be911565015672..e80a298d621ac87db8ad9f76e48041f05ec18f64 100644
--- a/paddlespeech/s2t/modules/encoder_layer.py
+++ b/paddlespeech/s2t/modules/encoder_layer.py
@@ -20,6 +20,8 @@ from typing import Tuple
import paddle
from paddle import nn
+from paddlespeech.s2t.modules.align import LayerNorm
+from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()
@@ -39,7 +41,7 @@ class TransformerEncoderLayer(nn.Layer):
normalize_before: bool=True,
concat_after: bool=False, ):
"""Construct an EncoderLayer object.
-
+
Args:
size (int): Input dimension.
self_attn (nn.Layer): Self-attention module instance.
@@ -59,15 +61,15 @@ class TransformerEncoderLayer(nn.Layer):
super().__init__()
self.self_attn = self_attn
self.feed_forward = feed_forward
- self.norm1 = nn.LayerNorm(size, epsilon=1e-12)
- self.norm2 = nn.LayerNorm(size, epsilon=1e-12)
+ self.norm1 = LayerNorm(size, epsilon=1e-12)
+ self.norm2 = LayerNorm(size, epsilon=1e-12)
self.dropout = nn.Dropout(dropout_rate)
self.size = size
self.normalize_before = normalize_before
self.concat_after = concat_after
# concat_linear may be not used in forward fuction,
# but will be saved in the *.pt
- self.concat_linear = nn.Linear(size + size, size)
+ self.concat_linear = Linear(size + size, size)
def forward(
self,
@@ -147,7 +149,7 @@ class ConformerEncoderLayer(nn.Layer):
normalize_before: bool=True,
concat_after: bool=False, ):
"""Construct an EncoderLayer object.
-
+
Args:
size (int): Input dimension.
self_attn (nn.Layer): Self-attention module instance.
@@ -174,23 +176,23 @@ class ConformerEncoderLayer(nn.Layer):
self.feed_forward = feed_forward
self.feed_forward_macaron = feed_forward_macaron
self.conv_module = conv_module
- self.norm_ff = nn.LayerNorm(size, epsilon=1e-12) # for the FNN module
- self.norm_mha = nn.LayerNorm(size, epsilon=1e-12) # for the MHA module
+ self.norm_ff = LayerNorm(size, epsilon=1e-12) # for the FNN module
+ self.norm_mha = LayerNorm(size, epsilon=1e-12) # for the MHA module
if feed_forward_macaron is not None:
- self.norm_ff_macaron = nn.LayerNorm(size, epsilon=1e-12)
+ self.norm_ff_macaron = LayerNorm(size, epsilon=1e-12)
self.ff_scale = 0.5
else:
self.ff_scale = 1.0
if self.conv_module is not None:
- self.norm_conv = nn.LayerNorm(
+ self.norm_conv = LayerNorm(
size, epsilon=1e-12) # for the CNN module
- self.norm_final = nn.LayerNorm(
+ self.norm_final = LayerNorm(
size, epsilon=1e-12) # for the final output of the block
self.dropout = nn.Dropout(dropout_rate)
self.size = size
self.normalize_before = normalize_before
self.concat_after = concat_after
- self.concat_linear = nn.Linear(size + size, size)
+ self.concat_linear = Linear(size + size, size)
def forward(
self,
diff --git a/paddlespeech/s2t/modules/initializer.py b/paddlespeech/s2t/modules/initializer.py
new file mode 100644
index 0000000000000000000000000000000000000000..30a04e44fb2965d03be8c6346ef16448ed257bbc
--- /dev/null
+++ b/paddlespeech/s2t/modules/initializer.py
@@ -0,0 +1,172 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+from paddle.fluid import framework
+from paddle.fluid import unique_name
+from paddle.fluid.core import VarDesc
+from paddle.fluid.initializer import MSRAInitializer
+
+__all__ = ['KaimingUniform']
+
+
+class KaimingUniform(MSRAInitializer):
+ r"""Implements the Kaiming Uniform initializer
+
+ This class implements the weight initialization from the paper
+ `Delving Deep into Rectifiers: Surpassing Human-Level Performance on
+ ImageNet Classification `_
+ by Kaiming He, Xiangyu Zhang, Shaoqing Ren and Jian Sun. This is a
+ robust initialization method that particularly considers the rectifier
+ nonlinearities.
+
+ In case of Uniform distribution, the range is [-x, x], where
+
+ .. math::
+
+ x = \sqrt{\frac{1.0}{fan\_in}}
+
+ In case of Normal distribution, the mean is 0 and the standard deviation
+ is
+
+ .. math::
+
+ \sqrt{\\frac{2.0}{fan\_in}}
+
+ Args:
+ fan_in (float32|None): fan_in for Kaiming uniform Initializer. If None, it is\
+ inferred from the variable. default is None.
+
+ Note:
+ It is recommended to set fan_in to None for most cases.
+
+ Examples:
+ .. code-block:: python
+
+ import paddle
+ import paddle.nn as nn
+
+ linear = nn.Linear(2,
+ 4,
+ weight_attr=nn.initializer.KaimingUniform())
+ data = paddle.rand([30, 10, 2], dtype='float32')
+ res = linear(data)
+
+ """
+
+ def __init__(self, fan_in=None):
+ super(KaimingUniform, self).__init__(
+ uniform=True, fan_in=fan_in, seed=0)
+
+ def __call__(self, var, block=None):
+ """Initialize the input tensor with MSRA initialization.
+
+ Args:
+ var(Tensor): Tensor that needs to be initialized.
+ block(Block, optional): The block in which initialization ops
+ should be added. Used in static graph only, default None.
+
+ Returns:
+ The initialization op
+ """
+ block = self._check_block(block)
+
+ assert isinstance(var, framework.Variable)
+ assert isinstance(block, framework.Block)
+ f_in, f_out = self._compute_fans(var)
+
+ # If fan_in is passed, use it
+ fan_in = f_in if self._fan_in is None else self._fan_in
+
+ if self._seed == 0:
+ self._seed = block.program.random_seed
+
+ # to be compatible of fp16 initalizers
+ if var.dtype == VarDesc.VarType.FP16 or (
+ var.dtype == VarDesc.VarType.BF16 and not self._uniform):
+ out_dtype = VarDesc.VarType.FP32
+ out_var = block.create_var(
+ name=unique_name.generate(
+ ".".join(['masra_init', var.name, 'tmp'])),
+ shape=var.shape,
+ dtype=out_dtype,
+ type=VarDesc.VarType.LOD_TENSOR,
+ persistable=False)
+ else:
+ out_dtype = var.dtype
+ out_var = var
+
+ if self._uniform:
+ limit = np.sqrt(1.0 / float(fan_in))
+ op = block.append_op(
+ type="uniform_random",
+ inputs={},
+ outputs={"Out": out_var},
+ attrs={
+ "shape": out_var.shape,
+ "dtype": int(out_dtype),
+ "min": -limit,
+ "max": limit,
+ "seed": self._seed
+ },
+ stop_gradient=True)
+
+ else:
+ std = np.sqrt(2.0 / float(fan_in))
+ op = block.append_op(
+ type="gaussian_random",
+ outputs={"Out": out_var},
+ attrs={
+ "shape": out_var.shape,
+ "dtype": int(out_dtype),
+ "mean": 0.0,
+ "std": std,
+ "seed": self._seed
+ },
+ stop_gradient=True)
+
+ if var.dtype == VarDesc.VarType.FP16 or (
+ var.dtype == VarDesc.VarType.BF16 and not self._uniform):
+ block.append_op(
+ type="cast",
+ inputs={"X": out_var},
+ outputs={"Out": var},
+ attrs={"in_dtype": out_var.dtype,
+ "out_dtype": var.dtype})
+
+ if not framework.in_dygraph_mode():
+ var.op = op
+ return op
+
+
+class DefaultInitializerContext(object):
+ """
+ egs:
+ with DefaultInitializerContext("kaiming_uniform"):
+ code for setup_model
+ """
+
+ def __init__(self, init_type=None):
+ self.init_type = init_type
+
+ def __enter__(self):
+ if self.init_type is None:
+ return
+ else:
+ from paddlespeech.s2t.modules import align
+ align.global_init_type = self.init_type
+ return
+
+ def __exit__(self, exc_type, exc_val, exc_tb):
+ from paddlespeech.s2t.modules import align
+ align.global_init_type = None
diff --git a/paddlespeech/s2t/modules/positionwise_feed_forward.py b/paddlespeech/s2t/modules/positionwise_feed_forward.py
index e2619cd49dc15ef7d9ddb1fbbb991f3fe3eb1c35..c2725dc5cc4aac28d04e44333e185082d7300d44 100644
--- a/paddlespeech/s2t/modules/positionwise_feed_forward.py
+++ b/paddlespeech/s2t/modules/positionwise_feed_forward.py
@@ -17,6 +17,7 @@
import paddle
from paddle import nn
+from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()
@@ -44,10 +45,10 @@ class PositionwiseFeedForward(nn.Layer):
activation (paddle.nn.Layer): Activation function
"""
super().__init__()
- self.w_1 = nn.Linear(idim, hidden_units)
+ self.w_1 = Linear(idim, hidden_units)
self.activation = activation
self.dropout = nn.Dropout(dropout_rate)
- self.w_2 = nn.Linear(hidden_units, idim)
+ self.w_2 = Linear(hidden_units, idim)
def forward(self, xs: paddle.Tensor) -> paddle.Tensor:
"""Forward function.
diff --git a/paddlespeech/s2t/modules/subsampling.py b/paddlespeech/s2t/modules/subsampling.py
index 99a8300f246149e924fe741f53934259d404e4e8..88451ddd77f6f89f8597238ddb1236acaa1945d7 100644
--- a/paddlespeech/s2t/modules/subsampling.py
+++ b/paddlespeech/s2t/modules/subsampling.py
@@ -19,6 +19,9 @@ from typing import Tuple
import paddle
from paddle import nn
+from paddlespeech.s2t.modules.align import Conv2D
+from paddlespeech.s2t.modules.align import LayerNorm
+from paddlespeech.s2t.modules.align import Linear
from paddlespeech.s2t.modules.embedding import PositionalEncoding
from paddlespeech.s2t.utils.log import Log
@@ -60,8 +63,8 @@ class LinearNoSubsampling(BaseSubsampling):
"""
super().__init__(pos_enc_class)
self.out = nn.Sequential(
- nn.Linear(idim, odim),
- nn.LayerNorm(odim, epsilon=1e-12),
+ Linear(idim, odim),
+ LayerNorm(odim, epsilon=1e-12),
nn.Dropout(dropout_rate),
nn.ReLU(), )
self.right_context = 0
@@ -108,12 +111,12 @@ class Conv2dSubsampling4(Conv2dSubsampling):
"""
super().__init__(pos_enc_class)
self.conv = nn.Sequential(
- nn.Conv2D(1, odim, 3, 2),
+ Conv2D(1, odim, 3, 2),
nn.ReLU(),
- nn.Conv2D(odim, odim, 3, 2),
+ Conv2D(odim, odim, 3, 2),
nn.ReLU(), )
self.out = nn.Sequential(
- nn.Linear(odim * (((idim - 1) // 2 - 1) // 2), odim))
+ Linear(odim * (((idim - 1) // 2 - 1) // 2), odim))
self.subsampling_rate = 4
# The right context for every conv layer is computed by:
# (kernel_size - 1) * frame_rate_of_this_layer
@@ -160,13 +163,13 @@ class Conv2dSubsampling6(Conv2dSubsampling):
"""
super().__init__(pos_enc_class)
self.conv = nn.Sequential(
- nn.Conv2D(1, odim, 3, 2),
+ Conv2D(1, odim, 3, 2),
nn.ReLU(),
- nn.Conv2D(odim, odim, 5, 3),
+ Conv2D(odim, odim, 5, 3),
nn.ReLU(), )
# O = (I - F + Pstart + Pend) // S + 1
# when Padding == 0, O = (I - F - S) // S
- self.linear = nn.Linear(odim * (((idim - 1) // 2 - 2) // 3), odim)
+ self.linear = Linear(odim * (((idim - 1) // 2 - 2) // 3), odim)
# The right context for every conv layer is computed by:
# (kernel_size - 1) * frame_rate_of_this_layer
# 10 = (3 - 1) * 1 + (5 - 1) * 2
@@ -212,14 +215,14 @@ class Conv2dSubsampling8(Conv2dSubsampling):
"""
super().__init__(pos_enc_class)
self.conv = nn.Sequential(
- nn.Conv2D(1, odim, 3, 2),
+ Conv2D(1, odim, 3, 2),
nn.ReLU(),
- nn.Conv2D(odim, odim, 3, 2),
+ Conv2D(odim, odim, 3, 2),
nn.ReLU(),
- nn.Conv2D(odim, odim, 3, 2),
+ Conv2D(odim, odim, 3, 2),
nn.ReLU(), )
- self.linear = nn.Linear(odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2),
- odim)
+ self.linear = Linear(odim * ((((idim - 1) // 2 - 1) // 2 - 1) // 2),
+ odim)
self.subsampling_rate = 8
# The right context for every conv layer is computed by:
# (kernel_size - 1) * frame_rate_of_this_layer
diff --git a/paddlespeech/s2t/transform/spectrogram.py b/paddlespeech/s2t/transform/spectrogram.py
index 889cd349d8fe1b5a67c7a5e65d1435ea48eaa35e..4a65548fe141bb7e23b1b04fa990d998891d922d 100644
--- a/paddlespeech/s2t/transform/spectrogram.py
+++ b/paddlespeech/s2t/transform/spectrogram.py
@@ -14,8 +14,11 @@
# Modified from espnet(https://github.com/espnet/espnet)
import librosa
import numpy as np
+import paddle
from python_speech_features import logfbank
+import paddleaudio.compliance.kaldi as kaldi
+
def stft(x,
n_fft,
@@ -309,6 +312,77 @@ class IStft():
class LogMelSpectrogramKaldi():
+ def __init__(
+ self,
+ fs=16000,
+ n_mels=80,
+ n_shift=160, # unit:sample, 10ms
+ win_length=400, # unit:sample, 25ms
+ energy_floor=0.0,
+ dither=0.1):
+ """
+ The Kaldi implementation of LogMelSpectrogram
+ Args:
+ fs (int): sample rate of the audio
+ n_mels (int): number of mel filter banks
+ n_shift (int): number of points in a frame shift
+ win_length (int): number of points in a frame windows
+ energy_floor (float): Floor on energy in Spectrogram computation (absolute)
+ dither (float): Dithering constant
+
+ Returns:
+ LogMelSpectrogramKaldi
+ """
+
+ self.fs = fs
+ self.n_mels = n_mels
+ num_point_ms = fs / 1000
+ self.n_frame_length = win_length / num_point_ms
+ self.n_frame_shift = n_shift / num_point_ms
+ self.energy_floor = energy_floor
+ self.dither = dither
+
+ def __repr__(self):
+ return (
+ "{name}(fs={fs}, n_mels={n_mels}, "
+ "n_frame_shift={n_frame_shift}, n_frame_length={n_frame_length}, "
+ "dither={dither}))".format(
+ name=self.__class__.__name__,
+ fs=self.fs,
+ n_mels=self.n_mels,
+ n_frame_shift=self.n_frame_shift,
+ n_frame_length=self.n_frame_length,
+ dither=self.dither, ))
+
+ def __call__(self, x, train):
+ """
+ Args:
+ x (np.ndarray): shape (Ti,)
+ train (bool): True, train mode.
+
+ Raises:
+ ValueError: not support (Ti, C)
+
+ Returns:
+ np.ndarray: (T, D)
+ """
+ dither = self.dither if train else 0.0
+ if x.ndim != 1:
+ raise ValueError("Not support x: [Time, Channel]")
+ waveform = paddle.to_tensor(np.expand_dims(x, 0), dtype=paddle.float32)
+ mat = kaldi.fbank(
+ waveform,
+ n_mels=self.n_mels,
+ frame_length=self.n_frame_length,
+ frame_shift=self.n_frame_shift,
+ dither=dither,
+ energy_floor=self.energy_floor,
+ sr=self.fs)
+ mat = np.squeeze(mat.numpy())
+ return mat
+
+
+class LogMelSpectrogramKaldi_decay():
def __init__(
self,
fs=16000,
diff --git a/paddlespeech/s2t/transform/transformation.py b/paddlespeech/s2t/transform/transformation.py
index 381b0cdc9d92c9d583bf357935dcf8ac9759c9aa..3b433cb0bc50c7c3e3cbf847f2906d0f6b554d99 100644
--- a/paddlespeech/s2t/transform/transformation.py
+++ b/paddlespeech/s2t/transform/transformation.py
@@ -31,6 +31,7 @@ import_alias = dict(
freq_mask="paddlespeech.s2t.transform.spec_augment:FreqMask",
spec_augment="paddlespeech.s2t.transform.spec_augment:SpecAugment",
speed_perturbation="paddlespeech.s2t.transform.perturb:SpeedPerturbation",
+ speed_perturbation_sox="paddlespeech.s2t.transform.perturb:SpeedPerturbationSox",
volume_perturbation="paddlespeech.s2t.transform.perturb:VolumePerturbation",
noise_injection="paddlespeech.s2t.transform.perturb:NoiseInjection",
bandpass_perturbation="paddlespeech.s2t.transform.perturb:BandpassPerturbation",
diff --git a/paddlespeech/s2t/utils/bleu_score.py b/paddlespeech/s2t/utils/bleu_score.py
index a50c000ae914027611af06e1b6d11e0401b13149..d7eb9c7c6878d33a1a52404d2aeb9407e09859b1 100644
--- a/paddlespeech/s2t/utils/bleu_score.py
+++ b/paddlespeech/s2t/utils/bleu_score.py
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
"""This module provides functions to calculate bleu score in different level.
e.g. wer for word-level, cer for char-level.
"""
diff --git a/paddlespeech/s2t/utils/cli_utils.py b/paddlespeech/s2t/utils/cli_utils.py
index 4aee3f4398095cd4aafa33447093db5362b36cf5..ccb0d3c97dde3a9bb3b17bdf5ee0b29adee652ba 100644
--- a/paddlespeech/s2t/utils/cli_utils.py
+++ b/paddlespeech/s2t/utils/cli_utils.py
@@ -14,9 +14,9 @@
# Modified from espnet(https://github.com/espnet/espnet)
import sys
from collections.abc import Sequence
-from distutils.util import strtobool as dist_strtobool
import numpy
+from distutils.util import strtobool as dist_strtobool
def strtobool(x):
diff --git a/paddlespeech/s2t/utils/text_grid.py b/paddlespeech/s2t/utils/text_grid.py
index 3af58c9ba60ff873d0e8134c996d1ffc99b4279a..cbd9856e40d72897cd08d3618178e60f7a34ea0f 100644
--- a/paddlespeech/s2t/utils/text_grid.py
+++ b/paddlespeech/s2t/utils/text_grid.py
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+# Modified from wenet(https://github.com/wenet-e2e/wenet)
from typing import Dict
from typing import List
from typing import Text
diff --git a/paddlespeech/s2t/utils/utility.py b/paddlespeech/s2t/utils/utility.py
index dc1be815923d234d13dfabc3ada0e7e3296202cd..fdd8c029232b582f46adfe8d06cae817d89d283d 100644
--- a/paddlespeech/s2t/utils/utility.py
+++ b/paddlespeech/s2t/utils/utility.py
@@ -12,7 +12,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Contains common utility functions."""
-import distutils.util
import math
import os
import random
@@ -21,6 +20,7 @@ from contextlib import contextmanager
from pprint import pformat
from typing import List
+import distutils.util
import numpy as np
import paddle
import soundfile
diff --git a/paddlespeech/server/README.md b/paddlespeech/server/README.md
index 4ce9605d62a0c411840f9f861a5f251b146110ab..819fe440d220c1f4b06b2557978c9205ede804e0 100644
--- a/paddlespeech/server/README.md
+++ b/paddlespeech/server/README.md
@@ -10,7 +10,7 @@
paddlespeech_server help
```
### Start the server
- First set the service-related configuration parameters, similar to `./conf/application.yaml`,
+ First set the service-related configuration parameters, similar to `./conf/application.yaml`. Set `engine_list`, which represents the speech tasks included in the service to be started
Then start the service:
```bash
paddlespeech_server start --config_file ./conf/application.yaml
@@ -23,7 +23,7 @@
```
### Access speech recognition services
```
- paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input ./tests/16_audio.wav
+ paddlespeech_client asr --server_ip 127.0.0.1 --port 8090 --input input_16k.wav
```
### Access text to speech services
@@ -31,3 +31,7 @@
paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "你好,欢迎使用百度飞桨深度学习框架!" --output output.wav
```
+ ### Access audio classification services
+ ```bash
+ paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input input.wav
+ ```
diff --git a/paddlespeech/server/README_cn.md b/paddlespeech/server/README_cn.md
index 2dfd9474ba6490dedbb8d984c5ba9810506fa415..c0a4a7336700c642efc2172dfa14416dff0ef5ec 100644
--- a/paddlespeech/server/README_cn.md
+++ b/paddlespeech/server/README_cn.md
@@ -10,7 +10,7 @@
paddlespeech_server help
```
### 启动服务
- 首先设置服务相关配置文件,类似于 `./conf/application.yaml`,同时设置服务配置中的语音任务模型相关配置,类似于 `./conf/tts/tts.yaml`。
+ 首先设置服务相关配置文件,类似于 `./conf/application.yaml`,设置 `engine_list`,该值表示即将启动的服务中包含的语音任务。
然后启动服务:
```bash
paddlespeech_server start --config_file ./conf/application.yaml
@@ -30,3 +30,8 @@
```bash
paddlespeech_client tts --server_ip 127.0.0.1 --port 8090 --input "你好,欢迎使用百度飞桨深度学习框架!" --output output.wav
```
+
+ ### 访问音频分类服务
+ ```bash
+ paddlespeech_client cls --server_ip 127.0.0.1 --port 8090 --input input.wav
+ ```
diff --git a/paddlespeech/server/bin/main.py b/paddlespeech/server/bin/main.py
index de52829930262f5b89a5c85a270bb0f33ec8dea2..81824c85c46687ff12d2ffd366743eaf237dbd9a 100644
--- a/paddlespeech/server/bin/main.py
+++ b/paddlespeech/server/bin/main.py
@@ -17,8 +17,9 @@ import uvicorn
from fastapi import FastAPI
from paddlespeech.server.engine.engine_pool import init_engine_pool
-from paddlespeech.server.restful.api import setup_router
+from paddlespeech.server.restful.api import setup_router as setup_http_router
from paddlespeech.server.utils.config import get_config
+from paddlespeech.server.ws.api import setup_router as setup_ws_router
app = FastAPI(
title="PaddleSpeech Serving API", description="Api", version="0.0.1")
@@ -35,7 +36,12 @@ def init(config):
"""
# init api
api_list = list(engine.split("_")[0] for engine in config.engine_list)
- api_router = setup_router(api_list)
+ if config.protocol == "websocket":
+ api_router = setup_ws_router(api_list)
+ elif config.protocol == "http":
+ api_router = setup_http_router(api_list)
+ else:
+ raise Exception("unsupported protocol")
app.include_router(api_router)
if not init_engine_pool(config):
diff --git a/paddlespeech/server/bin/paddlespeech_client.py b/paddlespeech/server/bin/paddlespeech_client.py
index 40f17c63c8e7177205a2dfa87bce89ddb7ac6201..413f00872327b1ef364146d12b8cd8540eec421f 100644
--- a/paddlespeech/server/bin/paddlespeech_client.py
+++ b/paddlespeech/server/bin/paddlespeech_client.py
@@ -150,7 +150,7 @@ class TTSClientExecutor(BaseExecutor):
res = requests.post(url, json.dumps(request))
response_dict = res.json()
- if not output:
+ if output is not None:
self.postprocess(response_dict["result"]["audio"], output)
return res
diff --git a/paddlespeech/server/conf/application.yaml b/paddlespeech/server/conf/application.yaml
index 2b1a05998083e08377d63ee02bc77323a7c4dce5..849349c2df371a58f754d1fa881ba524ac7df5d7 100644
--- a/paddlespeech/server/conf/application.yaml
+++ b/paddlespeech/server/conf/application.yaml
@@ -8,7 +8,9 @@ port: 8090
# The task format in the engin_list is: _
# task choices = ['asr_python', 'asr_inference', 'tts_python', 'tts_inference']
-
+# protocol = ['websocket', 'http'] (only one can be selected).
+# http only support offline engine type.
+protocol: 'http'
engine_list: ['asr_python', 'tts_python', 'cls_python']
@@ -48,6 +50,24 @@ asr_inference:
summary: True # False -> do not show predictor config
+################### speech task: asr; engine_type: online #######################
+asr_online:
+ model_type: 'deepspeech2online_aishell'
+ am_model: # the pdmodel file of am static model [optional]
+ am_params: # the pdiparams file of am static model [optional]
+ lang: 'zh'
+ sample_rate: 16000
+ cfg_path:
+ decode_method:
+ force_yes: True
+
+ am_predictor_conf:
+ device: # set 'gpu:id' or 'cpu'
+ switch_ir_optim: True
+ glog_info: False # True -> print glog
+ summary: True # False -> do not show predictor config
+
+
################################### TTS #########################################
################### speech task: tts; engine_type: python #######################
tts_python:
diff --git a/paddlespeech/server/conf/ws_application.yaml b/paddlespeech/server/conf/ws_application.yaml
new file mode 100644
index 0000000000000000000000000000000000000000..ef23593ed896ebfc11d2bf3645f4e515b1e90df4
--- /dev/null
+++ b/paddlespeech/server/conf/ws_application.yaml
@@ -0,0 +1,51 @@
+# This is the parameter configuration file for PaddleSpeech Serving.
+
+#################################################################################
+# SERVER SETTING #
+#################################################################################
+host: 0.0.0.0
+port: 8091
+
+# The task format in the engin_list is: _
+# task choices = ['asr_online', 'tts_online']
+# protocol = ['websocket', 'http'] (only one can be selected).
+# websocket only support online engine type.
+protocol: 'websocket'
+engine_list: ['asr_online']
+
+
+#################################################################################
+# ENGINE CONFIG #
+#################################################################################
+
+################################### ASR #########################################
+################### speech task: asr; engine_type: online #######################
+asr_online:
+ model_type: 'deepspeech2online_aishell'
+ am_model: # the pdmodel file of am static model [optional]
+ am_params: # the pdiparams file of am static model [optional]
+ lang: 'zh'
+ sample_rate: 16000
+ cfg_path:
+ decode_method:
+ force_yes: True
+
+ am_predictor_conf:
+ device: # set 'gpu:id' or 'cpu'
+ switch_ir_optim: True
+ glog_info: False # True -> print glog
+ summary: True # False -> do not show predictor config
+
+ chunk_buffer_conf:
+ frame_duration_ms: 80
+ shift_ms: 40
+ sample_rate: 16000
+ sample_width: 2
+
+ vad_conf:
+ aggressiveness: 2
+ sample_rate: 16000
+ frame_duration_ms: 20
+ sample_width: 2
+ padding_ms: 200
+ padding_ratio: 0.9
diff --git a/paddlespeech/server/engine/asr/online/__init__.py b/paddlespeech/server/engine/asr/online/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/paddlespeech/server/engine/asr/online/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/server/engine/asr/online/asr_engine.py b/paddlespeech/server/engine/asr/online/asr_engine.py
new file mode 100644
index 0000000000000000000000000000000000000000..389175a0a0c257903f8b4c296842923bf1b73cf7
--- /dev/null
+++ b/paddlespeech/server/engine/asr/online/asr_engine.py
@@ -0,0 +1,352 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Optional
+
+import numpy as np
+import paddle
+from numpy import float32
+from yacs.config import CfgNode
+
+from paddlespeech.cli.asr.infer import ASRExecutor
+from paddlespeech.cli.log import logger
+from paddlespeech.cli.utils import MODEL_HOME
+from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
+from paddlespeech.s2t.frontend.speech import SpeechSegment
+from paddlespeech.s2t.modules.ctc import CTCDecoder
+from paddlespeech.s2t.utils.utility import UpdateConfig
+from paddlespeech.server.engine.base_engine import BaseEngine
+from paddlespeech.server.utils.paddle_predictor import init_predictor
+
+__all__ = ['ASREngine']
+
+pretrained_models = {
+ "deepspeech2online_aishell-zh-16k": {
+ 'url':
+ 'https://paddlespeech.bj.bcebos.com/s2t/aishell/asr0/asr0_deepspeech2_online_aishell_ckpt_0.1.1.model.tar.gz',
+ 'md5':
+ 'd5e076217cf60486519f72c217d21b9b',
+ 'cfg_path':
+ 'model.yaml',
+ 'ckpt_path':
+ 'exp/deepspeech2_online/checkpoints/avg_1',
+ 'model':
+ 'exp/deepspeech2_online/checkpoints/avg_1.jit.pdmodel',
+ 'params':
+ 'exp/deepspeech2_online/checkpoints/avg_1.jit.pdiparams',
+ 'lm_url':
+ 'https://deepspeech.bj.bcebos.com/zh_lm/zh_giga.no_cna_cmn.prune01244.klm',
+ 'lm_md5':
+ '29e02312deb2e59b3c8686c7966d4fe3'
+ },
+}
+
+
+class ASRServerExecutor(ASRExecutor):
+ def __init__(self):
+ super().__init__()
+ pass
+
+ def _init_from_path(self,
+ model_type: str='wenetspeech',
+ am_model: Optional[os.PathLike]=None,
+ am_params: Optional[os.PathLike]=None,
+ lang: str='zh',
+ sample_rate: int=16000,
+ cfg_path: Optional[os.PathLike]=None,
+ decode_method: str='attention_rescoring',
+ am_predictor_conf: dict=None):
+ """
+ Init model and other resources from a specific path.
+ """
+
+ if cfg_path is None or am_model is None or am_params is None:
+ sample_rate_str = '16k' if sample_rate == 16000 else '8k'
+ tag = model_type + '-' + lang + '-' + sample_rate_str
+ res_path = self._get_pretrained_path(tag) # wenetspeech_zh
+ self.res_path = res_path
+ self.cfg_path = os.path.join(res_path,
+ pretrained_models[tag]['cfg_path'])
+
+ self.am_model = os.path.join(res_path,
+ pretrained_models[tag]['model'])
+ self.am_params = os.path.join(res_path,
+ pretrained_models[tag]['params'])
+ logger.info(res_path)
+ logger.info(self.cfg_path)
+ logger.info(self.am_model)
+ logger.info(self.am_params)
+ else:
+ self.cfg_path = os.path.abspath(cfg_path)
+ self.am_model = os.path.abspath(am_model)
+ self.am_params = os.path.abspath(am_params)
+ self.res_path = os.path.dirname(
+ os.path.dirname(os.path.abspath(self.cfg_path)))
+
+ #Init body.
+ self.config = CfgNode(new_allowed=True)
+ self.config.merge_from_file(self.cfg_path)
+
+ with UpdateConfig(self.config):
+ if "deepspeech2online" in model_type or "deepspeech2offline" in model_type:
+ from paddlespeech.s2t.io.collator import SpeechCollator
+ self.vocab = self.config.vocab_filepath
+ self.config.decode.lang_model_path = os.path.join(
+ MODEL_HOME, 'language_model',
+ self.config.decode.lang_model_path)
+ self.collate_fn_test = SpeechCollator.from_config(self.config)
+ self.text_feature = TextFeaturizer(
+ unit_type=self.config.unit_type, vocab=self.vocab)
+
+ lm_url = pretrained_models[tag]['lm_url']
+ lm_md5 = pretrained_models[tag]['lm_md5']
+ self.download_lm(
+ lm_url,
+ os.path.dirname(self.config.decode.lang_model_path), lm_md5)
+ elif "conformer" in model_type or "transformer" in model_type or "wenetspeech" in model_type:
+ raise Exception("wrong type")
+ else:
+ raise Exception("wrong type")
+
+ # AM predictor
+ self.am_predictor_conf = am_predictor_conf
+ self.am_predictor = init_predictor(
+ model_file=self.am_model,
+ params_file=self.am_params,
+ predictor_conf=self.am_predictor_conf)
+
+ # decoder
+ self.decoder = CTCDecoder(
+ odim=self.config.output_dim, # is in vocab
+ enc_n_units=self.config.rnn_layer_size * 2,
+ blank_id=self.config.blank_id,
+ dropout_rate=0.0,
+ reduction=True, # sum
+ batch_average=True, # sum / batch_size
+ grad_norm_type=self.config.get('ctc_grad_norm_type', None))
+
+ # init decoder
+ cfg = self.config.decode
+ decode_batch_size = 1 # for online
+ self.decoder.init_decoder(
+ decode_batch_size, self.text_feature.vocab_list,
+ cfg.decoding_method, cfg.lang_model_path, cfg.alpha, cfg.beta,
+ cfg.beam_size, cfg.cutoff_prob, cfg.cutoff_top_n,
+ cfg.num_proc_bsearch)
+
+ # init state box
+ self.chunk_state_h_box = np.zeros(
+ (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+ dtype=float32)
+ self.chunk_state_c_box = np.zeros(
+ (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+ dtype=float32)
+
+ def reset_decoder_and_chunk(self):
+ """reset decoder and chunk state for an new audio
+ """
+ self.decoder.reset_decoder(batch_size=1)
+ # init state box, for new audio request
+ self.chunk_state_h_box = np.zeros(
+ (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+ dtype=float32)
+ self.chunk_state_c_box = np.zeros(
+ (self.config.num_rnn_layers, 1, self.config.rnn_layer_size),
+ dtype=float32)
+
+ def decode_one_chunk(self, x_chunk, x_chunk_lens, model_type: str):
+ """decode one chunk
+
+ Args:
+ x_chunk (numpy.array): shape[B, T, D]
+ x_chunk_lens (numpy.array): shape[B]
+ model_type (str): online model type
+
+ Returns:
+ [type]: [description]
+ """
+ if "deepspeech2online" in model_type:
+ input_names = self.am_predictor.get_input_names()
+ audio_handle = self.am_predictor.get_input_handle(input_names[0])
+ audio_len_handle = self.am_predictor.get_input_handle(
+ input_names[1])
+ h_box_handle = self.am_predictor.get_input_handle(input_names[2])
+ c_box_handle = self.am_predictor.get_input_handle(input_names[3])
+
+ audio_handle.reshape(x_chunk.shape)
+ audio_handle.copy_from_cpu(x_chunk)
+
+ audio_len_handle.reshape(x_chunk_lens.shape)
+ audio_len_handle.copy_from_cpu(x_chunk_lens)
+
+ h_box_handle.reshape(self.chunk_state_h_box.shape)
+ h_box_handle.copy_from_cpu(self.chunk_state_h_box)
+
+ c_box_handle.reshape(self.chunk_state_c_box.shape)
+ c_box_handle.copy_from_cpu(self.chunk_state_c_box)
+
+ output_names = self.am_predictor.get_output_names()
+ output_handle = self.am_predictor.get_output_handle(output_names[0])
+ output_lens_handle = self.am_predictor.get_output_handle(
+ output_names[1])
+ output_state_h_handle = self.am_predictor.get_output_handle(
+ output_names[2])
+ output_state_c_handle = self.am_predictor.get_output_handle(
+ output_names[3])
+
+ self.am_predictor.run()
+
+ output_chunk_probs = output_handle.copy_to_cpu()
+ output_chunk_lens = output_lens_handle.copy_to_cpu()
+ self.chunk_state_h_box = output_state_h_handle.copy_to_cpu()
+ self.chunk_state_c_box = output_state_c_handle.copy_to_cpu()
+
+ self.decoder.next(output_chunk_probs, output_chunk_lens)
+ trans_best, trans_beam = self.decoder.decode()
+
+ return trans_best[0]
+
+ elif "conformer" in model_type or "transformer" in model_type:
+ raise Exception("invalid model name")
+ else:
+ raise Exception("invalid model name")
+
+ def _pcm16to32(self, audio):
+ """pcm int16 to float32
+
+ Args:
+ audio(numpy.array): numpy.int16
+
+ Returns:
+ audio(numpy.array): numpy.float32
+ """
+ if audio.dtype == np.int16:
+ audio = audio.astype("float32")
+ bits = np.iinfo(np.int16).bits
+ audio = audio / (2**(bits - 1))
+ return audio
+
+ def extract_feat(self, samples, sample_rate):
+ """extract feat
+
+ Args:
+ samples (numpy.array): numpy.float32
+ sample_rate (int): sample rate
+
+ Returns:
+ x_chunk (numpy.array): shape[B, T, D]
+ x_chunk_lens (numpy.array): shape[B]
+ """
+ # pcm16 -> pcm 32
+ samples = self._pcm16to32(samples)
+
+ # read audio
+ speech_segment = SpeechSegment.from_pcm(
+ samples, sample_rate, transcript=" ")
+ # audio augment
+ self.collate_fn_test.augmentation.transform_audio(speech_segment)
+
+ # extract speech feature
+ spectrum, transcript_part = self.collate_fn_test._speech_featurizer.featurize(
+ speech_segment, self.collate_fn_test.keep_transcription_text)
+ # CMVN spectrum
+ if self.collate_fn_test._normalizer:
+ spectrum = self.collate_fn_test._normalizer.apply(spectrum)
+
+ # spectrum augment
+ audio = self.collate_fn_test.augmentation.transform_feature(spectrum)
+
+ audio_len = audio.shape[0]
+ audio = paddle.to_tensor(audio, dtype='float32')
+ # audio_len = paddle.to_tensor(audio_len)
+ audio = paddle.unsqueeze(audio, axis=0)
+
+ x_chunk = audio.numpy()
+ x_chunk_lens = np.array([audio_len])
+
+ return x_chunk, x_chunk_lens
+
+
+class ASREngine(BaseEngine):
+ """ASR server engine
+
+ Args:
+ metaclass: Defaults to Singleton.
+ """
+
+ def __init__(self):
+ super(ASREngine, self).__init__()
+
+ def init(self, config: dict) -> bool:
+ """init engine resource
+
+ Args:
+ config_file (str): config file
+
+ Returns:
+ bool: init failed or success
+ """
+ self.input = None
+ self.output = ""
+ self.executor = ASRServerExecutor()
+ self.config = config
+
+ self.executor._init_from_path(
+ model_type=self.config.model_type,
+ am_model=self.config.am_model,
+ am_params=self.config.am_params,
+ lang=self.config.lang,
+ sample_rate=self.config.sample_rate,
+ cfg_path=self.config.cfg_path,
+ decode_method=self.config.decode_method,
+ am_predictor_conf=self.config.am_predictor_conf)
+
+ logger.info("Initialize ASR server engine successfully.")
+ return True
+
+ def preprocess(self, samples, sample_rate):
+ """preprocess
+
+ Args:
+ samples (numpy.array): numpy.float32
+ sample_rate (int): sample rate
+
+ Returns:
+ x_chunk (numpy.array): shape[B, T, D]
+ x_chunk_lens (numpy.array): shape[B]
+ """
+ x_chunk, x_chunk_lens = self.executor.extract_feat(samples, sample_rate)
+ return x_chunk, x_chunk_lens
+
+ def run(self, x_chunk, x_chunk_lens, decoder_chunk_size=1):
+ """run online engine
+
+ Args:
+ x_chunk (numpy.array): shape[B, T, D]
+ x_chunk_lens (numpy.array): shape[B]
+ decoder_chunk_size(int)
+ """
+ self.output = self.executor.decode_one_chunk(x_chunk, x_chunk_lens,
+ self.config.model_type)
+
+ def postprocess(self):
+ """postprocess
+ """
+ return self.output
+
+ def reset(self):
+ """reset engine decoder and inference state
+ """
+ self.executor.reset_decoder_and_chunk()
+ self.output = ""
diff --git a/paddlespeech/server/engine/engine_factory.py b/paddlespeech/server/engine/engine_factory.py
index c39c44cae5fbf4a8cad0e8111bd13b42aa73cf1a..2a39fb79b9c4ece5f16ef6761f03af90ddaed79e 100644
--- a/paddlespeech/server/engine/engine_factory.py
+++ b/paddlespeech/server/engine/engine_factory.py
@@ -25,6 +25,9 @@ class EngineFactory(object):
elif engine_name == 'asr' and engine_type == 'python':
from paddlespeech.server.engine.asr.python.asr_engine import ASREngine
return ASREngine()
+ elif engine_name == 'asr' and engine_type == 'online':
+ from paddlespeech.server.engine.asr.online.asr_engine import ASREngine
+ return ASREngine()
elif engine_name == 'tts' and engine_type == 'inference':
from paddlespeech.server.engine.tts.paddleinference.tts_engine import TTSEngine
return TTSEngine()
diff --git a/paddlespeech/server/tests/asr/online/microphone_client.py b/paddlespeech/server/tests/asr/online/microphone_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..2ceaf6d03a07ab922477505c016e3870351d2574
--- /dev/null
+++ b/paddlespeech/server/tests/asr/online/microphone_client.py
@@ -0,0 +1,161 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+record wave from the mic
+"""
+import asyncio
+import json
+import logging
+import threading
+import wave
+from signal import SIGINT
+from signal import SIGTERM
+
+import pyaudio
+import websockets
+
+
+class ASRAudioHandler(threading.Thread):
+ def __init__(self, url="127.0.0.1", port=8091):
+ threading.Thread.__init__(self)
+ self.url = url
+ self.port = port
+ self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr"
+ self.fileName = "./output.wav"
+ self.chunk = 5120
+ self.format = pyaudio.paInt16
+ self.channels = 1
+ self.rate = 16000
+ self._running = True
+ self._frames = []
+ self.data_backup = []
+
+ def startrecord(self):
+ """
+ start a new thread to record wave
+ """
+ threading._start_new_thread(self.recording, ())
+
+ def recording(self):
+ """
+ recording wave
+ """
+ self._running = True
+ self._frames = []
+ p = pyaudio.PyAudio()
+ stream = p.open(
+ format=self.format,
+ channels=self.channels,
+ rate=self.rate,
+ input=True,
+ frames_per_buffer=self.chunk)
+ while (self._running):
+ data = stream.read(self.chunk)
+ self._frames.append(data)
+ self.data_backup.append(data)
+
+ stream.stop_stream()
+ stream.close()
+ p.terminate()
+
+ def save(self):
+ """
+ save wave data
+ """
+ p = pyaudio.PyAudio()
+ wf = wave.open(self.fileName, 'wb')
+ wf.setnchannels(self.channels)
+ wf.setsampwidth(p.get_sample_size(self.format))
+ wf.setframerate(self.rate)
+ wf.writeframes(b''.join(self.data_backup))
+ wf.close()
+ p.terminate()
+
+ def stoprecord(self):
+ """
+ stop recording
+ """
+ self._running = False
+
+ async def run(self):
+ aa = input("是否开始录音? (y/n)")
+ if aa.strip() == "y":
+ self.startrecord()
+ logging.info("*" * 10 + "开始录音,请输入语音")
+
+ async with websockets.connect(self.url) as ws:
+ # 发送开始指令
+ audio_info = json.dumps(
+ {
+ "name": "test.wav",
+ "signal": "start",
+ "nbest": 5
+ },
+ sort_keys=True,
+ indent=4,
+ separators=(',', ': '))
+ await ws.send(audio_info)
+ msg = await ws.recv()
+ logging.info("receive msg={}".format(msg))
+
+ # send bytes data
+ logging.info("结束录音请: Ctrl + c。继续请按回车。")
+ try:
+ while True:
+ while len(self._frames) > 0:
+ await ws.send(self._frames.pop(0))
+ msg = await ws.recv()
+ logging.info("receive msg={}".format(msg))
+ except asyncio.CancelledError:
+ # quit
+ # send finished
+ audio_info = json.dumps(
+ {
+ "name": "test.wav",
+ "signal": "end",
+ "nbest": 5
+ },
+ sort_keys=True,
+ indent=4,
+ separators=(',', ': '))
+ await ws.send(audio_info)
+ msg = await ws.recv()
+ logging.info("receive msg={}".format(msg))
+
+ self.stoprecord()
+ logging.info("*" * 10 + "录音结束")
+ self.save()
+ elif aa.strip() == "n":
+ exit()
+ else:
+ print("无效输入!")
+ exit()
+
+
+if __name__ == "__main__":
+
+ logging.basicConfig(level=logging.INFO)
+ logging.info("asr websocket client start")
+
+ handler = ASRAudioHandler("127.0.0.1", 8091)
+ loop = asyncio.get_event_loop()
+ main_task = asyncio.ensure_future(handler.run())
+ for signal in [SIGINT, SIGTERM]:
+ loop.add_signal_handler(signal, main_task.cancel)
+ try:
+ loop.run_until_complete(main_task)
+ finally:
+ loop.close()
+
+ logging.info("asr websocket client finished")
diff --git a/paddlespeech/server/tests/asr/online/websocket_client.py b/paddlespeech/server/tests/asr/online/websocket_client.py
new file mode 100644
index 0000000000000000000000000000000000000000..58b1a452c19a2e330b32be8826e4d4f693dae440
--- /dev/null
+++ b/paddlespeech/server/tests/asr/online/websocket_client.py
@@ -0,0 +1,115 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#!/usr/bin/python
+# -*- coding: UTF-8 -*-
+import argparse
+import asyncio
+import json
+import logging
+
+import numpy as np
+import soundfile
+import websockets
+
+
+class ASRAudioHandler:
+ def __init__(self, url="127.0.0.1", port=8090):
+ self.url = url
+ self.port = port
+ self.url = "ws://" + self.url + ":" + str(self.port) + "/ws/asr"
+
+ def read_wave(self, wavfile_path: str):
+ samples, sample_rate = soundfile.read(wavfile_path, dtype='int16')
+ x_len = len(samples)
+ chunk_stride = 40 * 16 #40ms, sample_rate = 16kHz
+ chunk_size = 80 * 16 #80ms, sample_rate = 16kHz
+
+ if (x_len - chunk_size) % chunk_stride != 0:
+ padding_len_x = chunk_stride - (x_len - chunk_size) % chunk_stride
+ else:
+ padding_len_x = 0
+
+ padding = np.zeros((padding_len_x), dtype=samples.dtype)
+ padded_x = np.concatenate([samples, padding], axis=0)
+
+ num_chunk = (x_len + padding_len_x - chunk_size) / chunk_stride + 1
+ num_chunk = int(num_chunk)
+
+ for i in range(0, num_chunk):
+ start = i * chunk_stride
+ end = start + chunk_size
+ x_chunk = padded_x[start:end]
+ yield x_chunk
+
+ async def run(self, wavfile_path: str):
+ logging.info("send a message to the server")
+ # 读取音频
+ # self.read_wave()
+ # 发送 websocket 的 handshake 协议头
+ async with websockets.connect(self.url) as ws:
+ # server 端已经接收到 handshake 协议头
+ # 发送开始指令
+ audio_info = json.dumps(
+ {
+ "name": "test.wav",
+ "signal": "start",
+ "nbest": 5
+ },
+ sort_keys=True,
+ indent=4,
+ separators=(',', ': '))
+ await ws.send(audio_info)
+ msg = await ws.recv()
+ logging.info("receive msg={}".format(msg))
+
+ # send chunk audio data to engine
+ for chunk_data in self.read_wave(wavfile_path):
+ await ws.send(chunk_data.tobytes())
+ msg = await ws.recv()
+ logging.info("receive msg={}".format(msg))
+
+ # finished
+ audio_info = json.dumps(
+ {
+ "name": "test.wav",
+ "signal": "end",
+ "nbest": 5
+ },
+ sort_keys=True,
+ indent=4,
+ separators=(',', ': '))
+ await ws.send(audio_info)
+ msg = await ws.recv()
+ logging.info("receive msg={}".format(msg))
+
+
+def main(args):
+ logging.basicConfig(level=logging.INFO)
+ logging.info("asr websocket client start")
+ handler = ASRAudioHandler("127.0.0.1", 8091)
+ loop = asyncio.get_event_loop()
+ loop.run_until_complete(handler.run(args.wavfile))
+ logging.info("asr websocket client finished")
+
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument(
+ "--wavfile",
+ action="store",
+ help="wav file path ",
+ default="./16_audio.wav")
+ args = parser.parse_args()
+
+ main(args)
diff --git a/paddlespeech/server/utils/buffer.py b/paddlespeech/server/utils/buffer.py
new file mode 100644
index 0000000000000000000000000000000000000000..682357b34f542fe62d9819d225b3e5bdde3a30be
--- /dev/null
+++ b/paddlespeech/server/utils/buffer.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class Frame(object):
+ """Represents a "frame" of audio data."""
+
+ def __init__(self, bytes, timestamp, duration):
+ self.bytes = bytes
+ self.timestamp = timestamp
+ self.duration = duration
+
+
+class ChunkBuffer(object):
+ def __init__(self,
+ frame_duration_ms=80,
+ shift_ms=40,
+ sample_rate=16000,
+ sample_width=2):
+ self.sample_rate = sample_rate
+ self.frame_duration_ms = frame_duration_ms
+ self.shift_ms = shift_ms
+ self.remained_audio = b''
+ self.sample_width = sample_width # int16 = 2; float32 = 4
+
+ def frame_generator(self, audio):
+ """Generates audio frames from PCM audio data.
+ Takes the desired frame duration in milliseconds, the PCM data, and
+ the sample rate.
+ Yields Frames of the requested duration.
+ """
+ audio = self.remained_audio + audio
+ self.remained_audio = b''
+
+ n = int(self.sample_rate * (self.frame_duration_ms / 1000.0) *
+ self.sample_width)
+ shift_n = int(self.sample_rate * (self.shift_ms / 1000.0) *
+ self.sample_width)
+ offset = 0
+ timestamp = 0.0
+ duration = (float(n) / self.sample_rate) / self.sample_width
+ shift_duration = (float(shift_n) / self.sample_rate) / self.sample_width
+ while offset + n <= len(audio):
+ yield Frame(audio[offset:offset + n], timestamp, duration)
+ timestamp += shift_duration
+ offset += shift_n
+
+ self.remained_audio += audio[offset:]
diff --git a/paddlespeech/server/utils/vad.py b/paddlespeech/server/utils/vad.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2dcf68b80f2985a41ffb44d9501b973730b4ba2
--- /dev/null
+++ b/paddlespeech/server/utils/vad.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+
+import webrtcvad
+
+
+class VADAudio():
+ def __init__(self,
+ aggressiveness=2,
+ rate=16000,
+ frame_duration_ms=20,
+ sample_width=2,
+ padding_ms=200,
+ padding_ratio=0.9):
+ """Initializes VAD with given aggressivenes and sets up internal queues"""
+ self.vad = webrtcvad.Vad(aggressiveness)
+ self.rate = rate
+ self.sample_width = sample_width
+ self.frame_duration_ms = frame_duration_ms
+ self._frame_length = int(rate * (frame_duration_ms / 1000.0) *
+ self.sample_width)
+ self._buffer_queue = collections.deque()
+ self.ring_buffer = collections.deque(maxlen=padding_ms //
+ frame_duration_ms)
+ self._ratio = padding_ratio
+ self.triggered = False
+
+ def add_audio(self, audio):
+ """Adds new audio to internal queue"""
+ for x in audio:
+ self._buffer_queue.append(x)
+
+ def frame_generator(self):
+ """Generator that yields audio frames of frame_duration_ms"""
+ while len(self._buffer_queue) > self._frame_length:
+ frame = bytearray()
+ for _ in range(self._frame_length):
+ frame.append(self._buffer_queue.popleft())
+ yield bytes(frame)
+
+ def vad_collector(self):
+ """Generator that yields series of consecutive audio frames comprising each utterence, separated by yielding a single None.
+ Determines voice activity by ratio of frames in padding_ms. Uses a buffer to include padding_ms prior to being triggered.
+ Example: (frame, ..., frame, None, frame, ..., frame, None, ...)
+ |---utterence---| |---utterence---|
+ """
+ for frame in self.frame_generator():
+ is_speech = self.vad.is_speech(frame, self.rate)
+ if not self.triggered:
+ self.ring_buffer.append((frame, is_speech))
+ num_voiced = len(
+ [f for f, speech in self.ring_buffer if speech])
+ if num_voiced > self._ratio * self.ring_buffer.maxlen:
+ self.triggered = True
+ for f, s in self.ring_buffer:
+ yield f
+ self.ring_buffer.clear()
+ else:
+ yield frame
+ self.ring_buffer.append((frame, is_speech))
+ num_unvoiced = len(
+ [f for f, speech in self.ring_buffer if not speech])
+ if num_unvoiced > self._ratio * self.ring_buffer.maxlen:
+ self.triggered = False
+ yield None
+ self.ring_buffer.clear()
diff --git a/paddlespeech/server/ws/__init__.py b/paddlespeech/server/ws/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/paddlespeech/server/ws/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/server/ws/api.py b/paddlespeech/server/ws/api.py
new file mode 100644
index 0000000000000000000000000000000000000000..10664d11470ba4c98816b1c3a1fa30d40fe67a02
--- /dev/null
+++ b/paddlespeech/server/ws/api.py
@@ -0,0 +1,38 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import List
+
+from fastapi import APIRouter
+
+from paddlespeech.server.ws.asr_socket import router as asr_router
+
+_router = APIRouter()
+
+
+def setup_router(api_list: List):
+ """setup router for fastapi
+ Args:
+ api_list (List): [asr, tts]
+ Returns:
+ APIRouter
+ """
+ for api_name in api_list:
+ if api_name == 'asr':
+ _router.include_router(asr_router)
+ elif api_name == 'tts':
+ pass
+ else:
+ pass
+
+ return _router
diff --git a/paddlespeech/server/ws/asr_socket.py b/paddlespeech/server/ws/asr_socket.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea19816b69ff4719220784099205a6d8a5bae4ed
--- /dev/null
+++ b/paddlespeech/server/ws/asr_socket.py
@@ -0,0 +1,100 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+
+import numpy as np
+from fastapi import APIRouter
+from fastapi import WebSocket
+from fastapi import WebSocketDisconnect
+from starlette.websockets import WebSocketState as WebSocketState
+
+from paddlespeech.server.engine.engine_pool import get_engine_pool
+from paddlespeech.server.utils.buffer import ChunkBuffer
+from paddlespeech.server.utils.vad import VADAudio
+
+router = APIRouter()
+
+
+@router.websocket('/ws/asr')
+async def websocket_endpoint(websocket: WebSocket):
+
+ await websocket.accept()
+
+ engine_pool = get_engine_pool()
+ asr_engine = engine_pool['asr']
+ # init buffer
+ chunk_buffer_conf = asr_engine.config.chunk_buffer_conf
+ chunk_buffer = ChunkBuffer(
+ sample_rate=chunk_buffer_conf['sample_rate'],
+ sample_width=chunk_buffer_conf['sample_width'])
+ # init vad
+ vad_conf = asr_engine.config.vad_conf
+ vad = VADAudio(
+ aggressiveness=vad_conf['aggressiveness'],
+ rate=vad_conf['sample_rate'],
+ frame_duration_ms=vad_conf['frame_duration_ms'])
+
+ try:
+ while True:
+ # careful here, changed the source code from starlette.websockets
+ assert websocket.application_state == WebSocketState.CONNECTED
+ message = await websocket.receive()
+ websocket._raise_on_disconnect(message)
+ if "text" in message:
+ message = json.loads(message["text"])
+ if 'signal' not in message:
+ resp = {"status": "ok", "message": "no valid json data"}
+ await websocket.send_json(resp)
+
+ if message['signal'] == 'start':
+ resp = {"status": "ok", "signal": "server_ready"}
+ # do something at begining here
+ await websocket.send_json(resp)
+ elif message['signal'] == 'end':
+ engine_pool = get_engine_pool()
+ asr_engine = engine_pool['asr']
+ # reset single engine for an new connection
+ asr_engine.reset()
+ resp = {"status": "ok", "signal": "finished"}
+ await websocket.send_json(resp)
+ break
+ else:
+ resp = {"status": "ok", "message": "no valid json data"}
+ await websocket.send_json(resp)
+ elif "bytes" in message:
+ message = message["bytes"]
+
+ # vad for input bytes audio
+ vad.add_audio(message)
+ message = b''.join(f for f in vad.vad_collector()
+ if f is not None)
+
+ engine_pool = get_engine_pool()
+ asr_engine = engine_pool['asr']
+ asr_results = ""
+ frames = chunk_buffer.frame_generator(message)
+ for frame in frames:
+ samples = np.frombuffer(frame.bytes, dtype=np.int16)
+ sample_rate = asr_engine.config.sample_rate
+ x_chunk, x_chunk_lens = asr_engine.preprocess(samples,
+ sample_rate)
+ asr_engine.run(x_chunk, x_chunk_lens)
+ asr_results = asr_engine.postprocess()
+
+ asr_results = asr_engine.postprocess()
+ resp = {'asr_results': asr_results}
+
+ await websocket.send_json(resp)
+ except WebSocketDisconnect:
+ pass
diff --git a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
index 3fded29b12a180d3153d42ecb4dc810ce25fa900..4c92ad1cc46ee22e165f560eb4095d4044a559fe 100644
--- a/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
+++ b/paddlespeech/t2s/exps/fastspeech2/gen_gta_mel.py
@@ -13,7 +13,6 @@
# limitations under the License.
# generate mels using durations.txt
# for mb melgan finetune
-# 长度和原本的 mel 不一致怎么办?
import argparse
import os
from pathlib import Path
diff --git a/paddlespeech/t2s/exps/synthesize_streaming.py b/paddlespeech/t2s/exps/synthesize_streaming.py
new file mode 100644
index 0000000000000000000000000000000000000000..f38b2d3522f0c047ef6b3351f2db71130d2cebc4
--- /dev/null
+++ b/paddlespeech/t2s/exps/synthesize_streaming.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import math
+from pathlib import Path
+
+import numpy as np
+import paddle
+import soundfile as sf
+import yaml
+from timer import timer
+from yacs.config import CfgNode
+
+from paddlespeech.s2t.utils.dynamic_import import dynamic_import
+from paddlespeech.t2s.exps.syn_utils import get_frontend
+from paddlespeech.t2s.exps.syn_utils import get_sentences
+from paddlespeech.t2s.exps.syn_utils import get_voc_inference
+from paddlespeech.t2s.exps.syn_utils import model_alias
+from paddlespeech.t2s.utils import str2bool
+
+
+def denorm(data, mean, std):
+ return data * std + mean
+
+
+def get_chunks(data, chunk_size, pad_size):
+ data_len = data.shape[1]
+ chunks = []
+ n = math.ceil(data_len / chunk_size)
+ for i in range(n):
+ start = max(0, i * chunk_size - pad_size)
+ end = min((i + 1) * chunk_size + pad_size, data_len)
+ chunks.append(data[:, start:end, :])
+ return chunks
+
+
+def evaluate(args):
+
+ # Init body.
+ with open(args.am_config) as f:
+ am_config = CfgNode(yaml.safe_load(f))
+ with open(args.voc_config) as f:
+ voc_config = CfgNode(yaml.safe_load(f))
+
+ print("========Args========")
+ print(yaml.safe_dump(vars(args)))
+ print("========Config========")
+ print(am_config)
+ print(voc_config)
+
+ sentences = get_sentences(args)
+
+ # frontend
+ frontend = get_frontend(args)
+
+ with open(args.phones_dict, "r") as f:
+ phn_id = [line.strip().split() for line in f.readlines()]
+ vocab_size = len(phn_id)
+ print("vocab_size:", vocab_size)
+
+ # acoustic model, only support fastspeech2 here now!
+ # am_inference, am_name, am_dataset = get_am_inference(args, am_config)
+ # model: {model_name}_{dataset}
+ am_name = args.am[:args.am.rindex('_')]
+ am_dataset = args.am[args.am.rindex('_') + 1:]
+ odim = am_config.n_mels
+
+ am_class = dynamic_import(am_name, model_alias)
+ am = am_class(idim=vocab_size, odim=odim, **am_config["model"])
+ am.set_state_dict(paddle.load(args.am_ckpt)["main_params"])
+ am.eval()
+ am_mu, am_std = np.load(args.am_stat)
+ am_mu = paddle.to_tensor(am_mu)
+ am_std = paddle.to_tensor(am_std)
+
+ # vocoder
+ voc_inference = get_voc_inference(args, voc_config)
+
+ output_dir = Path(args.output_dir)
+ output_dir.mkdir(parents=True, exist_ok=True)
+ merge_sentences = True
+
+ N = 0
+ T = 0
+ chunk_size = args.chunk_size
+ pad_size = args.pad_size
+
+ for utt_id, sentence in sentences:
+ with timer() as t:
+ get_tone_ids = False
+
+ if args.lang == 'zh':
+ input_ids = frontend.get_input_ids(
+ sentence,
+ merge_sentences=merge_sentences,
+ get_tone_ids=get_tone_ids)
+
+ phone_ids = input_ids["phone_ids"]
+ else:
+ print("lang should in be 'zh' here!")
+ # merge_sentences=True here, so we only use the first item of phone_ids
+ phone_ids = phone_ids[0]
+ with paddle.no_grad():
+ # acoustic model
+ orig_hs, h_masks = am.encoder_infer(phone_ids)
+
+ if args.am_streaming:
+ hss = get_chunks(orig_hs, chunk_size, pad_size)
+ chunk_num = len(hss)
+ mel_list = []
+ for i, hs in enumerate(hss):
+ before_outs, _ = am.decoder(hs)
+ after_outs = before_outs + am.postnet(
+ before_outs.transpose((0, 2, 1))).transpose(
+ (0, 2, 1))
+ normalized_mel = after_outs[0]
+ sub_mel = denorm(normalized_mel, am_mu, am_std)
+ # clip output part of pad
+ if i == 0:
+ sub_mel = sub_mel[:-pad_size]
+ elif i == chunk_num - 1:
+ # 最后一块的右侧一定没有 pad 够
+ sub_mel = sub_mel[pad_size:]
+ else:
+ # 倒数几块的右侧也可能没有 pad 够
+ sub_mel = sub_mel[pad_size:(chunk_size + pad_size) -
+ sub_mel.shape[0]]
+ mel_list.append(sub_mel)
+ mel = paddle.concat(mel_list, axis=0)
+
+ else:
+ before_outs, _ = am.decoder(orig_hs)
+ after_outs = before_outs + am.postnet(
+ before_outs.transpose((0, 2, 1))).transpose((0, 2, 1))
+ normalized_mel = after_outs[0]
+ mel = denorm(normalized_mel, am_mu, am_std)
+
+ # vocoder
+ wav = voc_inference(mel)
+
+ wav = wav.numpy()
+ N += wav.size
+ T += t.elapse
+ speed = wav.size / t.elapse
+ rtf = am_config.fs / speed
+ print(
+ f"{utt_id}, mel: {mel.shape}, wave: {wav.shape}, time: {t.elapse}s, Hz: {speed}, RTF: {rtf}."
+ )
+ sf.write(
+ str(output_dir / (utt_id + ".wav")), wav, samplerate=am_config.fs)
+ print(f"{utt_id} done!")
+ print(f"generation speed: {N / T}Hz, RTF: {am_config.fs / (N / T) }")
+
+
+def parse_args():
+ # parse args and config and redirect to train_sp
+ parser = argparse.ArgumentParser(
+ description="Synthesize with acoustic model & vocoder")
+ # acoustic model
+ parser.add_argument(
+ '--am',
+ type=str,
+ default='fastspeech2_csmsc',
+ choices=['fastspeech2_csmsc'],
+ help='Choose acoustic model type of tts task.')
+ parser.add_argument(
+ '--am_config',
+ type=str,
+ default=None,
+ help='Config of acoustic model. Use deault config when it is None.')
+ parser.add_argument(
+ '--am_ckpt',
+ type=str,
+ default=None,
+ help='Checkpoint file of acoustic model.')
+ parser.add_argument(
+ "--am_stat",
+ type=str,
+ default=None,
+ help="mean and standard deviation used to normalize spectrogram when training acoustic model."
+ )
+ parser.add_argument(
+ "--phones_dict", type=str, default=None, help="phone vocabulary file.")
+ parser.add_argument(
+ "--tones_dict", type=str, default=None, help="tone vocabulary file.")
+
+ # vocoder
+ parser.add_argument(
+ '--voc',
+ type=str,
+ default='pwgan_csmsc',
+ choices=[
+ 'pwgan_csmsc',
+ 'pwgan_ljspeech',
+ 'pwgan_aishell3',
+ 'pwgan_vctk',
+ 'mb_melgan_csmsc',
+ 'style_melgan_csmsc',
+ 'hifigan_csmsc',
+ 'hifigan_ljspeech',
+ 'hifigan_aishell3',
+ 'hifigan_vctk',
+ 'wavernn_csmsc',
+ ],
+ help='Choose vocoder type of tts task.')
+ parser.add_argument(
+ '--voc_config',
+ type=str,
+ default=None,
+ help='Config of voc. Use deault config when it is None.')
+ parser.add_argument(
+ '--voc_ckpt', type=str, default=None, help='Checkpoint file of voc.')
+ parser.add_argument(
+ "--voc_stat",
+ type=str,
+ default=None,
+ help="mean and standard deviation used to normalize spectrogram when training voc."
+ )
+ # other
+ parser.add_argument(
+ '--lang',
+ type=str,
+ default='zh',
+ help='Choose model language. zh or en')
+
+ parser.add_argument(
+ "--ngpu", type=int, default=1, help="if ngpu == 0, use cpu.")
+ parser.add_argument(
+ "--text",
+ type=str,
+ help="text to synthesize, a 'utt_id sentence' pair per line.")
+
+ parser.add_argument(
+ "--am_streaming",
+ type=str2bool,
+ default=False,
+ help="whether use streaming acoustic model")
+ parser.add_argument(
+ "--chunk_size", type=int, default=42, help="chunk size of am streaming")
+ parser.add_argument(
+ "--pad_size", type=int, default=12, help="pad size of am streaming")
+
+ parser.add_argument("--output_dir", type=str, help="output dir.")
+
+ args = parser.parse_args()
+ return args
+
+
+def main():
+ args = parse_args()
+
+ if args.ngpu == 0:
+ paddle.set_device("cpu")
+ elif args.ngpu > 0:
+ paddle.set_device("gpu")
+ else:
+ print("ngpu should >= 0 !")
+
+ evaluate(args)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/paddlespeech/t2s/exps/transformer_tts/train.py b/paddlespeech/t2s/exps/transformer_tts/train.py
index d521ce89607a969ab4145aebfa24c64afc049a50..45ecb269bac033fed4287e5083ced6ce92b89f35 100644
--- a/paddlespeech/t2s/exps/transformer_tts/train.py
+++ b/paddlespeech/t2s/exps/transformer_tts/train.py
@@ -42,10 +42,12 @@ from paddlespeech.t2s.training.trainer import Trainer
def train_sp(args, config):
# decides device type and whether to run in parallel
# setup running environment correctly
- if (not paddle.is_compiled_with_cuda()) or args.ngpu == 0:
- paddle.set_device("cpu")
- else:
+ if paddle.is_compiled_with_cuda() and args.ngpu > 0:
paddle.set_device("gpu")
+ elif paddle.is_compiled_with_npu() and args.ngpu > 0:
+ paddle.set_device("npu")
+ else:
+ paddle.set_device("cpu")
world_size = paddle.distributed.get_world_size()
if world_size > 1:
paddle.distributed.init_parallel_env()
diff --git a/paddlespeech/t2s/frontend/zh_normalization/chronology.py b/paddlespeech/t2s/frontend/zh_normalization/chronology.py
index ea51891353ad8c6fe942edcdf7efb22ec60526ce..ea4558e2a7abba4ff454656b82e67b3f5c483bf2 100644
--- a/paddlespeech/t2s/frontend/zh_normalization/chronology.py
+++ b/paddlespeech/t2s/frontend/zh_normalization/chronology.py
@@ -64,7 +64,7 @@ def replace_time(match) -> str:
result = f"{num2str(hour)}点"
if minute.lstrip('0'):
if int(minute) == 30:
- result += f"半"
+ result += "半"
else:
result += f"{_time_num2str(minute)}分"
if second and second.lstrip('0'):
@@ -75,7 +75,7 @@ def replace_time(match) -> str:
result += f"{num2str(hour_2)}点"
if minute_2.lstrip('0'):
if int(minute) == 30:
- result += f"半"
+ result += "半"
else:
result += f"{_time_num2str(minute_2)}分"
if second_2 and second_2.lstrip('0'):
diff --git a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
index 73f5498e7e3d633da5f6f0be8c2f97e35e1a0072..c2f1e218f15ba7178bb20751984db8c2b130fe12 100644
--- a/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
+++ b/paddlespeech/t2s/models/fastspeech2/fastspeech2.py
@@ -14,6 +14,7 @@
# Modified from espnet(https://github.com/espnet/espnet)
"""Fastspeech2 related modules for paddle"""
from typing import Dict
+from typing import List
from typing import Sequence
from typing import Tuple
from typing import Union
@@ -32,6 +33,8 @@ from paddlespeech.t2s.modules.predictor.duration_predictor import DurationPredic
from paddlespeech.t2s.modules.predictor.length_regulator import LengthRegulator
from paddlespeech.t2s.modules.predictor.variance_predictor import VariancePredictor
from paddlespeech.t2s.modules.tacotron2.decoder import Postnet
+from paddlespeech.t2s.modules.transformer.encoder import CNNDecoder
+from paddlespeech.t2s.modules.transformer.encoder import CNNPostnet
from paddlespeech.t2s.modules.transformer.encoder import ConformerEncoder
from paddlespeech.t2s.modules.transformer.encoder import TransformerEncoder
@@ -97,6 +100,12 @@ class FastSpeech2(nn.Layer):
zero_triu: bool=False,
conformer_enc_kernel_size: int=7,
conformer_dec_kernel_size: int=31,
+ # for CNN Decoder
+ cnn_dec_dropout_rate: float=0.2,
+ cnn_postnet_dropout_rate: float=0.2,
+ cnn_postnet_resblock_kernel_sizes: List[int]=[256, 256],
+ cnn_postnet_kernel_size: int=5,
+ cnn_decoder_embedding_dim: int=256,
# duration predictor
duration_predictor_layers: int=2,
duration_predictor_chans: int=384,
@@ -392,6 +401,13 @@ class FastSpeech2(nn.Layer):
activation_type=conformer_activation_type,
use_cnn_module=use_cnn_in_conformer,
cnn_module_kernel=conformer_dec_kernel_size, )
+ elif decoder_type == 'cnndecoder':
+ self.decoder = CNNDecoder(
+ emb_dim=adim,
+ odim=odim,
+ kernel_size=cnn_postnet_kernel_size,
+ dropout_rate=cnn_dec_dropout_rate,
+ resblock_kernel_sizes=cnn_postnet_resblock_kernel_sizes)
else:
raise ValueError(f"{decoder_type} is not supported.")
@@ -399,14 +415,21 @@ class FastSpeech2(nn.Layer):
self.feat_out = nn.Linear(adim, odim * reduction_factor)
# define postnet
- self.postnet = (None if postnet_layers == 0 else Postnet(
- idim=idim,
- odim=odim,
- n_layers=postnet_layers,
- n_chans=postnet_chans,
- n_filts=postnet_filts,
- use_batch_norm=use_batch_norm,
- dropout_rate=postnet_dropout_rate, ))
+ if decoder_type == 'cnndecoder':
+ self.postnet = CNNPostnet(
+ odim=odim,
+ kernel_size=cnn_postnet_kernel_size,
+ dropout_rate=cnn_postnet_dropout_rate,
+ resblock_kernel_sizes=cnn_postnet_resblock_kernel_sizes)
+ else:
+ self.postnet = (None if postnet_layers == 0 else Postnet(
+ idim=idim,
+ odim=odim,
+ n_layers=postnet_layers,
+ n_chans=postnet_chans,
+ n_filts=postnet_filts,
+ use_batch_norm=use_batch_norm,
+ dropout_rate=postnet_dropout_rate, ))
nn.initializer.set_global_initializer(None)
@@ -486,6 +509,7 @@ class FastSpeech2(nn.Layer):
ps: paddle.Tensor=None,
es: paddle.Tensor=None,
is_inference: bool=False,
+ return_after_enc=False,
alpha: float=1.0,
spk_emb=None,
spk_id=None,
@@ -562,15 +586,21 @@ class FastSpeech2(nn.Layer):
[olen // self.reduction_factor for olen in olens.numpy()])
else:
olens_in = olens
+ # (B, 1, T)
h_masks = self._source_mask(olens_in)
else:
h_masks = None
- # (B, Lmax, adim)
+ if return_after_enc:
+ return hs, h_masks
+ # (B, Lmax, adim)
zs, _ = self.decoder(hs, h_masks)
# (B, Lmax, odim)
- before_outs = self.feat_out(zs).reshape(
- (paddle.shape(zs)[0], -1, self.odim))
+ if self.decoder_type == 'cnndecoder':
+ before_outs = zs
+ else:
+ before_outs = self.feat_out(zs).reshape(
+ (paddle.shape(zs)[0], -1, self.odim))
# postnet -> (B, Lmax//r * r, odim)
if self.postnet is None:
@@ -581,10 +611,42 @@ class FastSpeech2(nn.Layer):
return before_outs, after_outs, d_outs, p_outs, e_outs
+ def encoder_infer(
+ self,
+ text: paddle.Tensor,
+ alpha: float=1.0,
+ spk_emb=None,
+ spk_id=None,
+ tone_id=None,
+ ) -> Tuple[paddle.Tensor, paddle.Tensor, paddle.Tensor]:
+ # input of embedding must be int64
+ x = paddle.cast(text, 'int64')
+ # setup batch axis
+ ilens = paddle.shape(x)[0]
+
+ xs = x.unsqueeze(0)
+
+ if spk_emb is not None:
+ spk_emb = spk_emb.unsqueeze(0)
+
+ if tone_id is not None:
+ tone_id = tone_id.unsqueeze(0)
+
+ # (1, L, odim)
+ hs, h_masks = self._forward(
+ xs,
+ ilens,
+ is_inference=True,
+ return_after_enc=True,
+ alpha=alpha,
+ spk_emb=spk_emb,
+ spk_id=spk_id,
+ tone_id=tone_id)
+ return hs, h_masks
+
def inference(
self,
text: paddle.Tensor,
- speech: paddle.Tensor=None,
durations: paddle.Tensor=None,
pitch: paddle.Tensor=None,
energy: paddle.Tensor=None,
@@ -598,7 +660,6 @@ class FastSpeech2(nn.Layer):
Args:
text(Tensor(int64)): Input sequence of characters (T,).
- speech(Tensor, optional): Feature sequence to extract style (N, idim).
durations(Tensor, optional (int64)): Groundtruth of duration (T,).
pitch(Tensor, optional): Groundtruth of token-averaged pitch (T, 1).
energy(Tensor, optional): Groundtruth of token-averaged energy (T, 1).
@@ -615,15 +676,11 @@ class FastSpeech2(nn.Layer):
"""
# input of embedding must be int64
x = paddle.cast(text, 'int64')
- y = speech
d, p, e = durations, pitch, energy
# setup batch axis
ilens = paddle.shape(x)[0]
- xs, ys = x.unsqueeze(0), None
-
- if y is not None:
- ys = y.unsqueeze(0)
+ xs = x.unsqueeze(0)
if spk_emb is not None:
spk_emb = spk_emb.unsqueeze(0)
@@ -641,7 +698,6 @@ class FastSpeech2(nn.Layer):
_, outs, d_outs, p_outs, e_outs = self._forward(
xs,
ilens,
- ys,
ds=ds,
ps=ps,
es=es,
@@ -654,7 +710,6 @@ class FastSpeech2(nn.Layer):
_, outs, d_outs, p_outs, e_outs = self._forward(
xs,
ilens,
- ys,
is_inference=True,
alpha=alpha,
spk_emb=spk_emb,
@@ -802,7 +857,6 @@ class StyleFastSpeech2Inference(FastSpeech2Inference):
Args:
text(Tensor(int64)): Input sequence of characters (T,).
- speech(Tensor, optional): Feature sequence to extract style (N, idim).
durations(paddle.Tensor/np.ndarray, optional (int64)): Groundtruth of duration (T,), this will overwrite the set of durations_scale and durations_bias
durations_scale(int/float, optional):
durations_bias(int/float, optional):
diff --git a/paddlespeech/t2s/models/hifigan/hifigan.py b/paddlespeech/t2s/models/hifigan/hifigan.py
index 116376eceb4ef5c7847325e0a20b088641cb153c..ac5ff204fae661dbc159f53970389ce3287a7b9f 100644
--- a/paddlespeech/t2s/models/hifigan/hifigan.py
+++ b/paddlespeech/t2s/models/hifigan/hifigan.py
@@ -1,7 +1,17 @@
-# -*- coding: utf-8 -*-
-"""HiFi-GAN Modules.
-This code is based on https://github.com/jik876/hifi-gan.
-"""
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This code is based on https://github.com/jik876/hifi-gan.
import copy
from typing import Any
from typing import Dict
diff --git a/paddlespeech/t2s/models/tacotron2/tacotron2.py b/paddlespeech/t2s/models/tacotron2/tacotron2.py
index abb691b4dcd884550d0ba16a3eae783f4fd753fc..7b306e4820de10db9ae8551fffe62ab50d055905 100644
--- a/paddlespeech/t2s/models/tacotron2/tacotron2.py
+++ b/paddlespeech/t2s/models/tacotron2/tacotron2.py
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+# Modified from espnet(https://github.com/espnet/espnet)
"""Tacotron 2 related modules for paddle"""
import logging
from typing import Dict
diff --git a/paddlespeech/t2s/models/wavernn/wavernn.py b/paddlespeech/t2s/models/wavernn/wavernn.py
index 9590704328b757b27a6626e8e3a5c675da41c8e0..b4b8b48091ff2ad4f2774af2662afd5e0a48b79c 100644
--- a/paddlespeech/t2s/models/wavernn/wavernn.py
+++ b/paddlespeech/t2s/models/wavernn/wavernn.py
@@ -11,6 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
+# Modified from https://github.com/fatchord/WaveRNN
import sys
import time
from typing import List
diff --git a/paddlespeech/t2s/modules/losses.py b/paddlespeech/t2s/modules/losses.py
index 93644e24aac93553bb938672ab22065b8740051e..db31bcfbb4361281df49d3afeb00dfb97c59d7f9 100644
--- a/paddlespeech/t2s/modules/losses.py
+++ b/paddlespeech/t2s/modules/losses.py
@@ -489,7 +489,7 @@ def stft(x,
"""
# calculate window
window = signal.get_window(window, win_length, fftbins=True)
- window = paddle.to_tensor(window)
+ window = paddle.to_tensor(window, dtype=x.dtype)
x_stft = paddle.signal.stft(
x,
fft_size,
@@ -896,7 +896,7 @@ class MelSpectrogram(nn.Layer):
# calculate window
window = signal.get_window(
self.window, self.win_length, fftbins=True)
- window = paddle.to_tensor(window)
+ window = paddle.to_tensor(window, dtype=x.dtype)
else:
window = None
diff --git a/paddlespeech/t2s/modules/predictor/length_regulator.py b/paddlespeech/t2s/modules/predictor/length_regulator.py
index 2472c413beace969408bd0726d1483a6abb1e811..be788e6ed214af92aec0da31582193f56bcfb419 100644
--- a/paddlespeech/t2s/modules/predictor/length_regulator.py
+++ b/paddlespeech/t2s/modules/predictor/length_regulator.py
@@ -73,15 +73,21 @@ class LengthRegulator(nn.Layer):
batch_size, t_enc = paddle.shape(durations)
slens = paddle.sum(durations, -1)
t_dec = paddle.max(slens)
- M = paddle.zeros([batch_size, t_dec, t_enc])
- for i in range(batch_size):
- k = 0
- for j in range(t_enc):
- d = durations[i, j]
- # If the d == 0, slice action is meaningless and not supported in paddle
- if d >= 1:
- M[i, k:k + d, j] = 1
- k += d
+ t_dec_1 = t_dec + 1
+ flatten_duration = paddle.cumsum(
+ paddle.reshape(durations, [batch_size * t_enc])) + 1
+ init = paddle.zeros(t_dec_1)
+ m_batch = batch_size * t_enc
+ M = paddle.zeros([t_dec_1, m_batch])
+ for i in range(m_batch):
+ d = flatten_duration[i]
+ m = paddle.concat(
+ [paddle.ones(d), paddle.zeros(t_dec_1 - d)], axis=0)
+ M[:, i] = m - init
+ init = m
+ M = paddle.reshape(M, shape=[t_dec_1, batch_size, t_enc])
+ M = M[1:, :, :]
+ M = paddle.transpose(M, (1, 0, 2))
encodings = paddle.matmul(M, encodings)
return encodings
diff --git a/paddlespeech/t2s/modules/transformer/encoder.py b/paddlespeech/t2s/modules/transformer/encoder.py
index 2b3ee788e845b060fc4d9645eea170e6d21b1550..f64202824c9a7ceb63395641c22326f06d768809 100644
--- a/paddlespeech/t2s/modules/transformer/encoder.py
+++ b/paddlespeech/t2s/modules/transformer/encoder.py
@@ -515,3 +515,132 @@ class ConformerEncoder(BaseEncoder):
if self.intermediate_layers is not None:
return xs, masks, intermediate_outputs
return xs, masks
+
+
+class Conv1dResidualBlock(nn.Layer):
+ """
+ Special module for simplified version of Encoder class.
+ """
+
+ def __init__(self,
+ idim: int=256,
+ odim: int=256,
+ kernel_size: int=5,
+ dropout_rate: float=0.2):
+ super().__init__()
+ self.main_block = nn.Sequential(
+ nn.Conv1D(
+ idim, odim, kernel_size=kernel_size, padding=kernel_size // 2),
+ nn.ReLU(),
+ nn.BatchNorm1D(odim),
+ nn.Dropout(p=dropout_rate))
+ self.conv1d_residual = nn.Conv1D(idim, odim, kernel_size=1)
+
+ def forward(self, xs):
+ """Encode input sequence.
+ Args:
+ xs (Tensor): Input tensor (#batch, idim, T).
+ Returns:
+ Tensor: Output tensor (#batch, odim, T).
+ """
+ outputs = self.main_block(xs)
+ outputs = self.conv1d_residual(xs) + outputs
+ return outputs
+
+
+class CNNDecoder(nn.Layer):
+ """
+ Much simplified decoder than the original one with Prenet.
+ """
+
+ def __init__(
+ self,
+ emb_dim: int=256,
+ odim: int=80,
+ kernel_size: int=5,
+ dropout_rate: float=0.2,
+ resblock_kernel_sizes: List[int]=[256, 256], ):
+
+ super().__init__()
+
+ input_shape = emb_dim
+ out_sizes = resblock_kernel_sizes
+ out_sizes.append(out_sizes[-1])
+
+ in_sizes = [input_shape] + out_sizes[:-1]
+ self.residual_blocks = nn.LayerList([
+ Conv1dResidualBlock(
+ idim=in_channels,
+ odim=out_channels,
+ kernel_size=kernel_size,
+ dropout_rate=dropout_rate, )
+ for in_channels, out_channels in zip(in_sizes, out_sizes)
+ ])
+ self.conv1d = nn.Conv1D(
+ in_channels=out_sizes[-1], out_channels=odim, kernel_size=1)
+
+ def forward(self, xs, masks=None):
+ """Encode input sequence.
+ Args:
+ xs (Tensor): Input tensor (#batch, time, idim).
+ masks (Tensor): Mask tensor (#batch, 1, time).
+ Returns:
+ Tensor: Output tensor (#batch, time, odim).
+ """
+ # exchange the temporal dimension and the feature dimension
+ xs = xs.transpose([0, 2, 1])
+ if masks is not None:
+ xs = xs * masks
+
+ for layer in self.residual_blocks:
+ outputs = layer(xs)
+ if masks is not None:
+ # input_mask B * 1 * T
+ outputs = outputs * masks
+ xs = outputs
+ outputs = self.conv1d(outputs)
+ if masks is not None:
+ outputs = outputs * masks
+ outputs = outputs.transpose([0, 2, 1])
+ return outputs, masks
+
+
+class CNNPostnet(nn.Layer):
+ def __init__(
+ self,
+ odim: int=80,
+ kernel_size: int=5,
+ dropout_rate: float=0.2,
+ resblock_kernel_sizes: List[int]=[256, 256], ):
+ super().__init__()
+ out_sizes = resblock_kernel_sizes
+ in_sizes = [odim] + out_sizes[:-1]
+ self.residual_blocks = nn.LayerList([
+ Conv1dResidualBlock(
+ idim=in_channels,
+ odim=out_channels,
+ kernel_size=kernel_size,
+ dropout_rate=dropout_rate)
+ for in_channels, out_channels in zip(in_sizes, out_sizes)
+ ])
+ self.conv1d = nn.Conv1D(
+ in_channels=out_sizes[-1], out_channels=odim, kernel_size=1)
+
+ def forward(self, xs, masks=None):
+ """Encode input sequence.
+ Args:
+ xs (Tensor): Input tensor (#batch, odim, time).
+ masks (Tensor): Mask tensor (#batch, 1, time).
+ Returns:
+ Tensor: Output tensor (#batch, odim, time).
+ """
+ for layer in self.residual_blocks:
+ outputs = layer(xs)
+ if masks is not None:
+ # input_mask B * 1 * T
+ outputs = outputs * masks
+ xs = outputs
+ outputs = self.conv1d(outputs)
+ if masks is not None:
+ outputs = outputs * masks
+ return outputs
diff --git a/paddlespeech/vector/cluster/__init__.py b/paddlespeech/vector/cluster/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/paddlespeech/vector/cluster/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/vector/cluster/diarization.py b/paddlespeech/vector/cluster/diarization.py
index 6432acb816976ee671a701abbd0ad5c18d3191c4..597aa48070eb6bf8845ccb323e7353a87799029b 100644
--- a/paddlespeech/vector/cluster/diarization.py
+++ b/paddlespeech/vector/cluster/diarization.py
@@ -16,22 +16,20 @@ This script contains basic functions used for speaker diarization.
This script has an optional dependency on open source sklearn library.
A few sklearn functions are modified in this script as per requirement.
"""
-
import argparse
import warnings
-import scipy
+
import numpy as np
+import scipy
+import sklearn
from distutils.util import strtobool
-
from scipy import sparse
-from scipy.sparse.linalg import eigsh
from scipy.sparse.csgraph import connected_components
from scipy.sparse.csgraph import laplacian as csgraph_laplacian
-
-import sklearn
-from sklearn.neighbors import kneighbors_graph
+from scipy.sparse.linalg import eigsh
from sklearn.cluster import SpectralClustering
from sklearn.cluster._kmeans import k_means
+from sklearn.neighbors import kneighbors_graph
def _graph_connected_component(graph, node_id):
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
new file mode 100644
index 0000000000000000000000000000000000000000..686de9363e82f121e59348441c09bb150984d218
--- /dev/null
+++ b/paddlespeech/vector/exps/ecapa_tdnn/extract_emb.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import time
+
+import paddle
+from yacs.config import CfgNode
+
+from paddleaudio.backends import load as load_audio
+from paddleaudio.compliance.librosa import melspectrogram
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.batch import feature_normalize
+from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
+from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+from paddlespeech.vector.training.seeding import seed_everything
+
+logger = Log(__name__).getlog()
+
+
+def extract_audio_embedding(args, config):
+ # stage 0: set the training device, cpu or gpu
+ paddle.set_device(args.device)
+ # set the random seed, it is a must for multiprocess training
+ seed_everything(config.seed)
+
+ # stage 1: build the dnn backbone model network
+ ecapa_tdnn = EcapaTdnn(**config.model)
+
+ # stage4: build the speaker verification train instance with backbone model
+ model = SpeakerIdetification(
+ backbone=ecapa_tdnn, num_class=config.num_speakers)
+ # stage 2: load the pre-trained model
+ args.load_checkpoint = os.path.abspath(
+ os.path.expanduser(args.load_checkpoint))
+
+ # load model checkpoint to sid model
+ state_dict = paddle.load(
+ os.path.join(args.load_checkpoint, 'model.pdparams'))
+ model.set_state_dict(state_dict)
+ logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
+
+ # stage 3: we must set the model to eval mode
+ model.eval()
+
+ # stage 4: read the audio data and extract the embedding
+ # wavform is one dimension numpy array
+ waveform, sr = load_audio(args.audio_path)
+
+ # feat type is numpy array, whose shape is [dim, time]
+ # we need convert the audio feat to one-batch shape [batch, dim, time], where the batch is one
+ # so the final shape is [1, dim, time]
+ start_time = time.time()
+ feat = melspectrogram(
+ x=waveform,
+ sr=config.sr,
+ n_mels=config.n_mels,
+ window_size=config.window_size,
+ hop_length=config.hop_size)
+ feat = paddle.to_tensor(feat).unsqueeze(0)
+
+ # in inference period, the lengths is all one without padding
+ lengths = paddle.ones([1])
+ feat = feature_normalize(feat, mean_norm=True, std_norm=False)
+
+ # model backbone network forward the feats and get the embedding
+ embedding = model.backbone(
+ feat, lengths).squeeze().numpy() # (1, emb_size, 1) -> (emb_size)
+ elapsed_time = time.time() - start_time
+ audio_length = waveform.shape[0] / sr
+
+ # stage 5: do global norm with external mean and std
+ rtf = elapsed_time / audio_length
+ logger.info(f"{args.device} rft={rtf}")
+
+ return embedding
+
+
+if __name__ == "__main__":
+ # yapf: disable
+ parser = argparse.ArgumentParser(__doc__)
+ parser.add_argument('--device',
+ choices=['cpu', 'gpu'],
+ default="cpu",
+ help="Select which device to train model, defaults to gpu.")
+ parser.add_argument("--config",
+ default=None,
+ type=str,
+ help="configuration file")
+ parser.add_argument("--load-checkpoint",
+ type=str,
+ default='',
+ help="Directory to load model checkpoint to contiune trainning.")
+ parser.add_argument("--audio-path",
+ default="./data/demo.wav",
+ type=str,
+ help="Single audio file path")
+ args = parser.parse_args()
+ # yapf: enable
+ # https://yaml.org/type/float.html
+ config = CfgNode(new_allowed=True)
+ if args.config:
+ config.merge_from_file(args.config)
+
+ config.freeze()
+ print(config)
+
+ extract_audio_embedding(args, config)
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/test.py b/paddlespeech/vector/exps/ecapa_tdnn/test.py
new file mode 100644
index 0000000000000000000000000000000000000000..d0de6dc51a8829c8bc11cb2ae943a3f164f1fd1d
--- /dev/null
+++ b/paddlespeech/vector/exps/ecapa_tdnn/test.py
@@ -0,0 +1,203 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+
+import numpy as np
+import paddle
+from paddle.io import BatchSampler
+from paddle.io import DataLoader
+from tqdm import tqdm
+from yacs.config import CfgNode
+
+from paddleaudio.datasets import VoxCeleb
+from paddleaudio.metric import compute_eer
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.batch import batch_feature_normalize
+from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
+from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+from paddlespeech.vector.training.seeding import seed_everything
+
+logger = Log(__name__).getlog()
+
+
+def main(args, config):
+ # stage0: set the training device, cpu or gpu
+ paddle.set_device(args.device)
+ # set the random seed, it is a must for multiprocess training
+ seed_everything(config.seed)
+
+ # stage1: build the dnn backbone model network
+ ecapa_tdnn = EcapaTdnn(**config.model)
+
+ # stage2: build the speaker verification eval instance with backbone model
+ model = SpeakerIdetification(
+ backbone=ecapa_tdnn, num_class=config.num_speakers)
+
+ # stage3: load the pre-trained model
+ # we get the last model from the epoch and save_interval
+ args.load_checkpoint = os.path.abspath(
+ os.path.expanduser(args.load_checkpoint))
+
+ # load model checkpoint to sid model
+ state_dict = paddle.load(
+ os.path.join(args.load_checkpoint, 'model.pdparams'))
+ model.set_state_dict(state_dict)
+ logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
+
+ # stage4: construct the enroll and test dataloader
+
+ enroll_dataset = VoxCeleb(
+ subset='enroll',
+ target_dir=args.data_dir,
+ feat_type='melspectrogram',
+ random_chunk=False,
+ n_mels=config.n_mels,
+ window_size=config.window_size,
+ hop_length=config.hop_size)
+ enroll_sampler = BatchSampler(
+ enroll_dataset, batch_size=config.batch_size,
+ shuffle=True) # Shuffle to make embedding normalization more robust.
+ enrol_loader = DataLoader(enroll_dataset,
+ batch_sampler=enroll_sampler,
+ collate_fn=lambda x: batch_feature_normalize(
+ x, mean_norm=True, std_norm=False),
+ num_workers=config.num_workers,
+ return_list=True,)
+ test_dataset = VoxCeleb(
+ subset='test',
+ target_dir=args.data_dir,
+ feat_type='melspectrogram',
+ random_chunk=False,
+ n_mels=config.n_mels,
+ window_size=config.window_size,
+ hop_length=config.hop_size)
+
+ test_sampler = BatchSampler(
+ test_dataset, batch_size=config.batch_size, shuffle=True)
+ test_loader = DataLoader(test_dataset,
+ batch_sampler=test_sampler,
+ collate_fn=lambda x: batch_feature_normalize(
+ x, mean_norm=True, std_norm=False),
+ num_workers=config.num_workers,
+ return_list=True,)
+ # stage5: we must set the model to eval mode
+ model.eval()
+
+ # stage6: global embedding norm to imporve the performance
+ logger.info(f"global embedding norm: {config.global_embedding_norm}")
+ if config.global_embedding_norm:
+ global_embedding_mean = None
+ global_embedding_std = None
+ mean_norm_flag = config.embedding_mean_norm
+ std_norm_flag = config.embedding_std_norm
+ batch_count = 0
+
+ # stage7: Compute embeddings of audios in enrol and test dataset from model.
+ id2embedding = {}
+ # Run multi times to make embedding normalization more stable.
+ for i in range(2):
+ for dl in [enrol_loader, test_loader]:
+ logger.info(
+ f'Loop {[i+1]}: Computing embeddings on {dl.dataset.subset} dataset'
+ )
+ with paddle.no_grad():
+ for batch_idx, batch in enumerate(tqdm(dl)):
+
+ # stage 8-1: extrac the audio embedding
+ ids, feats, lengths = batch['ids'], batch['feats'], batch[
+ 'lengths']
+ embeddings = model.backbone(feats, lengths).squeeze(
+ -1).numpy() # (N, emb_size, 1) -> (N, emb_size)
+
+ # Global embedding normalization.
+ # if we use the global embedding norm
+ # eer can reduece about relative 10%
+ if config.global_embedding_norm:
+ batch_count += 1
+ current_mean = embeddings.mean(
+ axis=0) if mean_norm_flag else 0
+ current_std = embeddings.std(
+ axis=0) if std_norm_flag else 1
+ # Update global mean and std.
+ if global_embedding_mean is None and global_embedding_std is None:
+ global_embedding_mean, global_embedding_std = current_mean, current_std
+ else:
+ weight = 1 / batch_count # Weight decay by batches.
+ global_embedding_mean = (
+ 1 - weight
+ ) * global_embedding_mean + weight * current_mean
+ global_embedding_std = (
+ 1 - weight
+ ) * global_embedding_std + weight * current_std
+ # Apply global embedding normalization.
+ embeddings = (embeddings - global_embedding_mean
+ ) / global_embedding_std
+
+ # Update embedding dict.
+ id2embedding.update(dict(zip(ids, embeddings)))
+
+ # stage 8: Compute cosine scores.
+ labels = []
+ enroll_ids = []
+ test_ids = []
+ logger.info(f"read the trial from {VoxCeleb.veri_test_file}")
+ with open(VoxCeleb.veri_test_file, 'r') as f:
+ for line in f.readlines():
+ label, enroll_id, test_id = line.strip().split(' ')
+ labels.append(int(label))
+ enroll_ids.append(enroll_id.split('.')[0].replace('/', '-'))
+ test_ids.append(test_id.split('.')[0].replace('/', '-'))
+
+ cos_sim_func = paddle.nn.CosineSimilarity(axis=1)
+ enrol_embeddings, test_embeddings = map(lambda ids: paddle.to_tensor(
+ np.asarray([id2embedding[uttid] for uttid in ids], dtype='float32')),
+ [enroll_ids, test_ids
+ ]) # (N, emb_size)
+ scores = cos_sim_func(enrol_embeddings, test_embeddings)
+ EER, threshold = compute_eer(np.asarray(labels), scores.numpy())
+ logger.info(
+ f'EER of verification test: {EER*100:.4f}%, score threshold: {threshold:.5f}'
+ )
+
+
+if __name__ == "__main__":
+ # yapf: disable
+ parser = argparse.ArgumentParser(__doc__)
+ parser.add_argument('--device',
+ choices=['cpu', 'gpu'],
+ default="gpu",
+ help="Select which device to train model, defaults to gpu.")
+ parser.add_argument("--config",
+ default=None,
+ type=str,
+ help="configuration file")
+ parser.add_argument("--data-dir",
+ default="./data/",
+ type=str,
+ help="data directory")
+ parser.add_argument("--load-checkpoint",
+ type=str,
+ default='',
+ help="Directory to load model checkpoint to contiune trainning.")
+ args = parser.parse_args()
+ # yapf: enable
+ # https://yaml.org/type/float.html
+ config = CfgNode(new_allowed=True)
+ if args.config:
+ config.merge_from_file(args.config)
+
+ config.freeze()
+ print(config)
+ main(args, config)
diff --git a/paddlespeech/vector/exps/ecapa_tdnn/train.py b/paddlespeech/vector/exps/ecapa_tdnn/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..257b97abed7f8ca2edebf2d85e75f26370851116
--- /dev/null
+++ b/paddlespeech/vector/exps/ecapa_tdnn/train.py
@@ -0,0 +1,351 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import time
+
+import numpy as np
+import paddle
+from paddle.io import BatchSampler
+from paddle.io import DataLoader
+from paddle.io import DistributedBatchSampler
+from yacs.config import CfgNode
+
+from paddleaudio.compliance.librosa import melspectrogram
+from paddleaudio.datasets.voxceleb import VoxCeleb
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.augment import build_augment_pipeline
+from paddlespeech.vector.io.augment import waveform_augment
+from paddlespeech.vector.io.batch import batch_pad_right
+from paddlespeech.vector.io.batch import feature_normalize
+from paddlespeech.vector.io.batch import waveform_collate_fn
+from paddlespeech.vector.models.ecapa_tdnn import EcapaTdnn
+from paddlespeech.vector.modules.loss import AdditiveAngularMargin
+from paddlespeech.vector.modules.loss import LogSoftmaxWrapper
+from paddlespeech.vector.modules.sid_model import SpeakerIdetification
+from paddlespeech.vector.training.scheduler import CyclicLRScheduler
+from paddlespeech.vector.training.seeding import seed_everything
+from paddlespeech.vector.utils.time import Timer
+
+logger = Log(__name__).getlog()
+
+
+def main(args, config):
+ # stage0: set the training device, cpu or gpu
+ paddle.set_device(args.device)
+
+ # stage1: we must call the paddle.distributed.init_parallel_env() api at the begining
+ paddle.distributed.init_parallel_env()
+ nranks = paddle.distributed.get_world_size()
+ local_rank = paddle.distributed.get_rank()
+ # set the random seed, it is a must for multiprocess training
+ seed_everything(config.seed)
+
+ # stage2: data prepare, such vox1 and vox2 data, and augment noise data and pipline
+ # note: some cmd must do in rank==0, so wo will refactor the data prepare code
+ train_dataset = VoxCeleb('train', target_dir=args.data_dir)
+ dev_dataset = VoxCeleb('dev', target_dir=args.data_dir)
+
+ if config.augment:
+ augment_pipeline = build_augment_pipeline(target_dir=args.data_dir)
+ else:
+ augment_pipeline = []
+
+ # stage3: build the dnn backbone model network
+ ecapa_tdnn = EcapaTdnn(**config.model)
+
+ # stage4: build the speaker verification train instance with backbone model
+ model = SpeakerIdetification(
+ backbone=ecapa_tdnn, num_class=VoxCeleb.num_speakers)
+
+ # stage5: build the optimizer, we now only construct the AdamW optimizer
+ # 140000 is single gpu steps
+ # so, in multi-gpu mode, wo reduce the step_size to 140000//nranks to enable CyclicLRScheduler
+ lr_schedule = CyclicLRScheduler(
+ base_lr=config.learning_rate, max_lr=1e-3, step_size=140000 // nranks)
+ optimizer = paddle.optimizer.AdamW(
+ learning_rate=lr_schedule, parameters=model.parameters())
+
+ # stage6: build the loss function, we now only support LogSoftmaxWrapper
+ criterion = LogSoftmaxWrapper(
+ loss_fn=AdditiveAngularMargin(margin=0.2, scale=30))
+
+ # stage7: confirm training start epoch
+ # if pre-trained model exists, start epoch confirmed by the pre-trained model
+ start_epoch = 0
+ if args.load_checkpoint:
+ logger.info("load the check point")
+ args.load_checkpoint = os.path.abspath(
+ os.path.expanduser(args.load_checkpoint))
+ try:
+ # load model checkpoint
+ state_dict = paddle.load(
+ os.path.join(args.load_checkpoint, 'model.pdparams'))
+ model.set_state_dict(state_dict)
+
+ # load optimizer checkpoint
+ state_dict = paddle.load(
+ os.path.join(args.load_checkpoint, 'model.pdopt'))
+ optimizer.set_state_dict(state_dict)
+ if local_rank == 0:
+ logger.info(f'Checkpoint loaded from {args.load_checkpoint}')
+ except FileExistsError:
+ if local_rank == 0:
+ logger.info('Train from scratch.')
+
+ try:
+ start_epoch = int(args.load_checkpoint[-1])
+ logger.info(f'Restore training from epoch {start_epoch}.')
+ except ValueError:
+ pass
+
+ # stage8: we build the batch sampler for paddle.DataLoader
+ train_sampler = DistributedBatchSampler(
+ train_dataset,
+ batch_size=config.batch_size,
+ shuffle=True,
+ drop_last=False)
+ train_loader = DataLoader(
+ train_dataset,
+ batch_sampler=train_sampler,
+ num_workers=config.num_workers,
+ collate_fn=waveform_collate_fn,
+ return_list=True,
+ use_buffer_reader=True, )
+
+ # stage9: start to train
+ # we will comment the training process
+ steps_per_epoch = len(train_sampler)
+ timer = Timer(steps_per_epoch * config.epochs)
+ last_saved_epoch = ""
+ timer.start()
+
+ for epoch in range(start_epoch + 1, config.epochs + 1):
+ # at the begining, model must set to train mode
+ model.train()
+
+ avg_loss = 0
+ num_corrects = 0
+ num_samples = 0
+ train_reader_cost = 0.0
+ train_feat_cost = 0.0
+ train_run_cost = 0.0
+
+ reader_start = time.time()
+ for batch_idx, batch in enumerate(train_loader):
+ train_reader_cost += time.time() - reader_start
+
+ # stage 9-1: batch data is audio sample points and speaker id label
+ feat_start = time.time()
+ waveforms, labels = batch['waveforms'], batch['labels']
+ waveforms, lengths = batch_pad_right(waveforms.numpy())
+ waveforms = paddle.to_tensor(waveforms)
+
+ # stage 9-2: audio sample augment method, which is done on the audio sample point
+ # the original wavefrom and the augmented waveform is concatented in a batch
+ # eg. five augment method in the augment pipeline
+ # the final data nums is batch_size * [five + one]
+ # -> five augmented waveform batch plus one original batch waveform
+ if len(augment_pipeline) != 0:
+ waveforms = waveform_augment(waveforms, augment_pipeline)
+ labels = paddle.concat(
+ [labels for i in range(len(augment_pipeline) + 1)])
+
+ # stage 9-3: extract the audio feats,such fbank, mfcc, spectrogram
+ feats = []
+ for waveform in waveforms.numpy():
+ feat = melspectrogram(
+ x=waveform,
+ sr=config.sr,
+ n_mels=config.n_mels,
+ window_size=config.window_size,
+ hop_length=config.hop_size)
+ feats.append(feat)
+ feats = paddle.to_tensor(np.asarray(feats))
+
+ # stage 9-4: feature normalize, which help converge and imporve the performance
+ feats = feature_normalize(
+ feats, mean_norm=True, std_norm=False) # Features normalization
+ train_feat_cost += time.time() - feat_start
+
+ # stage 9-5: model forward, such ecapa-tdnn, x-vector
+ train_start = time.time()
+ logits = model(feats)
+
+ # stage 9-6: loss function criterion, such AngularMargin, AdditiveAngularMargin
+ loss = criterion(logits, labels)
+
+ # stage 9-7: update the gradient and clear the gradient cache
+ loss.backward()
+ optimizer.step()
+ if isinstance(optimizer._learning_rate,
+ paddle.optimizer.lr.LRScheduler):
+ optimizer._learning_rate.step()
+ optimizer.clear_grad()
+ train_run_cost += time.time() - train_start
+
+ # stage 9-8: Calculate average loss per batch
+ avg_loss += loss.numpy()[0]
+
+ # stage 9-9: Calculate metrics, which is one-best accuracy
+ preds = paddle.argmax(logits, axis=1)
+ num_corrects += (preds == labels).numpy().sum()
+ num_samples += feats.shape[0]
+ timer.count() # step plus one in timer
+
+ # stage 9-10: print the log information only on 0-rank per log-freq batchs
+ if (batch_idx + 1) % config.log_interval == 0 and local_rank == 0:
+ lr = optimizer.get_lr()
+ avg_loss /= config.log_interval
+ avg_acc = num_corrects / num_samples
+
+ print_msg = 'Train Epoch={}/{}, Step={}/{}'.format(
+ epoch, config.epochs, batch_idx + 1, steps_per_epoch)
+ print_msg += ' loss={:.4f}'.format(avg_loss)
+ print_msg += ' acc={:.4f}'.format(avg_acc)
+ print_msg += ' avg_reader_cost: {:.5f} sec,'.format(
+ train_reader_cost / config.log_interval)
+ print_msg += ' avg_feat_cost: {:.5f} sec,'.format(
+ train_feat_cost / config.log_interval)
+ print_msg += ' avg_train_cost: {:.5f} sec,'.format(
+ train_run_cost / config.log_interval)
+ print_msg += ' lr={:.4E} step/sec={:.2f} | ETA {}'.format(
+ lr, timer.timing, timer.eta)
+ logger.info(print_msg)
+
+ avg_loss = 0
+ num_corrects = 0
+ num_samples = 0
+ train_reader_cost = 0.0
+ train_feat_cost = 0.0
+ train_run_cost = 0.0
+
+ reader_start = time.time()
+
+ # stage 9-11: save the model parameters only on 0-rank per save-freq batchs
+ if epoch % config.save_interval == 0 and batch_idx + 1 == steps_per_epoch:
+ if local_rank != 0:
+ paddle.distributed.barrier(
+ ) # Wait for valid step in main process
+ continue # Resume trainning on other process
+
+ # stage 9-12: construct the valid dataset dataloader
+ dev_sampler = BatchSampler(
+ dev_dataset,
+ batch_size=config.batch_size,
+ shuffle=False,
+ drop_last=False)
+ dev_loader = DataLoader(
+ dev_dataset,
+ batch_sampler=dev_sampler,
+ collate_fn=waveform_collate_fn,
+ num_workers=config.num_workers,
+ return_list=True, )
+
+ # set the model to eval mode
+ model.eval()
+ num_corrects = 0
+ num_samples = 0
+
+ # stage 9-13: evaluation the valid dataset batch data
+ logger.info('Evaluate on validation dataset')
+ with paddle.no_grad():
+ for batch_idx, batch in enumerate(dev_loader):
+ waveforms, labels = batch['waveforms'], batch['labels']
+
+ feats = []
+ for waveform in waveforms.numpy():
+ feat = melspectrogram(
+ x=waveform,
+ sr=config.sr,
+ n_mels=config.n_mels,
+ window_size=config.window_size,
+ hop_length=config.hop_size)
+ feats.append(feat)
+
+ feats = paddle.to_tensor(np.asarray(feats))
+ feats = feature_normalize(
+ feats, mean_norm=True, std_norm=False)
+ logits = model(feats)
+
+ preds = paddle.argmax(logits, axis=1)
+ num_corrects += (preds == labels).numpy().sum()
+ num_samples += feats.shape[0]
+
+ print_msg = '[Evaluation result]'
+ print_msg += ' dev_acc={:.4f}'.format(num_corrects / num_samples)
+ logger.info(print_msg)
+
+ # stage 9-14: Save model parameters
+ save_dir = os.path.join(args.checkpoint_dir,
+ 'epoch_{}'.format(epoch))
+ last_saved_epoch = os.path.join('epoch_{}'.format(epoch),
+ "model.pdparams")
+ logger.info('Saving model checkpoint to {}'.format(save_dir))
+ paddle.save(model.state_dict(),
+ os.path.join(save_dir, 'model.pdparams'))
+ paddle.save(optimizer.state_dict(),
+ os.path.join(save_dir, 'model.pdopt'))
+
+ if nranks > 1:
+ paddle.distributed.barrier() # Main process
+
+ # stage 10: create the final trained model.pdparams with soft link
+ if local_rank == 0:
+ final_model = os.path.join(args.checkpoint_dir, "model.pdparams")
+ logger.info(f"we will create the final model: {final_model}")
+ if os.path.islink(final_model):
+ logger.info(
+ f"An {final_model} already exists, we will rm is and create it again"
+ )
+ os.unlink(final_model)
+ os.symlink(last_saved_epoch, final_model)
+
+
+if __name__ == "__main__":
+ # yapf: disable
+ parser = argparse.ArgumentParser(__doc__)
+ parser.add_argument('--device',
+ choices=['cpu', 'gpu'],
+ default="cpu",
+ help="Select which device to train model, defaults to gpu.")
+ parser.add_argument("--config",
+ default=None,
+ type=str,
+ help="configuration file")
+ parser.add_argument("--data-dir",
+ default="./data/",
+ type=str,
+ help="data directory")
+ parser.add_argument("--load-checkpoint",
+ type=str,
+ default=None,
+ help="Directory to load model checkpoint to contiune trainning.")
+ parser.add_argument("--checkpoint-dir",
+ type=str,
+ default='./checkpoint',
+ help="Directory to save model checkpoints.")
+
+ args = parser.parse_args()
+ # yapf: enable
+
+ # https://yaml.org/type/float.html
+ config = CfgNode(new_allowed=True)
+ if args.config:
+ config.merge_from_file(args.config)
+
+ config.freeze()
+ print(config)
+
+ main(args, config)
diff --git a/paddlespeech/vector/io/__init__.py b/paddlespeech/vector/io/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/paddlespeech/vector/io/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/vector/io/augment.py b/paddlespeech/vector/io/augment.py
new file mode 100644
index 0000000000000000000000000000000000000000..3baace13977d0ba4ce324597b1d821850a79119b
--- /dev/null
+++ b/paddlespeech/vector/io/augment.py
@@ -0,0 +1,906 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# this is modified from SpeechBrain
+# https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/lobes/augment.py
+import math
+from typing import List
+
+import numpy as np
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+from paddleaudio.datasets.rirs_noises import OpenRIRNoise
+from paddlespeech.s2t.utils.log import Log
+from paddlespeech.vector.io.signal_processing import compute_amplitude
+from paddlespeech.vector.io.signal_processing import convolve1d
+from paddlespeech.vector.io.signal_processing import dB_to_amplitude
+from paddlespeech.vector.io.signal_processing import notch_filter
+from paddlespeech.vector.io.signal_processing import reverberate
+
+logger = Log(__name__).getlog()
+
+
+# TODO: Complete type-hint and doc string.
+class DropFreq(nn.Layer):
+ def __init__(
+ self,
+ drop_freq_low=1e-14,
+ drop_freq_high=1,
+ drop_count_low=1,
+ drop_count_high=2,
+ drop_width=0.05,
+ drop_prob=1, ):
+ super(DropFreq, self).__init__()
+ self.drop_freq_low = drop_freq_low
+ self.drop_freq_high = drop_freq_high
+ self.drop_count_low = drop_count_low
+ self.drop_count_high = drop_count_high
+ self.drop_width = drop_width
+ self.drop_prob = drop_prob
+
+ def forward(self, waveforms):
+ # Don't drop (return early) 1-`drop_prob` portion of the batches
+ dropped_waveform = waveforms.clone()
+ if paddle.rand([1]) > self.drop_prob:
+ return dropped_waveform
+
+ # Add channels dimension
+ if len(waveforms.shape) == 2:
+ dropped_waveform = dropped_waveform.unsqueeze(-1)
+
+ # Pick number of frequencies to drop
+ drop_count = paddle.randint(
+ low=self.drop_count_low, high=self.drop_count_high + 1, shape=[1])
+
+ # Pick a frequency to drop
+ drop_range = self.drop_freq_high - self.drop_freq_low
+ drop_frequency = (
+ paddle.rand([drop_count]) * drop_range + self.drop_freq_low)
+
+ # Filter parameters
+ filter_length = 101
+ pad = filter_length // 2
+
+ # Start with delta function
+ drop_filter = paddle.zeros([1, filter_length, 1])
+ drop_filter[0, pad, 0] = 1
+
+ # Subtract each frequency
+ for frequency in drop_frequency:
+ notch_kernel = notch_filter(frequency, filter_length,
+ self.drop_width)
+ drop_filter = convolve1d(drop_filter, notch_kernel, pad)
+
+ # Apply filter
+ dropped_waveform = convolve1d(dropped_waveform, drop_filter, pad)
+
+ # Remove channels dimension if added
+ return dropped_waveform.squeeze(-1)
+
+
+class DropChunk(nn.Layer):
+ def __init__(
+ self,
+ drop_length_low=100,
+ drop_length_high=1000,
+ drop_count_low=1,
+ drop_count_high=10,
+ drop_start=0,
+ drop_end=None,
+ drop_prob=1,
+ noise_factor=0.0, ):
+ super(DropChunk, self).__init__()
+ self.drop_length_low = drop_length_low
+ self.drop_length_high = drop_length_high
+ self.drop_count_low = drop_count_low
+ self.drop_count_high = drop_count_high
+ self.drop_start = drop_start
+ self.drop_end = drop_end
+ self.drop_prob = drop_prob
+ self.noise_factor = noise_factor
+
+ # Validate low < high
+ if drop_length_low > drop_length_high:
+ raise ValueError("Low limit must not be more than high limit")
+ if drop_count_low > drop_count_high:
+ raise ValueError("Low limit must not be more than high limit")
+
+ # Make sure the length doesn't exceed end - start
+ if drop_end is not None and drop_end >= 0:
+ if drop_start > drop_end:
+ raise ValueError("Low limit must not be more than high limit")
+
+ drop_range = drop_end - drop_start
+ self.drop_length_low = min(drop_length_low, drop_range)
+ self.drop_length_high = min(drop_length_high, drop_range)
+
+ def forward(self, waveforms, lengths):
+ # Reading input list
+ lengths = (lengths * waveforms.shape[1]).astype('int64')
+ batch_size = waveforms.shape[0]
+ dropped_waveform = waveforms.clone()
+
+ # Don't drop (return early) 1-`drop_prob` portion of the batches
+ if paddle.rand([1]) > self.drop_prob:
+ return dropped_waveform
+
+ # Store original amplitude for computing white noise amplitude
+ clean_amplitude = compute_amplitude(waveforms, lengths.unsqueeze(1))
+
+ # Pick a number of times to drop
+ drop_times = paddle.randint(
+ low=self.drop_count_low,
+ high=self.drop_count_high + 1,
+ shape=[batch_size], )
+
+ # Iterate batch to set mask
+ for i in range(batch_size):
+ if drop_times[i] == 0:
+ continue
+
+ # Pick lengths
+ length = paddle.randint(
+ low=self.drop_length_low,
+ high=self.drop_length_high + 1,
+ shape=[drop_times[i]], )
+
+ # Compute range of starting locations
+ start_min = self.drop_start
+ if start_min < 0:
+ start_min += lengths[i]
+ start_max = self.drop_end
+ if start_max is None:
+ start_max = lengths[i]
+ if start_max < 0:
+ start_max += lengths[i]
+ start_max = max(0, start_max - length.max())
+
+ # Pick starting locations
+ start = paddle.randint(
+ low=start_min,
+ high=start_max + 1,
+ shape=[drop_times[i]], )
+
+ end = start + length
+
+ # Update waveform
+ if not self.noise_factor:
+ for j in range(drop_times[i]):
+ if start[j] < end[j]:
+ dropped_waveform[i, start[j]:end[j]] = 0.0
+ else:
+ # Uniform distribution of -2 to +2 * avg amplitude should
+ # preserve the average for normalization
+ noise_max = 2 * clean_amplitude[i] * self.noise_factor
+ for j in range(drop_times[i]):
+ # zero-center the noise distribution
+ noise_vec = paddle.rand([length[j]], dtype='float32')
+
+ noise_vec = 2 * noise_max * noise_vec - noise_max
+ dropped_waveform[i, int(start[j]):int(end[j])] = noise_vec
+
+ return dropped_waveform
+
+
+class Resample(nn.Layer):
+ def __init__(
+ self,
+ orig_freq=16000,
+ new_freq=16000,
+ lowpass_filter_width=6, ):
+ super(Resample, self).__init__()
+ self.orig_freq = orig_freq
+ self.new_freq = new_freq
+ self.lowpass_filter_width = lowpass_filter_width
+
+ # Compute rate for striding
+ self._compute_strides()
+ assert self.orig_freq % self.conv_stride == 0
+ assert self.new_freq % self.conv_transpose_stride == 0
+
+ def _compute_strides(self):
+ # Compute new unit based on ratio of in/out frequencies
+ base_freq = math.gcd(self.orig_freq, self.new_freq)
+ input_samples_in_unit = self.orig_freq // base_freq
+ self.output_samples = self.new_freq // base_freq
+
+ # Store the appropriate stride based on the new units
+ self.conv_stride = input_samples_in_unit
+ self.conv_transpose_stride = self.output_samples
+
+ def forward(self, waveforms):
+ if not hasattr(self, "first_indices"):
+ self._indices_and_weights(waveforms)
+
+ # Don't do anything if the frequencies are the same
+ if self.orig_freq == self.new_freq:
+ return waveforms
+
+ unsqueezed = False
+ if len(waveforms.shape) == 2:
+ waveforms = waveforms.unsqueeze(1)
+ unsqueezed = True
+ elif len(waveforms.shape) == 3:
+ waveforms = waveforms.transpose([0, 2, 1])
+ else:
+ raise ValueError("Input must be 2 or 3 dimensions")
+
+ # Do resampling
+ resampled_waveform = self._perform_resample(waveforms)
+
+ if unsqueezed:
+ resampled_waveform = resampled_waveform.squeeze(1)
+ else:
+ resampled_waveform = resampled_waveform.transpose([0, 2, 1])
+
+ return resampled_waveform
+
+ def _perform_resample(self, waveforms):
+ # Compute output size and initialize
+ batch_size, num_channels, wave_len = waveforms.shape
+ window_size = self.weights.shape[1]
+ tot_output_samp = self._output_samples(wave_len)
+ resampled_waveform = paddle.zeros((batch_size, num_channels,
+ tot_output_samp))
+
+ # eye size: (num_channels, num_channels, 1)
+ eye = paddle.eye(num_channels).unsqueeze(2)
+
+ # Iterate over the phases in the polyphase filter
+ for i in range(self.first_indices.shape[0]):
+ wave_to_conv = waveforms
+ first_index = int(self.first_indices[i].item())
+ if first_index >= 0:
+ # trim the signal as the filter will not be applied
+ # before the first_index
+ wave_to_conv = wave_to_conv[:, :, first_index:]
+
+ # pad the right of the signal to allow partial convolutions
+ # meaning compute values for partial windows (e.g. end of the
+ # window is outside the signal length)
+ max_index = (tot_output_samp - 1) // self.output_samples
+ end_index = max_index * self.conv_stride + window_size
+ current_wave_len = wave_len - first_index
+ right_padding = max(0, end_index + 1 - current_wave_len)
+ left_padding = max(0, -first_index)
+ wave_to_conv = paddle.nn.functional.pad(
+ wave_to_conv, [left_padding, right_padding], data_format='NCL')
+ conv_wave = paddle.nn.functional.conv1d(
+ x=wave_to_conv,
+ # weight=self.weights[i].repeat(num_channels, 1, 1),
+ weight=self.weights[i].expand((num_channels, 1, -1)),
+ stride=self.conv_stride,
+ groups=num_channels, )
+
+ # we want conv_wave[:, i] to be at
+ # output[:, i + n*conv_transpose_stride]
+ dilated_conv_wave = paddle.nn.functional.conv1d_transpose(
+ conv_wave, eye, stride=self.conv_transpose_stride)
+
+ # pad dilated_conv_wave so it reaches the output length if needed.
+ left_padding = i
+ previous_padding = left_padding + dilated_conv_wave.shape[-1]
+ right_padding = max(0, tot_output_samp - previous_padding)
+ dilated_conv_wave = paddle.nn.functional.pad(
+ dilated_conv_wave, [left_padding, right_padding],
+ data_format='NCL')
+ dilated_conv_wave = dilated_conv_wave[:, :, :tot_output_samp]
+
+ resampled_waveform += dilated_conv_wave
+
+ return resampled_waveform
+
+ def _output_samples(self, input_num_samp):
+ samp_in = int(self.orig_freq)
+ samp_out = int(self.new_freq)
+
+ tick_freq = abs(samp_in * samp_out) // math.gcd(samp_in, samp_out)
+ ticks_per_input_period = tick_freq // samp_in
+
+ # work out the number of ticks in the time interval
+ # [ 0, input_num_samp/samp_in ).
+ interval_length = input_num_samp * ticks_per_input_period
+ if interval_length <= 0:
+ return 0
+ ticks_per_output_period = tick_freq // samp_out
+
+ # Get the last output-sample in the closed interval,
+ # i.e. replacing [ ) with [ ]. Note: integer division rounds down.
+ # See http://en.wikipedia.org/wiki/Interval_(mathematics) for an
+ # explanation of the notation.
+ last_output_samp = interval_length // ticks_per_output_period
+
+ # We need the last output-sample in the open interval, so if it
+ # takes us to the end of the interval exactly, subtract one.
+ if last_output_samp * ticks_per_output_period == interval_length:
+ last_output_samp -= 1
+
+ # First output-sample index is zero, so the number of output samples
+ # is the last output-sample plus one.
+ num_output_samp = last_output_samp + 1
+
+ return num_output_samp
+
+ def _indices_and_weights(self, waveforms):
+ # Lowpass filter frequency depends on smaller of two frequencies
+ min_freq = min(self.orig_freq, self.new_freq)
+ lowpass_cutoff = 0.99 * 0.5 * min_freq
+
+ assert lowpass_cutoff * 2 <= min_freq
+ window_width = self.lowpass_filter_width / (2.0 * lowpass_cutoff)
+
+ assert lowpass_cutoff < min(self.orig_freq, self.new_freq) / 2
+ output_t = paddle.arange(start=0.0, end=self.output_samples)
+ output_t /= self.new_freq
+ min_t = output_t - window_width
+ max_t = output_t + window_width
+
+ min_input_index = paddle.ceil(min_t * self.orig_freq)
+ max_input_index = paddle.floor(max_t * self.orig_freq)
+ num_indices = max_input_index - min_input_index + 1
+
+ max_weight_width = num_indices.max()
+ j = paddle.arange(max_weight_width, dtype='float32')
+ input_index = min_input_index.unsqueeze(1) + j.unsqueeze(0)
+ delta_t = (input_index / self.orig_freq) - output_t.unsqueeze(1)
+
+ weights = paddle.zeros_like(delta_t)
+ inside_window_indices = delta_t.abs().less_than(
+ paddle.to_tensor(window_width))
+
+ # raised-cosine (Hanning) window with width `window_width`
+ weights[inside_window_indices] = 0.5 * (1 + paddle.cos(
+ 2 * math.pi * lowpass_cutoff / self.lowpass_filter_width *
+ delta_t.masked_select(inside_window_indices)))
+
+ t_eq_zero_indices = delta_t.equal(paddle.zeros_like(delta_t))
+ t_not_eq_zero_indices = delta_t.not_equal(paddle.zeros_like(delta_t))
+
+ # sinc filter function
+ weights = paddle.where(
+ t_not_eq_zero_indices,
+ weights * paddle.sin(2 * math.pi * lowpass_cutoff * delta_t) /
+ (math.pi * delta_t), weights)
+
+ # limit of the function at t = 0
+ weights = paddle.where(t_eq_zero_indices, weights * 2 * lowpass_cutoff,
+ weights)
+
+ # size (output_samples, max_weight_width)
+ weights /= self.orig_freq
+
+ self.first_indices = min_input_index
+ self.weights = weights
+
+
+class SpeedPerturb(nn.Layer):
+ def __init__(
+ self,
+ orig_freq,
+ speeds=[90, 100, 110],
+ perturb_prob=1.0, ):
+ super(SpeedPerturb, self).__init__()
+ self.orig_freq = orig_freq
+ self.speeds = speeds
+ self.perturb_prob = perturb_prob
+
+ # Initialize index of perturbation
+ self.samp_index = 0
+
+ # Initialize resamplers
+ self.resamplers = []
+ for speed in self.speeds:
+ config = {
+ "orig_freq": self.orig_freq,
+ "new_freq": self.orig_freq * speed // 100,
+ }
+ self.resamplers.append(Resample(**config))
+
+ def forward(self, waveform):
+ # Don't perturb (return early) 1-`perturb_prob` portion of the batches
+ if paddle.rand([1]) > self.perturb_prob:
+ return waveform.clone()
+
+ # Perform a random perturbation
+ self.samp_index = paddle.randint(len(self.speeds), shape=[1]).item()
+ perturbed_waveform = self.resamplers[self.samp_index](waveform)
+
+ return perturbed_waveform
+
+
+class AddNoise(nn.Layer):
+ def __init__(
+ self,
+ noise_dataset=None, # None for white noise
+ num_workers=0,
+ snr_low=0,
+ snr_high=0,
+ mix_prob=1.0,
+ start_index=None,
+ normalize=False, ):
+ super(AddNoise, self).__init__()
+
+ self.num_workers = num_workers
+ self.snr_low = snr_low
+ self.snr_high = snr_high
+ self.mix_prob = mix_prob
+ self.start_index = start_index
+ self.normalize = normalize
+ self.noise_dataset = noise_dataset
+ self.noise_dataloader = None
+
+ def forward(self, waveforms, lengths=None):
+ if lengths is None:
+ lengths = paddle.ones([len(waveforms)])
+
+ # Copy clean waveform to initialize noisy waveform
+ noisy_waveform = waveforms.clone()
+ lengths = (lengths * waveforms.shape[1]).astype('int64').unsqueeze(1)
+
+ # Don't add noise (return early) 1-`mix_prob` portion of the batches
+ if paddle.rand([1]) > self.mix_prob:
+ return noisy_waveform
+
+ # Compute the average amplitude of the clean waveforms
+ clean_amplitude = compute_amplitude(waveforms, lengths)
+
+ # Pick an SNR and use it to compute the mixture amplitude factors
+ SNR = paddle.rand((len(waveforms), 1))
+ SNR = SNR * (self.snr_high - self.snr_low) + self.snr_low
+ noise_amplitude_factor = 1 / (dB_to_amplitude(SNR) + 1)
+ new_noise_amplitude = noise_amplitude_factor * clean_amplitude
+
+ # Scale clean signal appropriately
+ noisy_waveform *= 1 - noise_amplitude_factor
+
+ # Loop through clean samples and create mixture
+ if self.noise_dataset is None:
+ white_noise = paddle.normal(shape=waveforms.shape)
+ noisy_waveform += new_noise_amplitude * white_noise
+ else:
+ tensor_length = waveforms.shape[1]
+ noise_waveform, noise_length = self._load_noise(
+ lengths,
+ tensor_length, )
+
+ # Rescale and add
+ noise_amplitude = compute_amplitude(noise_waveform, noise_length)
+ noise_waveform *= new_noise_amplitude / (noise_amplitude + 1e-14)
+ noisy_waveform += noise_waveform
+
+ # Normalizing to prevent clipping
+ if self.normalize:
+ abs_max, _ = paddle.max(
+ paddle.abs(noisy_waveform), axis=1, keepdim=True)
+ noisy_waveform = noisy_waveform / abs_max.clip(min=1.0)
+
+ return noisy_waveform
+
+ def _load_noise(self, lengths, max_length):
+ """
+ Load a batch of noises
+
+ args
+ lengths(Paddle.Tensor): Num samples of waveforms with shape (N, 1).
+ max_length(int): Width of a batch.
+ """
+ lengths = lengths.squeeze(1)
+ batch_size = len(lengths)
+
+ # Load a noise batch
+ if self.noise_dataloader is None:
+
+ def noise_collate_fn(batch):
+ def pad(x, target_length, mode='constant', **kwargs):
+ x = np.asarray(x)
+ w = target_length - x.shape[0]
+ assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[0]}'
+ return np.pad(x, [0, w], mode=mode, **kwargs)
+
+ ids = [item['id'] for item in batch]
+ lengths = np.asarray([item['feat'].shape[0] for item in batch])
+ waveforms = list(
+ map(lambda x: pad(x, max(max_length, lengths.max().item())),
+ [item['feat'] for item in batch]))
+ waveforms = np.stack(waveforms)
+ return {'ids': ids, 'feats': waveforms, 'lengths': lengths}
+
+ # Create noise data loader.
+ self.noise_dataloader = paddle.io.DataLoader(
+ self.noise_dataset,
+ batch_size=batch_size,
+ shuffle=True,
+ num_workers=self.num_workers,
+ collate_fn=noise_collate_fn,
+ return_list=True, )
+ self.noise_data = iter(self.noise_dataloader)
+
+ noise_batch, noise_len = self._load_noise_batch_of_size(batch_size)
+
+ # Select a random starting location in the waveform
+ start_index = self.start_index
+ if self.start_index is None:
+ start_index = 0
+ max_chop = (noise_len - lengths).min().clip(min=1)
+ start_index = paddle.randint(high=max_chop, shape=[1])
+
+ # Truncate noise_batch to max_length
+ noise_batch = noise_batch[:, start_index:start_index + max_length]
+ noise_len = (noise_len - start_index).clip(max=max_length).unsqueeze(1)
+ return noise_batch, noise_len
+
+ def _load_noise_batch_of_size(self, batch_size):
+ """Concatenate noise batches, then chop to correct size"""
+ noise_batch, noise_lens = self._load_noise_batch()
+
+ # Expand
+ while len(noise_batch) < batch_size:
+ noise_batch = paddle.concat((noise_batch, noise_batch))
+ noise_lens = paddle.concat((noise_lens, noise_lens))
+
+ # Contract
+ if len(noise_batch) > batch_size:
+ noise_batch = noise_batch[:batch_size]
+ noise_lens = noise_lens[:batch_size]
+
+ return noise_batch, noise_lens
+
+ def _load_noise_batch(self):
+ """Load a batch of noises, restarting iteration if necessary."""
+ try:
+ batch = next(self.noise_data)
+ except StopIteration:
+ self.noise_data = iter(self.noise_dataloader)
+ batch = next(self.noise_data)
+
+ noises, lens = batch['feats'], batch['lengths']
+ return noises, lens
+
+
+class AddReverb(nn.Layer):
+ def __init__(
+ self,
+ rir_dataset,
+ reverb_prob=1.0,
+ rir_scale_factor=1.0,
+ num_workers=0, ):
+ super(AddReverb, self).__init__()
+ self.rir_dataset = rir_dataset
+ self.reverb_prob = reverb_prob
+ self.rir_scale_factor = rir_scale_factor
+
+ # Create rir data loader.
+ def rir_collate_fn(batch):
+ def pad(x, target_length, mode='constant', **kwargs):
+ x = np.asarray(x)
+ w = target_length - x.shape[0]
+ assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[0]}'
+ return np.pad(x, [0, w], mode=mode, **kwargs)
+
+ ids = [item['id'] for item in batch]
+ lengths = np.asarray([item['feat'].shape[0] for item in batch])
+ waveforms = list(
+ map(lambda x: pad(x, lengths.max().item()),
+ [item['feat'] for item in batch]))
+ waveforms = np.stack(waveforms)
+ return {'ids': ids, 'feats': waveforms, 'lengths': lengths}
+
+ self.rir_dataloader = paddle.io.DataLoader(
+ self.rir_dataset,
+ collate_fn=rir_collate_fn,
+ num_workers=num_workers,
+ shuffle=True,
+ return_list=True, )
+
+ self.rir_data = iter(self.rir_dataloader)
+
+ def forward(self, waveforms, lengths=None):
+ """
+ Arguments
+ ---------
+ waveforms : tensor
+ Shape should be `[batch, time]` or `[batch, time, channels]`.
+ lengths : tensor
+ Shape should be a single dimension, `[batch]`.
+
+ Returns
+ -------
+ Tensor of shape `[batch, time]` or `[batch, time, channels]`.
+ """
+
+ if lengths is None:
+ lengths = paddle.ones([len(waveforms)])
+
+ # Don't add reverb (return early) 1-`reverb_prob` portion of the time
+ if paddle.rand([1]) > self.reverb_prob:
+ return waveforms.clone()
+
+ # Add channels dimension if necessary
+ channel_added = False
+ if len(waveforms.shape) == 2:
+ waveforms = waveforms.unsqueeze(-1)
+ channel_added = True
+
+ # Load and prepare RIR
+ rir_waveform = self._load_rir()
+
+ # Compress or dilate RIR
+ if self.rir_scale_factor != 1:
+ rir_waveform = F.interpolate(
+ rir_waveform.transpose([0, 2, 1]),
+ scale_factor=self.rir_scale_factor,
+ mode="linear",
+ align_corners=False,
+ data_format='NCW', )
+ # (N, C, L) -> (N, L, C)
+ rir_waveform = rir_waveform.transpose([0, 2, 1])
+
+ rev_waveform = reverberate(
+ waveforms,
+ rir_waveform,
+ self.rir_dataset.sample_rate,
+ rescale_amp="avg")
+
+ # Remove channels dimension if added
+ if channel_added:
+ return rev_waveform.squeeze(-1)
+
+ return rev_waveform
+
+ def _load_rir(self):
+ try:
+ batch = next(self.rir_data)
+ except StopIteration:
+ self.rir_data = iter(self.rir_dataloader)
+ batch = next(self.rir_data)
+
+ rir_waveform = batch['feats']
+
+ # Make sure RIR has correct channels
+ if len(rir_waveform.shape) == 2:
+ rir_waveform = rir_waveform.unsqueeze(-1)
+
+ return rir_waveform
+
+
+class AddBabble(nn.Layer):
+ def __init__(
+ self,
+ speaker_count=3,
+ snr_low=0,
+ snr_high=0,
+ mix_prob=1, ):
+ super(AddBabble, self).__init__()
+ self.speaker_count = speaker_count
+ self.snr_low = snr_low
+ self.snr_high = snr_high
+ self.mix_prob = mix_prob
+
+ def forward(self, waveforms, lengths=None):
+ if lengths is None:
+ lengths = paddle.ones([len(waveforms)])
+
+ babbled_waveform = waveforms.clone()
+ lengths = (lengths * waveforms.shape[1]).unsqueeze(1)
+ batch_size = len(waveforms)
+
+ # Don't mix (return early) 1-`mix_prob` portion of the batches
+ if paddle.rand([1]) > self.mix_prob:
+ return babbled_waveform
+
+ # Pick an SNR and use it to compute the mixture amplitude factors
+ clean_amplitude = compute_amplitude(waveforms, lengths)
+ SNR = paddle.rand((batch_size, 1))
+ SNR = SNR * (self.snr_high - self.snr_low) + self.snr_low
+ noise_amplitude_factor = 1 / (dB_to_amplitude(SNR) + 1)
+ new_noise_amplitude = noise_amplitude_factor * clean_amplitude
+
+ # Scale clean signal appropriately
+ babbled_waveform *= 1 - noise_amplitude_factor
+
+ # For each speaker in the mixture, roll and add
+ babble_waveform = waveforms.roll((1, ), axis=0)
+ babble_len = lengths.roll((1, ), axis=0)
+ for i in range(1, self.speaker_count):
+ babble_waveform += waveforms.roll((1 + i, ), axis=0)
+ babble_len = paddle.concat(
+ [babble_len, babble_len.roll((1, ), axis=0)], axis=-1).max(
+ axis=-1, keepdim=True)
+
+ # Rescale and add to mixture
+ babble_amplitude = compute_amplitude(babble_waveform, babble_len)
+ babble_waveform *= new_noise_amplitude / (babble_amplitude + 1e-14)
+ babbled_waveform += babble_waveform
+
+ return babbled_waveform
+
+
+class TimeDomainSpecAugment(nn.Layer):
+ def __init__(
+ self,
+ perturb_prob=1.0,
+ drop_freq_prob=1.0,
+ drop_chunk_prob=1.0,
+ speeds=[95, 100, 105],
+ sample_rate=16000,
+ drop_freq_count_low=0,
+ drop_freq_count_high=3,
+ drop_chunk_count_low=0,
+ drop_chunk_count_high=5,
+ drop_chunk_length_low=1000,
+ drop_chunk_length_high=2000,
+ drop_chunk_noise_factor=0, ):
+ super(TimeDomainSpecAugment, self).__init__()
+ self.speed_perturb = SpeedPerturb(
+ perturb_prob=perturb_prob,
+ orig_freq=sample_rate,
+ speeds=speeds, )
+ self.drop_freq = DropFreq(
+ drop_prob=drop_freq_prob,
+ drop_count_low=drop_freq_count_low,
+ drop_count_high=drop_freq_count_high, )
+ self.drop_chunk = DropChunk(
+ drop_prob=drop_chunk_prob,
+ drop_count_low=drop_chunk_count_low,
+ drop_count_high=drop_chunk_count_high,
+ drop_length_low=drop_chunk_length_low,
+ drop_length_high=drop_chunk_length_high,
+ noise_factor=drop_chunk_noise_factor, )
+
+ def forward(self, waveforms, lengths=None):
+ if lengths is None:
+ lengths = paddle.ones([len(waveforms)])
+
+ with paddle.no_grad():
+ # Augmentation
+ waveforms = self.speed_perturb(waveforms)
+ waveforms = self.drop_freq(waveforms)
+ waveforms = self.drop_chunk(waveforms, lengths)
+
+ return waveforms
+
+
+class EnvCorrupt(nn.Layer):
+ def __init__(
+ self,
+ reverb_prob=1.0,
+ babble_prob=1.0,
+ noise_prob=1.0,
+ rir_dataset=None,
+ noise_dataset=None,
+ num_workers=0,
+ babble_speaker_count=0,
+ babble_snr_low=0,
+ babble_snr_high=0,
+ noise_snr_low=0,
+ noise_snr_high=0,
+ rir_scale_factor=1.0, ):
+ super(EnvCorrupt, self).__init__()
+
+ # Initialize corrupters
+ if rir_dataset is not None and reverb_prob > 0.0:
+ self.add_reverb = AddReverb(
+ rir_dataset=rir_dataset,
+ num_workers=num_workers,
+ reverb_prob=reverb_prob,
+ rir_scale_factor=rir_scale_factor, )
+
+ if babble_speaker_count > 0 and babble_prob > 0.0:
+ self.add_babble = AddBabble(
+ speaker_count=babble_speaker_count,
+ snr_low=babble_snr_low,
+ snr_high=babble_snr_high,
+ mix_prob=babble_prob, )
+
+ if noise_dataset is not None and noise_prob > 0.0:
+ self.add_noise = AddNoise(
+ noise_dataset=noise_dataset,
+ num_workers=num_workers,
+ snr_low=noise_snr_low,
+ snr_high=noise_snr_high,
+ mix_prob=noise_prob, )
+
+ def forward(self, waveforms, lengths=None):
+ if lengths is None:
+ lengths = paddle.ones([len(waveforms)])
+
+ # Augmentation
+ with paddle.no_grad():
+ if hasattr(self, "add_reverb"):
+ try:
+ waveforms = self.add_reverb(waveforms, lengths)
+ except Exception:
+ pass
+ if hasattr(self, "add_babble"):
+ waveforms = self.add_babble(waveforms, lengths)
+ if hasattr(self, "add_noise"):
+ waveforms = self.add_noise(waveforms, lengths)
+
+ return waveforms
+
+
+def build_augment_pipeline(target_dir=None) -> List[paddle.nn.Layer]:
+ """build augment pipeline
+ Note: this pipeline cannot be used in the paddle.DataLoader
+
+ Returns:
+ List[paddle.nn.Layer]: all augment process
+ """
+ logger.info("start to build the augment pipeline")
+ noise_dataset = OpenRIRNoise('noise', target_dir=target_dir)
+ rir_dataset = OpenRIRNoise('rir', target_dir=target_dir)
+
+ wavedrop = TimeDomainSpecAugment(
+ sample_rate=16000,
+ speeds=[100], )
+ speed_perturb = TimeDomainSpecAugment(
+ sample_rate=16000,
+ speeds=[95, 100, 105], )
+ add_noise = EnvCorrupt(
+ noise_dataset=noise_dataset,
+ reverb_prob=0.0,
+ noise_prob=1.0,
+ noise_snr_low=0,
+ noise_snr_high=15,
+ rir_scale_factor=1.0, )
+ add_rev = EnvCorrupt(
+ rir_dataset=rir_dataset,
+ reverb_prob=1.0,
+ noise_prob=0.0,
+ rir_scale_factor=1.0, )
+ add_rev_noise = EnvCorrupt(
+ noise_dataset=noise_dataset,
+ rir_dataset=rir_dataset,
+ reverb_prob=1.0,
+ noise_prob=1.0,
+ noise_snr_low=0,
+ noise_snr_high=15,
+ rir_scale_factor=1.0, )
+
+ return [wavedrop, speed_perturb, add_noise, add_rev, add_rev_noise]
+
+
+def waveform_augment(waveforms: paddle.Tensor,
+ augment_pipeline: List[paddle.nn.Layer]) -> paddle.Tensor:
+ """process the augment pipeline and return all the waveforms
+
+ Args:
+ waveforms (paddle.Tensor): original batch waveform
+ augment_pipeline (List[paddle.nn.Layer]): agument pipeline process
+
+ Returns:
+ paddle.Tensor: all the audio waveform including the original waveform and augmented waveform
+ """
+ # stage 0: store the original waveforms
+ waveforms_aug_list = [waveforms]
+
+ # augment the original batch waveform
+ for aug in augment_pipeline:
+ # stage 1: augment the data
+ waveforms_aug = aug(waveforms) # (N, L)
+ if waveforms_aug.shape[1] >= waveforms.shape[1]:
+ # Trunc
+ waveforms_aug = waveforms_aug[:, :waveforms.shape[1]]
+ else:
+ # Pad
+ lengths_to_pad = waveforms.shape[1] - waveforms_aug.shape[1]
+ waveforms_aug = F.pad(
+ waveforms_aug.unsqueeze(-1), [0, lengths_to_pad],
+ data_format='NLC').squeeze(-1)
+ # stage 2: append the augmented waveform into the list
+ waveforms_aug_list.append(waveforms_aug)
+
+ # get the all the waveforms
+ return paddle.concat(waveforms_aug_list, axis=0)
diff --git a/paddlespeech/vector/io/batch.py b/paddlespeech/vector/io/batch.py
new file mode 100644
index 0000000000000000000000000000000000000000..92ca990cf2dd83f6a22127e15b50885e6809c21f
--- /dev/null
+++ b/paddlespeech/vector/io/batch.py
@@ -0,0 +1,166 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy
+import numpy as np
+import paddle
+
+
+def waveform_collate_fn(batch):
+ waveforms = np.stack([item['feat'] for item in batch])
+ labels = np.stack([item['label'] for item in batch])
+
+ return {'waveforms': waveforms, 'labels': labels}
+
+
+def feature_normalize(feats: paddle.Tensor,
+ mean_norm: bool=True,
+ std_norm: bool=True,
+ convert_to_numpy: bool=False):
+ # Features normalization if needed
+ # numpy.mean is a little with paddle.mean about 1e-6
+ if convert_to_numpy:
+ feats_np = feats.numpy()
+ mean = feats_np.mean(axis=-1, keepdims=True) if mean_norm else 0
+ std = feats_np.std(axis=-1, keepdims=True) if std_norm else 1
+ feats_np = (feats_np - mean) / std
+ feats = paddle.to_tensor(feats_np, dtype=feats.dtype)
+ else:
+ mean = feats.mean(axis=-1, keepdim=True) if mean_norm else 0
+ std = feats.std(axis=-1, keepdim=True) if std_norm else 1
+ feats = (feats - mean) / std
+
+ return feats
+
+
+def pad_right_2d(x, target_length, axis=-1, mode='constant', **kwargs):
+ x = np.asarray(x)
+ assert len(
+ x.shape) == 2, f'Only 2D arrays supported, but got shape: {x.shape}'
+
+ w = target_length - x.shape[axis]
+ assert w >= 0, f'Target length {target_length} is less than origin length {x.shape[axis]}'
+
+ if axis == 0:
+ pad_width = [[0, w], [0, 0]]
+ else:
+ pad_width = [[0, 0], [0, w]]
+
+ return np.pad(x, pad_width, mode=mode, **kwargs)
+
+
+def batch_feature_normalize(batch, mean_norm: bool=True, std_norm: bool=True):
+ ids = [item['id'] for item in batch]
+ lengths = np.asarray([item['feat'].shape[1] for item in batch])
+ feats = list(
+ map(lambda x: pad_right_2d(x, lengths.max()),
+ [item['feat'] for item in batch]))
+ feats = np.stack(feats)
+
+ # Features normalization if needed
+ for i in range(len(feats)):
+ feat = feats[i][:, :lengths[i]] # Excluding pad values.
+ mean = feat.mean(axis=-1, keepdims=True) if mean_norm else 0
+ std = feat.std(axis=-1, keepdims=True) if std_norm else 1
+ feats[i][:, :lengths[i]] = (feat - mean) / std
+ assert feats[i][:, lengths[
+ i]:].sum() == 0 # Padding valus should all be 0.
+
+ # Converts into ratios.
+ # the utterance of the max length doesn't need to padding
+ # the remaining utterances need to padding and all of them will be padded to max length
+ # we convert the original length of each utterance to the ratio of the max length
+ lengths = (lengths / lengths.max()).astype(np.float32)
+
+ return {'ids': ids, 'feats': feats, 'lengths': lengths}
+
+
+def pad_right_to(array, target_shape, mode="constant", value=0):
+ """
+ This function takes a numpy array of arbitrary shape and pads it to target
+ shape by appending values on the right.
+
+ Args:
+ array: input numpy array. Input array whose dimension we need to pad.
+ target_shape : (list, tuple). Target shape we want for the target array its len must be equal to array.ndim
+ mode : str. Pad mode, please refer to numpy.pad documentation.
+ value : float. Pad value, please refer to numpy.pad documentation.
+
+ Returns:
+ array: numpy.array. Padded array.
+ valid_vals : list. List containing proportion for each dimension of original, non-padded values.
+ """
+ assert len(target_shape) == array.ndim
+ pads = [] # this contains the abs length of the padding for each dimension.
+ valid_vals = [] # this contains the relative lengths for each dimension.
+ i = 0 # iterating over target_shape ndims
+ while i < len(target_shape):
+ assert (target_shape[i] >= array.shape[i]
+ ), "Target shape must be >= original shape for every dim"
+ pads.append([0, target_shape[i] - array.shape[i]])
+ valid_vals.append(array.shape[i] / target_shape[i])
+ i += 1
+
+ array = numpy.pad(array, pads, mode=mode, constant_values=value)
+
+ return array, valid_vals
+
+
+def batch_pad_right(arrays, mode="constant", value=0):
+ """Given a list of numpy arrays it batches them together by padding to the right
+ on each dimension in order to get same length for all.
+
+ Args:
+ arrays : list. List of array we wish to pad together.
+ mode : str. Padding mode see numpy.pad documentation.
+ value : float. Padding value see numpy.pad documentation.
+
+ Returns:
+ array : numpy.array. Padded array.
+ valid_vals : list. List containing proportion for each dimension of original, non-padded values.
+ """
+
+ if not len(arrays):
+ raise IndexError("arrays list must not be empty")
+
+ if len(arrays) == 1:
+ # if there is only one array in the batch we simply unsqueeze it.
+ return numpy.expand_dims(arrays[0], axis=0), numpy.array([1.0])
+
+ if not (any(
+ [arrays[i].ndim == arrays[0].ndim for i in range(1, len(arrays))])):
+ raise IndexError("All arrays must have same number of dimensions")
+
+ # FIXME we limit the support here: we allow padding of only the last dimension
+ # need to remove this when feat extraction is updated to handle multichannel.
+ max_shape = []
+ for dim in range(arrays[0].ndim):
+ if dim != (arrays[0].ndim - 1):
+ if not all(
+ [x.shape[dim] == arrays[0].shape[dim] for x in arrays[1:]]):
+ raise EnvironmentError(
+ "arrays should have same dimensions except for last one")
+ max_shape.append(max([x.shape[dim] for x in arrays]))
+
+ batched = []
+ valid = []
+ for t in arrays:
+ # for each array we apply pad_right_to
+ padded, valid_percent = pad_right_to(
+ t, max_shape, mode=mode, value=value)
+ batched.append(padded)
+ valid.append(valid_percent[-1])
+
+ batched = numpy.stack(batched)
+
+ return batched, numpy.array(valid)
diff --git a/paddlespeech/vector/io/signal_processing.py b/paddlespeech/vector/io/signal_processing.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee939bdb1042d14a653d3bbf1496c20f0336d2ce
--- /dev/null
+++ b/paddlespeech/vector/io/signal_processing.py
@@ -0,0 +1,217 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import numpy as np
+import paddle
+
+# TODO: Complete type-hint and doc string.
+
+
+def blackman_window(win_len, dtype=np.float32):
+ arcs = np.pi * np.arange(win_len) / float(win_len)
+ win = np.asarray(
+ [0.42 - 0.5 * np.cos(2 * arc) + 0.08 * np.cos(4 * arc) for arc in arcs],
+ dtype=dtype)
+ return paddle.to_tensor(win)
+
+
+def compute_amplitude(waveforms, lengths=None, amp_type="avg", scale="linear"):
+ if len(waveforms.shape) == 1:
+ waveforms = waveforms.unsqueeze(0)
+
+ assert amp_type in ["avg", "peak"]
+ assert scale in ["linear", "dB"]
+
+ if amp_type == "avg":
+ if lengths is None:
+ out = paddle.mean(paddle.abs(waveforms), axis=1, keepdim=True)
+ else:
+ wav_sum = paddle.sum(paddle.abs(waveforms), axis=1, keepdim=True)
+ out = wav_sum / lengths
+ elif amp_type == "peak":
+ out = paddle.max(paddle.abs(waveforms), axis=1, keepdim=True)
+ else:
+ raise NotImplementedError
+
+ if scale == "linear":
+ return out
+ elif scale == "dB":
+ return paddle.clip(20 * paddle.log10(out), min=-80)
+ else:
+ raise NotImplementedError
+
+
+def dB_to_amplitude(SNR):
+ return 10**(SNR / 20)
+
+
+def convolve1d(
+ waveform,
+ kernel,
+ padding=0,
+ pad_type="constant",
+ stride=1,
+ groups=1, ):
+ if len(waveform.shape) != 3:
+ raise ValueError("Convolve1D expects a 3-dimensional tensor")
+
+ # Padding can be a tuple (left_pad, right_pad) or an int
+ if isinstance(padding, list):
+ waveform = paddle.nn.functional.pad(
+ x=waveform,
+ pad=padding,
+ mode=pad_type,
+ data_format='NLC', )
+
+ # Move time dimension last, which pad and fft and conv expect.
+ # (N, L, C) -> (N, C, L)
+ waveform = waveform.transpose([0, 2, 1])
+ kernel = kernel.transpose([0, 2, 1])
+
+ convolved = paddle.nn.functional.conv1d(
+ x=waveform,
+ weight=kernel,
+ stride=stride,
+ groups=groups,
+ padding=padding if not isinstance(padding, list) else 0, )
+
+ # Return time dimension to the second dimension.
+ return convolved.transpose([0, 2, 1])
+
+
+def notch_filter(notch_freq, filter_width=101, notch_width=0.05):
+ # Check inputs
+ assert 0 < notch_freq <= 1
+ assert filter_width % 2 != 0
+ pad = filter_width // 2
+ inputs = paddle.arange(filter_width, dtype='float32') - pad
+
+ # Avoid frequencies that are too low
+ notch_freq += notch_width
+
+ # Define sinc function, avoiding division by zero
+ def sinc(x):
+ def _sinc(x):
+ return paddle.sin(x) / x
+
+ # The zero is at the middle index
+ res = paddle.concat(
+ [_sinc(x[:pad]), paddle.ones([1]), _sinc(x[pad + 1:])])
+ return res
+
+ # Compute a low-pass filter with cutoff frequency notch_freq.
+ hlpf = sinc(3 * (notch_freq - notch_width) * inputs)
+ # import torch
+ # hlpf *= paddle.to_tensor(torch.blackman_window(filter_width).detach().numpy())
+ hlpf *= blackman_window(filter_width)
+ hlpf /= paddle.sum(hlpf)
+
+ # Compute a high-pass filter with cutoff frequency notch_freq.
+ hhpf = sinc(3 * (notch_freq + notch_width) * inputs)
+ # hhpf *= paddle.to_tensor(torch.blackman_window(filter_width).detach().numpy())
+ hhpf *= blackman_window(filter_width)
+ hhpf /= -paddle.sum(hhpf)
+ hhpf[pad] += 1
+
+ # Adding filters creates notch filter
+ return (hlpf + hhpf).reshape([1, -1, 1])
+
+
+def reverberate(waveforms,
+ rir_waveform,
+ sample_rate,
+ impulse_duration=0.3,
+ rescale_amp="avg"):
+ orig_shape = waveforms.shape
+
+ if len(waveforms.shape) > 3 or len(rir_waveform.shape) > 3:
+ raise NotImplementedError
+
+ # if inputs are mono tensors we reshape to 1, samples
+ if len(waveforms.shape) == 1:
+ waveforms = waveforms.unsqueeze(0).unsqueeze(-1)
+ elif len(waveforms.shape) == 2:
+ waveforms = waveforms.unsqueeze(-1)
+
+ if len(rir_waveform.shape) == 1: # convolve1d expects a 3d tensor !
+ rir_waveform = rir_waveform.unsqueeze(0).unsqueeze(-1)
+ elif len(rir_waveform.shape) == 2:
+ rir_waveform = rir_waveform.unsqueeze(-1)
+
+ # Compute the average amplitude of the clean
+ orig_amplitude = compute_amplitude(waveforms, waveforms.shape[1],
+ rescale_amp)
+
+ # Compute index of the direct signal, so we can preserve alignment
+ impulse_index_start = rir_waveform.abs().argmax(axis=1).item()
+ impulse_index_end = min(
+ impulse_index_start + int(sample_rate * impulse_duration),
+ rir_waveform.shape[1])
+ rir_waveform = rir_waveform[:, impulse_index_start:impulse_index_end, :]
+ rir_waveform = rir_waveform / paddle.norm(rir_waveform, p=2)
+ rir_waveform = paddle.flip(rir_waveform, [1])
+
+ waveforms = convolve1d(
+ waveform=waveforms,
+ kernel=rir_waveform,
+ padding=[rir_waveform.shape[1] - 1, 0], )
+
+ # Rescale to the peak amplitude of the clean waveform
+ waveforms = rescale(waveforms, waveforms.shape[1], orig_amplitude,
+ rescale_amp)
+
+ if len(orig_shape) == 1:
+ waveforms = waveforms.squeeze(0).squeeze(-1)
+ if len(orig_shape) == 2:
+ waveforms = waveforms.squeeze(-1)
+
+ return waveforms
+
+
+def rescale(waveforms, lengths, target_lvl, amp_type="avg", scale="linear"):
+ assert amp_type in ["peak", "avg"]
+ assert scale in ["linear", "dB"]
+
+ batch_added = False
+ if len(waveforms.shape) == 1:
+ batch_added = True
+ waveforms = waveforms.unsqueeze(0)
+
+ waveforms = normalize(waveforms, lengths, amp_type)
+
+ if scale == "linear":
+ out = target_lvl * waveforms
+ elif scale == "dB":
+ out = dB_to_amplitude(target_lvl) * waveforms
+
+ else:
+ raise NotImplementedError("Invalid scale, choose between dB and linear")
+
+ if batch_added:
+ out = out.squeeze(0)
+
+ return out
+
+
+def normalize(waveforms, lengths=None, amp_type="avg", eps=1e-14):
+ assert amp_type in ["avg", "peak"]
+
+ batch_added = False
+ if len(waveforms.shape) == 1:
+ batch_added = True
+ waveforms = waveforms.unsqueeze(0)
+
+ den = compute_amplitude(waveforms, lengths, amp_type) + eps
+ if batch_added:
+ waveforms = waveforms.squeeze(0)
+ return waveforms / den
diff --git a/paddlespeech/vector/models/ecapa_tdnn.py b/paddlespeech/vector/models/ecapa_tdnn.py
index e493b8004e2255135a070483f01ca709698fc81c..0e7287cd3614d8964941f6d14179e0ce7f3c4d71 100644
--- a/paddlespeech/vector/models/ecapa_tdnn.py
+++ b/paddlespeech/vector/models/ecapa_tdnn.py
@@ -47,6 +47,19 @@ class Conv1d(nn.Layer):
groups=1,
bias=True,
padding_mode="reflect", ):
+ """_summary_
+
+ Args:
+ in_channels (int): intput channel or input data dimensions
+ out_channels (int): output channel or output data dimensions
+ kernel_size (int): kernel size of 1-d convolution
+ stride (int, optional): strid in 1-d convolution . Defaults to 1.
+ padding (str, optional): padding value. Defaults to "same".
+ dilation (int, optional): dilation in 1-d convolution. Defaults to 1.
+ groups (int, optional): groups in 1-d convolution. Defaults to 1.
+ bias (bool, optional): bias in 1-d convolution . Defaults to True.
+ padding_mode (str, optional): padding mode. Defaults to "reflect".
+ """
super().__init__()
self.kernel_size = kernel_size
@@ -134,6 +147,15 @@ class TDNNBlock(nn.Layer):
kernel_size,
dilation,
activation=nn.ReLU, ):
+ """Implementation of TDNN network
+
+ Args:
+ in_channels (int): input channels or input embedding dimensions
+ out_channels (int): output channels or output embedding dimensions
+ kernel_size (int): the kernel size of the TDNN network block
+ dilation (int): the dilation of the TDNN network block
+ activation (paddle class, optional): the activation layers. Defaults to nn.ReLU.
+ """
super().__init__()
self.conv = Conv1d(
in_channels=in_channels,
@@ -149,6 +171,15 @@ class TDNNBlock(nn.Layer):
class Res2NetBlock(nn.Layer):
def __init__(self, in_channels, out_channels, scale=8, dilation=1):
+ """Implementation of Res2Net Block with dilation
+ The paper is refered as "Res2Net: A New Multi-scale Backbone Architecture",
+ whose url is https://arxiv.org/abs/1904.01169
+ Args:
+ in_channels (int): input channels or input dimensions
+ out_channels (int): output channels or output dimensions
+ scale (int, optional): scale in res2net bolck. Defaults to 8.
+ dilation (int, optional): dilation of 1-d convolution in TDNN block. Defaults to 1.
+ """
super().__init__()
assert in_channels % scale == 0
assert out_channels % scale == 0
@@ -179,6 +210,14 @@ class Res2NetBlock(nn.Layer):
class SEBlock(nn.Layer):
def __init__(self, in_channels, se_channels, out_channels):
+ """Implementation of SEBlock
+ The paper is refered as "Squeeze-and-Excitation Networks"
+ whose url is https://arxiv.org/abs/1709.01507
+ Args:
+ in_channels (int): input channels or input data dimensions
+ se_channels (_type_): _description_
+ out_channels (int): output channels or output data dimensions
+ """
super().__init__()
self.conv1 = Conv1d(
@@ -275,6 +314,18 @@ class SERes2NetBlock(nn.Layer):
kernel_size=1,
dilation=1,
activation=nn.ReLU, ):
+ """Implementation of Squeeze-Extraction Res2Blocks in ECAPA-TDNN network model
+ The paper is refered "Squeeze-and-Excitation Networks"
+ whose url is: https://arxiv.org/pdf/1709.01507.pdf
+ Args:
+ in_channels (int): input channels or input data dimensions
+ out_channels (int): output channels or output data dimensions
+ res2net_scale (int, optional): scale in the res2net block. Defaults to 8.
+ se_channels (int, optional): embedding dimensions of res2net block. Defaults to 128.
+ kernel_size (int, optional): kernel size of 1-d convolution in TDNN block. Defaults to 1.
+ dilation (int, optional): dilation of 1-d convolution in TDNN block. Defaults to 1.
+ activation (paddle.nn.class, optional): activation function. Defaults to nn.ReLU.
+ """
super().__init__()
self.out_channels = out_channels
self.tdnn1 = TDNNBlock(
@@ -326,7 +377,21 @@ class EcapaTdnn(nn.Layer):
res2net_scale=8,
se_channels=128,
global_context=True, ):
-
+ """Implementation of ECAPA-TDNN backbone model network
+ The paper is refered as "ECAPA-TDNN: Emphasized Channel Attention, Propagation and Aggregation in TDNN Based Speaker Verification"
+ whose url is: https://arxiv.org/abs/2005.07143
+ Args:
+ input_size (_type_): input fature dimension
+ lin_neurons (int, optional): speaker embedding size. Defaults to 192.
+ activation (paddle.nn.class, optional): activation function. Defaults to nn.ReLU.
+ channels (list, optional): inter embedding dimension. Defaults to [512, 512, 512, 512, 1536].
+ kernel_sizes (list, optional): kernel size of 1-d convolution in TDNN block . Defaults to [5, 3, 3, 3, 1].
+ dilations (list, optional): dilations of 1-d convolution in TDNN block. Defaults to [1, 2, 3, 4, 1].
+ attention_channels (int, optional): attention dimensions. Defaults to 128.
+ res2net_scale (int, optional): scale value in res2net. Defaults to 8.
+ se_channels (int, optional): dimensions of squeeze-excitation block. Defaults to 128.
+ global_context (bool, optional): global context flag. Defaults to True.
+ """
super().__init__()
assert len(channels) == len(kernel_sizes)
assert len(channels) == len(dilations)
diff --git a/paddlespeech/vector/modules/__init__.py b/paddlespeech/vector/modules/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/paddlespeech/vector/modules/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/vector/modules/loss.py b/paddlespeech/vector/modules/loss.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c80dda4fc7ccd02aa0c66f5c9a24c5dd4e97a64
--- /dev/null
+++ b/paddlespeech/vector/modules/loss.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# This is modified from SpeechBrain
+# https://github.com/speechbrain/speechbrain/blob/085be635c07f16d42cd1295045bc46c407f1e15b/speechbrain/nnet/losses.py
+import math
+
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class AngularMargin(nn.Layer):
+ def __init__(self, margin=0.0, scale=1.0):
+ """An implementation of Angular Margin (AM) proposed in the following
+ paper: '''Margin Matters: Towards More Discriminative Deep Neural Network
+ Embeddings for Speaker Recognition''' (https://arxiv.org/abs/1906.07317)
+
+ Args:
+ margin (float, optional): The margin for cosine similiarity. Defaults to 0.0.
+ scale (float, optional): The scale for cosine similiarity. Defaults to 1.0.
+ """
+ super(AngularMargin, self).__init__()
+ self.margin = margin
+ self.scale = scale
+
+ def forward(self, outputs, targets):
+ outputs = outputs - self.margin * targets
+ return self.scale * outputs
+
+
+class AdditiveAngularMargin(AngularMargin):
+ def __init__(self, margin=0.0, scale=1.0, easy_margin=False):
+ """The Implementation of Additive Angular Margin (AAM) proposed
+ in the following paper: '''Margin Matters: Towards More Discriminative Deep Neural Network Embeddings for Speaker Recognition'''
+ (https://arxiv.org/abs/1906.07317)
+
+ Args:
+ margin (float, optional): margin factor. Defaults to 0.0.
+ scale (float, optional): scale factor. Defaults to 1.0.
+ easy_margin (bool, optional): easy_margin flag. Defaults to False.
+ """
+ super(AdditiveAngularMargin, self).__init__(margin, scale)
+ self.easy_margin = easy_margin
+
+ self.cos_m = math.cos(self.margin)
+ self.sin_m = math.sin(self.margin)
+ self.th = math.cos(math.pi - self.margin)
+ self.mm = math.sin(math.pi - self.margin) * self.margin
+
+ def forward(self, outputs, targets):
+ cosine = outputs.astype('float32')
+ sine = paddle.sqrt(1.0 - paddle.pow(cosine, 2))
+ phi = cosine * self.cos_m - sine * self.sin_m # cos(theta + m)
+ if self.easy_margin:
+ phi = paddle.where(cosine > 0, phi, cosine)
+ else:
+ phi = paddle.where(cosine > self.th, phi, cosine - self.mm)
+ outputs = (targets * phi) + ((1.0 - targets) * cosine)
+ return self.scale * outputs
+
+
+class LogSoftmaxWrapper(nn.Layer):
+ def __init__(self, loss_fn):
+ """Speaker identificatin loss function wrapper
+ including all of compositions of the loss transformation
+ Args:
+ loss_fn (_type_): the loss value of a batch
+ """
+ super(LogSoftmaxWrapper, self).__init__()
+ self.loss_fn = loss_fn
+ self.criterion = paddle.nn.KLDivLoss(reduction="sum")
+
+ def forward(self, outputs, targets, length=None):
+ targets = F.one_hot(targets, outputs.shape[1])
+ try:
+ predictions = self.loss_fn(outputs, targets)
+ except TypeError:
+ predictions = self.loss_fn(outputs)
+
+ predictions = F.log_softmax(predictions, axis=1)
+ loss = self.criterion(predictions, targets) / targets.sum()
+ return loss
diff --git a/paddlespeech/vector/modules/sid_model.py b/paddlespeech/vector/modules/sid_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..4045f75d1286bf2efc5b9a27f9cef25d715a8690
--- /dev/null
+++ b/paddlespeech/vector/modules/sid_model.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+import paddle.nn as nn
+import paddle.nn.functional as F
+
+
+class SpeakerIdetification(nn.Layer):
+ def __init__(
+ self,
+ backbone,
+ num_class,
+ lin_blocks=0,
+ lin_neurons=192,
+ dropout=0.1, ):
+ """The speaker identification model, which includes the speaker backbone network
+ and the a linear transform to speaker class num in training
+
+ Args:
+ backbone (Paddle.nn.Layer class): the speaker identification backbone network model
+ num_class (_type_): the speaker class num in the training dataset
+ lin_blocks (int, optional): the linear layer transform between the embedding and the final linear layer. Defaults to 0.
+ lin_neurons (int, optional): the output dimension of final linear layer. Defaults to 192.
+ dropout (float, optional): the dropout factor on the embedding. Defaults to 0.1.
+ """
+ super(SpeakerIdetification, self).__init__()
+ # speaker idenfication backbone network model
+ # the output of the backbond network is the target embedding
+ self.backbone = backbone
+ if dropout > 0:
+ self.dropout = nn.Dropout(dropout)
+ else:
+ self.dropout = None
+
+ # construct the speaker classifer
+ input_size = self.backbone.emb_size
+ self.blocks = nn.LayerList()
+ for i in range(lin_blocks):
+ self.blocks.extend([
+ nn.BatchNorm1D(input_size),
+ nn.Linear(in_features=input_size, out_features=lin_neurons),
+ ])
+ input_size = lin_neurons
+
+ # the final layer
+ self.weight = paddle.create_parameter(
+ shape=(input_size, num_class),
+ dtype='float32',
+ attr=paddle.ParamAttr(initializer=nn.initializer.XavierUniform()), )
+
+ def forward(self, x, lengths=None):
+ """Do the speaker identification model forwrd,
+ including the speaker embedding model and the classifier model network
+
+ Args:
+ x (paddle.Tensor): input audio feats,
+ shape=[batch, dimension, times]
+ lengths (paddle.Tensor, optional): input audio length.
+ shape=[batch, times]
+ Defaults to None.
+
+ Returns:
+ paddle.Tensor: return the logits of the feats
+ """
+ # x.shape: (N, C, L)
+ x = self.backbone(x, lengths).squeeze(
+ -1) # (N, emb_size, 1) -> (N, emb_size)
+ if self.dropout is not None:
+ x = self.dropout(x)
+
+ for fc in self.blocks:
+ x = fc(x)
+
+ logits = F.linear(F.normalize(x), F.normalize(self.weight, axis=0))
+
+ return logits
diff --git a/paddlespeech/vector/training/__init__.py b/paddlespeech/vector/training/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/paddlespeech/vector/training/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/vector/training/scheduler.py b/paddlespeech/vector/training/scheduler.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dcac0576c6b0eb7e76d60afaab410a5971faafa
--- /dev/null
+++ b/paddlespeech/vector/training/scheduler.py
@@ -0,0 +1,45 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddle.optimizer.lr import LRScheduler
+
+
+class CyclicLRScheduler(LRScheduler):
+ def __init__(self,
+ base_lr: float=1e-8,
+ max_lr: float=1e-3,
+ step_size: int=10000):
+
+ super(CyclicLRScheduler, self).__init__()
+
+ self.current_step = -1
+ self.base_lr = base_lr
+ self.max_lr = max_lr
+ self.step_size = step_size
+
+ def step(self):
+ if not hasattr(self, 'current_step'):
+ return
+
+ self.current_step += 1
+ if self.current_step >= 2 * self.step_size:
+ self.current_step %= 2 * self.step_size
+
+ self.last_lr = self.get_lr()
+
+ def get_lr(self):
+ p = self.current_step / (2 * self.step_size) # Proportion in one cycle.
+ if p < 0.5: # Increase
+ return self.base_lr + p / 0.5 * (self.max_lr - self.base_lr)
+ else: # Decrease
+ return self.max_lr - (p / 0.5 - 1) * (self.max_lr - self.base_lr)
diff --git a/paddlespeech/vector/training/seeding.py b/paddlespeech/vector/training/seeding.py
new file mode 100644
index 0000000000000000000000000000000000000000..0778a27d61943ad63095a72a045b8ea52d8602d6
--- /dev/null
+++ b/paddlespeech/vector/training/seeding.py
@@ -0,0 +1,28 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from paddlespeech.s2t.utils.log import Log
+
+logger = Log(__name__).getlog()
+import random
+
+import numpy as np
+import paddle
+
+
+def seed_everything(seed: int):
+ """Seed paddle, random and np.random to help reproductivity."""
+ paddle.seed(seed)
+ random.seed(seed)
+ np.random.seed(seed)
+ logger.info(f"Set the seed of paddle, random, np.random to {seed}.")
diff --git a/paddlespeech/vector/utils/__init__.py b/paddlespeech/vector/utils/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..97043fd7ba6885aac81cad5a49924c23c67d4d47
--- /dev/null
+++ b/paddlespeech/vector/utils/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlespeech/vector/utils/time.py b/paddlespeech/vector/utils/time.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e85b0e120a3f0cf12d2e52aa3c397c873c3a869
--- /dev/null
+++ b/paddlespeech/vector/utils/time.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import time
+
+
+class Timer(object):
+ '''Calculate runing speed and estimated time of arrival(ETA)'''
+
+ def __init__(self, total_step: int):
+ self.total_step = total_step
+ self.last_start_step = 0
+ self.current_step = 0
+ self._is_running = True
+
+ def start(self):
+ self.last_time = time.time()
+ self.start_time = time.time()
+
+ def stop(self):
+ self._is_running = False
+ self.end_time = time.time()
+
+ def count(self) -> int:
+ if not self.current_step >= self.total_step:
+ self.current_step += 1
+ return self.current_step
+
+ @property
+ def timing(self) -> float:
+ run_steps = self.current_step - self.last_start_step
+ self.last_start_step = self.current_step
+ time_used = time.time() - self.last_time
+ self.last_time = time.time()
+ return time_used / run_steps
+
+ @property
+ def is_running(self) -> bool:
+ return self._is_running
+
+ @property
+ def eta(self) -> str:
+ if not self.is_running:
+ return '00:00:00'
+ remaining_time = time.time() - self.start_time
+ return seconds_to_hms(remaining_time)
+
+
+def seconds_to_hms(seconds: int) -> str:
+ '''Convert the number of seconds to hh:mm:ss'''
+ h = math.floor(seconds / 3600)
+ m = math.floor((seconds - h * 3600) / 60)
+ s = int(seconds - h * 3600 - m * 60)
+ hms_str = '{:0>2}:{:0>2}:{:0>2}'.format(h, m, s)
+ return hms_str
diff --git a/speechx/README.md b/speechx/README.md
index 7d73b61c6fafc5c8f3f4c3c5551da5ae34066fe5..610b88a8fe6a3acb62e74258095677827d729267 100644
--- a/speechx/README.md
+++ b/speechx/README.md
@@ -5,7 +5,7 @@
We develop under:
* docker - registry.baidubce.com/paddlepaddle/paddle:2.1.1-gpu-cuda10.2-cudnn7
* os - Ubuntu 16.04.7 LTS
-* gcc/g++ - 8.2.0
+* gcc/g++/gfortran - 8.2.0
* cmake - 3.16.0
> We make sure all things work fun under docker, and recommend using it to develop and deploy.
@@ -24,11 +24,13 @@ nvidia-docker run --privileged --net=host --ipc=host -it --rm -v $PWD:/workspac
* More `Paddle` docker images you can see [here](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html).
-* If you want only work under cpu, please download corresponded [image](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html), and using `docker` instead `nviida-docker`.
+* If you want only work under cpu, please download corresponded [image](https://www.paddlepaddle.org.cn/install/quick?docurl=/documentation/docs/zh/install/docker/linux-docker.html), and using `docker` instead `nvidia-docker`.
2. Build `speechx` and `examples`.
+> Do not source venv.
+
```
pushd /path/to/speechx
./build.sh
diff --git a/speechx/build.sh b/speechx/build.sh
index 3e9600d538c13949fcdddb118d9cdd5d945c96e6..8e36d23367346e2a68168c2954f224c845f75e15 100755
--- a/speechx/build.sh
+++ b/speechx/build.sh
@@ -2,8 +2,7 @@
# the build script had verified in the paddlepaddle docker image.
# please follow the instruction below to install PaddlePaddle image.
-# https://www.paddlepaddle.org.cn/documentation/docs/zh/install/docker/linux-docker.html
-
+# https://www.paddlepaddle.org.cn/documentation/docs/zh/install/docker/linux-docker.html
boost_SOURCE_DIR=$PWD/fc_patch/boost-src
if [ ! -d ${boost_SOURCE_DIR} ]; then wget -c https://boostorg.jfrog.io/artifactory/main/release/1.75.0/source/boost_1_75_0.tar.gz
tar xzfv boost_1_75_0.tar.gz
@@ -23,6 +22,6 @@ cd build
cmake .. -DBOOST_ROOT:STRING=${boost_SOURCE_DIR}
#cmake ..
-make -j1
+make -j10
cd -
diff --git a/speechx/cmake/FindGFortranLibs.cmake b/speechx/cmake/FindGFortranLibs.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..763f7883335d5582a7dbb89de07469e13ca2b47d
--- /dev/null
+++ b/speechx/cmake/FindGFortranLibs.cmake
@@ -0,0 +1,145 @@
+#.rst:
+# FindGFortranLibs
+# --------
+# https://github.com/Argonne-National-Laboratory/PIPS/blob/master/cmake/Modules/FindGFortranLibs.cmake
+# https://enccs.github.io/cmake-workshop/cxx-fortran/
+#
+# Find gcc Fortran compiler & library paths
+#
+# The module defines the following variables:
+#
+# ::
+#
+#
+# GFORTRANLIBS_FOUND - true if system has gfortran
+# LIBGFORTRAN_LIBRARIES - path to libgfortran
+# LIBQUADMATH_LIBRARIES - path to libquadmath
+# GFORTRAN_LIBARIES_DIR - directory containing libgfortran, libquadmath
+# GFORTRAN_INCLUDE_DIR - directory containing gfortran/gcc headers
+# LIBGOMP_LIBRARIES - path to libgomp
+# LIBGOMP_INCLUDE_DIR - directory containing omp.h header
+# GFORTRAN_VERSION_STRING - version of gfortran found
+#
+set(CMAKE_REQUIRED_QUIET ${LIBIOMP_FIND_QUIETLY})
+
+if(NOT CMAKE_REQUIRED_QUIET)
+ message(STATUS "Looking for gfortran related libraries...")
+endif()
+
+enable_language(Fortran)
+if(CMAKE_Fortran_COMPILER_ID MATCHES "GNU")
+
+ # Basically, call "gfortran -v" to dump compiler info to the string
+ # GFORTRAN_VERBOSE_STR, which will be used to get necessary paths
+ message(STATUS "Extracting library and header information by calling 'gfortran -v'...")
+ execute_process(COMMAND "${CMAKE_Fortran_COMPILER}" "-v" ERROR_VARIABLE
+ GFORTRAN_VERBOSE_STR RESULT_VARIABLE FLAG)
+
+ # For debugging
+ message(STATUS "'gfortran -v' returned:")
+ message(STATUS "${GFORTRAN_VERBOSE_STR}")
+
+ # Detect gfortran version
+ string(REGEX MATCH "gcc version [^\t\n ]+" GFORTRAN_VER_STR "${GFORTRAN_VERBOSE_STR}")
+ string(REGEX REPLACE "gcc version ([^\t\n ]+)" "\\1" GFORTRAN_VERSION_STRING "${GFORTRAN_VER_STR}")
+ message(STATUS "Detected gfortran version ${GFORTRAN_VERSION_STRING}")
+ unset(GFORTRAN_VER_STR)
+
+ set(MATCH_REGEX "[^\t\n ]+[\t\n ]+")
+ set(REPLACE_REGEX "([^\t\n ]+)")
+
+ # Find architecture for compiler
+ string(REGEX MATCH "Target: [^\t\n ]+"
+ GFORTRAN_ARCH_STR "${GFORTRAN_VERBOSE_STR}")
+ message(STATUS "Architecture string: ${GFORTRAN_ARCH_STR}")
+ string(REGEX REPLACE "Target: ([^\t\n ]+)" "\\1"
+ GFORTRAN_ARCH "${GFORTRAN_ARCH_STR}")
+ message(STATUS "Detected gfortran architecture: ${GFORTRAN_ARCH}")
+ unset(GFORTRAN_ARCH_STR)
+
+ # Find install prefix, if it exists; if not, use default
+ string(REGEX MATCH "--prefix=[^\t\n ]+[\t\n ]+"
+ GFORTRAN_PREFIX_STR "${GFORTRAN_VERBOSE_STR}")
+ if(NOT GFORTRAN_PREFIX_STR)
+ message(STATUS "Detected default gfortran prefix")
+ set(GFORTRAN_PREFIX_DIR "/usr/local") # default prefix for gcc install
+ else()
+ string(REGEX REPLACE "--prefix=([^\t\n ]+)" "\\1"
+ GFORTRAN_PREFIX_DIR "${GFORTRAN_PREFIX_STR}")
+ endif()
+ message(STATUS "Detected gfortran prefix: ${GFORTRAN_PREFIX_DIR}")
+ unset(GFORTRAN_PREFIX_STR)
+
+ # Find install exec-prefix, if it exists; if not, use default
+ string(REGEX MATCH "--exec-prefix=[^\t\n ]+[\t\n ]+" "\\1"
+ GFORTRAN_EXEC_PREFIX_STR "${GFORTRAN_VERBOSE_STR}")
+ if(NOT GFORTRAN_EXEC_PREFIX_STR)
+ message(STATUS "Detected default gfortran exec-prefix")
+ set(GFORTRAN_EXEC_PREFIX_DIR "${GFORTRAN_PREFIX_DIR}")
+ else()
+ string(REGEX REPLACE "--exec-prefix=([^\t\n ]+)" "\\1"
+ GFORTRAN_EXEC_PREFIX_DIR "${GFORTRAN_EXEC_PREFIX_STR}")
+ endif()
+ message(STATUS "Detected gfortran exec-prefix: ${GFORTRAN_EXEC_PREFIX_DIR}")
+ UNSET(GFORTRAN_EXEC_PREFIX_STR)
+
+ # Find library directory and include directory, if library directory specified
+ string(REGEX MATCH "--libdir=[^\t\n ]+"
+ GFORTRAN_LIB_DIR_STR "${GFORTRAN_VERBOSE_STR}")
+ if(NOT GFORTRAN_LIB_DIR_STR)
+ message(STATUS "Found --libdir flag -- not found")
+ message(STATUS "Using default gfortran library & include directory paths")
+ set(GFORTRAN_LIBRARIES_DIR
+ "${GFORTRAN_EXEC_PREFIX_DIR}/lib/gcc/${GFORTRAN_ARCH}/${GFORTRAN_VERSION_STRING}")
+ string(CONCAT GFORTRAN_INCLUDE_DIR "${GFORTRAN_LIBRARIES_DIR}" "/include")
+ else()
+ message(STATUS "Found --libdir flag -- yes")
+ string(REGEX REPLACE "--libdir=([^\t\n ]+)" "\\1"
+ GFORTRAN_LIBRARIES_DIR "${GFORTRAN_LIB_DIR_STR}")
+ string(CONCAT GFORTRAN_INCLUDE_DIR "${GFORTRAN_LIBRARIES_DIR}" "/gcc/" "${GFORTRAN_ARCH}" "/" "${GFORTRAN_VERSION_STRING}" "/include")
+ endif()
+ message(STATUS "gfortran libraries path: ${GFORTRAN_LIBRARIES_DIR}")
+ message(STATUS "gfortran include path dir: ${GFORTRAN_INCLUDE_DIR}")
+ unset(GFORTRAN_LIB_DIR_STR)
+
+ # There are lots of other build options for gcc & gfortran. For now, the
+ # options implemented above should cover a lot of common use cases.
+
+ # Clean up be deleting the output string from "gfortran -v"
+ unset(GFORTRAN_VERBOSE_STR)
+
+ # Find paths for libgfortran, libquadmath, libgomp
+ # libgomp needed for OpenMP support without Clang
+ find_library(LIBGFORTRAN_LIBRARIES NAMES gfortran libgfortran
+ HINTS ${GFORTRAN_LIBRARIES_DIR})
+ find_library(LIBQUADMATH_LIBRARIES NAMES quadmath libquadmath
+ HINTS ${GFORTRAN_LIBRARIES_DIR})
+ find_library(LIBGOMP_LIBRARIES NAMES gomp libgomp
+ HINTS ${GFORTRAN_LIBRARIES_DIR})
+
+ # Find OpenMP headers
+ find_path(LIBGOMP_INCLUDE_DIR NAMES omp.h HINTS ${GFORTRAN_INCLUDE_DIR})
+
+else()
+ message(STATUS "CMAKE_Fortran_COMPILER_ID does not match 'GNU'!")
+endif()
+
+include(FindPackageHandleStandardArgs)
+
+# Required: libgfortran, libquadmath, path for gfortran libraries
+# Optional: libgomp, path for OpenMP headers, path for gcc/gfortran headers
+find_package_handle_standard_args(GFortranLibs
+ REQUIRED_VARS LIBGFORTRAN_LIBRARIES LIBQUADMATH_LIBRARIES GFORTRAN_LIBRARIES_DIR
+ VERSION_VAR GFORTRAN_VERSION_STRING)
+
+if(GFORTRANLIBS_FOUND)
+ message(STATUS "Looking for gfortran libraries -- found")
+ message(STATUS "gfortran version: ${GFORTRAN_VERSION_STRING}")
+else()
+ message(STATUS "Looking for gfortran libraries -- not found")
+endif()
+
+mark_as_advanced(LIBGFORTRAN_LIBRARIES LIBQUADMATH_LIBRARIES
+ LIBGOMP_LIBRARIES LIBGOMP_INCLUDE_DIR
+ GFORTRAN_LIBRARIES_DIR GFORTRAN_INCLUDE_DIR)
+# FindGFortranLIBS.cmake ends here
\ No newline at end of file
diff --git a/speechx/cmake/external/openblas.cmake b/speechx/cmake/external/openblas.cmake
index 3c202f7f689339b30f215dc3af2db68a279a6541..5c196527eadb2f007534755bc476375036dd40cf 100644
--- a/speechx/cmake/external/openblas.cmake
+++ b/speechx/cmake/external/openblas.cmake
@@ -7,6 +7,27 @@ set(OpenBLAS_PREFIX ${fc_patch}/OpenBLAS-prefix)
# OPENBLAS https://github.com/lattice/quda/blob/develop/CMakeLists.txt#L575
# ######################################################################################################################
enable_language(Fortran)
+
+include(FortranCInterface)
+
+# # Clang doesn't have a Fortran compiler in its suite (yet),
+# # so detect libraries for gfortran; we need equivalents to
+# # libgfortran and libquadmath, which are implicitly
+# # linked by flags in CMAKE_Fortran_IMPLICIT_LINK_LIBRARIES
+# include(FindGFortranLibs REQUIRED)
+# # Add directory containing libgfortran and libquadmath to
+# # linker. Should also contain libgomp, if not using
+# # Intel OpenMP runtime
+# link_directories(${GFORTRAN_LIBRARIES_DIR})
+# # gfortan dir in the docker.
+# link_directories(/usr/local/gcc-8.2/lib64)
+# # if you are working with C and Fortran
+# FortranCInterface_VERIFY()
+
+# # if you are working with C++ and Fortran
+# FortranCInterface_VERIFY(CXX)
+
+
#TODO: switch to CPM
include(GNUInstallDirs)
ExternalProject_Add(
diff --git a/speechx/cmake/external/openfst.cmake b/speechx/cmake/external/openfst.cmake
index 07abb18e81d7336b4a6fb2147a4225fb62daffaf..9acf530a19549040e7642a1e6888689bb612469f 100644
--- a/speechx/cmake/external/openfst.cmake
+++ b/speechx/cmake/external/openfst.cmake
@@ -1,19 +1,20 @@
include(FetchContent)
+set(openfst_PREFIX_DIR ${fc_patch}/openfst)
set(openfst_SOURCE_DIR ${fc_patch}/openfst-src)
set(openfst_BINARY_DIR ${fc_patch}/openfst-build)
ExternalProject_Add(openfst
URL https://github.com/mjansche/openfst/archive/refs/tags/1.7.2.zip
URL_HASH SHA256=ffc56931025579a8af3515741c0f3b0fc3a854c023421472c07ca0c6389c75e6
-# #PREFIX ${openfst_PREFIX_DIR}
-# SOURCE_DIR ${openfst_SOURCE_DIR}
-# BINARY_DIR ${openfst_BINARY_DIR}
+ PREFIX ${openfst_PREFIX_DIR}
+ SOURCE_DIR ${openfst_SOURCE_DIR}
+ BINARY_DIR ${openfst_BINARY_DIR}
CONFIGURE_COMMAND ${openfst_SOURCE_DIR}/configure --prefix=${openfst_PREFIX_DIR}
"CPPFLAGS=-I${gflags_BINARY_DIR}/include -I${glog_SOURCE_DIR}/src -I${glog_BINARY_DIR}"
"LDFLAGS=-L${gflags_BINARY_DIR} -L${glog_BINARY_DIR}"
"LIBS=-lgflags_nothreads -lglog -lpthread"
- COMMAND ${CMAKE_COMMAND} -E copy_directory ${CMAKE_CURRENT_SOURCE_DIR}/patch/openfst ${openfst_SOURCE_DIR}
+ COMMAND ${CMAKE_COMMAND} -E copy_directory ${PROJECT_SOURCE_DIR}/patch/openfst ${openfst_SOURCE_DIR}
BUILD_COMMAND make -j 4
)
link_directories(${openfst_PREFIX_DIR}/lib)
-include_directories(${openfst_PREFIX_DIR}/include)
+include_directories(${openfst_PREFIX_DIR}/include)
\ No newline at end of file
diff --git a/speechx/examples/CMakeLists.txt b/speechx/examples/CMakeLists.txt
index ef0a72b8838b0d76c420f0219d65604b459beac4..7f1543c2562ea7991348bc06805b65b1c147a66c 100644
--- a/speechx/examples/CMakeLists.txt
+++ b/speechx/examples/CMakeLists.txt
@@ -3,3 +3,5 @@ cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
add_subdirectory(feat)
add_subdirectory(nnet)
add_subdirectory(decoder)
+
+add_subdirectory(glog)
\ No newline at end of file
diff --git a/speechx/examples/README.md b/speechx/examples/README.md
index 941c4272d9a919d6e8005c9b12cfaa650e9b5139..705ca2006c161339c2d28ca922116fa68dd307cf 100644
--- a/speechx/examples/README.md
+++ b/speechx/examples/README.md
@@ -1,8 +1,9 @@
# Examples
-* decoder - online decoder to work as offline
+* glog - glog usage
* feat - mfcc, linear
* nnet - ds2 nn
+* decoder - online decoder to work as offline
## How to run
diff --git a/speechx/examples/decoder/CMakeLists.txt b/speechx/examples/decoder/CMakeLists.txt
index 11e2ca913463aff445d6019834421b46754505ee..d446a67150702f6fc226be2a4dbba326f2a37159 100644
--- a/speechx/examples/decoder/CMakeLists.txt
+++ b/speechx/examples/decoder/CMakeLists.txt
@@ -1,5 +1,9 @@
cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+add_executable(offline_decoder_sliding_chunk_main ${CMAKE_CURRENT_SOURCE_DIR}/offline_decoder_sliding_chunk_main.cc)
+target_include_directories(offline_decoder_sliding_chunk_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(offline_decoder_sliding_chunk_main PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
+
add_executable(offline_decoder_main ${CMAKE_CURRENT_SOURCE_DIR}/offline_decoder_main.cc)
target_include_directories(offline_decoder_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
target_link_libraries(offline_decoder_main PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
@@ -7,3 +11,8 @@ target_link_libraries(offline_decoder_main PUBLIC nnet decoder fst utils gflags
add_executable(offline_wfst_decoder_main ${CMAKE_CURRENT_SOURCE_DIR}/offline_wfst_decoder_main.cc)
target_include_directories(offline_wfst_decoder_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
target_link_libraries(offline_wfst_decoder_main PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util kaldi-decoder ${DEPS})
+
+add_executable(decoder_test_main ${CMAKE_CURRENT_SOURCE_DIR}/decoder_test_main.cc)
+target_include_directories(decoder_test_main PRIVATE ${SPEECHX_ROOT} ${SPEECHX_ROOT}/kaldi)
+target_link_libraries(decoder_test_main PUBLIC nnet decoder fst utils gflags glog kaldi-base kaldi-matrix kaldi-util ${DEPS})
+
diff --git a/speechx/examples/decoder/decoder_test_main.cc b/speechx/examples/decoder/decoder_test_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0e249cc6b93ced6b80e9c7b522ad8bd3819a19e7
--- /dev/null
+++ b/speechx/examples/decoder/decoder_test_main.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// todo refactor, repalce with gtest
+
+#include "base/flags.h"
+#include "base/log.h"
+#include "decoder/ctc_beam_search_decoder.h"
+#include "kaldi/util/table-types.h"
+#include "nnet/decodable.h"
+
+DEFINE_string(nnet_prob_respecifier, "", "test nnet prob rspecifier");
+DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
+DEFINE_string(lm_path, "lm.klm", "language model");
+
+using kaldi::BaseFloat;
+using kaldi::Matrix;
+using std::vector;
+
+// test decoder by feeding nnet posterior probability
+int main(int argc, char* argv[]) {
+ gflags::ParseCommandLineFlags(&argc, &argv, false);
+ google::InitGoogleLogging(argv[0]);
+
+ kaldi::SequentialBaseFloatMatrixReader likelihood_reader(
+ FLAGS_nnet_prob_respecifier);
+ std::string dict_file = FLAGS_dict_file;
+ std::string lm_path = FLAGS_lm_path;
+ LOG(INFO) << "dict path: " << dict_file;
+ LOG(INFO) << "lm path: " << lm_path;
+
+ int32 num_done = 0, num_err = 0;
+
+ ppspeech::CTCBeamSearchOptions opts;
+ opts.dict_file = dict_file;
+ opts.lm_path = lm_path;
+ ppspeech::CTCBeamSearch decoder(opts);
+
+ std::shared_ptr decodable(
+ new ppspeech::Decodable(nullptr, nullptr));
+
+ decoder.InitDecoder();
+
+ for (; !likelihood_reader.Done(); likelihood_reader.Next()) {
+ string utt = likelihood_reader.Key();
+ const kaldi::Matrix likelihood = likelihood_reader.Value();
+ LOG(INFO) << "process utt: " << utt;
+ LOG(INFO) << "rows: " << likelihood.NumRows();
+ LOG(INFO) << "cols: " << likelihood.NumCols();
+ decodable->Acceptlikelihood(likelihood);
+ decoder.AdvanceDecode(decodable);
+ std::string result;
+ result = decoder.GetFinalBestPath();
+ KALDI_LOG << " the result of " << utt << " is " << result;
+ decodable->Reset();
+ decoder.Reset();
+ ++num_done;
+ }
+
+ KALDI_LOG << "Done " << num_done << " utterances, " << num_err
+ << " with errors.";
+ return (num_done != 0 ? 0 : 1);
+}
diff --git a/speechx/examples/decoder/local/model.sh b/speechx/examples/decoder/local/model.sh
new file mode 100644
index 0000000000000000000000000000000000000000..5c609a6cf45715106f745dbc2755775734e35558
--- /dev/null
+++ b/speechx/examples/decoder/local/model.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+
diff --git a/speechx/examples/decoder/offline_decoder_main.cc b/speechx/examples/decoder/offline_decoder_main.cc
index 44127c73b4e6ce37328dfe8e8e91984495d3d27e..9a9c14a0cff9f2b239a45ad7a12c2a2f20f4d601 100644
--- a/speechx/examples/decoder/offline_decoder_main.cc
+++ b/speechx/examples/decoder/offline_decoder_main.cc
@@ -17,22 +17,24 @@
#include "base/flags.h"
#include "base/log.h"
#include "decoder/ctc_beam_search_decoder.h"
-#include "frontend/raw_audio.h"
+#include "frontend/audio/data_cache.h"
#include "kaldi/util/table-types.h"
#include "nnet/decodable.h"
#include "nnet/paddle_nnet.h"
-DEFINE_string(feature_respecifier, "", "test feature rspecifier");
+DEFINE_string(feature_respecifier, "", "feature matrix rspecifier");
DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
DEFINE_string(lm_path, "lm.klm", "language model");
+DEFINE_int32(chunk_size, 35, "feat chunk size");
using kaldi::BaseFloat;
using kaldi::Matrix;
using std::vector;
+// test decoder by feeding speech feature, deprecated.
int main(int argc, char* argv[]) {
gflags::ParseCommandLineFlags(&argc, &argv, false);
google::InitGoogleLogging(argv[0]);
@@ -43,50 +45,68 @@ int main(int argc, char* argv[]) {
std::string model_params = FLAGS_param_path;
std::string dict_file = FLAGS_dict_file;
std::string lm_path = FLAGS_lm_path;
+ int32 chunk_size = FLAGS_chunk_size;
+ LOG(INFO) << "model path: " << model_graph;
+ LOG(INFO) << "model param: " << model_params;
+ LOG(INFO) << "dict path: " << dict_file;
+ LOG(INFO) << "lm path: " << lm_path;
+ LOG(INFO) << "chunk size (frame): " << chunk_size;
int32 num_done = 0, num_err = 0;
- ppspeech::CTCBeamSearchOptions opts;
- opts.dict_file = dict_file;
- opts.lm_path = lm_path;
- ppspeech::CTCBeamSearch decoder(opts);
-
+ // frontend + nnet is decodable
ppspeech::ModelOptions model_opts;
model_opts.model_path = model_graph;
model_opts.params_path = model_params;
std::shared_ptr nnet(
new ppspeech::PaddleNnet(model_opts));
- std::shared_ptr raw_data(
- new ppspeech::RawDataCache());
+ std::shared_ptr raw_data(new ppspeech::DataCache());
std::shared_ptr decodable(
new ppspeech::Decodable(nnet, raw_data));
+ LOG(INFO) << "Init decodeable.";
- int32 chunk_size = 35;
- decoder.InitDecoder();
+ // init decoder
+ ppspeech::CTCBeamSearchOptions opts;
+ opts.dict_file = dict_file;
+ opts.lm_path = lm_path;
+ ppspeech::CTCBeamSearch decoder(opts);
+ LOG(INFO) << "Init decoder.";
+ decoder.InitDecoder();
for (; !feature_reader.Done(); feature_reader.Next()) {
string utt = feature_reader.Key();
const kaldi::Matrix feature = feature_reader.Value();
+ LOG(INFO) << "utt: " << utt;
+
+ // feat dim
raw_data->SetDim(feature.NumCols());
+ LOG(INFO) << "dim: " << raw_data->Dim();
+
int32 row_idx = 0;
int32 num_chunks = feature.NumRows() / chunk_size;
+ LOG(INFO) << "n chunks: " << num_chunks;
for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
+ // feat chunk
kaldi::Vector feature_chunk(chunk_size *
feature.NumCols());
for (int row_id = 0; row_id < chunk_size; ++row_id) {
- kaldi::SubVector tmp(feature, row_idx);
+ kaldi::SubVector feat_one_row(feature,
+ row_idx);
kaldi::SubVector f_chunk_tmp(
feature_chunk.Data() + row_id * feature.NumCols(),
feature.NumCols());
- f_chunk_tmp.CopyFromVec(tmp);
+ f_chunk_tmp.CopyFromVec(feat_one_row);
row_idx++;
}
+ // feed to raw cache
raw_data->Accept(feature_chunk);
if (chunk_idx == num_chunks - 1) {
raw_data->SetFinished();
}
+ // decode step
decoder.AdvanceDecode(decodable);
}
+
std::string result;
result = decoder.GetFinalBestPath();
KALDI_LOG << " the result of " << utt << " is " << result;
diff --git a/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc b/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7f6c572ca2ff6675d98474a5eb85381d1703e641
--- /dev/null
+++ b/speechx/examples/decoder/offline_decoder_sliding_chunk_main.cc
@@ -0,0 +1,141 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// todo refactor, repalce with gtest
+
+#include "base/flags.h"
+#include "base/log.h"
+#include "decoder/ctc_beam_search_decoder.h"
+#include "frontend/audio/data_cache.h"
+#include "kaldi/util/table-types.h"
+#include "nnet/decodable.h"
+#include "nnet/paddle_nnet.h"
+
+DEFINE_string(feature_respecifier, "", "test feature rspecifier");
+DEFINE_string(model_path, "avg_1.jit.pdmodel", "paddle nnet model");
+DEFINE_string(param_path, "avg_1.jit.pdiparams", "paddle nnet model param");
+DEFINE_string(dict_file, "vocab.txt", "vocabulary of lm");
+DEFINE_string(lm_path, "lm.klm", "language model");
+DEFINE_int32(receptive_field_length,
+ 7,
+ "receptive field of two CNN(kernel=5) downsampling module.");
+DEFINE_int32(downsampling_rate,
+ 4,
+ "two CNN(kernel=5) module downsampling rate.");
+
+using kaldi::BaseFloat;
+using kaldi::Matrix;
+using std::vector;
+
+
+// test ds2 online decoder by feeding speech feature
+int main(int argc, char* argv[]) {
+ gflags::ParseCommandLineFlags(&argc, &argv, false);
+ google::InitGoogleLogging(argv[0]);
+
+ kaldi::SequentialBaseFloatMatrixReader feature_reader(
+ FLAGS_feature_respecifier);
+ std::string model_graph = FLAGS_model_path;
+ std::string model_params = FLAGS_param_path;
+ std::string dict_file = FLAGS_dict_file;
+ std::string lm_path = FLAGS_lm_path;
+ LOG(INFO) << "model path: " << model_graph;
+ LOG(INFO) << "model param: " << model_params;
+ LOG(INFO) << "dict path: " << dict_file;
+ LOG(INFO) << "lm path: " << lm_path;
+
+
+ int32 num_done = 0, num_err = 0;
+
+ ppspeech::CTCBeamSearchOptions opts;
+ opts.dict_file = dict_file;
+ opts.lm_path = lm_path;
+ ppspeech::CTCBeamSearch decoder(opts);
+
+ ppspeech::ModelOptions model_opts;
+ model_opts.model_path = model_graph;
+ model_opts.params_path = model_params;
+ model_opts.cache_shape = "5-1-1024,5-1-1024";
+ std::shared_ptr nnet(
+ new ppspeech::PaddleNnet(model_opts));
+ std::shared_ptr raw_data(new ppspeech::DataCache());
+ std::shared_ptr decodable(
+ new ppspeech::Decodable(nnet, raw_data));
+
+ int32 chunk_size = FLAGS_receptive_field_length;
+ int32 chunk_stride = FLAGS_downsampling_rate;
+ int32 receptive_field_length = FLAGS_receptive_field_length;
+ LOG(INFO) << "chunk size (frame): " << chunk_size;
+ LOG(INFO) << "chunk stride (frame): " << chunk_stride;
+ LOG(INFO) << "receptive field (frame): " << receptive_field_length;
+ decoder.InitDecoder();
+
+ for (; !feature_reader.Done(); feature_reader.Next()) {
+ string utt = feature_reader.Key();
+ kaldi::Matrix feature = feature_reader.Value();
+ raw_data->SetDim(feature.NumCols());
+ LOG(INFO) << "process utt: " << utt;
+ LOG(INFO) << "rows: " << feature.NumRows();
+ LOG(INFO) << "cols: " << feature.NumCols();
+
+ int32 row_idx = 0;
+ int32 padding_len = 0;
+ int32 ori_feature_len = feature.NumRows();
+ if ((feature.NumRows() - chunk_size) % chunk_stride != 0) {
+ padding_len =
+ chunk_stride - (feature.NumRows() - chunk_size) % chunk_stride;
+ feature.Resize(feature.NumRows() + padding_len,
+ feature.NumCols(),
+ kaldi::kCopyData);
+ }
+ int32 num_chunks = (feature.NumRows() - chunk_size) / chunk_stride + 1;
+ for (int chunk_idx = 0; chunk_idx < num_chunks; ++chunk_idx) {
+ kaldi::Vector feature_chunk(chunk_size *
+ feature.NumCols());
+ int32 feature_chunk_size = 0;
+ if (ori_feature_len > chunk_idx * chunk_stride) {
+ feature_chunk_size = std::min(
+ ori_feature_len - chunk_idx * chunk_stride, chunk_size);
+ }
+ if (feature_chunk_size < receptive_field_length) break;
+
+ int32 start = chunk_idx * chunk_stride;
+ int32 end = start + chunk_size;
+
+ for (int row_id = 0; row_id < chunk_size; ++row_id) {
+ kaldi::SubVector tmp(feature, start);
+ kaldi::SubVector f_chunk_tmp(
+ feature_chunk.Data() + row_id * feature.NumCols(),
+ feature.NumCols());
+ f_chunk_tmp.CopyFromVec(tmp);
+ ++start;
+ }
+ raw_data->Accept(feature_chunk);
+ if (chunk_idx == num_chunks - 1) {
+ raw_data->SetFinished();
+ }
+ decoder.AdvanceDecode(decodable);
+ }
+ std::string result;
+ result = decoder.GetFinalBestPath();
+ KALDI_LOG << " the result of " << utt << " is " << result;
+ decodable->Reset();
+ decoder.Reset();
+ ++num_done;
+ }
+
+ KALDI_LOG << "Done " << num_done << " utterances, " << num_err
+ << " with errors.";
+ return (num_done != 0 ? 0 : 1);
+}
diff --git a/speechx/examples/decoder/path.sh b/speechx/examples/decoder/path.sh
index 7b4b7545b38b3daf0eefb64b577f6698e6b9c9b1..a0e7c9aed7b8d34e53a8780cf7f758f29675aaea 100644
--- a/speechx/examples/decoder/path.sh
+++ b/speechx/examples/decoder/path.sh
@@ -10,5 +10,5 @@ TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
export LC_AL=C
-SPEECHX_BIN=$SPEECHX_EXAMPLES/decoder
+SPEECHX_BIN=$SPEECHX_EXAMPLES/decoder:$SPEECHX_EXAMPLES/feat
export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
diff --git a/speechx/examples/decoder/run.sh b/speechx/examples/decoder/run.sh
index fc5e91824633beaf9d87eda2a255e3bb4ddb9903..ddda89702503204c99481951558e252df6fdcf7e 100755
--- a/speechx/examples/decoder/run.sh
+++ b/speechx/examples/decoder/run.sh
@@ -25,7 +25,10 @@ model_dir=../paddle_asr_model
feat_wspecifier=./feats.ark
cmvn=./cmvn.ark
-# 3. run feat
+
+export GLOG_logtostderr=1
+
+# 3. gen linear feat
linear_spectrogram_main \
--wav_rspecifier=scp:$model_dir/wav.scp \
--feature_wspecifier=ark,t:$feat_wspecifier \
@@ -37,4 +40,4 @@ offline_decoder_main \
--model_path=$model_dir/avg_1.jit.pdmodel \
--param_path=$model_dir/avg_1.jit.pdparams \
--dict_file=$model_dir/vocab.txt \
- --lm_path=$model_dir/avg_1.jit.klm
\ No newline at end of file
+ --lm_path=$model_dir/avg_1.jit.klm
diff --git a/speechx/examples/feat/feature-mfcc-test.cc b/speechx/examples/feat/feature-mfcc-test.cc
index ae32aba9e6ab86bcb089ffe7f0aa98a5f409b6af..48a9e1c29b4ab6ce07e443c6e5da31d653e6043f 100644
--- a/speechx/examples/feat/feature-mfcc-test.cc
+++ b/speechx/examples/feat/feature-mfcc-test.cc
@@ -41,7 +41,6 @@
using namespace kaldi;
-
static void UnitTestReadWave() {
std::cout << "=== UnitTestReadWave() ===\n";
diff --git a/speechx/examples/feat/linear_spectrogram_main.cc b/speechx/examples/feat/linear_spectrogram_main.cc
index 9ed4d6f9344724f892a8d42d54eeb75b5cb79612..2d75bb5df0f1757756729e72caba0ad442090e5f 100644
--- a/speechx/examples/feat/linear_spectrogram_main.cc
+++ b/speechx/examples/feat/linear_spectrogram_main.cc
@@ -14,17 +14,19 @@
// todo refactor, repalce with gtest
-#include "frontend/linear_spectrogram.h"
#include "base/flags.h"
#include "base/log.h"
-#include "frontend/feature_cache.h"
-#include "frontend/feature_extractor_interface.h"
-#include "frontend/normalizer.h"
-#include "frontend/raw_audio.h"
#include "kaldi/feat/wave-reader.h"
#include "kaldi/util/kaldi-io.h"
#include "kaldi/util/table-types.h"
+#include "frontend/audio/audio_cache.h"
+#include "frontend/audio/data_cache.h"
+#include "frontend/audio/feature_cache.h"
+#include "frontend/audio/frontend_itf.h"
+#include "frontend/audio/linear_spectrogram.h"
+#include "frontend/audio/normalizer.h"
+
DEFINE_string(wav_rspecifier, "", "test wav scp path");
DEFINE_string(feature_wspecifier, "", "output feats wspecifier");
DEFINE_string(cmvn_write_path, "./cmvn.ark", "write cmvn");
@@ -149,7 +151,7 @@ void WriteMatrix() {
cmvn_stats(1, idx) = variance_[idx];
}
cmvn_stats(0, mean_.size()) = count_;
- kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, true);
+ kaldi::WriteKaldiObject(cmvn_stats, FLAGS_cmvn_write_path, false);
}
int main(int argc, char* argv[]) {
@@ -161,43 +163,59 @@ int main(int argc, char* argv[]) {
kaldi::BaseFloatMatrixWriter feat_writer(FLAGS_feature_wspecifier);
WriteMatrix();
- // test feature linear_spectorgram: wave --> decibel_normalizer --> hanning
- // window -->linear_spectrogram --> cmvn
+
int32 num_done = 0, num_err = 0;
- // std::unique_ptr data_source(new
- // ppspeech::RawDataCache());
- std::unique_ptr data_source(
- new ppspeech::RawAudioCache());
+
+ // feature pipeline: wave cache --> decibel_normalizer --> hanning
+ // window -->linear_spectrogram --> global cmvn -> feat cache
+
+ // std::unique_ptr data_source(new
+ // ppspeech::DataCache());
+ std::unique_ptr data_source(
+ new ppspeech::AudioCache());
+
+ ppspeech::DecibelNormalizerOptions db_norm_opt;
+ std::unique_ptr db_norm(
+ new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source)));
ppspeech::LinearSpectrogramOptions opt;
opt.frame_opts.frame_length_ms = 20;
opt.frame_opts.frame_shift_ms = 10;
- ppspeech::DecibelNormalizerOptions db_norm_opt;
- std::unique_ptr base_feature_extractor(
- new ppspeech::DecibelNormalizer(db_norm_opt, std::move(data_source)));
+ opt.frame_opts.dither = 0.0;
+ opt.frame_opts.remove_dc_offset = false;
+ opt.frame_opts.window_type = "hanning";
+ opt.frame_opts.preemph_coeff = 0.0;
+ LOG(INFO) << "frame length (ms): " << opt.frame_opts.frame_length_ms;
+ LOG(INFO) << "frame shift (ms): " << opt.frame_opts.frame_shift_ms;
- std::unique_ptr linear_spectrogram(
- new ppspeech::LinearSpectrogram(opt,
- std::move(base_feature_extractor)));
+ std::unique_ptr linear_spectrogram(
+ new ppspeech::LinearSpectrogram(opt, std::move(db_norm)));
- std::unique_ptr cmvn(
- new ppspeech::CMVN(FLAGS_cmvn_write_path,
- std::move(linear_spectrogram)));
+ std::unique_ptr cmvn(new ppspeech::CMVN(
+ FLAGS_cmvn_write_path, std::move(linear_spectrogram)));
ppspeech::FeatureCache feature_cache(kint16max, std::move(cmvn));
+ LOG(INFO) << "feat dim: " << feature_cache.Dim();
- float streaming_chunk = 0.36;
int sample_rate = 16000;
+ float streaming_chunk = 0.36;
int chunk_sample_size = streaming_chunk * sample_rate;
+ LOG(INFO) << "sr: " << sample_rate;
+ LOG(INFO) << "chunk size (s): " << streaming_chunk;
+ LOG(INFO) << "chunk size (sample): " << chunk_sample_size;
+
for (; !wav_reader.Done(); wav_reader.Next()) {
std::string utt = wav_reader.Key();
const kaldi::WaveData& wave_data = wav_reader.Value();
+ LOG(INFO) << "process utt: " << utt;
int32 this_channel = 0;
kaldi::SubVector waveform(wave_data.Data(),
this_channel);
int tot_samples = waveform.Dim();
+ LOG(INFO) << "wav len (sample): " << tot_samples;
+
int sample_offset = 0;
std::vector> feats;
int feature_rows = 0;
@@ -209,6 +227,7 @@ int main(int argc, char* argv[]) {
for (int i = 0; i < cur_chunk_size; ++i) {
wav_chunk(i) = waveform(sample_offset + i);
}
+
kaldi::Vector features;
feature_cache.Accept(wav_chunk);
if (cur_chunk_size < chunk_sample_size) {
diff --git a/speechx/examples/feat/run.sh b/speechx/examples/feat/run.sh
index bd21bd7f4e1c4262a5af339c68d92e989f0bd5c2..29c49d3252f11c0681af4c5c1cfcab638550953b 100755
--- a/speechx/examples/feat/run.sh
+++ b/speechx/examples/feat/run.sh
@@ -25,6 +25,7 @@ feat_wspecifier=./feats.ark
cmvn=./cmvn.ark
# 3. run feat
+export GLOG_logtostderr=1
linear_spectrogram_main \
--wav_rspecifier=scp:$model_dir/wav.scp \
--feature_wspecifier=ark,t:$feat_wspecifier \
diff --git a/speechx/examples/glog/CMakeLists.txt b/speechx/examples/glog/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b4b0e6358b4ca7c2ac27056a34d1d0b9ed1b6399
--- /dev/null
+++ b/speechx/examples/glog/CMakeLists.txt
@@ -0,0 +1,8 @@
+cmake_minimum_required(VERSION 3.14 FATAL_ERROR)
+
+add_executable(glog_test ${CMAKE_CURRENT_SOURCE_DIR}/glog_test.cc)
+target_link_libraries(glog_test glog)
+
+
+add_executable(glog_logtostderr_test ${CMAKE_CURRENT_SOURCE_DIR}/glog_logtostderr_test.cc)
+target_link_libraries(glog_logtostderr_test glog)
\ No newline at end of file
diff --git a/speechx/examples/glog/README.md b/speechx/examples/glog/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..996e192e9abaf6733746b6ea06561da6be627f91
--- /dev/null
+++ b/speechx/examples/glog/README.md
@@ -0,0 +1,25 @@
+# [GLOG](https://rpg.ifi.uzh.ch/docs/glog.html)
+
+Unless otherwise specified, glog writes to the filename `/tmp/...log....` (e.g., "/tmp/hello_world.example.com.hamaji.log.INFO.20080709-222411.10474"). By default, glog copies the log messages of severity level ERROR or FATAL to standard error (stderr) in addition to log files.
+
+Several flags influence glog's output behavior. If the Google gflags library is installed on your machine, the configure script (see the INSTALL file in the package for detail of this script) will automatically detect and use it, allowing you to pass flags on the command line. For example, if you want to turn the flag --logtostderr on, you can start your application with the following command line:
+
+ `./your_application --logtostderr=1`
+
+If the Google gflags library isn't installed, you set flags via environment variables, prefixing the flag name with "GLOG_", e.g.
+
+ `GLOG_logtostderr=1 ./your_application`
+
+You can also modify flag values in your program by modifying global variables `FLAGS_*` . Most settings start working immediately after you update `FLAGS_*` . The exceptions are the flags related to destination files. For example, you might want to set `FLAGS_log_dir` before calling `google::InitGoogleLogging` . Here is an example:
+∂∂
+```c++
+ LOG(INFO) << "file";
+ // Most flags work immediately after updating values.
+ FLAGS_logtostderr = 1;
+ LOG(INFO) << "stderr";
+ FLAGS_logtostderr = 0;
+ // This won't change the log destination. If you want to set this
+ // value, you should do this before google::InitGoogleLogging .
+ FLAGS_log_dir = "/some/log/directory";
+ LOG(INFO) << "the same file";
+```
diff --git a/speechx/speechx/frontend/feature_extractor_controller.h b/speechx/examples/glog/glog_logtostderr_test.cc
similarity index 69%
rename from speechx/speechx/frontend/feature_extractor_controller.h
rename to speechx/examples/glog/glog_logtostderr_test.cc
index 0544a1e298b8e7dc871d13f546398a5c28308b0e..b0616a7de3f1cad4aca75d9511c1cc5a6e6d5e9a 100644
--- a/speechx/speechx/frontend/feature_extractor_controller.h
+++ b/speechx/examples/glog/glog_logtostderr_test.cc
@@ -11,3 +11,15 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
+
+#include
+
+int main(int argc, char* argv[]) {
+ // Initialize Google’s logging library.
+ google::InitGoogleLogging(argv[0]);
+
+ FLAGS_logtostderr = 1;
+
+ LOG(INFO) << "Found " << 10 << " cookies";
+ LOG(ERROR) << "Found " << 10 << " error";
+}
\ No newline at end of file
diff --git a/speechx/speechx/frontend/feature_extractor_controller_impl.h b/speechx/examples/glog/glog_test.cc
similarity index 71%
rename from speechx/speechx/frontend/feature_extractor_controller_impl.h
rename to speechx/examples/glog/glog_test.cc
index 0544a1e298b8e7dc871d13f546398a5c28308b0e..b6275119e1997a597bb074e741ae4bcb943b38c7 100644
--- a/speechx/speechx/frontend/feature_extractor_controller_impl.h
+++ b/speechx/examples/glog/glog_test.cc
@@ -11,3 +11,13 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
+
+#include
+
+int main(int argc, char* argv[]) {
+ // Initialize Google’s logging library.
+ google::InitGoogleLogging(argv[0]);
+
+ LOG(INFO) << "Found " << 10 << " cookies";
+ LOG(ERROR) << "Found " << 10 << " error";
+}
\ No newline at end of file
diff --git a/speechx/examples/glog/path.sh b/speechx/examples/glog/path.sh
new file mode 100644
index 0000000000000000000000000000000000000000..e2c7b2fcfa4ed6171d68dda49fb9bcdd7e433302
--- /dev/null
+++ b/speechx/examples/glog/path.sh
@@ -0,0 +1,14 @@
+# This contains the locations of binarys build required for running the examples.
+
+SPEECHX_ROOT=$PWD/../..
+SPEECHX_EXAMPLES=$SPEECHX_ROOT/build/examples
+
+SPEECHX_TOOLS=$SPEECHX_ROOT/tools
+TOOLS_BIN=$SPEECHX_TOOLS/valgrind/install/bin
+
+[ -d $SPEECHX_EXAMPLES ] || { echo "Error: 'build/examples' directory not found. please ensure that the project build successfully"; }
+
+export LC_AL=C
+
+SPEECHX_BIN=$SPEECHX_EXAMPLES/glog
+export PATH=$PATH:$SPEECHX_BIN:$TOOLS_BIN
diff --git a/speechx/examples/glog/run.sh b/speechx/examples/glog/run.sh
new file mode 100755
index 0000000000000000000000000000000000000000..d3fcdb643902201ff1f354c15cb8d5a26b9f9fb4
--- /dev/null
+++ b/speechx/examples/glog/run.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+set +x
+set -e
+
+. ./path.sh
+
+# 1. compile
+if [ ! -d ${SPEECHX_EXAMPLES} ]; then
+ pushd ${SPEECHX_ROOT}
+ bash build.sh
+ popd
+fi
+
+# 2. run
+glog_test
+
+echo "------"
+export FLAGS_logtostderr=1
+glog_test
+
+echo "------"
+glog_logtostderr_test
diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.cc b/speechx/speechx/decoder/ctc_beam_search_decoder.cc
index 84f1453c0f6098f1c0fd41b235580f80dd2d6deb..5d7a4f77acd69411227bd5985419b749c5c79dd2 100644
--- a/speechx/speechx/decoder/ctc_beam_search_decoder.cc
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder.cc
@@ -38,8 +38,10 @@ CTCBeamSearch::CTCBeamSearch(const CTCBeamSearchOptions& opts)
<< vocabulary_.size();
LOG(INFO) << "language model path: " << opts_.lm_path;
- init_ext_scorer_ = std::make_shared(
- opts_.alpha, opts_.beta, opts_.lm_path, vocabulary_);
+ if (opts_.lm_path != "") {
+ init_ext_scorer_ = std::make_shared(
+ opts_.alpha, opts_.beta, opts_.lm_path, vocabulary_);
+ }
blank_id_ = 0;
auto it = std::find(vocabulary_.begin(), vocabulary_.end(), " ");
diff --git a/speechx/speechx/decoder/ctc_beam_search_decoder.h b/speechx/speechx/decoder/ctc_beam_search_decoder.h
index cf1824c6f7863db3821455bb0da2e398c624d595..9d0a5d1422c09630b6150b4c1f9bf4341fb64ef2 100644
--- a/speechx/speechx/decoder/ctc_beam_search_decoder.h
+++ b/speechx/speechx/decoder/ctc_beam_search_decoder.h
@@ -33,13 +33,13 @@ struct CTCBeamSearchOptions {
int num_proc_bsearch;
CTCBeamSearchOptions()
: dict_file("vocab.txt"),
- lm_path("lm.klm"),
+ lm_path(""),
alpha(1.9f),
beta(5.0),
beam_size(300),
cutoff_prob(0.99f),
cutoff_top_n(40),
- num_proc_bsearch(0) {}
+ num_proc_bsearch(10) {}
void Register(kaldi::OptionsItf* opts) {
opts->Register("dict", &dict_file, "dict file ");
diff --git a/speechx/speechx/frontend/CMakeLists.txt b/speechx/speechx/frontend/CMakeLists.txt
index 44ca52cdc088073a2a88ada64ddf5c9c5f80c1af..7d10fdec9e1d5529c1b5d9a83bc2d8e798abc9ee 100644
--- a/speechx/speechx/frontend/CMakeLists.txt
+++ b/speechx/speechx/frontend/CMakeLists.txt
@@ -1,10 +1,2 @@
-project(frontend)
-add_library(frontend STATIC
- normalizer.cc
- linear_spectrogram.cc
- raw_audio.cc
- feature_cache.cc
-)
-
-target_link_libraries(frontend PUBLIC kaldi-matrix)
+add_subdirectory(audio)
\ No newline at end of file
diff --git a/speechx/speechx/frontend/audio/CMakeLists.txt b/speechx/speechx/frontend/audio/CMakeLists.txt
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..35243b6e3ce2a39333c99f77f709dea2d06e47e5 100644
--- a/speechx/speechx/frontend/audio/CMakeLists.txt
+++ b/speechx/speechx/frontend/audio/CMakeLists.txt
@@ -0,0 +1,11 @@
+project(frontend)
+
+add_library(frontend STATIC
+ cmvn.cc
+ db_norm.cc
+ linear_spectrogram.cc
+ audio_cache.cc
+ feature_cache.cc
+)
+
+target_link_libraries(frontend PUBLIC kaldi-matrix)
\ No newline at end of file
diff --git a/speechx/speechx/frontend/raw_audio.cc b/speechx/speechx/frontend/audio/audio_cache.cc
similarity index 64%
rename from speechx/speechx/frontend/raw_audio.cc
rename to speechx/speechx/frontend/audio/audio_cache.cc
index 21f643628251e642b545d680e00e0fca01b460d2..c3233e595d874adc4b21000e94881f670892b525 100644
--- a/speechx/speechx/frontend/raw_audio.cc
+++ b/speechx/speechx/frontend/audio/audio_cache.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "frontend/raw_audio.h"
+#include "frontend/audio/audio_cache.h"
#include "kaldi/base/timer.h"
namespace ppspeech {
@@ -21,38 +21,43 @@ using kaldi::BaseFloat;
using kaldi::VectorBase;
using kaldi::Vector;
-RawAudioCache::RawAudioCache(int buffer_size)
- : finished_(false), data_length_(0), start_(0), timeout_(1) {
- ring_buffer_.resize(buffer_size);
+AudioCache::AudioCache(int buffer_size)
+ : finished_(false),
+ capacity_(buffer_size),
+ size_(0),
+ offset_(0),
+ timeout_(1) {
+ ring_buffer_.resize(capacity_);
}
-void RawAudioCache::Accept(const VectorBase& waves) {
+void AudioCache::Accept(const VectorBase& waves) {
std::unique_lock lock(mutex_);
- while (data_length_ + waves.Dim() > ring_buffer_.size()) {
+ while (size_ + waves.Dim() > ring_buffer_.size()) {
ready_feed_condition_.wait(lock);
}
for (size_t idx = 0; idx < waves.Dim(); ++idx) {
- int32 buffer_idx = (idx + start_) % ring_buffer_.size();
+ int32 buffer_idx = (idx + offset_) % ring_buffer_.size();
ring_buffer_[buffer_idx] = waves(idx);
}
- data_length_ += waves.Dim();
+ size_ += waves.Dim();
}
-bool RawAudioCache::Read(Vector* waves) {
+bool AudioCache::Read(Vector* waves) {
size_t chunk_size = waves->Dim();
kaldi::Timer timer;
std::unique_lock lock(mutex_);
- while (chunk_size > data_length_) {
+ while (chunk_size > size_) {
// when audio is empty and no more data feed
- // ready_read_condition will block in dead lock. so replace with
- // timeout_
+ // ready_read_condition will block in dead lock,
+ // so replace with timeout_
// ready_read_condition_.wait(lock);
int32 elapsed = static_cast(timer.Elapsed() * 1000);
if (elapsed > timeout_) {
- if (finished_ == true) { // read last chunk data
+ if (finished_ == true) {
+ // read last chunk data
break;
}
- if (chunk_size > data_length_) {
+ if (chunk_size > size_) {
return false;
}
}
@@ -60,17 +65,17 @@ bool RawAudioCache::Read(Vector* waves) {
}
// read last chunk data
- if (chunk_size > data_length_) {
- chunk_size = data_length_;
+ if (chunk_size > size_) {
+ chunk_size = size_;
waves->Resize(chunk_size);
}
for (size_t idx = 0; idx < chunk_size; ++idx) {
- int buff_idx = (start_ + idx) % ring_buffer_.size();
+ int buff_idx = (offset_ + idx) % ring_buffer_.size();
waves->Data()[idx] = ring_buffer_[buff_idx];
}
- data_length_ -= chunk_size;
- start_ = (start_ + chunk_size) % ring_buffer_.size();
+ size_ -= chunk_size;
+ offset_ = (offset_ + chunk_size) % ring_buffer_.size();
ready_feed_condition_.notify_one();
return true;
}
diff --git a/speechx/speechx/frontend/audio/audio_cache.h b/speechx/speechx/frontend/audio/audio_cache.h
new file mode 100644
index 0000000000000000000000000000000000000000..17e1a83895b9f24eb3a6a24d5b73bd72bb53b11b
--- /dev/null
+++ b/speechx/speechx/frontend/audio/audio_cache.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#pragma once
+
+#include "base/common.h"
+#include "frontend/audio/frontend_itf.h"
+
+namespace ppspeech {
+
+// waves cache
+class AudioCache : public FrontendInterface {
+ public:
+ explicit AudioCache(int buffer_size = kint16max);
+
+ virtual void Accept(const kaldi::VectorBase& waves);
+
+ virtual bool Read(kaldi::Vector* waves);
+
+ // the audio dim is 1, one sample
+ virtual size_t Dim() const { return 1; }
+
+ virtual void SetFinished() {
+ std::lock_guard lock(mutex_);
+ finished_ = true;
+ }
+
+ virtual bool IsFinished() const { return finished_; }
+
+ virtual void Reset() {
+ offset_ = 0;
+ size_ = 0;
+ finished_ = false;
+ }
+
+ private:
+ std::vector ring_buffer_;
+ size_t offset_; // offset in ring_buffer_
+ size_t size_; // samples in ring_buffer_ now
+ size_t capacity_; // capacity of ring_buffer_
+ bool finished_; // reach audio end
+ mutable std::mutex mutex_;
+ std::condition_variable ready_feed_condition_;
+ kaldi::int32 timeout_; // millisecond
+
+ DISALLOW_COPY_AND_ASSIGN(AudioCache);
+};
+
+} // namespace ppspeech
diff --git a/speechx/speechx/frontend/normalizer.cc b/speechx/speechx/frontend/audio/cmvn.cc
similarity index 67%
rename from speechx/speechx/frontend/normalizer.cc
rename to speechx/speechx/frontend/audio/cmvn.cc
index 524125619a74f8e12b7d9932b7529a4ab3bfde0f..c7e446c92655cad25ff8ad39eb4920deb742af65 100644
--- a/speechx/speechx/frontend/normalizer.cc
+++ b/speechx/speechx/frontend/audio/cmvn.cc
@@ -13,7 +13,7 @@
// limitations under the License.
-#include "frontend/normalizer.h"
+#include "frontend/audio/cmvn.h"
#include "kaldi/feat/cmvn.h"
#include "kaldi/util/kaldi-io.h"
@@ -26,73 +26,8 @@ using std::vector;
using kaldi::SubVector;
using std::unique_ptr;
-DecibelNormalizer::DecibelNormalizer(
- const DecibelNormalizerOptions& opts,
- std::unique_ptr base_extractor) {
- base_extractor_ = std::move(base_extractor);
- opts_ = opts;
- dim_ = 1;
-}
-
-void DecibelNormalizer::Accept(const kaldi::VectorBase& waves) {
- base_extractor_->Accept(waves);
-}
-
-bool DecibelNormalizer::Read(kaldi::Vector* waves) {
- if (base_extractor_->Read(waves) == false || waves->Dim() == 0) {
- return false;
- }
- Compute(waves);
- return true;
-}
-
-bool DecibelNormalizer::Compute(VectorBase* waves) const {
- // calculate db rms
- BaseFloat rms_db = 0.0;
- BaseFloat mean_square = 0.0;
- BaseFloat gain = 0.0;
- BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1));
-
- vector samples;
- samples.resize(waves->Dim());
- for (size_t i = 0; i < samples.size(); ++i) {
- samples[i] = (*waves)(i);
- }
-
- // square
- for (auto& d : samples) {
- if (opts_.convert_int_float) {
- d = d * wave_float_normlization;
- }
- mean_square += d * d;
- }
-
- // mean
- mean_square /= samples.size();
- rms_db = 10 * std::log10(mean_square);
- gain = opts_.target_db - rms_db;
-
- if (gain > opts_.max_gain_db) {
- LOG(ERROR)
- << "Unable to normalize segment to " << opts_.target_db << "dB,"
- << "because the the probable gain have exceeds opts_.max_gain_db"
- << opts_.max_gain_db << "dB.";
- return false;
- }
-
- // Note that this is an in-place transformation.
- for (auto& item : samples) {
- // python item *= 10.0 ** (gain / 20.0)
- item *= std::pow(10.0, gain / 20.0);
- }
-
- std::memcpy(
- waves->Data(), samples.data(), sizeof(BaseFloat) * samples.size());
- return true;
-}
-CMVN::CMVN(std::string cmvn_file,
- unique_ptr base_extractor)
+CMVN::CMVN(std::string cmvn_file, unique_ptr base_extractor)
: var_norm_(true) {
base_extractor_ = std::move(base_extractor);
bool binary;
diff --git a/speechx/speechx/frontend/audio/cmvn.h b/speechx/speechx/frontend/audio/cmvn.h
new file mode 100644
index 0000000000000000000000000000000000000000..50ef5649be6602fefef3373018bc77cec45c86d8
--- /dev/null
+++ b/speechx/speechx/frontend/audio/cmvn.h
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "base/common.h"
+#include "frontend/audio/frontend_itf.h"
+#include "kaldi/matrix/kaldi-matrix.h"
+#include "kaldi/util/options-itf.h"
+
+namespace ppspeech {
+
+class CMVN : public FrontendInterface {
+ public:
+ explicit CMVN(std::string cmvn_file,
+ std::unique_ptr base_extractor);
+ virtual void Accept(const kaldi::VectorBase& inputs);
+
+ // the length of feats = feature_row * feature_dim,
+ // the Matrix is squashed into Vector
+ virtual bool Read(kaldi::Vector* feats);
+ // the dim_ is the feautre dim.
+ virtual size_t Dim() const { return dim_; }
+ virtual void SetFinished() { base_extractor_->SetFinished(); }
+ virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
+ virtual void Reset() { base_extractor_->Reset(); }
+
+ private:
+ void Compute(kaldi::VectorBase* feats) const;
+ void ApplyCMVN(kaldi::MatrixBase* feats);
+ kaldi::Matrix stats_;
+ std::unique_ptr base_extractor_;
+ size_t dim_;
+ bool var_norm_;
+};
+
+} // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/raw_audio.h b/speechx/speechx/frontend/audio/data_cache.h
similarity index 52%
rename from speechx/speechx/frontend/raw_audio.h
rename to speechx/speechx/frontend/audio/data_cache.h
index ce75c137cf332eaaeb23c53781343bc7c6eaf1b6..a812278ce2e1aa8fb66c57885e36f324e25fe078 100644
--- a/speechx/speechx/frontend/raw_audio.h
+++ b/speechx/speechx/frontend/audio/data_cache.h
@@ -15,51 +15,22 @@
#pragma once
+
#include "base/common.h"
-#include "frontend/feature_extractor_interface.h"
+#include "frontend/audio/frontend_itf.h"
-#pragma once
namespace ppspeech {
-
-class RawAudioCache : public FeatureExtractorInterface {
+// A data source for testing different frontend module.
+// It accepts waves or feats.
+class DataCache : public FrontendInterface {
public:
- explicit RawAudioCache(int buffer_size = kint16max);
- virtual void Accept(const kaldi::VectorBase& waves);
- virtual bool Read(kaldi::Vector* waves);
- // the audio dim is 1
- virtual size_t Dim() const { return 1; }
- virtual void SetFinished() {
- std::lock_guard lock(mutex_);
- finished_ = true;
- }
- virtual bool IsFinished() const { return finished_; }
- virtual void Reset() {
- start_ = 0;
- data_length_ = 0;
- finished_ = false;
- }
-
- private:
- std::vector ring_buffer_;
- size_t start_;
- size_t data_length_;
- bool finished_;
- mutable std::mutex mutex_;
- std::condition_variable ready_feed_condition_;
- kaldi::int32 timeout_;
+ explicit DataCache() { finished_ = false; }
- DISALLOW_COPY_AND_ASSIGN(RawAudioCache);
-};
-
-// it is a datasource for testing different frontend module.
-// it accepts waves or feats.
-class RawDataCache : public FeatureExtractorInterface {
- public:
- explicit RawDataCache() { finished_ = false; }
virtual void Accept(const kaldi::VectorBase& inputs) {
data_ = inputs;
}
+
virtual bool Read(kaldi::Vector* feats) {
if (data_.Dim() == 0) {
return false;
@@ -68,9 +39,10 @@ class RawDataCache : public FeatureExtractorInterface {
data_.Resize(0);
return true;
}
- virtual size_t Dim() const { return dim_; }
+
virtual void SetFinished() { finished_ = true; }
virtual bool IsFinished() const { return finished_; }
+ virtual size_t Dim() const { return dim_; }
void SetDim(int32 dim) { dim_ = dim; }
virtual void Reset() { finished_ = true; }
@@ -79,7 +51,6 @@ class RawDataCache : public FeatureExtractorInterface {
bool finished_;
int32 dim_;
- DISALLOW_COPY_AND_ASSIGN(RawDataCache);
+ DISALLOW_COPY_AND_ASSIGN(DataCache);
};
-
-} // namespace ppspeech
+}
\ No newline at end of file
diff --git a/speechx/speechx/frontend/audio/db_norm.cc b/speechx/speechx/frontend/audio/db_norm.cc
new file mode 100644
index 0000000000000000000000000000000000000000..931e932d6d6aa725655ddc2cccf933529425c6f8
--- /dev/null
+++ b/speechx/speechx/frontend/audio/db_norm.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+
+#include "frontend/audio/db_norm.h"
+#include "kaldi/feat/cmvn.h"
+#include "kaldi/util/kaldi-io.h"
+
+namespace ppspeech {
+
+using kaldi::Vector;
+using kaldi::VectorBase;
+using kaldi::BaseFloat;
+using std::vector;
+using kaldi::SubVector;
+using std::unique_ptr;
+
+DecibelNormalizer::DecibelNormalizer(
+ const DecibelNormalizerOptions& opts,
+ std::unique_ptr base_extractor) {
+ base_extractor_ = std::move(base_extractor);
+ opts_ = opts;
+ dim_ = 1;
+}
+
+void DecibelNormalizer::Accept(const kaldi::VectorBase& waves) {
+ base_extractor_->Accept(waves);
+}
+
+bool DecibelNormalizer::Read(kaldi::Vector* waves) {
+ if (base_extractor_->Read(waves) == false || waves->Dim() == 0) {
+ return false;
+ }
+ Compute(waves);
+ return true;
+}
+
+bool DecibelNormalizer::Compute(VectorBase* waves) const {
+ // calculate db rms
+ BaseFloat rms_db = 0.0;
+ BaseFloat mean_square = 0.0;
+ BaseFloat gain = 0.0;
+ BaseFloat wave_float_normlization = 1.0f / (std::pow(2, 16 - 1));
+
+ vector samples;
+ samples.resize(waves->Dim());
+ for (size_t i = 0; i < samples.size(); ++i) {
+ samples[i] = (*waves)(i);
+ }
+
+ // square
+ for (auto& d : samples) {
+ if (opts_.convert_int_float) {
+ d = d * wave_float_normlization;
+ }
+ mean_square += d * d;
+ }
+
+ // mean
+ mean_square /= samples.size();
+ rms_db = 10 * std::log10(mean_square);
+ gain = opts_.target_db - rms_db;
+
+ if (gain > opts_.max_gain_db) {
+ LOG(ERROR)
+ << "Unable to normalize segment to " << opts_.target_db << "dB,"
+ << "because the the probable gain have exceeds opts_.max_gain_db"
+ << opts_.max_gain_db << "dB.";
+ return false;
+ }
+
+ // Note that this is an in-place transformation.
+ for (auto& item : samples) {
+ // python item *= 10.0 ** (gain / 20.0)
+ item *= std::pow(10.0, gain / 20.0);
+ }
+
+ std::memcpy(
+ waves->Data(), samples.data(), sizeof(BaseFloat) * samples.size());
+ return true;
+}
+
+
+} // namespace ppspeech
diff --git a/speechx/speechx/frontend/normalizer.h b/speechx/speechx/frontend/audio/db_norm.h
similarity index 62%
rename from speechx/speechx/frontend/normalizer.h
rename to speechx/speechx/frontend/audio/db_norm.h
index 352d1e1677e7e9566f3c3ea08d65235ae897a73a..425971437d1d2b154d645727720c460140d5e117 100644
--- a/speechx/speechx/frontend/normalizer.h
+++ b/speechx/speechx/frontend/audio/db_norm.h
@@ -16,7 +16,7 @@
#pragma once
#include "base/common.h"
-#include "frontend/feature_extractor_interface.h"
+#include "frontend/audio/frontend_itf.h"
#include "kaldi/matrix/kaldi-matrix.h"
#include "kaldi/util/options-itf.h"
@@ -40,11 +40,11 @@ struct DecibelNormalizerOptions {
}
};
-class DecibelNormalizer : public FeatureExtractorInterface {
+class DecibelNormalizer : public FrontendInterface {
public:
explicit DecibelNormalizer(
const DecibelNormalizerOptions& opts,
- std::unique_ptr base_extractor);
+ std::unique_ptr base_extractor);
virtual void Accept(const kaldi::VectorBase& waves);
virtual bool Read(kaldi::Vector* waves);
// noramlize audio, the dim is 1.
@@ -57,33 +57,9 @@ class DecibelNormalizer : public FeatureExtractorInterface {
bool Compute(kaldi::VectorBase* waves) const;
DecibelNormalizerOptions opts_;
size_t dim_;
- std::unique_ptr base_extractor_;
+ std::unique_ptr base_extractor_;
kaldi::Vector waveform_;
};
-class CMVN : public FeatureExtractorInterface {
- public:
- explicit CMVN(std::string cmvn_file,
- std::unique_ptr base_extractor);
- virtual void Accept(const kaldi::VectorBase& inputs);
-
- // the length of feats = feature_row * feature_dim,
- // the Matrix is squashed into Vector
- virtual bool Read(kaldi::Vector* feats);
- // the dim_ is the feautre dim.
- virtual size_t Dim() const { return dim_; }
- virtual void SetFinished() { base_extractor_->SetFinished(); }
- virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
- virtual void Reset() { base_extractor_->Reset(); }
-
- private:
- void Compute(kaldi::VectorBase* feats) const;
- void ApplyCMVN(kaldi::MatrixBase* feats);
- kaldi::Matrix stats_;
- std::unique_ptr base_extractor_;
- size_t dim_;
- bool var_norm_;
-};
-
} // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/fbank.h b/speechx/speechx/frontend/audio/fbank.h
similarity index 88%
rename from speechx/speechx/frontend/fbank.h
rename to speechx/speechx/frontend/audio/fbank.h
index 7d9cf4221254acb1f0eec4430b4f778cd8e1f255..68267b3d0efee480c89b0816489c6014baceb13f 100644
--- a/speechx/speechx/frontend/fbank.h
+++ b/speechx/speechx/frontend/audio/fbank.h
@@ -20,10 +20,10 @@
namespace ppspeech {
-class FbankExtractor : FeatureExtractorInterface {
+class FbankExtractor : FrontendInterface {
public:
explicit FbankExtractor(const FbankOptions& opts,
- share_ptr pre_extractor);
+ share_ptr pre_extractor);
virtual void AcceptWaveform(
const kaldi::Vector& input) = 0;
virtual void Read(kaldi::Vector* feat) = 0;
diff --git a/speechx/speechx/frontend/feature_cache.cc b/speechx/speechx/frontend/audio/feature_cache.cc
similarity index 91%
rename from speechx/speechx/frontend/feature_cache.cc
rename to speechx/speechx/frontend/audio/feature_cache.cc
index d23b3a8b29d5ab66ca06b1517ce0f34d7efaeb36..3f7f6502b4074f55e10324d791b69763f1db0a72 100644
--- a/speechx/speechx/frontend/feature_cache.cc
+++ b/speechx/speechx/frontend/audio/feature_cache.cc
@@ -12,7 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-#include "frontend/feature_cache.h"
+#include "frontend/audio/feature_cache.h"
namespace ppspeech {
@@ -23,8 +23,8 @@ using std::vector;
using kaldi::SubVector;
using std::unique_ptr;
-FeatureCache::FeatureCache(
- int max_size, unique_ptr base_extractor) {
+FeatureCache::FeatureCache(int max_size,
+ unique_ptr base_extractor) {
max_size_ = max_size;
base_extractor_ = std::move(base_extractor);
}
@@ -41,6 +41,7 @@ void FeatureCache::Accept(const kaldi::VectorBase& inputs) {
// pop feature chunk
bool FeatureCache::Read(kaldi::Vector* feats) {
kaldi::Timer timer;
+
std::unique_lock lock(mutex_);
while (cache_.empty() && base_extractor_->IsFinished() == false) {
ready_read_condition_.wait(lock);
@@ -64,10 +65,13 @@ bool FeatureCache::Compute() {
// compute and feed
Vector feature_chunk;
bool result = base_extractor_->Read(&feature_chunk);
+
std::unique_lock lock(mutex_);
while (cache_.size() >= max_size_) {
ready_feed_condition_.wait(lock);
}
+
+ // feed cache
if (feature_chunk.Dim() != 0) {
cache_.push(feature_chunk);
}
diff --git a/speechx/speechx/frontend/feature_cache.h b/speechx/speechx/frontend/audio/feature_cache.h
similarity index 82%
rename from speechx/speechx/frontend/feature_cache.h
rename to speechx/speechx/frontend/audio/feature_cache.h
index e52d8b2981aa546ba9e4aad6998310c931e86cfe..99961b5e28252691ecb81c60a5e8448a541e52f0 100644
--- a/speechx/speechx/frontend/feature_cache.h
+++ b/speechx/speechx/frontend/audio/feature_cache.h
@@ -15,26 +15,33 @@
#pragma once
#include "base/common.h"
-#include "frontend/feature_extractor_interface.h"
+#include "frontend/audio/frontend_itf.h"
namespace ppspeech {
-class FeatureCache : public FeatureExtractorInterface {
+class FeatureCache : public FrontendInterface {
public:
explicit FeatureCache(
int32 max_size = kint16max,
- std::unique_ptr base_extractor = NULL);
+ std::unique_ptr base_extractor = NULL);
+
+ // Feed feats or waves
virtual void Accept(const kaldi::VectorBase& inputs);
- // feats dim = num_frames * feature_dim
+
+ // feats size = num_frames * feat_dim
virtual bool Read(kaldi::Vector* feats);
- // feature cache only cache feature which from base extractor
+
+ // feat dim
virtual size_t Dim() const { return base_extractor_->Dim(); }
+
virtual void SetFinished() {
base_extractor_->SetFinished();
// read the last chunk data
Compute();
}
+
virtual bool IsFinished() const { return base_extractor_->IsFinished(); }
+
virtual void Reset() {
base_extractor_->Reset();
while (!cache_.empty()) {
@@ -45,12 +52,14 @@ class FeatureCache : public FeatureExtractorInterface {
private:
bool Compute();
- std::mutex mutex_;
size_t max_size_;
+ std::unique_ptr base_extractor_;
+
+ std::mutex mutex_;
std::queue> cache_;
- std::unique_ptr base_extractor_;
std::condition_variable ready_feed_condition_;
std::condition_variable ready_read_condition_;
+
// DISALLOW_COPY_AND_ASSGIN(FeatureCache);
};
diff --git a/speechx/speechx/frontend/feature_extractor_interface.h b/speechx/speechx/frontend/audio/frontend_itf.h
similarity index 66%
rename from speechx/speechx/frontend/feature_extractor_interface.h
rename to speechx/speechx/frontend/audio/frontend_itf.h
index 3668fbda76953eaaf544f0fedbbae4cd017369ea..7913cc7c086564bd01981f46eac5701a1a799ff1 100644
--- a/speechx/speechx/frontend/feature_extractor_interface.h
+++ b/speechx/speechx/frontend/audio/frontend_itf.h
@@ -19,19 +19,28 @@
namespace ppspeech {
-class FeatureExtractorInterface {
+class FrontendInterface {
public:
- // accept input data, accept feature or raw waves which decided
- // by the base_extractor
+ // Feed inputs: features(2D saved in 1D) or waveforms(1D).
virtual void Accept(const kaldi::VectorBase& inputs) = 0;
- // get the processed result
- // the length of output = feature_row * feature_dim,
- // the Matrix is squashed into Vector
+
+ // Fetch processed data: features or waveforms.
+ // For features(2D saved in 1D), the Matrix is squashed into Vector,
+ // the length of output = feature_row * feature_dim.
+ // For waveforms(1D), samples saved in vector.
virtual bool Read(kaldi::Vector* outputs) = 0;
- // the Dim is the feature dim
+
+ // Dim is the feature dim. For waveforms(1D), Dim is zero; else is specific,
+ // e.g 80 for fbank.
virtual size_t Dim() const = 0;
+
+ // End Flag for Streaming Data.
virtual void SetFinished() = 0;
+
+ // whether is end of Streaming Data.
virtual bool IsFinished() const = 0;
+
+ // Reset to start state.
virtual void Reset() = 0;
};
diff --git a/speechx/speechx/frontend/audio/linear_spectrogram.cc b/speechx/speechx/frontend/audio/linear_spectrogram.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d6ae3d012c63bf62a5fdaf3500bb7bfe182c4635
--- /dev/null
+++ b/speechx/speechx/frontend/audio/linear_spectrogram.cc
@@ -0,0 +1,109 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "frontend/audio/linear_spectrogram.h"
+#include "kaldi/base/kaldi-math.h"
+#include "kaldi/feat/feature-common.h"
+#include "kaldi/feat/feature-functions.h"
+#include "kaldi/matrix/matrix-functions.h"
+
+namespace ppspeech {
+
+using kaldi::int32;
+using kaldi::BaseFloat;
+using kaldi::Vector;
+using kaldi::SubVector;
+using kaldi::VectorBase;
+using kaldi::Matrix;
+using std::vector;
+
+LinearSpectrogram::LinearSpectrogram(
+ const LinearSpectrogramOptions& opts,
+ std::unique_ptr base_extractor)
+ : opts_(opts), feature_window_funtion_(opts.frame_opts) {
+ base_extractor_ = std::move(base_extractor);
+ int32 window_size = opts.frame_opts.WindowSize();
+ int32 window_shift = opts.frame_opts.WindowShift();
+ dim_ = window_size / 2 + 1;
+ chunk_sample_size_ =
+ static_cast(opts.streaming_chunk * opts.frame_opts.samp_freq);
+ hanning_window_energy_ = kaldi::VecVec(feature_window_funtion_.window,
+ feature_window_funtion_.window);
+}
+
+void LinearSpectrogram::Accept(const VectorBase& inputs) {
+ base_extractor_->Accept(inputs);
+}
+
+bool LinearSpectrogram::Read(Vector* feats) {
+ Vector input_feats(chunk_sample_size_);
+ bool flag = base_extractor_->Read(&input_feats);
+ if (flag == false || input_feats.Dim() == 0) return false;
+
+ int32 feat_len = input_feats.Dim();
+ int32 left_len = reminded_wav_.Dim();
+ Vector waves(feat_len + left_len);
+ waves.Range(0, left_len).CopyFromVec(reminded_wav_);
+ waves.Range(left_len, feat_len).CopyFromVec(input_feats);
+ Compute(waves, feats);
+ int32 frame_shift = opts_.frame_opts.WindowShift();
+ int32 num_frames = kaldi::NumFrames(waves.Dim(), opts_.frame_opts);
+ int32 left_samples = waves.Dim() - frame_shift * num_frames;
+ reminded_wav_.Resize(left_samples);
+ reminded_wav_.CopyFromVec(
+ waves.Range(frame_shift * num_frames, left_samples));
+ return true;
+}
+
+// Compute spectrogram feat
+bool LinearSpectrogram::Compute(const Vector& waves,
+ Vector* feats) {
+ int32 num_samples = waves.Dim();
+ int32 frame_length = opts_.frame_opts.WindowSize();
+ int32 sample_rate = opts_.frame_opts.samp_freq;
+ BaseFloat scale = 2.0 / (hanning_window_energy_ * sample_rate);
+
+ if (num_samples < frame_length) {
+ return true;
+ }
+
+ int32 num_frames = kaldi::NumFrames(num_samples, opts_.frame_opts);
+ feats->Resize(num_frames * dim_);
+ Vector window;
+
+ for (int frame_idx = 0; frame_idx < num_frames; ++frame_idx) {
+ kaldi::ExtractWindow(0,
+ waves,
+ frame_idx,
+ opts_.frame_opts,
+ feature_window_funtion_,
+ &window,
+ NULL);
+
+ SubVector output_row(feats->Data() + frame_idx * dim_, dim_);
+ window.Resize(frame_length, kaldi::kCopyData);
+ RealFft(&window, true);
+ kaldi::ComputePowerSpectrum(&window);
+ SubVector power_spectrum(window, 0, dim_);
+ power_spectrum.Scale(scale);
+ power_spectrum(0) = power_spectrum(0) / 2;
+ power_spectrum(dim_ - 1) = power_spectrum(dim_ - 1) / 2;
+ power_spectrum.Add(1e-14);
+ power_spectrum.ApplyLog();
+ output_row.CopyFromVec(power_spectrum);
+ }
+ return true;
+}
+
+} // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/frontend/linear_spectrogram.h b/speechx/speechx/frontend/audio/linear_spectrogram.h
similarity index 66%
rename from speechx/speechx/frontend/linear_spectrogram.h
rename to speechx/speechx/frontend/audio/linear_spectrogram.h
index ffdfbbe9281e6ad62616ca5e52044377eaa24262..896c494dd5d67ad8fb66cbae549c08e9f4f5747c 100644
--- a/speechx/speechx/frontend/linear_spectrogram.h
+++ b/speechx/speechx/frontend/audio/linear_spectrogram.h
@@ -16,28 +16,30 @@
#pragma once
#include "base/common.h"
-#include "frontend/feature_extractor_interface.h"
+#include "frontend/audio/frontend_itf.h"
#include "kaldi/feat/feature-window.h"
namespace ppspeech {
struct LinearSpectrogramOptions {
kaldi::FrameExtractionOptions frame_opts;
- kaldi::BaseFloat streaming_chunk;
+ kaldi::BaseFloat streaming_chunk; // second
+
LinearSpectrogramOptions() : streaming_chunk(0.36), frame_opts() {}
void Register(kaldi::OptionsItf* opts) {
- opts->Register(
- "streaming-chunk", &streaming_chunk, "streaming chunk size");
+ opts->Register("streaming-chunk",
+ &streaming_chunk,
+ "streaming chunk size, default: 0.36 sec");
frame_opts.Register(opts);
}
};
-class LinearSpectrogram : public FeatureExtractorInterface {
+class LinearSpectrogram : public FrontendInterface {
public:
explicit LinearSpectrogram(
const LinearSpectrogramOptions& opts,
- std::unique_ptr base_extractor);
+ std::unique_ptr base_extractor);
virtual void Accept(const kaldi::VectorBase& inputs);
virtual bool Read(kaldi::Vector* feats);
// the dim_ is the dim of single frame feature
@@ -47,19 +49,15 @@ class LinearSpectrogram : public FeatureExtractorInterface {
virtual void Reset() { base_extractor_->Reset(); }
private:
- void Hanning(std::vector* data) const;
- bool Compute(const std::vector& waves,
- std::vector>& feats);
- bool NumpyFft(std::vector* v,
- std::vector* real,
- std::vector* img) const;
+ bool Compute(const kaldi::Vector& waves,
+ kaldi::Vector* feats);
- kaldi::int32 fft_points_;
size_t dim_;
- std::vector hanning_window_;
+ kaldi::FeatureWindowFunction feature_window_funtion_;
kaldi::BaseFloat hanning_window_energy_;
LinearSpectrogramOptions opts_;
- std::unique_ptr base_extractor_;
+ std::unique_ptr base_extractor_;
+ kaldi::Vector reminded_wav_;
int chunk_sample_size_;
DISALLOW_COPY_AND_ASSIGN(LinearSpectrogram);
};
diff --git a/speechx/speechx/frontend/mfcc.h b/speechx/speechx/frontend/audio/mfcc.h
similarity index 100%
rename from speechx/speechx/frontend/mfcc.h
rename to speechx/speechx/frontend/audio/mfcc.h
diff --git a/speechx/speechx/frontend/window.h b/speechx/speechx/frontend/audio/normalizer.h
similarity index 88%
rename from speechx/speechx/frontend/window.h
rename to speechx/speechx/frontend/audio/normalizer.h
index 70d6307ec0c5d0bbe3a0e8847247ce7f26bdc5db..dcf721dd2b286edfc2379eb9b3819dd751c13263 100644
--- a/speechx/speechx/frontend/window.h
+++ b/speechx/speechx/frontend/audio/normalizer.h
@@ -12,4 +12,7 @@
// See the License for the specific language governing permissions and
// limitations under the License.
-// extract the window of kaldi feat.
+#pragma once
+
+#include "frontend/audio/cmvn.h"
+#include "frontend/audio/db_norm.h"
\ No newline at end of file
diff --git a/speechx/speechx/frontend/linear_spectrogram.cc b/speechx/speechx/frontend/linear_spectrogram.cc
deleted file mode 100644
index 41bc8743a939e4f45b5607d71435e246fc480ca7..0000000000000000000000000000000000000000
--- a/speechx/speechx/frontend/linear_spectrogram.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-// http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "frontend/linear_spectrogram.h"
-#include "kaldi/base/kaldi-math.h"
-#include "kaldi/matrix/matrix-functions.h"
-
-namespace ppspeech {
-
-using kaldi::int32;
-using kaldi::BaseFloat;
-using kaldi::Vector;
-using kaldi::VectorBase;
-using kaldi::Matrix;
-using std::vector;
-
-LinearSpectrogram::LinearSpectrogram(
- const LinearSpectrogramOptions& opts,
- std::unique_ptr base_extractor) {
- opts_ = opts;
- base_extractor_ = std::move(base_extractor);
- int32 window_size = opts.frame_opts.WindowSize();
- int32 window_shift = opts.frame_opts.WindowShift();
- fft_points_ = window_size;
- chunk_sample_size_ =
- static_cast(opts.streaming_chunk * opts.frame_opts.samp_freq);
- hanning_window_.resize(window_size);
-
- double a = M_2PI / (window_size - 1);
- hanning_window_energy_ = 0;
- for (int i = 0; i < window_size; ++i) {
- hanning_window_[i] = 0.5 - 0.5 * cos(a * i);
- hanning_window_energy_ += hanning_window_[i] * hanning_window_[i];
- }
-
- dim_ = fft_points_ / 2 + 1; // the dimension is Fs/2 Hz
-}
-
-void LinearSpectrogram::Accept(const VectorBase& inputs) {
- base_extractor_->Accept(inputs);
-}
-
-bool LinearSpectrogram::Read(Vector* feats) {
- Vector input_feats(chunk_sample_size_);
- bool flag = base_extractor_->Read(&input_feats);
- if (flag == false || input_feats.Dim() == 0) return false;
-
- vector input_feats_vec(input_feats.Dim());
- std::memcpy(input_feats_vec.data(),
- input_feats.Data(),
- input_feats.Dim() * sizeof(BaseFloat));
- vector> result;
- Compute(input_feats_vec, result);
- int32 feat_size = 0;
- if (result.size() != 0) {
- feat_size = result.size() * result[0].size();
- }
- feats->Resize(feat_size);
- // todo refactor (SimleGoat)
- for (size_t idx = 0; idx < feat_size; ++idx) {
- (*feats)(idx) = result[idx / dim_][idx % dim_];
- }
- return true;
-}
-
-void LinearSpectrogram::Hanning(vector* data) const {
- CHECK_GE(data->size(), hanning_window_.size());
-
- for (size_t i = 0; i < hanning_window_.size(); ++i) {
- data->at(i) *= hanning_window_[i];
- }
-}
-
-bool LinearSpectrogram::NumpyFft(vector* v,
- vector* real,
- vector* img) const {
- Vector v_tmp;
- v_tmp.Resize(v->size());
- std::memcpy(v_tmp.Data(), v->data(), sizeof(BaseFloat) * (v->size()));
- RealFft(&v_tmp, true);
- v->resize(v_tmp.Dim());
- std::memcpy(v->data(), v_tmp.Data(), sizeof(BaseFloat) * (v->size()));
-
- real->push_back(v->at(0));
- img->push_back(0);
- for (int i = 1; i < v->size() / 2; i++) {
- real->push_back(v->at(2 * i));
- img->push_back(v->at(2 * i + 1));
- }
- real->push_back(v->at(1));
- img->push_back(0);
-
- return true;
-}
-
-// Compute spectrogram feat
-// todo: refactor later (SmileGoat)
-bool LinearSpectrogram::Compute(const vector& waves,
- vector>& feats) {
- int num_samples = waves.size();
- const int& frame_length = opts_.frame_opts.WindowSize();
- const int& sample_rate = opts_.frame_opts.samp_freq;
- const int& frame_shift = opts_.frame_opts.WindowShift();
- const int& fft_points = fft_points_;
- const float scale = hanning_window_energy_ * sample_rate;
-
- if (num_samples < frame_length) {
- return true;
- }
-
- int num_frames = 1 + ((num_samples - frame_length) / frame_shift);
- feats.resize(num_frames);
- vector fft_real((fft_points_ / 2 + 1), 0);
- vector fft_img((fft_points_ / 2 + 1), 0);
- vector v(frame_length, 0);
- vector power((fft_points / 2 + 1));
-
- for (int i = 0; i < num_frames; ++i) {
- vector data(waves.data() + i * frame_shift,
- waves.data() + i * frame_shift + frame_length);
- Hanning(&data);
- fft_img.clear();
- fft_real.clear();
- v.assign(data.begin(), data.end());
- NumpyFft(&v, &fft_real, &fft_img);
-
- feats[i].resize(fft_points / 2 + 1); // the last dimension is Fs/2 Hz
- for (int j = 0; j < (fft_points / 2 + 1); ++j) {
- power[j] = fft_real[j] * fft_real[j] + fft_img[j] * fft_img[j];
- feats[i][j] = power[j];
-
- if (j == 0 || j == feats[0].size() - 1) {
- feats[i][j] /= scale;
- } else {
- feats[i][j] *= (2.0 / scale);
- }
-
- // log added eps=1e-14
- feats[i][j] = std::log(feats[i][j] + 1e-14);
- }
- }
- return true;
-}
-
-} // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/nnet/decodable.cc b/speechx/speechx/nnet/decodable.cc
index b0f82e0c1d00d2e2abfc08fe75c395f72aaea7c5..de15c4f842177e34be2a4678751a9c1185aa0eeb 100644
--- a/speechx/speechx/nnet/decodable.cc
+++ b/speechx/speechx/nnet/decodable.cc
@@ -22,10 +22,11 @@ using std::vector;
using kaldi::Vector;
Decodable::Decodable(const std::shared_ptr& nnet,
- const std::shared_ptr& frontend)
+ const std::shared_ptr& frontend)
: frontend_(frontend), nnet_(nnet), frame_offset_(0), frames_ready_(0) {}
void Decodable::Acceptlikelihood(const Matrix& likelihood) {
+ nnet_cache_ = likelihood;
frames_ready_ += likelihood.NumRows();
}
@@ -59,7 +60,7 @@ bool Decodable::EnsureFrameHaveComputed(int32 frame) {
bool Decodable::AdvanceChunk() {
Vector features;
- if (frontend_->Read(&features) == false) {
+ if (frontend_ == NULL || frontend_->Read(&features) == false) {
return false;
}
int32 nnet_dim = 0;
@@ -83,10 +84,11 @@ bool Decodable::FrameLogLikelihood(int32 frame, vector* likelihood) {
}
void Decodable::Reset() {
- frontend_->Reset();
- nnet_->Reset();
+ if (frontend_ != nullptr) frontend_->Reset();
+ if (nnet_ != nullptr) nnet_->Reset();
frame_offset_ = 0;
frames_ready_ = 0;
+ nnet_cache_.Resize(0, 0);
}
} // namespace ppspeech
\ No newline at end of file
diff --git a/speechx/speechx/nnet/decodable.h b/speechx/speechx/nnet/decodable.h
index 9a480d21ea978ab433bdbf0c3de03ba777789d7e..5b687f3dcb018b5cbae7d8fd9172d862c6044eec 100644
--- a/speechx/speechx/nnet/decodable.h
+++ b/speechx/speechx/nnet/decodable.h
@@ -13,10 +13,10 @@
// limitations under the License.
#include "base/common.h"
-#include "frontend/feature_extractor_interface.h"
+#include "frontend/audio/frontend_itf.h"
#include "kaldi/matrix/kaldi-matrix.h"
#include "kaldi/decoder/decodable-itf.h"
-#include "nnet/nnet_interface.h"
+#include "nnet/nnet_itf.h"
namespace ppspeech {
@@ -24,9 +24,8 @@ struct DecodableOpts;
class Decodable : public kaldi::DecodableInterface {
public:
- explicit Decodable(
- const std::shared_ptr& nnet,
- const std::shared_ptr& frontend);
+ explicit Decodable(const std::shared_ptr& nnet,
+ const std::shared_ptr& frontend);
// void Init(DecodableOpts config);
virtual kaldi::BaseFloat LogLikelihood(int32 frame, int32 index);
virtual bool IsLastFrame(int32 frame);
@@ -43,7 +42,7 @@ class Decodable : public kaldi::DecodableInterface {
private:
bool AdvanceChunk();
- std::shared_ptr frontend_;
+ std::shared_ptr frontend_;
std::shared_ptr nnet_;
kaldi::Matrix nnet_cache_;
// std::vector> nnet_cache_;
diff --git a/speechx/speechx/nnet/nnet_interface.h b/speechx/speechx/nnet/nnet_itf.h
similarity index 100%
rename from speechx/speechx/nnet/nnet_interface.h
rename to speechx/speechx/nnet/nnet_itf.h
diff --git a/speechx/speechx/nnet/paddle_nnet.h b/speechx/speechx/nnet/paddle_nnet.h
index 30fbac9f1b5bd432c2a338bd1edfe163a2803bc5..906994d06c81b753689e5172d7e61fda60b27fbf 100644
--- a/speechx/speechx/nnet/paddle_nnet.h
+++ b/speechx/speechx/nnet/paddle_nnet.h
@@ -15,13 +15,14 @@
#pragma once
-#include "base/common.h"
-#include "nnet/nnet_interface.h"
-#include "paddle_inference_api.h"
#include "kaldi/matrix/kaldi-matrix.h"
#include "kaldi/util/options-itf.h"
+#include "base/common.h"
+#include "nnet/nnet_itf.h"
+#include "paddle_inference_api.h"
+
#include
namespace ppspeech {
diff --git a/tests/unit/cli/test_cli.sh b/tests/unit/cli/test_cli.sh
index 9852b0695e5dabcec48e5510ff19274b81669670..96ab84d65312d4ea7d4974fa86ab85e991108f27 100755
--- a/tests/unit/cli/test_cli.sh
+++ b/tests/unit/cli/test_cli.sh
@@ -21,6 +21,7 @@ paddlespeech tts --voc hifigan_csmsc --input "你好,欢迎使用百度飞桨
paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0
paddlespeech tts --am fastspeech2_aishell3 --voc hifigan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0
paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
+paddlespeech tts --am fastspeech2_ljspeech --voc hifigan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0
paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0
paddlespeech tts --am tacotron2_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"
@@ -42,3 +43,16 @@ paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
paddlespeech stats --task asr
paddlespeech stats --task tts
paddlespeech stats --task cls
+
+# Speaker Verification
+wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
+paddlespeech vector --task spk --input 85236145389.wav
+
+echo -e "demo1 85236145389.wav \n demo2 85236145389.wav" > vec.job
+paddlespeech vector --task spk --input vec.job
+
+echo -e "demo3 85236145389.wav \n demo4 85236145389.wav" | paddlespeech vector --task spk
+rm 85236145389.wav
+rm vec.job
+
+
diff --git a/tests/unit/vector/conftest.py b/tests/unit/vector/conftest.py
new file mode 100644
index 0000000000000000000000000000000000000000..cc5dccd196fd47e04adbde689dd8980ca92e9224
--- /dev/null
+++ b/tests/unit/vector/conftest.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+def pytest_addoption(parser):
+ parser.addoption("--device", action="store", default="cpu")
+
+
+def pytest_generate_tests(metafunc):
+ # This is called for every test. Only get/set command line arguments
+ # if the argument is specified in the list of test "fixturenames".
+ option_value = metafunc.config.option.device
+ if "device" in metafunc.fixturenames and option_value is not None:
+ metafunc.parametrize("device", [option_value])
diff --git a/tests/unit/vector/test_augment.py b/tests/unit/vector/test_augment.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ae01da410fff6501a0b6e4df73b1cbe896f6864
--- /dev/null
+++ b/tests/unit/vector/test_augment.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import paddle
+
+
+def test_add_noise(tmpdir, device):
+ paddle.device.set_device(device)
+ from paddlespeech.vector.io.augment import AddNoise
+
+ test_waveform = paddle.sin(
+ paddle.arange(16000.0, dtype="float32")).unsqueeze(0)
+ test_noise = paddle.cos(
+ paddle.arange(16000.0, dtype="float32")).unsqueeze(0)
+ wav_lens = paddle.ones([1], dtype="float32")
+
+ # Edge cases
+ no_noise = AddNoise(mix_prob=0.0)
+ assert no_noise(test_waveform, wav_lens).allclose(test_waveform)
+
+
+def test_speed_perturb(device):
+ paddle.device.set_device(device)
+ from paddlespeech.vector.io.augment import SpeedPerturb
+
+ test_waveform = paddle.sin(
+ paddle.arange(16000.0, dtype="float32")).unsqueeze(0)
+
+ # Edge cases
+ no_perturb = SpeedPerturb(16000, perturb_prob=0.0)
+ assert no_perturb(test_waveform).allclose(test_waveform)
+ no_perturb = SpeedPerturb(16000, speeds=[100])
+ assert no_perturb(test_waveform).allclose(test_waveform)
+
+ # # Half speed
+ half_speed = SpeedPerturb(16000, speeds=[50])
+ assert half_speed(test_waveform).allclose(test_waveform[:, ::2], atol=3e-1)
+
+
+def test_babble(device):
+ paddle.device.set_device(device)
+ from paddlespeech.vector.io.augment import AddBabble
+
+ test_waveform = paddle.stack(
+ (paddle.sin(paddle.arange(16000.0, dtype="float32")),
+ paddle.cos(paddle.arange(16000.0, dtype="float32")), ))
+ lengths = paddle.ones([2])
+
+ # Edge cases
+ no_babble = AddBabble(mix_prob=0.0)
+ assert no_babble(test_waveform, lengths).allclose(test_waveform)
+ no_babble = AddBabble(speaker_count=1, snr_low=1000, snr_high=1000)
+ assert no_babble(test_waveform, lengths).allclose(test_waveform)
+
+ # One babbler just averages the two speakers
+ babble = AddBabble(speaker_count=1).to(device)
+ expected = (test_waveform + test_waveform.roll(1, 0)) / 2
+ assert babble(test_waveform, lengths).allclose(expected, atol=1e-4)
+
+
+def test_drop_freq(device):
+ paddle.device.set_device(device)
+ from paddlespeech.vector.io.augment import DropFreq
+
+ test_waveform = paddle.sin(
+ paddle.arange(16000.0, dtype="float32")).unsqueeze(0)
+
+ # Edge cases
+ no_drop = DropFreq(drop_prob=0.0)
+ assert no_drop(test_waveform).allclose(test_waveform)
+ no_drop = DropFreq(drop_count_low=0, drop_count_high=0)
+ assert no_drop(test_waveform).allclose(test_waveform)
+
+ # Check case where frequency range *does not* include signal frequency
+ drop_diff_freq = DropFreq(drop_freq_low=0.5, drop_freq_high=0.9)
+ assert drop_diff_freq(test_waveform).allclose(test_waveform, atol=1e-1)
+
+ # Check case where frequency range *does* include signal frequency
+ drop_same_freq = DropFreq(drop_freq_low=0.28, drop_freq_high=0.28)
+ assert drop_same_freq(test_waveform).allclose(
+ paddle.zeros([1, 16000]), atol=4e-1)
+
+
+def test_drop_chunk(device):
+ paddle.device.set_device(device)
+ from paddlespeech.vector.io.augment import DropChunk
+
+ test_waveform = paddle.sin(
+ paddle.arange(16000.0, dtype="float32")).unsqueeze(0)
+ lengths = paddle.ones([1])
+
+ # Edge cases
+ no_drop = DropChunk(drop_prob=0.0)
+ assert no_drop(test_waveform, lengths).allclose(test_waveform)
+ no_drop = DropChunk(drop_length_low=0, drop_length_high=0)
+ assert no_drop(test_waveform, lengths).allclose(test_waveform)
+ no_drop = DropChunk(drop_count_low=0, drop_count_high=0)
+ assert no_drop(test_waveform, lengths).allclose(test_waveform)
+ no_drop = DropChunk(drop_start=0, drop_end=0)
+ assert no_drop(test_waveform, lengths).allclose(test_waveform)
+
+ # Specify all parameters to ensure it is deterministic
+ dropper = DropChunk(
+ drop_length_low=100,
+ drop_length_high=100,
+ drop_count_low=1,
+ drop_count_high=1,
+ drop_start=100,
+ drop_end=200,
+ noise_factor=0.0, )
+ expected_waveform = test_waveform.clone()
+ expected_waveform[:, 100:200] = 0.0
+
+ assert dropper(test_waveform, lengths).allclose(expected_waveform)
+
+ # Make sure amplitude is similar before and after
+ dropper = DropChunk(noise_factor=1.0)
+ drop_amplitude = dropper(test_waveform, lengths).abs().mean()
+ orig_amplitude = test_waveform.abs().mean()
+ assert drop_amplitude.allclose(orig_amplitude, atol=1e-2)
diff --git a/third_party/ctc_decoders/setup.py b/third_party/ctc_decoders/setup.py
index 4a11b890dc7c205164ea543831f552cad1c8cc1a..ce2787e3fa5531b19976c5cd7fa70c56e49caf71 100644
--- a/third_party/ctc_decoders/setup.py
+++ b/third_party/ctc_decoders/setup.py
@@ -127,7 +127,7 @@ decoders_module = [
setup(
name='paddlespeech_ctcdecoders',
- version='0.1.1',
+ version='0.2.0',
description="CTC decoders in paddlespeech",
author="PaddlePaddle Speech and Language Team",
author_email="paddlesl@baidu.com",
diff --git a/utils/DER.py b/utils/DER.py
index d6ab695d8f498dd9aafebe6b43b645cc5de709e3..59bcbec473489e0f7930caed5838c152d3d0f874 100755
--- a/utils/DER.py
+++ b/utils/DER.py
@@ -26,9 +26,9 @@ import argparse
import os
import re
import subprocess
-from distutils.util import strtobool
import numpy as np
+from distutils.util import strtobool
FILE_IDS = re.compile(r"(?<=Speaker Diarization for).+(?=\*\*\*)")
SCORED_SPEAKER_TIME = re.compile(r"(?<=SCORED SPEAKER TIME =)[\d.]+")
diff --git a/utils/addjson.py b/utils/addjson.py
index 013d14727f40d8b95a9d54f1f60a0682e8bf3977..e1be7ab316830ccca699be13372def2a039c0f74 100755
--- a/utils/addjson.py
+++ b/utils/addjson.py
@@ -10,8 +10,8 @@ import codecs
import json
import logging
import sys
-from distutils.util import strtobool
+from distutils.util import strtobool
from espnet.utils.cli_utils import get_commandline_args
is_python2 = sys.version_info[0] == 2
diff --git a/utils/apply-cmvn.py b/utils/apply-cmvn.py
index b92e58f456d13677769e2a9ef07e062d4a2288a6..cf91bdfcda9248e1bd8604d4af1b510044fc05c6 100755
--- a/utils/apply-cmvn.py
+++ b/utils/apply-cmvn.py
@@ -1,10 +1,10 @@
#!/usr/bin/env python3
import argparse
import logging
-from distutils.util import strtobool
import kaldiio
import numpy
+from distutils.util import strtobool
from paddlespeech.s2t.transform.cmvn import CMVN
from paddlespeech.s2t.utils.cli_readers import file_reader_helper
diff --git a/utils/copy-feats.py b/utils/copy-feats.py
index 2e120881468bd97c88fe1c85e668386ebc012c99..dc7a70b45f0ed9a8b8a0a594a3d8b6a17b00b9a3 100755
--- a/utils/copy-feats.py
+++ b/utils/copy-feats.py
@@ -1,6 +1,7 @@
#!/usr/bin/env python3
import argparse
import logging
+
from distutils.util import strtobool
from paddlespeech.s2t.transform.transformation import Transformation
diff --git a/utils/merge_scp2json.py b/utils/merge_scp2json.py
index 650e46698046465e384eae4e21a880a7279802c5..99db6bac8a6b5977f91f908c6a8f7f2395dd8706 100755
--- a/utils/merge_scp2json.py
+++ b/utils/merge_scp2json.py
@@ -5,9 +5,10 @@ import codecs
import json
import logging
import sys
-from distutils.util import strtobool
from io import open
+from distutils.util import strtobool
+
from paddlespeech.s2t.utils.cli_utils import get_commandline_args
PY2 = sys.version_info[0] == 2