...
 
Commits (4)
    https://gitcode.net/paddlepaddle/DeepSpeech/-/commit/35d874c5321e16eb57d8d9d77e7cbaec1ff3058d [s2t] mv dataset into paddlespeech.dataset (#3183) 2023-04-21T11:33:17+08:00 Hui Zhang zhtclz@foxmail.com * mv dataset into paddlespeech.dataset * add aidatatang * fix import https://gitcode.net/paddlepaddle/DeepSpeech/-/commit/8c7859d3bc10f9f01eac4c1ee9099fb0b5e2c50f Fix some typos. (#3178) 2023-04-21T13:06:20+08:00 Shuangchi He 34329208+Yulv-git@users.noreply.github.com Signed-off-by: <span data-trailer="Signed-off-by:" data-user="40584"><a href="https://gitcode.net/qq_37828488" title="yulvchi@qq.com"><img alt="洞幺01's avatar" src="https://profile.csdnimg.cn/4/3/4/1_qq_37828488" class="avatar s16 avatar-inline" title="洞幺01"></a><a href="https://gitcode.net/qq_37828488" title="yulvchi@qq.com">Yulv-git</a> &lt;<a href="mailto:yulvchi@qq.com" title="yulvchi@qq.com">yulvchi@qq.com</a>&gt;</span> https://gitcode.net/paddlepaddle/DeepSpeech/-/commit/df3be4acae5ac109c839927eef5dd64dfc759e5a [s2t] move s2t data preprocess into paddlespeech.dataset (#3189) 2023-04-23T10:47:22+08:00 Hui Zhang zhtclz@foxmail.com * move s2t data preprocess into paddlespeech.dataset * avg model, compute wer, format rsl into paddlespeech.dataset * fix format rsl * fix avg ckpts https://gitcode.net/paddlepaddle/DeepSpeech/-/commit/84cc5fc98f339250ff11fc509705fa0a9929eb5d Update pretrained model in README (#3193) 2023-04-23T19:40:06+08:00 ljhzxc 33015549+ljhzxc@users.noreply.github.com
......@@ -27,4 +27,4 @@ git commit -m "xxxxxx, test=doc"
1. 虽然跳过了 CI,但是还要先排队排到才能跳过,所以非自己方向看到 pending 不要着急 🤣
2.`git commit --amend` 的时候才加 `test=xxx` 可能不太有效
3. 一个 pr 多次提交 commit 注意每次都要加 `test=xxx`,因为每个 commit 都会触发 CI
4. 删除 python 环境中已经安装好的 paddlespeech,否则可能会影响 import paddlespeech 的顺序</div>
4. 删除 python 环境中已经安装好的 paddlespeech,否则可能会影响 import paddlespeech 的顺序</div>
......@@ -191,7 +191,7 @@ def soundfile_save(y: np.ndarray, sr: int, file: os.PathLike) -> None:
if sr <= 0:
raise ParameterError(
f'Sample rate should be larger than 0, recieved sr = {sr}')
f'Sample rate should be larger than 0, received sr = {sr}')
if y.dtype not in ['int16', 'int8']:
warnings.warn(
......
......@@ -18,139 +18,7 @@ Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
"""
import argparse
import codecs
import json
import os
from pathlib import Path
import soundfile
from utils.utility import download
from utils.utility import unpack
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
URL_ROOT = 'http://www.openslr.org/resources/62'
# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62'
DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz'
MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949'
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--target_dir",
default=DATA_HOME + "/aidatatang_200zh",
type=str,
help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument(
"--manifest_prefix",
default="manifest",
type=str,
help="Filepath prefix for output manifests. (default: %(default)s)")
args = parser.parse_args()
def create_manifest(data_dir, manifest_path_prefix):
print("Creating manifest %s ..." % manifest_path_prefix)
json_lines = []
transcript_path = os.path.join(data_dir, 'transcript',
'aidatatang_200_zh_transcript.txt')
transcript_dict = {}
for line in codecs.open(transcript_path, 'r', 'utf-8'):
line = line.strip()
if line == '':
continue
audio_id, text = line.split(' ', 1)
# remove withespace, charactor text
text = ''.join(text.split())
transcript_dict[audio_id] = text
data_types = ['train', 'dev', 'test']
for dtype in data_types:
del json_lines[:]
total_sec = 0.0
total_text = 0.0
total_num = 0
audio_dir = os.path.join(data_dir, 'corpus/', dtype)
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
for fname in filelist:
if not fname.endswith('.wav'):
continue
audio_path = os.path.abspath(os.path.join(subfolder, fname))
audio_id = os.path.basename(fname)[:-4]
utt2spk = Path(audio_path).parent.name
audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
text = transcript_dict[audio_id]
json_lines.append(
json.dumps(
{
'utt': audio_id,
'utt2spk': str(utt2spk),
'feat': audio_path,
'feat_shape': (duration, ), # second
'text': text,
},
ensure_ascii=False))
total_sec += duration
total_text += len(text)
total_num += 1
manifest_path = manifest_path_prefix + '.' + dtype
with codecs.open(manifest_path, 'w', 'utf-8') as fout:
for line in json_lines:
fout.write(line + '\n')
manifest_dir = os.path.dirname(manifest_path_prefix)
meta_path = os.path.join(manifest_dir, dtype) + '.meta'
with open(meta_path, 'w') as f:
print(f"{dtype}:", file=f)
print(f"{total_num} utts", file=f)
print(f"{total_sec / (60*60)} h", file=f)
print(f"{total_text} text", file=f)
print(f"{total_text / total_sec} text/sec", file=f)
print(f"{total_sec / total_num} sec/utt", file=f)
def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
"""Download, unpack and create manifest file."""
data_dir = os.path.join(target_dir, subset)
if not os.path.exists(data_dir):
filepath = download(url, md5sum, target_dir)
unpack(filepath, target_dir)
# unpack all audio tar files
audio_dir = os.path.join(data_dir, 'corpus')
for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)):
for sub in dirlist:
print(f"unpack dir {sub}...")
for folder, _, filelist in sorted(
os.walk(os.path.join(subfolder, sub))):
for ftar in filelist:
unpack(os.path.join(folder, ftar), folder, True)
else:
print("Skip downloading and unpacking. Data already exists in %s." %
target_dir)
create_manifest(data_dir, manifest_path)
def main():
if args.target_dir.startswith('~'):
args.target_dir = os.path.expanduser(args.target_dir)
prepare_dataset(
url=DATA_URL,
md5sum=MD5_DATA,
target_dir=args.target_dir,
manifest_path=args.manifest_prefix,
subset='aidatatang_200zh')
print("Data download and manifest prepare done!")
from paddlespeech.dataset.aidatatang_200zh import aidatatang_200zh_main
if __name__ == '__main__':
main()
aidatatang_200zh_main()
......@@ -18,143 +18,7 @@ Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
"""
import argparse
import codecs
import json
import os
from pathlib import Path
import soundfile
from utils.utility import download
from utils.utility import unpack
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
URL_ROOT = 'http://openslr.elda.org/resources/33'
# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
DATA_URL = URL_ROOT + '/data_aishell.tgz'
MD5_DATA = '2f494334227864a8a8fec932999db9d8'
RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz'
MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5'
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--target_dir",
default=DATA_HOME + "/Aishell",
type=str,
help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument(
"--manifest_prefix",
default="manifest",
type=str,
help="Filepath prefix for output manifests. (default: %(default)s)")
args = parser.parse_args()
def create_manifest(data_dir, manifest_path_prefix):
print("Creating manifest %s ..." % manifest_path_prefix)
json_lines = []
transcript_path = os.path.join(data_dir, 'transcript',
'aishell_transcript_v0.8.txt')
transcript_dict = {}
for line in codecs.open(transcript_path, 'r', 'utf-8'):
line = line.strip()
if line == '':
continue
audio_id, text = line.split(' ', 1)
# remove withespace, charactor text
text = ''.join(text.split())
transcript_dict[audio_id] = text
data_types = ['train', 'dev', 'test']
for dtype in data_types:
del json_lines[:]
total_sec = 0.0
total_text = 0.0
total_num = 0
audio_dir = os.path.join(data_dir, 'wav', dtype)
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
for fname in filelist:
audio_path = os.path.abspath(os.path.join(subfolder, fname))
audio_id = os.path.basename(fname)[:-4]
# if no transcription for audio then skipped
if audio_id not in transcript_dict:
continue
utt2spk = Path(audio_path).parent.name
audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
text = transcript_dict[audio_id]
json_lines.append(
json.dumps(
{
'utt': audio_id,
'utt2spk': str(utt2spk),
'feat': audio_path,
'feat_shape': (duration, ), # second
'text': text
},
ensure_ascii=False))
total_sec += duration
total_text += len(text)
total_num += 1
manifest_path = manifest_path_prefix + '.' + dtype
with codecs.open(manifest_path, 'w', 'utf-8') as fout:
for line in json_lines:
fout.write(line + '\n')
manifest_dir = os.path.dirname(manifest_path_prefix)
meta_path = os.path.join(manifest_dir, dtype) + '.meta'
with open(meta_path, 'w') as f:
print(f"{dtype}:", file=f)
print(f"{total_num} utts", file=f)
print(f"{total_sec / (60*60)} h", file=f)
print(f"{total_text} text", file=f)
print(f"{total_text / total_sec} text/sec", file=f)
print(f"{total_sec / total_num} sec/utt", file=f)
def prepare_dataset(url, md5sum, target_dir, manifest_path=None):
"""Download, unpack and create manifest file."""
data_dir = os.path.join(target_dir, 'data_aishell')
if not os.path.exists(data_dir):
filepath = download(url, md5sum, target_dir)
unpack(filepath, target_dir)
# unpack all audio tar files
audio_dir = os.path.join(data_dir, 'wav')
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
for ftar in filelist:
unpack(os.path.join(subfolder, ftar), subfolder, True)
else:
print("Skip downloading and unpacking. Data already exists in %s." %
target_dir)
if manifest_path:
create_manifest(data_dir, manifest_path)
def main():
if args.target_dir.startswith('~'):
args.target_dir = os.path.expanduser(args.target_dir)
prepare_dataset(
url=DATA_URL,
md5sum=MD5_DATA,
target_dir=args.target_dir,
manifest_path=args.manifest_prefix)
prepare_dataset(
url=RESOURCE_URL,
md5sum=MD5_RESOURCE,
target_dir=args.target_dir,
manifest_path=None)
print("Data download and manifest prepare done!")
from paddlespeech.dataset.aishell import aishell_main
if __name__ == '__main__':
main()
aishell_main()
......@@ -28,8 +28,8 @@ from multiprocessing.pool import Pool
import distutils.util
import soundfile
from utils.utility import download
from utils.utility import unpack
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack
URL_ROOT = "http://openslr.elda.org/resources/12"
#URL_ROOT = "https://openslr.magicdatatech.com/resources/12"
......
......@@ -27,8 +27,8 @@ from multiprocessing.pool import Pool
import soundfile
from utils.utility import download
from utils.utility import unpack
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack
URL_ROOT = "http://openslr.elda.org/resources/31"
URL_TRAIN_CLEAN = URL_ROOT + "/train-clean-5.tar.gz"
......
......@@ -29,8 +29,8 @@ import os
import soundfile
from utils.utility import download
from utils.utility import unpack
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
......
......@@ -29,8 +29,8 @@ import os
import soundfile
from utils.utility import download
from utils.utility import unzip
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unzip
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
......
......@@ -27,8 +27,8 @@ from pathlib import Path
import soundfile
from utils.utility import download
from utils.utility import unpack
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
......
......@@ -28,7 +28,7 @@ from pathlib import Path
import soundfile
from utils.utility import unzip
from paddlespeech.dataset.download import unzip
URL_ROOT = ""
MD5_DATA = "45c68037c7fdfe063a43c851f181fb2d"
......
......@@ -31,9 +31,9 @@ from pathlib import Path
import soundfile
from utils.utility import check_md5sum
from utils.utility import download
from utils.utility import unzip
from paddlespeech.dataset.download import check_md5sum
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unzip
# all the data will be download in the current data/voxceleb directory default
DATA_HOME = os.path.expanduser('.')
......
......@@ -27,9 +27,9 @@ from pathlib import Path
import soundfile
from utils.utility import check_md5sum
from utils.utility import download
from utils.utility import unzip
from paddlespeech.dataset.download import check_md5sum
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unzip
# all the data will be download in the current data/voxceleb directory default
DATA_HOME = os.path.expanduser('.')
......
......@@ -28,9 +28,9 @@ import subprocess
import soundfile
from utils.utility import download_multi
from utils.utility import getfile_insensitive
from utils.utility import unpack
from paddlespeech.dataset.download import download_multi
from paddlespeech.dataset.download import getfile_insensitive
from paddlespeech.dataset.download import unpack
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
......
# 语音合成 Java API Demo 使用指南
在 Android 上实现语音合成功能,此 Demo 有很好的易用性和开放性,如在 Demo 中跑自己训练好的模型等。
在 Android 上实现语音合成功能,此 Demo 有很好的易用性和开放性,如在 Demo 中跑自己训练好的模型等。
本文主要介绍语音合成 Demo 运行方法。
......
......@@ -6,13 +6,13 @@
--jieba_stop_word_path=./dict/jieba/stop_words.utf8
# dict conf fastspeech2_0.4
--seperate_tone=false
--separate_tone=false
--word2phone_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
--phone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
--tone2id_path=./dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
# dict conf speedyspeech_0.5
#--seperate_tone=true
#--separate_tone=true
#--word2phone_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict
#--phone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt
#--tone2id_path=./dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
......
......@@ -6,13 +6,13 @@
--jieba_stop_word_path=./front_demo/dict/jieba/stop_words.utf8
# dict conf fastspeech2_0.4
--seperate_tone=false
--separate_tone=false
--word2phone_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
--phone2id_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/phone_id_map.txt
--tone2id_path=./front_demo/dict/fastspeech2_nosil_baker_ckpt_0.4/word2phone_fs2.dict
# dict conf speedyspeech_0.5
#--seperate_tone=true
#--separate_tone=true
#--word2phone_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/word2phone.dict
#--phone2id_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/phone_id_map.txt
#--tone2id_path=./front_demo/dict/speedyspeech_nosil_baker_ckpt_0.5/tone_id_map.txt
......
......@@ -20,7 +20,7 @@
DEFINE_string(sentence, "你好,欢迎使用语音合成服务", "Text to be synthesized");
DEFINE_string(front_conf, "./front_demo/front.conf", "Front conf file");
// DEFINE_string(seperate_tone, "true", "If true, get phoneids and tonesid");
// DEFINE_string(separate_tone, "true", "If true, get phoneids and tonesid");
int main(int argc, char** argv) {
......
......@@ -20,7 +20,7 @@ worddict = "./dict/jieba_part.dict.utf8"
newdict = "./dict/word_phones.dict"
def GenPhones(initials, finals, seperate=True):
def GenPhones(initials, finals, separate=True):
phones = []
for c, v in zip(initials, finals):
......@@ -30,9 +30,9 @@ def GenPhones(initials, finals, seperate=True):
elif c in ['zh', 'ch', 'sh', 'r']:
v = re.sub('i', 'iii', v)
if c:
if seperate is True:
if separate is True:
phones.append(c + '0')
elif seperate is False:
elif separate is False:
phones.append(c)
else:
print("Not sure whether phone and tone need to be separated")
......
......@@ -126,7 +126,7 @@ int FrontEngineInterface::init() {
}
// 生成音调字典(音调到音调id的映射)
if (_seperate_tone == "true") {
if (_separate_tone == "true") {
if (0 != GenDict(_tone2id_path, &tone_id_map)) {
LOG(ERROR) << "Genarate tone2id dict failed";
return -1;
......@@ -168,7 +168,7 @@ int FrontEngineInterface::ReadConfFile() {
_jieba_stop_word_path = conf_map["jieba_stop_word_path"];
// dict path
_seperate_tone = conf_map["seperate_tone"];
_separate_tone = conf_map["separate_tone"];
_word2phone_path = conf_map["word2phone_path"];
_phone2id_path = conf_map["phone2id_path"];
_tone2id_path = conf_map["tone2id_path"];
......@@ -295,7 +295,7 @@ int FrontEngineInterface::GetWordsIds(
}
}
} else { // 标点符号
if (_seperate_tone == "true") {
if (_separate_tone == "true") {
phone = "sp0"; // speedyspeech
} else {
phone = "sp"; // fastspeech2
......@@ -354,7 +354,7 @@ int FrontEngineInterface::Phone2Phoneid(const std::string &phone,
std::string temp_phone;
for (int i = 0; i < phone_vec.size(); i++) {
temp_phone = phone_vec[i];
if (_seperate_tone == "true") {
if (_separate_tone == "true") {
phoneid->push_back(atoi(
(phone_id_map[temp_phone.substr(0, temp_phone.length() - 1)])
.c_str()));
......
......@@ -182,7 +182,7 @@ class FrontEngineInterface : public TextNormalizer {
std::string _jieba_idf_path;
std::string _jieba_stop_word_path;
std::string _seperate_tone;
std::string _separate_tone;
std::string _word2phone_path;
std::string _phone2id_path;
std::string _tone2id_path;
......
......@@ -14,8 +14,8 @@
from audio_search import app
from fastapi.testclient import TestClient
from utils.utility import download
from utils.utility import unpack
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack
client = TestClient(app)
......
......@@ -14,8 +14,8 @@
from fastapi.testclient import TestClient
from vpr_search import app
from utils.utility import download
from utils.utility import unpack
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack
client = TestClient(app)
......
......@@ -23,7 +23,7 @@ Paddle Speech Demo 是一个以 PaddleSpeech 的语音交互功能为主体开
+ ERNIE-SAT:语言-语音跨模态大模型 ERNIE-SAT 可视化展示示例,支持个性化合成,跨语言语音合成(音频为中文则输入英文文本进行合成),语音编辑(修改音频文字中间的结果)功能。 ERNIE-SAT 更多实现细节,可以参考:
+ [【ERNIE-SAT with AISHELL-3 dataset】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3/ernie_sat)
+ [【ERNIE-SAT with with AISHELL3 and VCTK datasets】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3_vctk/ernie_sat)
+ [【ERNIE-SAT with AISHELL3 and VCTK datasets】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/aishell3_vctk/ernie_sat)
+ [【ERNIE-SAT with VCTK dataset】](https://github.com/PaddlePaddle/PaddleSpeech/tree/develop/examples/vctk/ernie_sat)
运行效果:
......
......@@ -260,7 +260,7 @@ async def websocket_endpoint_online(websocket: WebSocket):
# and we break the loop
if message['signal'] == 'start':
resp = {"status": "ok", "signal": "server_ready"}
# do something at begining here
# do something at beginning here
# create the instance to process the audio
# connection_handler = chatbot.asr.connection_handler
connection_handler = PaddleASRConnectionHanddler(engine)
......
......@@ -62,7 +62,7 @@
"collapsed": false
},
"source": [
"# 使用Transformer进行端到端语音翻译的基本流程\n",
"# 使用Transformer进行端到端语音翻译的基本流程\n",
"## 基础模型\n",
"由于 ASR 章节已经介绍了 Transformer 以及语音特征抽取,在此便不做过多介绍,感兴趣的同学可以去相关章节进行了解。\n",
"\n",
......
......@@ -464,7 +464,7 @@
"<br><center> FastSpeech2 网络结构图</center></br>\n",
"\n",
"\n",
"PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于,我们使用的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似),这样的合成结果可以更加**稳定**。\n",
"PaddleSpeech TTS 实现的 FastSpeech2 与论文不同的地方在于,我们使用的是 phone 级别的 `pitch` 和 `energy`(与 FastPitch 类似),这样的合成结果可以更加**稳定**。\n",
"<center><img src=\"https://ai-studio-static-online.cdn.bcebos.com/862c21456c784c41a83a308b7d9707f0810cc3b3c6f94ed48c60f5d32d0072f0\"></center>\n",
"<br><center> FastPitch 网络结构图</center></br>\n",
"\n",
......
#!/bin/bash
if [ $# != 3 ];then
echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1
fi
set -e
stage=0
stop_stage=100
source utils/parse_options.sh || exit 1;
ngpu=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}')
echo "using $ngpu gpus..."
if [ $# != 3 ];then
echo "usage: ${0} config_path decode_config_path ckpt_path_prefix"
exit -1
fi
config_path=$1
decode_config_path=$2
ckpt_prefix=$3
......@@ -92,6 +98,7 @@ if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
fi
if [ ${stage} -le 101 ] && [ ${stop_stage} -ge 101 ]; then
echo "using sclite to compute cer..."
# format the reference test file for sclite
python utils/format_rsl.py \
--origin_ref data/manifest.test.raw \
......
......@@ -96,3 +96,13 @@ CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize.sh ${conf_path} ${train_output_p
```bash
CUDA_VISIBLE_DEVICES=${gpus} ./local/synthesize_e2e.sh ${conf_path} ${train_output_path} ${ckpt_name}
```
## Pretrained Model
The pretrained model can be downloaded here:
- [jets_csmsc_ckpt_1.5.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/jets_csmsc_ckpt_1.5.0.zip)
The static model can be downloaded here:
- [jets_csmsc_static_1.5.0.zip](https://paddlespeech.bj.bcebos.com/Parakeet/jets_csmsc_static_1.5.0.zip)
......@@ -153,7 +153,7 @@ After training the model, we need to get the final model for testing and inferen
```bash
if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
# avg n best model
avg.sh lastest exp/${ckpt}/checkpoints ${avg_num}
avg.sh latest exp/${ckpt}/checkpoints ${avg_num}
fi
```
The `avg.sh` is in the `../../../utils/` which is define in the `path.sh`.
......
......@@ -48,7 +48,7 @@ def rule(C, V, R, T):
'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'.
Erhua is is possibly applied to every finals, except for finals that already ends with 'r'.
Erhua is possibly applied to every finals, except for finals that already ends with 'r'.
When a syllable is impossible or does not have any characters with this pronunciation, return None
to filter it out.
......
......@@ -37,7 +37,7 @@ It will support the way of using `--variable value` in the shell scripts.
Some local variables are set in `run.sh`.
`gpus` denotes the GPU number you want to use. If you set `gpus=`, it means you only use CPU.
`stage` denotes the number of stage you want the start from in the experiments.
`stop stage` denotes the number of stage you want the stop at in the expriments.
`stop stage` denotes the number of stage you want the stop at in the experiments.
`conf_path` denotes the config path of the model.
`avg_num`denotes the number K of top-K models you want to average to get the final model.
`ckpt` denotes the checkpoint prefix of the model, e.g. "transformerr"
......
......@@ -13,3 +13,7 @@
# limitations under the License.
import _locale
_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
__version__ = '0.0.0'
__commit__ = '9cf8c1985a98bb380c183116123672976bdfe5c9'
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .aidatatang_200zh import main as aidatatang_200zh_main
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Prepare aidatatang_200zh mandarin dataset
Download, unpack and create manifest files.
Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
"""
import argparse
import codecs
import json
import os
from pathlib import Path
import soundfile
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack
from paddlespeech.utils.argparse import print_arguments
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
URL_ROOT = 'http://www.openslr.org/resources/62'
# URL_ROOT = 'https://openslr.magicdatatech.com/resources/62'
DATA_URL = URL_ROOT + '/aidatatang_200zh.tgz'
MD5_DATA = '6e0f4f39cd5f667a7ee53c397c8d0949'
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--target_dir",
default=DATA_HOME + "/aidatatang_200zh",
type=str,
help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument(
"--manifest_prefix",
default="manifest",
type=str,
help="Filepath prefix for output manifests. (default: %(default)s)")
args = parser.parse_args()
def create_manifest(data_dir, manifest_path_prefix):
print("Creating manifest %s ..." % manifest_path_prefix)
json_lines = []
transcript_path = os.path.join(data_dir, 'transcript',
'aidatatang_200_zh_transcript.txt')
transcript_dict = {}
for line in codecs.open(transcript_path, 'r', 'utf-8'):
line = line.strip()
if line == '':
continue
audio_id, text = line.split(' ', 1)
# remove withespace, charactor text
text = ''.join(text.split())
transcript_dict[audio_id] = text
data_types = ['train', 'dev', 'test']
for dtype in data_types:
del json_lines[:]
total_sec = 0.0
total_text = 0.0
total_num = 0
audio_dir = os.path.join(data_dir, 'corpus/', dtype)
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
for fname in filelist:
if not fname.endswith('.wav'):
continue
audio_path = os.path.abspath(os.path.join(subfolder, fname))
audio_id = os.path.basename(fname)[:-4]
utt2spk = Path(audio_path).parent.name
audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
text = transcript_dict[audio_id]
json_lines.append(
json.dumps(
{
'utt': audio_id,
'utt2spk': str(utt2spk),
'feat': audio_path,
'feat_shape': (duration, ), # second
'text': text,
},
ensure_ascii=False))
total_sec += duration
total_text += len(text)
total_num += 1
manifest_path = manifest_path_prefix + '.' + dtype
with codecs.open(manifest_path, 'w', 'utf-8') as fout:
for line in json_lines:
fout.write(line + '\n')
manifest_dir = os.path.dirname(manifest_path_prefix)
meta_path = os.path.join(manifest_dir, dtype) + '.meta'
with open(meta_path, 'w') as f:
print(f"{dtype}:", file=f)
print(f"{total_num} utts", file=f)
print(f"{total_sec / (60*60)} h", file=f)
print(f"{total_text} text", file=f)
print(f"{total_text / total_sec} text/sec", file=f)
print(f"{total_sec / total_num} sec/utt", file=f)
def prepare_dataset(url, md5sum, target_dir, manifest_path, subset):
"""Download, unpack and create manifest file."""
data_dir = os.path.join(target_dir, subset)
if not os.path.exists(data_dir):
filepath = download(url, md5sum, target_dir)
unpack(filepath, target_dir)
# unpack all audio tar files
audio_dir = os.path.join(data_dir, 'corpus')
for subfolder, dirlist, filelist in sorted(os.walk(audio_dir)):
for sub in dirlist:
print(f"unpack dir {sub}...")
for folder, _, filelist in sorted(
os.walk(os.path.join(subfolder, sub))):
for ftar in filelist:
unpack(os.path.join(folder, ftar), folder, True)
else:
print("Skip downloading and unpacking. Data already exists in %s." %
target_dir)
create_manifest(data_dir, manifest_path)
def main():
print_arguments(args, globals())
if args.target_dir.startswith('~'):
args.target_dir = os.path.expanduser(args.target_dir)
prepare_dataset(
url=DATA_URL,
md5sum=MD5_DATA,
target_dir=args.target_dir,
manifest_path=args.manifest_prefix,
subset='aidatatang_200zh')
print("Data download and manifest prepare done!")
if __name__ == '__main__':
main()
# [Aishell1](http://openslr.elda.org/33/)
This Open Source Mandarin Speech Corpus, AISHELL-ASR0009-OS1, is 178 hours long. It is a part of AISHELL-ASR0009, of which utterance contains 11 domains, including smart home, autonomous driving, and industrial production. The whole recording was put in quiet indoor environment, using 3 different devices at the same time: high fidelity microphone (44.1kHz, 16-bit,); Android-system mobile phone (16kHz, 16-bit), iOS-system mobile phone (16kHz, 16-bit). Audios in high fidelity were re-sampled to 16kHz to build AISHELL- ASR0009-OS1. 400 speakers from different accent areas in China were invited to participate in the recording. The manual transcription accuracy rate is above 95%, through professional speech annotation and strict quality inspection. The corpus is divided into training, development and testing sets. ( This database is free for academic research, not in the commerce, if without permission. )
## Dataset Architecture
```bash
data_aishell
├── transcript # text 目录
└── wav # wav 目录
├── dev # dev 目录
│ ├── S0724 # spk 目录
│ ├── S0725
│ ├── S0726
├── train
│ ├── S0724
│ ├── S0725
│ ├── S0726
├── test
│ ├── S0724
│ ├── S0725
│ ├── S0726
data_aishell
├── transcript
│ └── aishell_transcript_v0.8.txt # 文本标注文件
└── wav
├── dev
│ ├── S0724
│ │ ├── BAC009S0724W0121.wav # S0724 的音频
│ │ ├── BAC009S0724W0122.wav
│ │ ├── BAC009S0724W0123.wav
├── test
│ ├── S0724
│ │ ├── BAC009S0724W0121.wav
│ │ ├── BAC009S0724W0122.wav
│ │ ├── BAC009S0724W0123.wav
├── train
│ ├── S0724
│ │ ├── BAC009S0724W0121.wav
│ │ ├── BAC009S0724W0122.wav
│ │ ├── BAC009S0724W0123.wav
标注文件格式: <utt> <tokens>
> head data_aishell/transcript/aishell_transcript_v0.8.txt
BAC009S0002W0122 而 对 楼市 成交 抑制 作用 最 大 的 限 购
BAC009S0002W0123 也 成为 地方 政府 的 眼中 钉
BAC009S0002W0124 自 六月 底 呼和浩特 市 率先 宣布 取消 限 购 后
BAC009S0002W0125 各地 政府 便 纷纷 跟进
BAC009S0002W0126 仅 一 个 多 月 的 时间 里
BAC009S0002W0127 除了 北京 上海 广州 深圳 四 个 一 线 城市 和 三亚 之外
BAC009S0002W0128 四十六 个 限 购 城市 当中
BAC009S0002W0129 四十一 个 已 正式 取消 或 变相 放松 了 限 购
BAC009S0002W0130 财政 金融 政策 紧随 其后 而来
BAC009S0002W0131 显示 出 了 极 强 的 威力
```
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from .aishell import check_dataset
from .aishell import create_manifest
from .aishell import download_dataset
from .aishell import main as aishell_main
from .aishell import prepare_dataset
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Prepare Aishell mandarin dataset
Download, unpack and create manifest files.
Manifest file is a json-format file with each line containing the
meta data (i.e. audio filepath, transcript and audio duration)
of each audio file in the data set.
"""
import argparse
import codecs
import json
import os
from pathlib import Path
import soundfile
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack
from paddlespeech.utils.argparse import print_arguments
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
URL_ROOT = 'http://openslr.elda.org/resources/33'
# URL_ROOT = 'https://openslr.magicdatatech.com/resources/33'
DATA_URL = URL_ROOT + '/data_aishell.tgz'
MD5_DATA = '2f494334227864a8a8fec932999db9d8'
RESOURCE_URL = URL_ROOT + '/resource_aishell.tgz'
MD5_RESOURCE = '957d480a0fcac85fc18e550756f624e5'
parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument(
"--target_dir",
default=DATA_HOME + "/Aishell",
type=str,
help="Directory to save the dataset. (default: %(default)s)")
parser.add_argument(
"--manifest_prefix",
default="manifest",
type=str,
help="Filepath prefix for output manifests. (default: %(default)s)")
args = parser.parse_args()
def create_manifest(data_dir, manifest_path_prefix):
print("Creating manifest %s ..." % os.path.join(data_dir,
manifest_path_prefix))
json_lines = []
transcript_path = os.path.join(data_dir, 'transcript',
'aishell_transcript_v0.8.txt')
transcript_dict = {}
for line in codecs.open(transcript_path, 'r', 'utf-8'):
line = line.strip()
if line == '':
continue
audio_id, text = line.split(' ', 1)
# remove withespace, charactor text
text = ''.join(text.split())
transcript_dict[audio_id] = text
data_metas = dict()
data_types = ['train', 'dev', 'test']
for dtype in data_types:
del json_lines[:]
total_sec = 0.0
total_text = 0.0
total_num = 0
audio_dir = os.path.join(data_dir, 'wav', dtype)
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
for fname in filelist:
audio_path = os.path.abspath(os.path.join(subfolder, fname))
audio_id = os.path.basename(fname)[:-4]
# if no transcription for audio then skipped
if audio_id not in transcript_dict:
continue
utt2spk = Path(audio_path).parent.name
audio_data, samplerate = soundfile.read(audio_path)
duration = float(len(audio_data) / samplerate)
text = transcript_dict[audio_id]
json_lines.append(
json.dumps(
{
'utt': audio_id,
'utt2spk': str(utt2spk),
'feat': audio_path,
'feat_shape': (duration, ), # second
'text': text
},
ensure_ascii=False))
total_sec += duration
total_text += len(text)
total_num += 1
manifest_path = manifest_path_prefix + '.' + dtype
with codecs.open(manifest_path, 'w', 'utf-8') as fout:
for line in json_lines:
fout.write(line + '\n')
meta = dict()
meta["dtype"] = dtype # train, dev, test
meta["utts"] = total_num
meta["hours"] = total_sec / (60 * 60)
meta["text"] = total_text
meta["text/sec"] = total_text / total_sec
meta["sec/utt"] = total_sec / total_num
data_metas[dtype] = meta
manifest_dir = os.path.dirname(manifest_path_prefix)
meta_path = os.path.join(manifest_dir, dtype) + '.meta'
with open(meta_path, 'w') as f:
for key, val in meta.items():
print(f"{key}: {val}", file=f)
return data_metas
def download_dataset(url, md5sum, target_dir):
"""Download, unpack and create manifest file."""
data_dir = os.path.join(target_dir, 'data_aishell')
if not os.path.exists(data_dir):
filepath = download(url, md5sum, target_dir)
unpack(filepath, target_dir)
# unpack all audio tar files
audio_dir = os.path.join(data_dir, 'wav')
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
for ftar in filelist:
unpack(os.path.join(subfolder, ftar), subfolder, True)
else:
print("Skip downloading and unpacking. Data already exists in %s." %
os.path.abspath(target_dir))
return os.path.abspath(data_dir)
def check_dataset(data_dir):
print(f"check dataset {os.path.abspath(data_dir)} ...")
transcript_path = os.path.join(data_dir, 'transcript',
'aishell_transcript_v0.8.txt')
if not os.path.exists(transcript_path):
raise FileNotFoundError(f"no transcript file found in {data_dir}.")
transcript_dict = {}
for line in codecs.open(transcript_path, 'r', 'utf-8'):
line = line.strip()
if line == '':
continue
audio_id, text = line.split(' ', 1)
# remove withespace, charactor text
text = ''.join(text.split())
transcript_dict[audio_id] = text
no_label = 0
data_types = ['train', 'dev', 'test']
for dtype in data_types:
audio_dir = os.path.join(data_dir, 'wav', dtype)
if not os.path.exists(audio_dir):
raise IOError(f"{audio_dir} does not exist.")
for subfolder, _, filelist in sorted(os.walk(audio_dir)):
for fname in filelist:
audio_path = os.path.abspath(os.path.join(subfolder, fname))
audio_id = os.path.basename(fname)[:-4]
# if no transcription for audio then skipped
if audio_id not in transcript_dict:
print(f"Warning: {audio_id} not has transcript.")
no_label += 1
continue
utt2spk = Path(audio_path).parent.name
audio_data, samplerate = soundfile.read(audio_path)
assert samplerate == 16000, f"{audio_path} sample rate is {samplerate} not 16k, please check."
print(f"Warning: {dtype} has {no_label} audio does not has transcript.")
def prepare_dataset(url, md5sum, target_dir, manifest_path=None, check=False):
"""Download, unpack and create manifest file."""
data_dir = download_dataset(url, md5sum, target_dir)
if check:
try:
check_dataset(data_dir)
except Exception as e:
raise ValueError(
f"{data_dir} dataset format not right, please check it.")
meta = None
if manifest_path:
meta = create_manifest(data_dir, manifest_path)
return data_dir, meta
def main():
print_arguments(args, globals())
if args.target_dir.startswith('~'):
args.target_dir = os.path.expanduser(args.target_dir)
data_dir, meta = prepare_dataset(
url=DATA_URL,
md5sum=MD5_DATA,
target_dir=args.target_dir,
manifest_path=args.manifest_prefix,
check=True)
resource_dir, _ = prepare_dataset(
url=RESOURCE_URL,
md5sum=MD5_RESOURCE,
target_dir=args.target_dir,
manifest_path=None)
print("Data download and manifest prepare done!")
if __name__ == '__main__':
main()
......@@ -19,91 +19,16 @@ import zipfile
from typing import Text
__all__ = [
"check_md5sum", "getfile_insensitive", "download_multi", "download",
"unpack", "unzip", "md5file", "print_arguments", "add_arguments",
"get_commandline_args"
"check_md5sum",
"getfile_insensitive",
"download_multi",
"download",
"unpack",
"unzip",
"md5file",
]
def get_commandline_args():
extra_chars = [
" ",
";",
"&",
"(",
")",
"|",
"^",
"<",
">",
"?",
"*",
"[",
"]",
"$",
"`",
'"',
"\\",
"!",
"{",
"}",
]
# Escape the extra characters for shell
argv = [
arg.replace("'", "'\\''") if all(char not in arg
for char in extra_chars) else
"'" + arg.replace("'", "'\\''") + "'" for arg in sys.argv
]
return sys.executable + " " + " ".join(argv)
def print_arguments(args, info=None):
"""Print argparse's arguments.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
parser.add_argument("name", default="Jonh", type=str, help="User name.")
args = parser.parse_args()
print_arguments(args)
:param args: Input argparse.Namespace for printing.
:type args: argparse.Namespace
"""
filename = ""
if info:
filename = info["__file__"]
filename = os.path.basename(filename)
print(f"----------- {filename} Configuration Arguments -----------")
for arg, value in sorted(vars(args).items()):
print("%s: %s" % (arg, value))
print("-----------------------------------------------------------")
def add_arguments(argname, type, default, help, argparser, **kwargs):
"""Add argparse's argument.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
add_argument("name", str, "Jonh", "User name.", parser)
args = parser.parse_args()
"""
type = distutils.util.strtobool if type == bool else type
argparser.add_argument(
"--" + argname,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
def md5file(fname):
hash_md5 = hashlib.md5()
f = open(fname, "rb")
......
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# s2t utils binaries.
from .avg_model import main as avg_ckpts_main
from .build_vocab import main as build_vocab_main
from .compute_mean_std import main as compute_mean_std_main
from .compute_wer import main as compute_wer_main
from .format_data import main as format_data_main
from .format_rsl import main as format_rsl_main
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import glob
import json
import os
import numpy as np
import paddle
def define_argparse():
parser = argparse.ArgumentParser(description='average model')
parser.add_argument('--dst_model', required=True, help='averaged model')
parser.add_argument(
'--ckpt_dir', required=True, help='ckpt model dir for average')
parser.add_argument(
'--val_best', action="store_true", help='averaged model')
parser.add_argument(
'--num', default=5, type=int, help='nums for averaged model')
parser.add_argument(
'--min_epoch',
default=0,
type=int,
help='min epoch used for averaging model')
parser.add_argument(
'--max_epoch',
default=65536, # Big enough
type=int,
help='max epoch used for averaging model')
args = parser.parse_args()
return args
def average_checkpoints(dst_model="",
ckpt_dir="",
val_best=True,
num=5,
min_epoch=0,
max_epoch=65536):
paddle.set_device('cpu')
val_scores = []
jsons = glob.glob(f'{ckpt_dir}/[!train]*.json')
jsons = sorted(jsons, key=os.path.getmtime, reverse=True)
for y in jsons:
with open(y, 'r') as f:
dic_json = json.load(f)
loss = dic_json['val_loss']
epoch = dic_json['epoch']
if epoch >= min_epoch and epoch <= max_epoch:
val_scores.append((epoch, loss))
assert val_scores, f"Not find any valid checkpoints: {val_scores}"
val_scores = np.array(val_scores)
if val_best:
sort_idx = np.argsort(val_scores[:, 1])
sorted_val_scores = val_scores[sort_idx]
else:
sorted_val_scores = val_scores
beat_val_scores = sorted_val_scores[:num, 1]
selected_epochs = sorted_val_scores[:num, 0].astype(np.int64)
avg_val_score = np.mean(beat_val_scores)
print("selected val scores = " + str(beat_val_scores))
print("selected epochs = " + str(selected_epochs))
print("averaged val score = " + str(avg_val_score))
path_list = [
ckpt_dir + '/{}.pdparams'.format(int(epoch))
for epoch in sorted_val_scores[:num, 0]
]
print(path_list)
avg = None
num = args.num
assert num == len(path_list)
for path in path_list:
print(f'Processing {path}')
states = paddle.load(path)
if avg is None:
avg = states
else:
for k in avg.keys():
avg[k] += states[k]
# average
for k in avg.keys():
if avg[k] is not None:
avg[k] /= num
paddle.save(avg, args.dst_model)
print(f'Saving to {args.dst_model}')
meta_path = os.path.splitext(args.dst_model)[0] + '.avg.json'
with open(meta_path, 'w') as f:
data = json.dumps({
"mode": 'val_best' if args.val_best else 'latest',
"avg_ckpt": args.dst_model,
"val_loss_mean": avg_val_score,
"ckpts": path_list,
"epochs": selected_epochs.tolist(),
"val_losses": beat_val_scores.tolist(),
})
f.write(data + "\n")
def main():
args = define_argparse()
average_checkpoints(args)
if __name__ == '__main__':
main()
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Build vocabulary from manifest files.
Each item in vocabulary file is a character.
"""
import argparse
import functools
import os
import tempfile
from collections import Counter
import jsonlines
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import BLANK
from paddlespeech.s2t.frontend.utility import SOS
from paddlespeech.s2t.frontend.utility import SPACE
from paddlespeech.s2t.frontend.utility import UNK
from paddlespeech.utils.argparse import add_arguments
from paddlespeech.utils.argparse import print_arguments
def count_manifest(counter, text_feature, manifest_path):
manifest_jsons = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
manifest_jsons.append(json_data)
for line_json in manifest_jsons:
if isinstance(line_json['text'], str):
tokens = text_feature.tokenize(
line_json['text'], replace_space=False)
counter.update(tokens)
else:
assert isinstance(line_json['text'], list)
for text in line_json['text']:
tokens = text_feature.tokenize(text, replace_space=False)
counter.update(tokens)
def dump_text_manifest(fileobj, manifest_path, key='text'):
manifest_jsons = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
manifest_jsons.append(json_data)
for line_json in manifest_jsons:
if isinstance(line_json[key], str):
fileobj.write(line_json[key] + "\n")
else:
assert isinstance(line_json[key], list)
for line in line_json[key]:
fileobj.write(line + "\n")
def build_vocab(manifest_paths="",
vocab_path="examples/librispeech/data/vocab.txt",
unit_type="char",
count_threshold=0,
text_keys='text',
spm_mode="unigram",
spm_vocab_size=0,
spm_model_prefix="",
spm_character_coverage=0.9995):
fout = open(vocab_path, 'w', encoding='utf-8')
fout.write(BLANK + "\n") # 0 will be used for "blank" in CTC
fout.write(UNK + '\n') # <unk> must be 1
if unit_type == 'spm':
# tools/spm_train --input=$wave_data/lang_char/input.txt
# --vocab_size=${nbpe} --model_type=${bpemode}
# --model_prefix=${bpemodel} --input_sentence_size=100000000
import sentencepiece as spm
fp = tempfile.NamedTemporaryFile(mode='w', delete=False)
for manifest_path in manifest_paths:
_text_keys = [text_keys] if type(
text_keys) is not list else text_keys
for text_key in _text_keys:
dump_text_manifest(fp, manifest_path, key=text_key)
fp.close()
# train
spm.SentencePieceTrainer.Train(
input=fp.name,
vocab_size=spm_vocab_size,
model_type=spm_mode,
model_prefix=spm_model_prefix,
input_sentence_size=100000000,
character_coverage=spm_character_coverage)
os.unlink(fp.name)
# encode
text_feature = TextFeaturizer(unit_type, "", spm_model_prefix)
counter = Counter()
for manifest_path in manifest_paths:
count_manifest(counter, text_feature, manifest_path)
count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
tokens = []
for token, count in count_sorted:
if count < count_threshold:
break
# replace space by `<space>`
token = SPACE if token == ' ' else token
tokens.append(token)
tokens = sorted(tokens)
for token in tokens:
fout.write(token + '\n')
fout.write(SOS + "\n") # <sos/eos>
fout.close()
def define_argparse():
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
add_arg('count_threshold', int, 0,
"Truncation threshold for char/word counts.Default 0, no truncate.")
add_arg('vocab_path', str,
'examples/librispeech/data/vocab.txt',
"Filepath to write the vocabulary.")
add_arg('manifest_paths', str,
None,
"Filepaths of manifests for building vocabulary. "
"You can provide multiple manifest files.",
nargs='+',
required=True)
add_arg('text_keys', str,
'text',
"keys of the text in manifest for building vocabulary. "
"You can provide multiple k.",
nargs='+')
# bpe
add_arg('spm_vocab_size', int, 0, "Vocab size for spm.")
add_arg('spm_mode', str, 'unigram', "spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm")
add_arg('spm_model_prefix', str, "", "spm_model_%(spm_mode)_%(count_threshold), spm model prefix, only need when `unit_type` is spm")
add_arg('spm_character_coverage', float, 0.9995, "character coverage to determine the minimum symbols")
# yapf: disable
args = parser.parse_args()
return args
def main():
args = define_argparse()
print_arguments(args, globals())
build_vocab(**vars(args))
if __name__ == '__main__':
main()
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Compute mean and std for feature normalizer, and save to file."""
import argparse
import functools
from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline
from paddlespeech.s2t.frontend.featurizer.audio_featurizer import AudioFeaturizer
from paddlespeech.s2t.frontend.normalizer import FeatureNormalizer
from paddlespeech.utils.argparse import add_arguments
from paddlespeech.utils.argparse import print_arguments
def compute_cmvn(manifest_path="data/librispeech/manifest.train",
output_path="data/librispeech/mean_std.npz",
num_samples=2000,
num_workers=0,
spectrum_type="linear",
feat_dim=13,
delta_delta=False,
stride_ms=10,
window_ms=20,
sample_rate=16000,
use_dB_normalization=True,
target_dB=-20):
augmentation_pipeline = AugmentationPipeline('{}')
audio_featurizer = AudioFeaturizer(
spectrum_type=spectrum_type,
feat_dim=feat_dim,
delta_delta=delta_delta,
stride_ms=float(stride_ms),
window_ms=float(window_ms),
n_fft=None,
max_freq=None,
target_sample_rate=sample_rate,
use_dB_normalization=use_dB_normalization,
target_dB=target_dB,
dither=0.0)
def augment_and_featurize(audio_segment):
augmentation_pipeline.transform_audio(audio_segment)
return audio_featurizer.featurize(audio_segment)
normalizer = FeatureNormalizer(
mean_std_filepath=None,
manifest_path=manifest_path,
featurize_func=augment_and_featurize,
num_samples=num_samples,
num_workers=num_workers)
normalizer.write_to_file(output_path)
def define_argparse():
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('manifest_path', str,
'data/librispeech/manifest.train',
"Filepath of manifest to compute normalizer's mean and stddev.")
add_arg('output_path', str,
'data/librispeech/mean_std.npz',
"Filepath of write mean and stddev to (.npz).")
add_arg('num_samples', int, 2000, "# of samples to for statistics.")
add_arg('num_workers',
default=0,
type=int,
help='num of subprocess workers for processing')
add_arg('spectrum_type', str,
'linear',
"Audio feature type. Options: linear, mfcc, fbank.",
choices=['linear', 'mfcc', 'fbank'])
add_arg('feat_dim', int, 13, "Audio feature dim.")
add_arg('delta_delta', bool, False, "Audio feature with delta delta.")
add_arg('stride_ms', int, 10, "stride length in ms.")
add_arg('window_ms', int, 20, "stride length in ms.")
add_arg('sample_rate', int, 16000, "target sample rate.")
add_arg('use_dB_normalization', bool, True, "do dB normalization.")
add_arg('target_dB', int, -20, "target dB.")
# yapf: disable
args = parser.parse_args()
return args
def main():
args = define_argparse()
print_arguments(args, globals())
compute_cmvn(**vars(args))
if __name__ == '__main__':
main()
此差异已折叠。
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""format manifest with more metadata."""
import argparse
import functools
import json
import jsonlines
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.io.utility import feat_type
from paddlespeech.utils.argparse import add_arguments
from paddlespeech.utils.argparse import print_arguments
def define_argparse():
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('manifest_paths', str,
None,
"Filepaths of manifests for building vocabulary. "
"You can provide multiple manifest files.",
nargs='+',
required=True)
add_arg('output_path', str, None, "filepath of formated manifest.", required=True)
add_arg('cmvn_path', str,
'examples/librispeech/data/mean_std.json',
"Filepath of cmvn.")
add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
add_arg('vocab_path', str,
'examples/librispeech/data/vocab.txt',
"Filepath of the vocabulary.")
# bpe
add_arg('spm_model_prefix', str, None,
"spm model prefix, spm_model_%(bpe_mode)_%(count_threshold), only need when `unit_type` is spm")
# yapf: disable
args = parser.parse_args()
return args
def format_data(
manifest_paths="",
output_path="",
cmvn_path="examples/librispeech/data/mean_std.json",
unit_type="char",
vocab_path="examples/librispeech/data/vocab.txt",
spm_model_prefix=""):
fout = open(output_path, 'w', encoding='utf-8')
# get feat dim
filetype = cmvn_path.split(".")[-1]
mean, istd = load_cmvn(cmvn_path, filetype=filetype)
feat_dim = mean.shape[0] #(D)
print(f"Feature dim: {feat_dim}")
text_feature = TextFeaturizer(unit_type, vocab_path, spm_model_prefix)
vocab_size = text_feature.vocab_size
print(f"Vocab size: {vocab_size}")
# josnline like this
# {
# "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
# "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
# "utt2spk": "111-2222",
# "utt": "111-2222-333"
# }
count = 0
for manifest_path in manifest_paths:
with jsonlines.open(str(manifest_path), 'r') as reader:
manifest_jsons = list(reader)
for line_json in manifest_jsons:
output_json = {
"input": [],
"output": [],
'utt': line_json['utt'],
'utt2spk': line_json.get('utt2spk', 'global'),
}
# output
line = line_json['text']
if isinstance(line, str):
# only one target
tokens = text_feature.tokenize(line)
tokenids = text_feature.featurize(line)
output_json['output'].append({
'name': 'target1',
'shape': (len(tokenids), vocab_size),
'text': line,
'token': ' '.join(tokens),
'tokenid': ' '.join(map(str, tokenids)),
})
else:
# isinstance(line, list), multi target in one vocab
for i, item in enumerate(line, 1):
tokens = text_feature.tokenize(item)
tokenids = text_feature.featurize(item)
output_json['output'].append({
'name': f'target{i}',
'shape': (len(tokenids), vocab_size),
'text': item,
'token': ' '.join(tokens),
'tokenid': ' '.join(map(str, tokenids)),
})
# input
line = line_json['feat']
if isinstance(line, str):
# only one input
feat_shape = line_json['feat_shape']
assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
filetype = feat_type(line)
if filetype == 'sound':
feat_shape.append(feat_dim)
else: # kaldi
raise NotImplementedError('no support kaldi feat now!')
output_json['input'].append({
"name": "input1",
"shape": feat_shape,
"feat": line,
"filetype": filetype,
})
else:
# isinstance(line, list), multi input
raise NotImplementedError("not support multi input now!")
fout.write(json.dumps(output_json) + '\n')
count += 1
print(f"{manifest_paths} Examples number: {count}")
fout.close()
def main():
args = define_argparse()
print_arguments(args, globals())
format_data(**vars(args))
if __name__ == '__main__':
main()
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
format ref/hyp file for `utt text` format to compute CER/WER/MER.
norm:
BAC009S0764W0196 明确了发展目标和重点任务
BAC009S0764W0186 实现我国房地产市场的平稳运行
sclite:
加大对结构机械化环境和收集谈控机制力度(BAC009S0906W0240.wav)
河南省新乡市丰秋县刘光镇政府东五零左右(BAC009S0770W0441.wav)
"""
import argparse
import jsonlines
from paddlespeech.utils.argparse import print_arguments
def transform_hyp(origin, trans, trans_sclite):
"""
Args:
origin: The input json file which contains the model output
trans: The output file for caculate CER/WER
trans_sclite: The output file for caculate CER/WER using sclite
"""
input_dict = {}
with open(origin, "r+", encoding="utf8") as f:
for item in jsonlines.Reader(f):
input_dict[item["utt"]] = item["hyps"][0]
if trans:
with open(trans, "w+", encoding="utf8") as f:
for key in input_dict.keys():
f.write(key + " " + input_dict[key] + "\n")
print(f"transform_hyp output: {trans}")
if trans_sclite:
with open(trans_sclite, "w+") as f:
for key in input_dict.keys():
line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
f.write(line)
print(f"transform_hyp output: {trans_sclite}")
def transform_ref(origin, trans, trans_sclite):
"""
Args:
origin: The input json file which contains the model output
trans: The output file for caculate CER/WER
trans_sclite: The output file for caculate CER/WER using sclite
"""
input_dict = {}
with open(origin, "r", encoding="utf8") as f:
for item in jsonlines.Reader(f):
input_dict[item["utt"]] = item["text"]
if trans:
with open(trans, "w", encoding="utf8") as f:
for key in input_dict.keys():
f.write(key + " " + input_dict[key] + "\n")
print(f"transform_hyp output: {trans}")
if trans_sclite:
with open(trans_sclite, "w") as f:
for key in input_dict.keys():
line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
f.write(line)
print(f"transform_hyp output: {trans_sclite}")
def define_argparse():
parser = argparse.ArgumentParser(
prog='format ref/hyp file for compute CER/WER', add_help=True)
parser.add_argument(
'--origin_hyp', type=str, default="", help='origin hyp file')
parser.add_argument(
'--trans_hyp',
type=str,
default="",
help='hyp file for caculating CER/WER')
parser.add_argument(
'--trans_hyp_sclite',
type=str,
default="",
help='hyp file for caculating CER/WER by sclite')
parser.add_argument(
'--origin_ref', type=str, default="", help='origin ref file')
parser.add_argument(
'--trans_ref',
type=str,
default="",
help='ref file for caculating CER/WER')
parser.add_argument(
'--trans_ref_sclite',
type=str,
default="",
help='ref file for caculating CER/WER by sclite')
parser_args = parser.parse_args()
return parser_args
def format_result(origin_hyp="",
trans_hyp="",
trans_hyp_sclite="",
origin_ref="",
trans_ref="",
trans_ref_sclite=""):
if origin_hyp:
transform_hyp(
origin=origin_hyp, trans=trans_hyp, trans_sclite=trans_hyp_sclite)
if origin_ref:
transform_ref(
origin=origin_ref, trans=trans_ref, trans_sclite=trans_ref_sclite)
def main():
args = define_argparse()
print_arguments(args, globals())
format_result(**vars(args))
if __name__ == "__main__":
main()
......@@ -267,7 +267,7 @@ def to(x: paddle.Tensor, *args, **kwargs) -> paddle.Tensor:
if not hasattr(paddle.Tensor, 'to'):
logger.debug("register user to to paddle.Tensor, remove this when fixed!")
logger.debug("register user to paddle.Tensor, remove this when fixed!")
setattr(paddle.Tensor, 'to', to)
setattr(paddle.static.Variable, 'to', to)
......
......@@ -28,8 +28,8 @@ from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.socket_server import AsrRequestHandler
from paddlespeech.s2t.utils.socket_server import AsrTCPServer
from paddlespeech.s2t.utils.socket_server import warm_up_test
from paddlespeech.s2t.utils.utility import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.utils.argparse import add_arguments
from paddlespeech.utils.argparse import print_arguments
def init_predictor(args):
......
......@@ -26,8 +26,8 @@ from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.socket_server import AsrRequestHandler
from paddlespeech.s2t.utils.socket_server import AsrTCPServer
from paddlespeech.s2t.utils.socket_server import warm_up_test
from paddlespeech.s2t.utils.utility import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.utils.argparse import add_arguments
from paddlespeech.utils.argparse import print_arguments
def start_server(config, args):
......
......@@ -16,7 +16,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args):
......
......@@ -16,7 +16,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Tester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args):
......
......@@ -16,7 +16,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2ExportTester as ExportTester
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args):
......
......@@ -27,8 +27,8 @@ from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils import mp_tools
from paddlespeech.s2t.utils.checkpoint import Checkpoint
from paddlespeech.s2t.utils.log import Log
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.s2t.utils.utility import UpdateConfig
from paddlespeech.utils.argparse import print_arguments
logger = Log(__name__).getlog()
......
......@@ -16,7 +16,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.deepspeech2.model import DeepSpeech2Trainer as Trainer
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args):
......
......@@ -16,7 +16,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args):
......
......@@ -16,7 +16,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args):
......
......@@ -18,7 +18,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2.model import U2Tester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args):
......
......@@ -19,7 +19,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2.model import U2Trainer as Trainer
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.utils.argparse import print_arguments
# from paddlespeech.s2t.exps.u2.trainer import U2Trainer as Trainer
......
......@@ -18,7 +18,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.dynamic_import import dynamic_import
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.utils.argparse import print_arguments
model_test_alias = {
"u2": "paddlespeech.s2t.exps.u2.model:U2Tester",
......
......@@ -19,7 +19,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.dynamic_import import dynamic_import
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.utils.argparse import print_arguments
model_train_alias = {
"u2": "paddlespeech.s2t.exps.u2.model:U2Trainer",
......
......@@ -16,7 +16,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args):
......
......@@ -18,7 +18,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2_st.model import U2STTester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args):
......
......@@ -19,7 +19,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.u2_st.model import U2STTrainer as Trainer
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args):
......
......@@ -18,7 +18,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTester as Tester
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args):
......
......@@ -19,7 +19,7 @@ from yacs.config import CfgNode
from paddlespeech.s2t.exps.wav2vec2.model import Wav2Vec2ASRTrainer as Trainer
from paddlespeech.s2t.training.cli import default_argument_parser
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.utils.argparse import print_arguments
def main_sp(config, args):
......
......@@ -45,7 +45,7 @@ class AugmentationPipeline():
samples to make the model invariant to certain types of perturbations in the
real world, improving model's generalization ability.
The pipeline is built according the the augmentation configuration in json
The pipeline is built according to the augmentation configuration in json
string, e.g.
.. code-block::
......
......@@ -48,13 +48,16 @@ class TextFeaturizer():
self.unit_type = unit_type
self.unk = UNK
self.maskctc = maskctc
self.vocab_path_or_list = vocab
if vocab:
if self.vocab_path_or_list:
self.vocab_dict, self._id2token, self.vocab_list, self.unk_id, self.eos_id, self.blank_id = self._load_vocabulary_from_file(
vocab, maskctc)
self.vocab_size = len(self.vocab_list)
else:
logger.warning("TextFeaturizer: not have vocab file or vocab list.")
logger.warning(
"TextFeaturizer: not have vocab file or vocab list. Only Tokenizer can use, can not convert to token idx"
)
if unit_type == 'spm':
spm_model = spm_model_prefix + '.model'
......@@ -62,6 +65,7 @@ class TextFeaturizer():
self.sp.Load(spm_model)
def tokenize(self, text, replace_space=True):
"""tokenizer split text into text tokens"""
if self.unit_type == 'char':
tokens = self.char_tokenize(text, replace_space)
elif self.unit_type == 'word':
......@@ -71,6 +75,7 @@ class TextFeaturizer():
return tokens
def detokenize(self, tokens):
"""tokenizer convert text tokens back to text"""
if self.unit_type == 'char':
text = self.char_detokenize(tokens)
elif self.unit_type == 'word':
......@@ -88,6 +93,7 @@ class TextFeaturizer():
Returns:
List[int]: List of token indices.
"""
assert self.vocab_path_or_list, "toidx need vocab path or vocab list"
tokens = self.tokenize(text)
ids = []
for token in tokens:
......@@ -107,6 +113,7 @@ class TextFeaturizer():
Returns:
str: Text.
"""
assert self.vocab_path_or_list, "toidx need vocab path or vocab list"
tokens = []
for idx in idxs:
if idx == self.eos_id:
......@@ -127,10 +134,10 @@ class TextFeaturizer():
"""
text = text.strip()
if replace_space:
text_list = [SPACE if item == " " else item for item in list(text)]
tokens = [SPACE if item == " " else item for item in list(text)]
else:
text_list = list(text)
return text_list
tokens = list(text)
return tokens
def char_detokenize(self, tokens):
"""Character detokenizer.
......
......@@ -283,7 +283,7 @@ class DynamicBatchSampler(Sampler):
num_quantiles, )
# get quantiles using lognormal distribution
quantiles = lognorm.ppf(latent_boundaries, 1)
# scale up to to max_batch_length
# scale up to max_batch_length
bucket_boundaries = quantiles * max_batch_length / quantiles[-1]
# compute resulting bucket length multipliers
length_multipliers = [
......
......@@ -560,7 +560,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
[len(hyp[0]) for hyp in hyps], place=device,
dtype=paddle.long) # (beam_size,)
hyps_pad, _ = add_sos_eos(hyps_pad, self.sos, self.eos, self.ignore_id)
hyps_lens = hyps_lens + 1 # Add <sos> at begining
hyps_lens = hyps_lens + 1 # Add <sos> at beginning
logger.debug(
f"hyps pad: {hyps_pad} {self.sos} {self.eos} {self.ignore_id}")
......@@ -709,7 +709,7 @@ class U2BaseModel(ASRInterface, nn.Layer):
hypothesis from ctc prefix beam search and one encoder output
Args:
hyps (paddle.Tensor): hyps from ctc prefix beam search, already
pad sos at the begining, (B, T)
pad sos at the beginning, (B, T)
hyps_lens (paddle.Tensor): length of each hyp in hyps, (B)
encoder_out (paddle.Tensor): corresponding encoder output, (B=1, T, D)
Returns:
......
......@@ -455,7 +455,7 @@ class U2STBaseModel(nn.Layer):
hypothesis from ctc prefix beam search and one encoder output
Args:
hyps (paddle.Tensor): hyps from ctc prefix beam search, already
pad sos at the begining, (B, T)
pad sos at the beginning, (B, T)
hyps_lens (paddle.Tensor): length of each hyp in hyps, (B)
encoder_out (paddle.Tensor): corresponding encoder output, (B=1, T, D)
Returns:
......
......@@ -29,10 +29,7 @@ from paddlespeech.s2t.utils.log import Log
logger = Log(__name__).getlog()
__all__ = [
"all_version", "UpdateConfig", "seed_all", 'print_arguments',
'add_arguments', "log_add"
]
__all__ = ["all_version", "UpdateConfig", "seed_all", "log_add"]
def all_version():
......@@ -60,51 +57,6 @@ def seed_all(seed: int=20210329):
paddle.seed(seed)
def print_arguments(args, info=None):
"""Print argparse's arguments.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
parser.add_argument("name", default="Jonh", type=str, help="User name.")
args = parser.parse_args()
print_arguments(args)
:param args: Input argparse.Namespace for printing.
:type args: argparse.Namespace
"""
filename = ""
if info:
filename = info["__file__"]
filename = os.path.basename(filename)
print(f"----------- {filename} Arguments -----------")
for arg, value in sorted(vars(args).items()):
print("%s: %s" % (arg, value))
print("-----------------------------------------------------------")
def add_arguments(argname, type, default, help, argparser, **kwargs):
"""Add argparse's argument.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
add_argument("name", str, "Jonh", "User name.", parser)
args = parser.parse_args()
"""
type = distutils.util.strtobool if type == bool else type
argparser.add_argument(
"--" + argname,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
def log_add(args: List[int]) -> float:
"""Stable log add
......
......@@ -609,7 +609,7 @@ class PaddleASRConnectionHanddler:
dtype=paddle.long) # (beam_size,)
hyps_pad, _ = add_sos_eos(hyps_pad, self.model.sos, self.model.eos,
self.model.ignore_id)
hyps_lens = hyps_lens + 1 # Add <sos> at begining
hyps_lens = hyps_lens + 1 # Add <sos> at beginning
# ctc score in ln domain
# (beam_size, max_hyps_len, vocab_size)
......
......@@ -67,7 +67,7 @@ async def websocket_endpoint(websocket: WebSocket):
# and we break the loop
if message['signal'] == 'start':
resp = {"status": "ok", "signal": "server_ready"}
# do something at begining here
# do something at beginning here
# create the instance to process the audio
#connection_handler = PaddleASRConnectionHanddler(asr_model)
connection_handler = asr_model.new_handler()
......
......@@ -45,7 +45,7 @@ def rule(C, V, R, T):
'u' in syllables when certain conditions are satisfied.
'i' is distinguished when appeared in phonemes, and separated into 3 categories, 'i', 'ii' and 'iii'.
Erhua is is possibly applied to every finals, except for finals that already ends with 'r'.
Erhua is possibly applied to every finals, except for finals that already ends with 'r'.
When a syllable is impossible or does not have any characters with this pronunciation, return None
to filter it out.
"""
......
......@@ -236,7 +236,7 @@ class ResidualBlock(nn.Layer):
Returns:
res (Tensor):
A row of the the residual output. shape=(batch_size, channel, 1, width)
A row of the residual output. shape=(batch_size, channel, 1, width)
skip (Tensor):
A row of the skip output. shape=(batch_size, channel, 1, width)
......@@ -343,7 +343,7 @@ class ResidualNet(nn.LayerList):
Returns:
res (Tensor):
A row of the the residual output. shape=(batch_size, channel, 1, width)
A row of the residual output. shape=(batch_size, channel, 1, width)
skip (Tensor):
A row of the skip output. shape=(batch_size, channel, 1, width)
......@@ -465,7 +465,7 @@ class Flow(nn.Layer):
self.resnet.start_sequence()
def inverse(self, z, condition):
"""Sampling from the the distrition p(X). It is done by sample form
"""Sampling from the distrition p(X). It is done by sample form
p(Z) and transform the sample. It is a auto regressive transformation.
Args:
......@@ -600,7 +600,7 @@ class WaveFlow(nn.LayerList):
return z, log_det_jacobian
def inverse(self, z, condition):
"""Sampling from the the distrition p(X).
"""Sampling from the distrition p(X).
It is done by sample a ``z`` form p(Z) and transform it into ``x``.
Each Flow transform .. math:: `z_{i-1}` to .. math:: `z_{i}` in an
......
......@@ -110,7 +110,7 @@ class LightweightConvolution(nn.Layer):
(batch, time1, time2) mask
Return:
Tensor: ouput. (batch, time1, d_model)
Tensor: output. (batch, time1, d_model)
"""
# linear -> GLU -> lightconv -> linear
......
# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import hashlib
import os
import sys
from typing import Text
import distutils
__all__ = ["print_arguments", "add_arguments", "get_commandline_args"]
def get_commandline_args():
extra_chars = [
" ",
";",
"&",
"(",
")",
"|",
"^",
"<",
">",
"?",
"*",
"[",
"]",
"$",
"`",
'"',
"\\",
"!",
"{",
"}",
]
# Escape the extra characters for shell
argv = [
arg.replace("'", "'\\''") if all(char not in arg
for char in extra_chars) else
"'" + arg.replace("'", "'\\''") + "'" for arg in sys.argv
]
return sys.executable + " " + " ".join(argv)
def print_arguments(args, info=None):
"""Print argparse's arguments.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
parser.add_argument("name", default="Jonh", type=str, help="User name.")
args = parser.parse_args()
print_arguments(args)
:param args: Input argparse.Namespace for printing.
:type args: argparse.Namespace
"""
filename = ""
if info:
filename = info["__file__"]
filename = os.path.basename(filename)
print(f"----------- {filename} Configuration Arguments -----------")
for arg, value in sorted(vars(args).items()):
print("%s: %s" % (arg, value))
print("-----------------------------------------------------------")
def add_arguments(argname, type, default, help, argparser, **kwargs):
"""Add argparse's argument.
Usage:
.. code-block:: python
parser = argparse.ArgumentParser()
add_argument("name", str, "Jonh", "User name.", parser)
args = parser.parse_args()
"""
type = distutils.util.strtobool if type == bool else type
argparser.add_argument(
"--" + argname,
default=default,
type=type,
help=help + ' Default: %(default)s.',
**kwargs)
......@@ -51,7 +51,7 @@ def main(args, config):
# stage0: set the training device, cpu or gpu
paddle.set_device(args.device)
# stage1: we must call the paddle.distributed.init_parallel_env() api at the begining
# stage1: we must call the paddle.distributed.init_parallel_env() api at the beginning
paddle.distributed.init_parallel_env()
nranks = paddle.distributed.get_world_size()
rank = paddle.distributed.get_rank()
......@@ -146,7 +146,7 @@ def main(args, config):
timer.start()
for epoch in range(start_epoch + 1, config.epochs + 1):
# at the begining, model must set to train mode
# at the beginning, model must set to train mode
model.train()
avg_loss = 0
......
......@@ -42,7 +42,7 @@ if __name__ == "__main__":
parser.add_argument(
"--skip_existing",
action="store_true",
help="Whether to skip ouput files with the same name. Useful if this script was interrupted."
help="Whether to skip output files with the same name. Useful if this script was interrupted."
)
parser.add_argument(
"--no_trim",
......
......@@ -2078,7 +2078,7 @@ class SymbolicShapeInference:
output_tensor_ranks = get_attribute(node, 'output_tensor_ranks')
assert output_tensor_ranks
# set the context output seperately.
# set the context output separately.
# The first output is autograd's context.
vi = self.known_vi_[node.output[0]]
vi.CopyFrom(
......
......@@ -76,7 +76,7 @@ bool DecibelNormalizer::Compute(VectorBase<BaseFloat>* waves) const {
if (gain > opts_.max_gain_db) {
LOG(ERROR)
<< "Unable to normalize segment to " << opts_.target_db << "dB,"
<< "because the the probable gain have exceeds opts_.max_gain_db"
<< "because the probable gain has exceeded opts_.max_gain_db"
<< opts_.max_gain_db << "dB.";
return false;
}
......
......@@ -40,7 +40,7 @@ typedef float BaseFloat;
#include <stdint.h>
// for discussion on what to do if you need compile kaldi
// without OpenFST, see the bottom of this this file
// without OpenFST, see the bottom of this file
#ifndef COMPILE_WITHOUT_OPENFST
......
......@@ -746,7 +746,7 @@ OnlinePitchFeatureImpl::OnlinePitchFeatureImpl(
Vector<BaseFloat> lags_offset(lags_);
// lags_offset equals lags_ (which are the log-spaced lag values we want to
// measure the NCCF at) with nccf_first_lag_ / opts.resample_freq subtracted
// from each element, so we can treat the measured NCCF values as as starting
// from each element, so we can treat the measured NCCF values as starting
// from sample zero in a signal that starts at the point start /
// opts.resample_freq. This is necessary because the ArbitraryResample code
// assumes that the input signal starts from sample zero.
......
......@@ -355,12 +355,12 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
//
//
// /// This function returns the number of words in the longest sentence in a
// /// CompactLattice (i.e. the the maximum of any path, of the count of
// /// CompactLattice (i.e. the maximum of any path, of the count of
// /// olabels on that path).
// int32 LongestSentenceLength(const Lattice &lat);
//
// /// This function returns the number of words in the longest sentence in a
// /// CompactLattice, i.e. the the maximum of any path, of the count of
// /// CompactLattice, i.e. the maximum of any path, of the count of
// /// labels on that path... note, in CompactLattice, the ilabels and olabels
// /// are identical because it is an acceptor.
// int32 LongestSentenceLength(const CompactLattice &lat);
......@@ -408,7 +408,7 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
//
// /// This function computes the mapping from the pair
// /// (frame-index, transition-id) to the pair
// /// (sum-of-acoustic-scores, num-of-occurences) over all occurences of the
// /// (sum-of-acoustic-scores, num-of-occurrences) over all occurrences of the
// /// transition-id in that frame.
// /// frame-index in the lattice.
// /// This function is useful for retaining the acoustic scores in a
......@@ -422,13 +422,13 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
// /// @param [out] acoustic_scores
// /// Pointer to a map from the pair (frame-index,
// /// transition-id) to a pair (sum-of-acoustic-scores,
// /// num-of-occurences).
// /// num-of-occurrences).
// /// Usually the acoustic scores for a pdf-id (and hence
// /// transition-id) on a frame will be the same for all the
// /// occurences of the pdf-id in that frame.
// /// occurrences of the pdf-id in that frame.
// /// But if not, we will take the average of the acoustic
// /// scores. Hence, we store both the sum-of-acoustic-scores
// /// and the num-of-occurences of the transition-id in that
// /// and the num-of-occurrences of the transition-id in that
// /// frame.
// void ComputeAcousticScoresMap(
// const Lattice &lat,
......@@ -440,8 +440,8 @@ bool PruneLattice(BaseFloat beam, LatticeType *lat);
// ///
// /// @param [in] acoustic_scores
// /// A map from the pair (frame-index, transition-id) to a
// /// pair (sum-of-acoustic-scores, num-of-occurences) of
// /// the occurences of the transition-id in that frame.
// /// pair (sum-of-acoustic-scores, num-of-occurrences) of
// /// the occurrences of the transition-id in that frame.
// /// See the comments for ComputeAcousticScoresMap for
// /// details.
// /// @param [out] lat Pointer to the output lattice.
......
......@@ -1646,7 +1646,7 @@ SubMatrix<Real>::SubMatrix(const MatrixBase<Real> &M,
static_cast<UnsignedMatrixIndexT>(M.num_rows_ - ro) &&
static_cast<UnsignedMatrixIndexT>(c) <=
static_cast<UnsignedMatrixIndexT>(M.num_cols_ - co));
// point to the begining of window
// point to the beginning of window
MatrixBase<Real>::num_rows_ = r;
MatrixBase<Real>::num_cols_ = c;
MatrixBase<Real>::stride_ = M.Stride();
......
......@@ -998,7 +998,7 @@ void FilterCompressedMatrixRows(const CompressedMatrix &in,
// iterating row-wise versus column-wise in compressed-matrix uncompression.
if (num_kept_rows > heuristic * in.NumRows()) {
// if quite a few of the the rows are kept, it may be more efficient
// if quite a few of the rows are kept, it may be more efficient
// to uncompress the entire compressed matrix, since per-column operation
// is more efficient.
Matrix<BaseFloat> full_mat(in);
......
......@@ -1587,7 +1587,7 @@ template<class Holder> class RandomAccessTableReaderImplBase {
// this from a pipe. In principle we could read it on-demand as for the
// archives, but this would probably be overkill.
// Note: the code for this this class is similar to TableWriterScriptImpl:
// Note: the code for this class is similar to TableWriterScriptImpl:
// try to keep them in sync.
template<class Holder>
class RandomAccessTableReaderScriptImpl:
......
......@@ -105,7 +105,7 @@ paddle_infer::Predictor* PaddleNnet::GetPredictor() {
while (pred_id < pool_usages.size()) {
if (pool_usages[pred_id] == false) {
predictor = pool->Retrive(pred_id);
predictor = pool->Retrieve(pred_id);
break;
}
++pred_id;
......
......@@ -32,14 +32,14 @@ void ConnectionHandler::OnSpeechStart() {
decode_thread_ = std::make_shared<std::thread>(
&ConnectionHandler::DecodeThreadFunc, this);
got_start_tag_ = true;
LOG(INFO) << "Server: Recieved speech start signal, start reading speech";
LOG(INFO) << "Server: Received speech start signal, start reading speech";
json::value rv = {{"status", "ok"}, {"type", "server_ready"}};
ws_.text(true);
ws_.write(asio::buffer(json::serialize(rv)));
}
void ConnectionHandler::OnSpeechEnd() {
LOG(INFO) << "Server: Recieved speech end signal";
LOG(INFO) << "Server: Received speech end signal";
if (recognizer_ != nullptr) {
recognizer_->SetFinished();
}
......@@ -70,8 +70,8 @@ void ConnectionHandler::OnSpeechData(const beast::flat_buffer& buffer) {
pcm_data(i) = static_cast<float>(*pdata);
pdata++;
}
VLOG(2) << "Server: Recieved " << num_samples << " samples";
LOG(INFO) << "Server: Recieved " << num_samples << " samples";
VLOG(2) << "Server: Received " << num_samples << " samples";
LOG(INFO) << "Server: Received " << num_samples << " samples";
CHECK(recognizer_ != nullptr);
recognizer_->Accept(pcm_data);
......
......@@ -26,8 +26,8 @@ from pathlib import Path
import soundfile
from utils.utility import download
from utils.utility import unpack
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
......
......@@ -25,8 +25,8 @@ from pathlib import Path
import soundfile
from utils.utility import download
from utils.utility import unpack
from paddlespeech.dataset.download import download
from paddlespeech.dataset.download import unpack
DATA_HOME = os.path.expanduser('~/.cache/paddle/dataset/speech')
......
......@@ -166,7 +166,7 @@ variable, sudo might not allow it to propagate to the command that it invokes."
fi
# The install variants, each in a function to simplify error reporting.
# Each one invokes a subshell with a 'set -x' to to show system-modifying
# Each one invokes a subshell with a 'set -x' to show system-modifying
# commands it runs. The subshells simply limit the scope of this diagnostics
# and avoid creating noise (if we were using 'set +x', it would be printed).
Install_redhat () {
......
......@@ -12,105 +12,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import glob
import json
import os
import numpy as np
import paddle
def main(args):
paddle.set_device('cpu')
val_scores = []
beat_val_scores = None
selected_epochs = None
jsons = glob.glob(f'{args.ckpt_dir}/[!train]*.json')
jsons = sorted(jsons, key=os.path.getmtime, reverse=True)
for y in jsons:
with open(y, 'r') as f:
dic_json = json.load(f)
loss = dic_json['val_loss']
epoch = dic_json['epoch']
if epoch >= args.min_epoch and epoch <= args.max_epoch:
val_scores.append((epoch, loss))
val_scores = np.array(val_scores)
if args.val_best:
sort_idx = np.argsort(val_scores[:, 1])
sorted_val_scores = val_scores[sort_idx]
else:
sorted_val_scores = val_scores
beat_val_scores = sorted_val_scores[:args.num, 1]
selected_epochs = sorted_val_scores[:args.num, 0].astype(np.int64)
avg_val_score = np.mean(beat_val_scores)
print("selected val scores = " + str(beat_val_scores))
print("selected epochs = " + str(selected_epochs))
print("averaged val score = " + str(avg_val_score))
path_list = [
args.ckpt_dir + '/{}.pdparams'.format(int(epoch))
for epoch in sorted_val_scores[:args.num, 0]
]
print(path_list)
avg = None
num = args.num
assert num == len(path_list)
for path in path_list:
print(f'Processing {path}')
states = paddle.load(path)
if avg is None:
avg = states
else:
for k in avg.keys():
avg[k] += states[k]
# average
for k in avg.keys():
if avg[k] is not None:
avg[k] /= num
paddle.save(avg, args.dst_model)
print(f'Saving to {args.dst_model}')
meta_path = os.path.splitext(args.dst_model)[0] + '.avg.json'
with open(meta_path, 'w') as f:
data = json.dumps({
"mode": 'val_best' if args.val_best else 'latest',
"avg_ckpt": args.dst_model,
"val_loss_mean": avg_val_score,
"ckpts": path_list,
"epochs": selected_epochs.tolist(),
"val_losses": beat_val_scores.tolist(),
})
f.write(data + "\n")
from paddlespeech.dataset.s2t import avg_ckpts_main
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='average model')
parser.add_argument('--dst_model', required=True, help='averaged model')
parser.add_argument(
'--ckpt_dir', required=True, help='ckpt model dir for average')
parser.add_argument(
'--val_best', action="store_true", help='averaged model')
parser.add_argument(
'--num', default=5, type=int, help='nums for averaged model')
parser.add_argument(
'--min_epoch',
default=0,
type=int,
help='min epoch used for averaging model')
parser.add_argument(
'--max_epoch',
default=65536, # Big enough
type=int,
help='max epoch used for averaging model')
args = parser.parse_args()
print(args)
main(args)
avg_ckpts_main()
......@@ -15,134 +15,7 @@
"""Build vocabulary from manifest files.
Each item in vocabulary file is a character.
"""
import argparse
import functools
import os
import tempfile
from collections import Counter
import jsonlines
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import BLANK
from paddlespeech.s2t.frontend.utility import SOS
from paddlespeech.s2t.frontend.utility import SPACE
from paddlespeech.s2t.frontend.utility import UNK
from paddlespeech.s2t.utils.utility import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
add_arg('count_threshold', int, 0,
"Truncation threshold for char/word counts.Default 0, no truncate.")
add_arg('vocab_path', str,
'examples/librispeech/data/vocab.txt',
"Filepath to write the vocabulary.")
add_arg('manifest_paths', str,
None,
"Filepaths of manifests for building vocabulary. "
"You can provide multiple manifest files.",
nargs='+',
required=True)
add_arg('text_keys', str,
'text',
"keys of the text in manifest for building vocabulary. "
"You can provide multiple k.",
nargs='+')
# bpe
add_arg('spm_vocab_size', int, 0, "Vocab size for spm.")
add_arg('spm_mode', str, 'unigram', "spm model type, e.g. unigram, spm, char, word. only need when `unit_type` is spm")
add_arg('spm_model_prefix', str, "", "spm_model_%(spm_mode)_%(count_threshold), spm model prefix, only need when `unit_type` is spm")
add_arg('spm_character_coverage', float, 0.9995, "character coverage to determine the minimum symbols")
# yapf: disable
args = parser.parse_args()
def count_manifest(counter, text_feature, manifest_path):
manifest_jsons = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
manifest_jsons.append(json_data)
for line_json in manifest_jsons:
if isinstance(line_json['text'], str):
line = text_feature.tokenize(line_json['text'], replace_space=False)
counter.update(line)
else:
assert isinstance(line_json['text'], list)
for text in line_json['text']:
line = text_feature.tokenize(text, replace_space=False)
counter.update(line)
def dump_text_manifest(fileobj, manifest_path, key='text'):
manifest_jsons = []
with jsonlines.open(manifest_path, 'r') as reader:
for json_data in reader:
manifest_jsons.append(json_data)
for line_json in manifest_jsons:
if isinstance(line_json[key], str):
fileobj.write(line_json[key] + "\n")
else:
assert isinstance(line_json[key], list)
for line in line_json[key]:
fileobj.write(line + "\n")
def main():
print_arguments(args, globals())
fout = open(args.vocab_path, 'w', encoding='utf-8')
fout.write(BLANK + "\n") # 0 will be used for "blank" in CTC
fout.write(UNK + '\n') # <unk> must be 1
if args.unit_type == 'spm':
# tools/spm_train --input=$wave_data/lang_char/input.txt
# --vocab_size=${nbpe} --model_type=${bpemode}
# --model_prefix=${bpemodel} --input_sentence_size=100000000
import sentencepiece as spm
fp = tempfile.NamedTemporaryFile(mode='w', delete=False)
for manifest_path in args.manifest_paths:
text_keys = [args.text_keys] if type(args.text_keys) is not list else args.text_keys
for text_key in text_keys:
dump_text_manifest(fp, manifest_path, key=text_key)
fp.close()
# train
spm.SentencePieceTrainer.Train(
input=fp.name,
vocab_size=args.spm_vocab_size,
model_type=args.spm_mode,
model_prefix=args.spm_model_prefix,
input_sentence_size=100000000,
character_coverage=args.spm_character_coverage)
os.unlink(fp.name)
# encode
text_feature = TextFeaturizer(args.unit_type, "", args.spm_model_prefix)
counter = Counter()
for manifest_path in args.manifest_paths:
count_manifest(counter, text_feature, manifest_path)
count_sorted = sorted(counter.items(), key=lambda x: x[1], reverse=True)
tokens = []
for token, count in count_sorted:
if count < args.count_threshold:
break
# replace space by `<space>`
token = SPACE if token == ' ' else token
tokens.append(token)
tokens = sorted(tokens)
for token in tokens:
fout.write(token + '\n')
fout.write(SOS + "\n") # <sos/eos>
fout.close()
from paddlespeech.dataset.s2t import build_vocab_main
if __name__ == '__main__':
main()
build_vocab_main()
此差异已折叠。
......@@ -13,75 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""Compute mean and std for feature normalizer, and save to file."""
import argparse
import functools
from paddlespeech.s2t.frontend.augmentor.augmentation import AugmentationPipeline
from paddlespeech.s2t.frontend.featurizer.audio_featurizer import AudioFeaturizer
from paddlespeech.s2t.frontend.normalizer import FeatureNormalizer
from paddlespeech.s2t.utils.utility import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('num_samples', int, 2000, "# of samples to for statistics.")
add_arg('spectrum_type', str,
'linear',
"Audio feature type. Options: linear, mfcc, fbank.",
choices=['linear', 'mfcc', 'fbank'])
add_arg('feat_dim', int, 13, "Audio feature dim.")
add_arg('delta_delta', bool, False, "Audio feature with delta delta.")
add_arg('stride_ms', int, 10, "stride length in ms.")
add_arg('window_ms', int, 20, "stride length in ms.")
add_arg('sample_rate', int, 16000, "target sample rate.")
add_arg('use_dB_normalization', bool, True, "do dB normalization.")
add_arg('target_dB', int, -20, "target dB.")
add_arg('manifest_path', str,
'data/librispeech/manifest.train',
"Filepath of manifest to compute normalizer's mean and stddev.")
add_arg('num_workers',
default=0,
type=int,
help='num of subprocess workers for processing')
add_arg('output_path', str,
'data/librispeech/mean_std.npz',
"Filepath of write mean and stddev to (.npz).")
# yapf: disable
args = parser.parse_args()
def main():
print_arguments(args, globals())
augmentation_pipeline = AugmentationPipeline('{}')
audio_featurizer = AudioFeaturizer(
spectrum_type=args.spectrum_type,
feat_dim=args.feat_dim,
delta_delta=args.delta_delta,
stride_ms=float(args.stride_ms),
window_ms=float(args.window_ms),
n_fft=None,
max_freq=None,
target_sample_rate=args.sample_rate,
use_dB_normalization=args.use_dB_normalization,
target_dB=args.target_dB,
dither=0.0)
def augment_and_featurize(audio_segment):
augmentation_pipeline.transform_audio(audio_segment)
return audio_featurizer.featurize(audio_segment)
normalizer = FeatureNormalizer(
mean_std_filepath=None,
manifest_path=args.manifest_path,
featurize_func=augment_and_featurize,
num_samples=args.num_samples,
num_workers=args.num_workers)
normalizer.write_to_file(args.output_path)
from paddlespeech.dataset.s2t import compute_mean_std_main
if __name__ == '__main__':
main()
compute_mean_std_main()
......@@ -13,130 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
"""format manifest with more metadata."""
import argparse
import functools
import json
import jsonlines
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.io.utility import feat_type
from paddlespeech.s2t.utils.utility import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('cmvn_path', str,
'examples/librispeech/data/mean_std.json',
"Filepath of cmvn.")
add_arg('unit_type', str, "char", "Unit type, e.g. char, word, spm")
add_arg('vocab_path', str,
'examples/librispeech/data/vocab.txt',
"Filepath of the vocabulary.")
add_arg('manifest_paths', str,
None,
"Filepaths of manifests for building vocabulary. "
"You can provide multiple manifest files.",
nargs='+',
required=True)
# bpe
add_arg('spm_model_prefix', str, None,
"spm model prefix, spm_model_%(bpe_mode)_%(count_threshold), only need when `unit_type` is spm")
add_arg('output_path', str, None, "filepath of formated manifest.", required=True)
# yapf: disable
args = parser.parse_args()
def main():
print_arguments(args, globals())
fout = open(args.output_path, 'w', encoding='utf-8')
# get feat dim
filetype = args.cmvn_path.split(".")[-1]
mean, istd = load_cmvn(args.cmvn_path, filetype=filetype)
feat_dim = mean.shape[0] #(D)
print(f"Feature dim: {feat_dim}")
text_feature = TextFeaturizer(args.unit_type, args.vocab_path, args.spm_model_prefix)
vocab_size = text_feature.vocab_size
print(f"Vocab size: {vocab_size}")
# josnline like this
# {
# "input": [{"name": "input1", "shape": (100, 83), "feat": "xxx.ark:123"}],
# "output": [{"name":"target1", "shape": (40, 5002), "text": "a b c de"}],
# "utt2spk": "111-2222",
# "utt": "111-2222-333"
# }
count = 0
for manifest_path in args.manifest_paths:
with jsonlines.open(str(manifest_path), 'r') as reader:
manifest_jsons = list(reader)
for line_json in manifest_jsons:
output_json = {
"input": [],
"output": [],
'utt': line_json['utt'],
'utt2spk': line_json.get('utt2spk', 'global'),
}
# output
line = line_json['text']
if isinstance(line, str):
# only one target
tokens = text_feature.tokenize(line)
tokenids = text_feature.featurize(line)
output_json['output'].append({
'name': 'target1',
'shape': (len(tokenids), vocab_size),
'text': line,
'token': ' '.join(tokens),
'tokenid': ' '.join(map(str, tokenids)),
})
else:
# isinstance(line, list), multi target in one vocab
for i, item in enumerate(line, 1):
tokens = text_feature.tokenize(item)
tokenids = text_feature.featurize(item)
output_json['output'].append({
'name': f'target{i}',
'shape': (len(tokenids), vocab_size),
'text': item,
'token': ' '.join(tokens),
'tokenid': ' '.join(map(str, tokenids)),
})
# input
line = line_json['feat']
if isinstance(line, str):
# only one input
feat_shape = line_json['feat_shape']
assert isinstance(feat_shape, (list, tuple)), type(feat_shape)
filetype = feat_type(line)
if filetype == 'sound':
feat_shape.append(feat_dim)
else: # kaldi
raise NotImplementedError('no support kaldi feat now!')
output_json['input'].append({
"name": "input1",
"shape": feat_shape,
"feat": line,
"filetype": filetype,
})
else:
# isinstance(line, list), multi input
raise NotImplementedError("not support multi input now!")
fout.write(json.dumps(output_json) + '\n')
count += 1
print(f"{args.manifest_paths} Examples number: {count}")
fout.close()
from paddlespeech.dataset.s2t import format_data_main
if __name__ == '__main__':
main()
format_data_main()
......@@ -11,96 +11,7 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
from paddlespeech.dataset.s2t import format_rsl_main
import jsonlines
def trans_hyp(origin_hyp, trans_hyp=None, trans_hyp_sclite=None):
"""
Args:
origin_hyp: The input json file which contains the model output
trans_hyp: The output file for caculate CER/WER
trans_hyp_sclite: The output file for caculate CER/WER using sclite
"""
input_dict = {}
with open(origin_hyp, "r+", encoding="utf8") as f:
for item in jsonlines.Reader(f):
input_dict[item["utt"]] = item["hyps"][0]
if trans_hyp is not None:
with open(trans_hyp, "w+", encoding="utf8") as f:
for key in input_dict.keys():
f.write(key + " " + input_dict[key] + "\n")
if trans_hyp_sclite is not None:
with open(trans_hyp_sclite, "w+") as f:
for key in input_dict.keys():
line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
f.write(line)
def trans_ref(origin_ref, trans_ref=None, trans_ref_sclite=None):
"""
Args:
origin_hyp: The input json file which contains the model output
trans_hyp: The output file for caculate CER/WER
trans_hyp_sclite: The output file for caculate CER/WER using sclite
"""
input_dict = {}
with open(origin_ref, "r", encoding="utf8") as f:
for item in jsonlines.Reader(f):
input_dict[item["utt"]] = item["text"]
if trans_ref is not None:
with open(trans_ref, "w", encoding="utf8") as f:
for key in input_dict.keys():
f.write(key + " " + input_dict[key] + "\n")
if trans_ref_sclite is not None:
with open(trans_ref_sclite, "w") as f:
for key in input_dict.keys():
line = input_dict[key] + "(" + key + ".wav" + ")" + "\n"
f.write(line)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
prog='format hyp file for compute CER/WER', add_help=True)
parser.add_argument(
'--origin_hyp', type=str, default=None, help='origin hyp file')
parser.add_argument(
'--trans_hyp',
type=str,
default=None,
help='hyp file for caculating CER/WER')
parser.add_argument(
'--trans_hyp_sclite',
type=str,
default=None,
help='hyp file for caculating CER/WER by sclite')
parser.add_argument(
'--origin_ref', type=str, default=None, help='origin ref file')
parser.add_argument(
'--trans_ref',
type=str,
default=None,
help='ref file for caculating CER/WER')
parser.add_argument(
'--trans_ref_sclite',
type=str,
default=None,
help='ref file for caculating CER/WER by sclite')
parser_args = parser.parse_args()
if parser_args.origin_hyp is not None:
trans_hyp(
origin_hyp=parser_args.origin_hyp,
trans_hyp=parser_args.trans_hyp,
trans_hyp_sclite=parser_args.trans_hyp_sclite, )
if parser_args.origin_ref is not None:
trans_ref(
origin_ref=parser_args.origin_ref,
trans_ref=parser_args.trans_ref,
trans_ref_sclite=parser_args.trans_ref_sclite, )
if __name__ == '__main__':
format_rsl_main()
......@@ -22,8 +22,8 @@ import jsonlines
from paddlespeech.s2t.frontend.featurizer.text_featurizer import TextFeaturizer
from paddlespeech.s2t.frontend.utility import load_cmvn
from paddlespeech.s2t.io.utility import feat_type
from paddlespeech.s2t.utils.utility import add_arguments
from paddlespeech.s2t.utils.utility import print_arguments
from paddlespeech.utils.argparse import add_arguments
from paddlespeech.utils.argparse import print_arguments
parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
......
此差异已折叠。
此差异已折叠。
此差异已折叠。