提交 0f78d25f 编写于 作者: X xiongxinlei

add vector cli batch and pipeline test demo, test=doc

上级 305bacdc
......@@ -28,91 +28,6 @@ from paddlespeech.vector.training.seeding import seed_everything
logger = Log(__name__).getlog()
class VectorWrapper:
""" VectorWrapper extract the audio embedding,
and single audio will get only an embedding
"""
def __init__(self,
device,
config_path,
model_path,):
super(VectorWrapper, self).__init__()
# stage 0: config the
self.device = device
self.config_path = config_path
self.model_path = model_path
# stage 1: set the run host device
paddle.device.set_device(device)
# stage 2: read the yaml config and set the seed factor
self.read_yaml_config(self.config_path)
seed_everything(self.config.seed)
# stage 3: init the speaker verification model
self.init_vector_model(self.config, self.model_path)
def read_yaml_config(self, config_path):
"""Read the yaml config from the config path
Args:
config_path (str): yaml config path
"""
config = CfgNode(new_allowed=True)
if config_path:
config.merge_from_file(config_path)
config.freeze()
self.config = config
def init_vector_model(self, config, model_path):
"""Init the vector model from yaml config
Args:
config (CfgNode): yaml config
model_path (str): pretrained model path and the stored model is named as model.pdparams
"""
# get the backbone network instance
ecapa_tdnn = EcapaTdnn(**config.model)
# get the sid instance
model = SpeakerIdetification(backbone=ecapa_tdnn, num_class=config.num_speakers)
# read the model parameters to sid model
model_path = os.path.abspath(os.path.expanduser(model_path))
state_dict = paddle.load(os.path.join(model_path, "model.pdparams"))
model.set_state_dict(state_dict)
model.eval()
self.model = model
def extract_audio_embedding(self, audio_path):
"""Extract the audio embedding
Args:
audio_path (str): audio path, which will be extracted the embedding
Returns:
embedding (numpy.array) : audio embedding
"""
waveform, sr = load_audio(audio_path)
feat = melspectrogram(x=waveform,
sr=self.config.sr,
n_mels=self.config.n_mels,
window_size=self.config.window_size,
hop_length=self.config.hop_size)
# conver the audio feat to batch shape, which means batch_size is equal to one
feat = paddle.to_tensor(feat).unsqueeze(0)
# in inference period, the lengths is all one without padding
lengths = paddle.ones([1])
feat = feature_normalize(feat, mean_norm=True, std_norm=False)
# model backbone network forward the feats and get the embedding
embedding = self.model.backbone(feat, lengths).squeeze().numpy() # (1, emb_size, 1) -> (emb_size)
return embedding
def extract_audio_embedding(args, config):
# stage 0: set the training device, cpu or gpu
......@@ -168,7 +83,7 @@ def extract_audio_embedding(args, config):
# stage 5: do global norm with external mean and std
rtf = elapsed_time / audio_length
logger.info(f"{args.device} rft={rtf}")
paddle.save(embedding, "emb1")
return embedding
......@@ -177,7 +92,7 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser(__doc__)
parser.add_argument('--device',
choices=['cpu', 'gpu'],
default="gpu",
default="cpu",
help="Select which device to train model, defaults to gpu.")
parser.add_argument("--config",
default=None,
......@@ -202,10 +117,3 @@ if __name__ == "__main__":
print(config)
extract_audio_embedding(args, config)
# use the VectorWrapper to extract the audio embedding
vector_inst = VectorWrapper(device="gpu",
config_path=args.config,
model_path=args.load_checkpoint)
embedding = vector_inst.extract_audio_embedding(args.audio_path)
#!/bin/bash
set -e
# Audio classification
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav
paddlespeech cls --input ./cat.wav --topk 10
# Punctuation_restoration
paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭
# Speech_recognition
wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
paddlespeech asr --input ./zh.wav
paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav
# Text To Speech
paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!"
paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"
paddlespeech tts --voc mb_melgan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"
paddlespeech tts --voc style_melgan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"
paddlespeech tts --voc hifigan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"
paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0
paddlespeech tts --am fastspeech2_aishell3 --voc hifigan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0
paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
paddlespeech tts --am fastspeech2_ljspeech --voc hifigan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0
paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0
paddlespeech tts --am tacotron2_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"
paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"
paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
# Speech Translation (only support linux)
paddlespeech st --input ./en.wav
# batch process
echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
# shell pipeline
paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
# stats
paddlespeech stats --task asr
paddlespeech stats --task tts
paddlespeech stats --task cls
# # Audio classification
# wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/cat.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/dog.wav
# paddlespeech cls --input ./cat.wav --topk 10
# # Punctuation_restoration
# paddlespeech text --input 今天的天气真不错啊你下午有空吗我想约你一起去吃饭
# # Speech_recognition
# wget -c https://paddlespeech.bj.bcebos.com/PaddleAudio/zh.wav https://paddlespeech.bj.bcebos.com/PaddleAudio/en.wav
# paddlespeech asr --input ./zh.wav
# paddlespeech asr --model transformer_librispeech --lang en --input ./en.wav
# # Text To Speech
# paddlespeech tts --input "你好,欢迎使用百度飞桨深度学习框架!"
# paddlespeech tts --am speedyspeech_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"
# paddlespeech tts --voc mb_melgan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"
# paddlespeech tts --voc style_melgan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"
# paddlespeech tts --voc hifigan_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"
# paddlespeech tts --am fastspeech2_aishell3 --voc pwgan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0
# paddlespeech tts --am fastspeech2_aishell3 --voc hifigan_aishell3 --input "你好,欢迎使用百度飞桨深度学习框架!" --spk_id 0
# paddlespeech tts --am fastspeech2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
# paddlespeech tts --am fastspeech2_ljspeech --voc hifigan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
# paddlespeech tts --am fastspeech2_vctk --voc pwgan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0
# paddlespeech tts --am fastspeech2_vctk --voc hifigan_vctk --input "Life was like a box of chocolates, you never know what you're gonna get." --lang en --spk_id 0
# paddlespeech tts --am tacotron2_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"
# paddlespeech tts --am tacotron2_csmsc --voc wavernn_csmsc --input "你好,欢迎使用百度飞桨深度学习框架!"
# paddlespeech tts --am tacotron2_ljspeech --voc pwgan_ljspeech --lang en --input "Life was like a box of chocolates, you never know what you're gonna get."
# # Speech Translation (only support linux)
# paddlespeech st --input ./en.wav
# # batch process
# echo -e "1 欢迎光临。\n2 谢谢惠顾。" | paddlespeech tts
# # shell pipeline
# paddlespeech asr --input ./zh.wav | paddlespeech text --task punc
# # stats
# paddlespeech stats --task asr
# paddlespeech stats --task tts
# paddlespeech stats --task cls
# Speaker Verification
wget -c https://paddlespeech.bj.bcebos.com/vector/audio/85236145389.wav
paddlespeech vector --task spk --input 85236145389.wav
echo "demo 85236145389.wav" > vec.job
paddlespeech vector --task spk --input vec.job
echo "demo 85236145389.wav" | paddlespeech vector --task spk
rm 85236145389.wav
rm vec.job
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册