From 47110429f224b284cd4fed93b2cfdd4b71b9ddae Mon Sep 17 00:00:00 2001 From: KP <109694228@qq.com> Date: Sat, 8 May 2021 14:26:09 +0800 Subject: [PATCH] Update audio tagging example (#5305) * Update audio tagging example * Update audio tagging example --- PaddleAudio/examples/audio_tagging/README.md | 198 +++++++++--------- .../examples/audio_tagging/audio_tag.py | 83 +++----- .../examples/audio_tagging/parse_result.py | 25 +-- .../examples/sound_classification/README.md | 1 + PaddleAudio/paddleaudio/datasets/__init__.py | 2 + PaddleAudio/paddleaudio/datasets/tess.py | 23 +- 6 files changed, 163 insertions(+), 169 deletions(-) diff --git a/PaddleAudio/examples/audio_tagging/README.md b/PaddleAudio/examples/audio_tagging/README.md index 3d2fb6aa..243ebf8e 100644 --- a/PaddleAudio/examples/audio_tagging/README.md +++ b/PaddleAudio/examples/audio_tagging/README.md @@ -1,136 +1,128 @@ -# Audioset Tagging Example +# Audio Tagging -本示例采用PANNs预训练模型,对输入音频实时打tag,并最终以文本形式输出对应时刻的topk类别和对应的得分。 +声音分类的任务是单标签的分类任务,但是对于一段音频来说,它可以是多标签的。譬如在一般的室内办公环境进行录音,这段音频里可能包含人们说话的声音、键盘敲打的声音、鼠标点击的声音,还有室内的一些其他背景声音。对于通用的声音识别和声音检测场景而言,对一段音频预测多个标签是具有很强的实用性的。 -PANNs预训练模型的详情,请参考论文[PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf)。 +在IEEE ICASSP 2017 大会上,谷歌开放了一个大规模的音频数据集[Audioset](https://research.google.com/audioset/)。该数据集包含了 632 类的音频类别以及 2,084,320 条人工标记的每段 10 秒长度的声音剪辑片段(来源于YouTube视频)。目前该数据集已经有210万个已标注的视频数据,5800小时的音频数据,经过标记的声音样本的标签类别为527。 -## Usage +`PANNs`([PANNs: Large-Scale Pretrained Audio Neural Networks for Audio Pattern Recognition](https://arxiv.org/pdf/1912.10211.pdf))是基于Audioset数据集训练的声音分类/识别的模型。其预训练的任务是多标签的声音识别,因此可用于声音的实时tagging。 -```python -python audio_tag.py \ - --wav ./cat_meow.wav \ - --sr 32000 \ - --sample_duration 2 \ - --hop_duration 0.3 \ - --checkpoint ./assets/cnn14.pdparams \ - --use_gpu True \ - --output_dir ./output_dir -``` +本示例采用`PANNs`预训练模型,基于Audioset的标签类别对输入音频实时tagging,并最终以文本形式输出对应时刻的top k类别和对应的得分。 + + +## 模型简介 + +PaddleAudio提供了PANNs的CNN14、CNN10和CNN6的预训练模型,可供用户选择使用: +- CNN14: 该模型主要包含12个卷积层和2个全连接层,模型参数的数量为79.6M,embbedding维度是2048。 +- CNN10: 该模型主要包含8个卷积层和2个全连接层,模型参数的数量为4.9M,embbedding维度是512。 +- CNN6: 该模型主要包含4个卷积层和2个全连接层,模型参数的数量为4.5M,embbedding维度是512。 -参数用法: + +## 快速开始 + +### 模型预测 + +```shell +export CUDA_VISIBLE_DEVICES=0 +python audio_tag.py --device gpu --wav ./cat.wav --sample_duration 2 --hop_duration 0.3 --output_dir ./output_dir ``` ---wav # 音频路径 ---sr # sample rate ---sample_duration # tagging音频长度,单位为秒 ---hop_duration # tagging音频间步长,单位为秒 ---checkpoint # 预训练模型参数 ---use_gpu # 使用GPU加速 ---output_dir # 输出路径 + +可支持配置的参数: + +- `device`: 选用什么设备进行训练,可选cpu或gpu,默认为gpu。如使用gpu训练则参数gpus指定GPU卡号。 +- `wav`: 指定预测的音频文件。 +- `sample_duration`: 模型每次预测的音频时间长度,单位为秒,默认为2s。 +- `hop_duration`: 每两个预测音频的时间间隔,单位为秒,默认为0.3s。 +- `output_dir`: 模型预测结果存放的路径,默认为`./output_dir`。 + +示例代码中使用的预训练模型为`CNN14`,如果想更换为其他预训练模型,可通过以下方式执行: +```python +from paddleaudio.models.panns import cnn14, cnn10, cnn6 + +# CNN14 +model = cnn14(pretrained=True, extract_embedding=False) +# CNN10 +model = cnn10(pretrained=True, extract_embedding=False) +# CNN6 +model = cnn6(pretrained=True, extract_embedding=False) ``` 执行结果: ``` -[2021-04-06 21:10:36,438] [ INFO] - Loaded CNN14 pretrained parameters from: ./assets/cnn14.pdparams -[2021-04-06 21:10:38,193] [ INFO] - Saved tagging results to ./output_dir/audioset_tagging_sr_32000.npz +[2021-04-30 19:15:41,025] [ INFO] - Saved tagging results to ./output_dir/audioset_tagging_sr_44100.npz ``` 执行后得分结果保存在`output_dir`的`.npz`文件中。 -## Output -```python -python parse_result.py \ - --input_file ./output_dir/audioset_tagging_sr_32000.npz \ - --topk 10 \ - --smooth True \ - --smooth_size 5 \ - --output_dir ./output_dir +### 生成tagging标签文本 +```shell +python parse_result.py --tagging_file ./output_dir/audioset_tagging_sr_44100.npz --top_k 10 --smooth True --smooth_size 5 --label_file ./assets/audioset_labels.txt --output_dir ./output_dir ``` -参数用法: -``` ---input_file # tagging得分文件 ---topk # 展示topk结果 ---smooth # 帧间得分平滑 ---smooth_size # 平滑窗口大小 ---output_dir # 输出路径 -``` +可支持配置的参数: + +- `tagging_file`: 模型预测结果文件。 +- `top_k`: 获取预测结果中,得分最高的前top_k个标签,默认为10。 +- `smooth`: 预测结果的后验概率平滑,默认为True,表示应用平滑。 +- `smooth_size`: 平滑计算过程中的样本数量,默认为5。 +- `label_file`: 模型预测结果对应的Audioset类别的文本文件。 +- `output_dir`: 标签文本存放的路径,默认为`./output_dir`。 执行结果: ``` -[2021-04-06 21:22:00,696] [ INFO] - Posterior smoothing... -[2021-04-06 21:22:00,699] [ INFO] - Saved tagging labels to ./output_dir/audioset_tagging_sr_32000.txt +[2021-04-30 19:26:58,743] [ INFO] - Posterior smoothing... +[2021-04-30 19:26:58,746] [ INFO] - Saved tagging labels to ./output_dir/audioset_tagging_sr_44100.txt ``` 执行后文本结果保存在`output_dir`的`.txt`文件中。 -## Labels +## Tagging标签文本 最终输出的文本结果如下所示。 -不同tagging的topk结果用空行分隔。每一个结果中,第一行是时间信息,数字表示tagging结果的起始样本点;接下来的k行是对应的标签和得分。 +样本每个时间范围的top k结果用空行分隔。在每一个结果中,第一行是时间信息,数字表示tagging结果在时间起点信息,比例值代表当前时刻`t`与音频总长度`T`的比值;紧接的k行是对应的标签和得分。 ``` -0 -Cat: 0.80844646692276 -Animal: 0.6848719716072083 -Meow: 0.6470851898193359 -Domestic animals, pets: 0.6392854452133179 -Inside, small room: 0.05361200496554375 -Purr: 0.02675800956785679 -Music: 0.021260583773255348 -Speech: 0.0209784135222435 -Caterwaul: 0.019929537549614906 -Outside, urban or manmade: 0.010916451923549175 - -9600 -Cat: 0.7778594493865967 -Meow: 0.6465566158294678 -Animal: 0.6342337131500244 -Domestic animals, pets: 0.5945377349853516 -Inside, small room: 0.04747435823082924 -Purr: 0.027785276994109154 -Music: 0.022447215393185616 -Caterwaul: 0.020785318687558174 -Speech: 0.01982543244957924 -Vehicle: 0.014558425173163414 - -19200 -Cat: 0.8243843913078308 -Animal: 0.6799540519714355 -Meow: 0.6794822812080383 -Domestic animals, pets: 0.6637188792228699 -Caterwaul: 0.09927166253328323 -Inside, small room: 0.0378643162548542 -Music: 0.02170632779598236 -Purr: 0.02035444974899292 -Speech: 0.02006830833852291 -Vehicle: 0.01234798226505518 - -28800 -Cat: 0.8329735398292542 -Animal: 0.6937487125396729 -Meow: 0.6766577959060669 -Domestic animals, pets: 0.6669812798500061 -Caterwaul: 0.08647485077381134 -Inside, small room: 0.03593464195728302 -Music: 0.022975120693445206 -Speech: 0.01964726485311985 -Purr: 0.017558127641677856 -Vehicle: 0.010926523245871067 - -38400 -Cat: 0.8097503781318665 -Animal: 0.6702587604522705 -Meow: 0.6487116813659668 -Domestic animals, pets: 0.6369225382804871 -Caterwaul: 0.07185821980237961 -Inside, small room: 0.039198972284793854 -Music: 0.02381189912557602 -Speech: 0.018534155562520027 -Purr: 0.0178740955889225 -Outside, urban or manmade: 0.011107126250863075 +0.0 +Cat: 0.9144676923751831 +Animal: 0.8855036497116089 +Domestic animals, pets: 0.804577112197876 +Meow: 0.7422927021980286 +Music: 0.19959309697151184 +Inside, small room: 0.12550437450408936 +Caterwaul: 0.021584441885352135 +Purr: 0.020247288048267365 +Speech: 0.018197158351540565 +Vehicle: 0.007446660194545984 + +0.059197544398158296 +Cat: 0.9250872135162354 +Animal: 0.8957151174545288 +Domestic animals, pets: 0.8228275775909424 +Meow: 0.7650775909423828 +Music: 0.20210561156272888 +Inside, small room: 0.12290887534618378 +Caterwaul: 0.029371455311775208 +Purr: 0.018731823191046715 +Speech: 0.017130598425865173 +Vehicle: 0.007748497650027275 + +0.11839508879631659 +Cat: 0.9336574673652649 +Animal: 0.9111202359199524 +Domestic animals, pets: 0.8349071145057678 +Meow: 0.7761964797973633 +Music: 0.20467285811901093 +Inside, small room: 0.10709915310144424 +Caterwaul: 0.05370649695396423 +Purr: 0.018830426037311554 +Speech: 0.017361722886562347 +Vehicle: 0.006929398979991674 ... ... ``` + +以下[Demo](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.mp4)展示了一个将tagging标签输出到视频的例子,可以实时地对音频进行多标签预测。 + +![](https://bj.bcebos.com/paddleaudio/media/audio_tagging_demo.gif) diff --git a/PaddleAudio/examples/audio_tagging/audio_tag.py b/PaddleAudio/examples/audio_tagging/audio_tag.py index 43764dbe..f1545399 100644 --- a/PaddleAudio/examples/audio_tagging/audio_tag.py +++ b/PaddleAudio/examples/audio_tagging/audio_tag.py @@ -17,67 +17,49 @@ import ast import os from typing import List -import librosa import numpy as np import paddle - +from paddleaudio.backends import load as load_audio from paddleaudio.features import mel_spect -from paddleaudio.models import CNN14 -from paddleaudio.utils.log import logger +from paddleaudio.models.panns import cnn14 +from paddleaudio.utils import logger +# yapf: disable parser = argparse.ArgumentParser(__doc__) -# features -parser.add_argument("--sr", type=int, default=32000, help="Sample rate of inference audio.") -parser.add_argument('--window_size', type=int, default=1024) -parser.add_argument('--hop_size', type=int, default=320) -parser.add_argument('--mel_bins', type=int, default=64) -parser.add_argument('--fmin', type=int, default=50) -parser.add_argument('--fmax', type=int, default=14000) -# waveform -parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.") -parser.add_argument('--sample_duration', type=float, default=1.0) # 1s -parser.add_argument('--hop_duration', type=float, default=0.3) # 0.3s - -parser.add_argument("--output_dir", type=str, default='./output_dir') -parser.add_argument("--use_gpu", - type=ast.literal_eval, - default=True, - help="Whether use GPU for fine-tuning, input should be True or False") -parser.add_argument("--checkpoint", type=str, default='./assets/cnn14.pdparams', help="Checkpoint of model.") +parser.add_argument('--device', choices=['cpu', 'gpu'], default='gpu', help='Select which device to predict, defaults to gpu.') +parser.add_argument('--wav', type=str, required=True, help='Audio file to infer.') +parser.add_argument('--sample_duration', type=float, default=2.0, help='Duration(in seconds) of tagging samples to predict.') +parser.add_argument('--hop_duration', type=float, default=0.3, help='Duration(in seconds) between two samples.') +parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging result.') args = parser.parse_args() +# yapf: enable def split(waveform: np.ndarray, win_size: int, hop_size: int): """ - Split into N audios. + Split into N waveforms. N is decided by win_size and hop_size. """ assert isinstance(waveform, np.ndarray) - ret = [] + time = [] + data = [] for i in range(0, len(waveform), hop_size): segment = waveform[i:i + win_size] if len(segment) < win_size: segment = np.pad(segment, (0, win_size - len(segment))) - ret.append(segment) - return ret + data.append(segment) + time.append(i / len(waveform)) + return time, data -def batchify(data: List[List[float]], batch_size: int): +def batchify(data: List[List[float]], sample_rate: int, batch_size: int, **kwargs): """ Extract features from waveforms and create batches. """ examples = [] for waveform in data: - feat = mel_spect( - waveform, - sample_rate=args.sr, - window_size=args.window_size, - hop_length=args.hop_size, - mel_bins=args.mel_bins, - fmin=args.fmin, - fmax=args.fmax, - ) - examples.append(np.expand_dims(feat.transpose(), 0)) # (mel_bins, time) -> (1, time, mel_bins) + feats = mel_spect(waveform, sample_rate=sample_rate, **kwargs).transpose() + examples.append(feats) # Seperates data into some batches. one_batch = [] @@ -90,15 +72,17 @@ def batchify(data: List[List[float]], batch_size: int): yield one_batch -def predict(model, data: List[List[float]], batch_size: int = 1, use_gpu: bool = False): - - paddle.set_device('gpu') if use_gpu else paddle.set_device('cpu') - - batches = batchify(data, batch_size) +def predict(model, data: List[List[float]], sample_rate: int, batch_size: int = 1): + """ + Use pretrained model to make predictions. + """ + batches = batchify(data, sample_rate, batch_size) results = None model.eval() for batch in batches: - feats = paddle.to_tensor(batch) + feats = paddle.to_tensor(batch).unsqueeze(1) \ + # (batch_size, num_frames, num_melbins) -> (batch_size, 1, num_frames, num_melbins) + audioset_scores = model(feats) if results is None: results = audioset_scores.numpy() @@ -109,14 +93,15 @@ def predict(model, data: List[List[float]], batch_size: int = 1, use_gpu: bool = if __name__ == '__main__': - model = CNN14(extract_embedding=False, checkpoint=args.checkpoint) - waveform = librosa.load(args.wav, sr=args.sr)[0] - data = split(waveform, int(args.sample_duration * args.sr), int(args.hop_duration * args.sr)) - results = predict(model, data, batch_size=8, use_gpu=args.use_gpu) + paddle.set_device(args.device) + model = cnn14(pretrained=True, extract_embedding=False) + waveform, sr = load_audio(args.wav, sr=None) + time, data = split(waveform, int(args.sample_duration * sr), int(args.hop_duration * sr)) + results = predict(model, data, sr, batch_size=8) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) - time = np.arange(0, 1, int(args.hop_duration * args.sr) / len(waveform)) - output_file = os.path.join(args.output_dir, f'audioset_tagging_sr_{args.sr}.npz') + time = np.arange(0, 1, int(args.hop_duration * sr) / len(waveform)) + output_file = os.path.join(args.output_dir, f'audioset_tagging_sr_{sr}.npz') np.savez(output_file, time=time, scores=results) logger.info(f'Saved tagging results to {output_file}') diff --git a/PaddleAudio/examples/audio_tagging/parse_result.py b/PaddleAudio/examples/audio_tagging/parse_result.py index f0198ea5..d7cba09c 100644 --- a/PaddleAudio/examples/audio_tagging/parse_result.py +++ b/PaddleAudio/examples/audio_tagging/parse_result.py @@ -18,16 +18,18 @@ import os from typing import Dict, List import numpy as np +from paddleaudio.utils import logger -from paddleaudio.utils.log import logger - +# yapf: disable parser = argparse.ArgumentParser(__doc__) -parser.add_argument("--input_file", type=str, required=True) -parser.add_argument("--topk", type=int, default=10, help="Show top k results of audioset labels.") -parser.add_argument("--smooth", type=ast.literal_eval, default=True, help="Posterior smoothing.") -parser.add_argument("--smooth_size", type=int, default=5, help="Window size of smoothing.") -parser.add_argument("--output_dir", type=str, default='./output_dir') +parser.add_argument('--tagging_file', type=str, required=True, help='') +parser.add_argument('--top_k', type=int, default=10, help='Get top k predicted results of audioset labels.') +parser.add_argument('--smooth', type=ast.literal_eval, default=True, help='Set "True" to apply posterior smoothing.') +parser.add_argument('--smooth_size', type=int, default=5, help='Window size of posterior smoothing.') +parser.add_argument('--label_file', type=str, default='./assets/audioset_labels.txt', help='File of audioset labels.') +parser.add_argument('--output_dir', type=str, default='./output_dir', help='Directory to save tagging labels.') args = parser.parse_args() +# yapf: enable def smooth(results: np.ndarray, win_size: int): @@ -57,13 +59,12 @@ def generate_topk_label(k: int, label_map: Dict, result: np.ndarray): if __name__ == "__main__": - label_file = './assets/audioset_labels.txt' label_map = {} - with open(label_file, 'r') as f: + with open(args.label_file, 'r') as f: for i, l in enumerate(f.readlines()): label_map[i] = l.strip() - results = np.load(args.input_file, allow_pickle=True) + results = np.load(args.tagging_file, allow_pickle=True) times, scores = results['time'], results['scores'] if args.smooth: @@ -72,10 +73,10 @@ if __name__ == "__main__": if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) - output_file = os.path.join(args.output_dir, os.path.basename(args.input_file).split('.')[0] + '.txt') + output_file = os.path.join(args.output_dir, os.path.basename(args.tagging_file).split('.')[0] + '.txt') with open(output_file, 'w') as f: for time, score in zip(times, scores): f.write(f'{time}\n') - f.write(generate_topk_label(args.topk, label_map, score) + '\n') + f.write(generate_topk_label(args.top_k, label_map, score) + '\n') logger.info(f'Saved tagging labels to {output_file}') diff --git a/PaddleAudio/examples/sound_classification/README.md b/PaddleAudio/examples/sound_classification/README.md index e5c393c4..62966efb 100644 --- a/PaddleAudio/examples/sound_classification/README.md +++ b/PaddleAudio/examples/sound_classification/README.md @@ -1,6 +1,7 @@ # 声音分类 声音分类和检测是声音算法的一个热门研究方向。 + 对于声音分类任务,传统机器学习的一个常用做法是首先人工提取音频的时域和频域的多种特征并做特征选择、组合、变换等,然后基于SVM或决策树进行分类。而端到端的深度学习则通常利用深度网络如RNN,CNN等直接对声间波形(waveform)或时频特征(time-frequency)进行特征学习(representation learning)和分类预测。 在IEEE ICASSP 2017 大会上,谷歌开放了一个大规模的音频数据集[Audioset](https://research.google.com/audioset/)。该数据集包含了 632 类的音频类别以及 2,084,320 条人工标记的每段 10 秒长度的声音剪辑片段(来源于YouTube视频)。目前该数据集已经有210万个已标注的视频数据,5800小时的音频数据,经过标记的声音样本的标签类别为527。 diff --git a/PaddleAudio/paddleaudio/datasets/__init__.py b/PaddleAudio/paddleaudio/datasets/__init__.py index 07cf2de3..6e3aabbb 100644 --- a/PaddleAudio/paddleaudio/datasets/__init__.py +++ b/PaddleAudio/paddleaudio/datasets/__init__.py @@ -16,6 +16,7 @@ from .dcase import UrbanAcousticScenes from .esc50 import ESC50 from .gtzan import GTZAN from .ravdess import RAVDESS +from .tess import TESS from .urban_sound import UrbanSound8K __all__ = [ @@ -24,4 +25,5 @@ __all__ = [ 'GTZAN', 'UrbanAcousticScenes', 'RAVDESS', + 'TESS', ] diff --git a/PaddleAudio/paddleaudio/datasets/tess.py b/PaddleAudio/paddleaudio/datasets/tess.py index cc66967c..3b9e6723 100644 --- a/PaddleAudio/paddleaudio/datasets/tess.py +++ b/PaddleAudio/paddleaudio/datasets/tess.py @@ -27,10 +27,23 @@ __all__ = ['TESS'] class TESS(AudioClassificationDataset): """ - TESS Dataset + TESS is a set of 200 target words were spoken in the carrier phrase + "Say the word _____' by two actresses (aged 26 and 64 years) and + recordings were made of the set portraying each of seven emotions(anger, + disgust, fear, happiness, pleasant surprise, sadness, and neutral). + There are 2800 stimuli in total. + + Reference: + Toronto emotional speech set (TESS) + https://doi.org/10.5683/SP2/E8H2MF """ - archieves = [] + archieves = [ + { + 'url': 'https://bj.bcebos.com/paddleaudio/datasets/TESS_Toronto_emotional_speech_set.zip', + 'md5': '1465311b24d1de704c4c63e4ccc470c7', + }, + ] label_list = [ 'angry', 'disgust', @@ -41,7 +54,7 @@ class TESS(AudioClassificationDataset): 'sad', ] meta_info = collections.namedtuple('META_INFO', ('speaker', 'word', 'emotion')) - audio_path = os.path.join(DATA_HOME, 'TESS Toronto emotional speech set data') + audio_path = 'TESS_Toronto_emotional_speech_set' sample_rate = 24414 duration = 2 @@ -76,11 +89,11 @@ class TESS(AudioClassificationDataset): return ret def _get_data(self, mode, seed, n_folds, split) -> Tuple[List[str], List[int]]: - if not os.path.isdir(self.audio_path): + if not os.path.isdir(os.path.join(DATA_HOME, self.audio_path)): download_and_decompress(self.archieves, DATA_HOME) wav_files = [] - for root, _, files in os.walk(self.audio_path): + for root, _, files in os.walk(os.path.join(DATA_HOME, self.audio_path)): for file in files: if file.endswith('.wav'): wav_files.append(os.path.join(root, file)) -- GitLab