# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import argparse import ast import numpy as np import paddle import paddle.nn.functional as F from model import SoundClassifier from paddlespeech.cls.backends import load as load_audio from paddlespeech.cls.datasets import ESC50 from paddlespeech.cls.features import LogMelSpectrogram from paddlespeech.cls.features import melspectrogram from paddlespeech.cls.models.panns import cnn14 # yapf: disable parser = argparse.ArgumentParser(__doc__) parser.add_argument('--device', choices=['cpu', 'gpu'], default="gpu", help="Select which device to predict, defaults to gpu.") parser.add_argument("--wav", type=str, required=True, help="Audio file to infer.") parser.add_argument("--gpu_feat", type=ast.literal_eval, default=False, help="Use gpu to extract feature.") parser.add_argument("--top_k", type=int, default=1, help="Show top k predicted results") parser.add_argument("--checkpoint", type=str, required=True, help="Checkpoint of model.") args = parser.parse_args() # yapf: enable def extract_features(file: str, gpu_feat: bool=False, **kwargs) -> paddle.Tensor: waveform, sr = load_audio(file, sr=None) if gpu_feat: feature_extractor = LogMelSpectrogram(sr=sr, hop_length=320, **kwargs) feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0)) feat = paddle.transpose(feat, [0, 2, 1]) else: feat = melspectrogram(waveform, sr, **kwargs).transpose() feat = np.expand_dims(feat, 0) feat = paddle.to_tensor(feat) return feat if __name__ == '__main__': paddle.set_device(args.device) model = SoundClassifier( backbone=cnn14(pretrained=False, extract_embedding=True), num_class=len(ESC50.label_list)) model.set_state_dict(paddle.load(args.checkpoint)) model.eval() feat = extract_features(args.wav, args.gpu_feat) logits = model(feat) probs = F.softmax(logits, axis=1).numpy() sorted_indices = (-probs[0]).argsort() msg = f'[{args.wav}]\n' for idx in sorted_indices[:args.top_k]: msg += f'{ESC50.label_list[idx]}: {probs[0][idx]}\n' print(msg)