predict.py 2.5 KB
Newer Older
K
KP 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
K
KP 已提交
15
import os
K
KP 已提交
16 17 18

import paddle
import paddle.nn.functional as F
K
KP 已提交
19
import yaml
K
KP 已提交
20 21
from paddleaudio.backends import load as load_audio
from paddleaudio.features import LogMelSpectrogram
K
KP 已提交
22
from paddleaudio.utils import logger
小湉湉's avatar
小湉湉 已提交
23

K
KP 已提交
24
from paddlespeech.cls.models import SoundClassifier
小湉湉's avatar
小湉湉 已提交
25
from paddlespeech.utils.dynamic_import import dynamic_import
K
KP 已提交
26 27 28

# yapf: disable
parser = argparse.ArgumentParser(__doc__)
K
KP 已提交
29
parser.add_argument("--cfg_path", type=str, required=True)
K
KP 已提交
30 31 32 33
args = parser.parse_args()
# yapf: enable


K
KP 已提交
34 35 36 37 38 39
def extract_features(file: str, **feat_conf) -> paddle.Tensor:
    file = os.path.abspath(os.path.expanduser(file))
    waveform, _ = load_audio(file, sr=feat_conf['sr'])
    feature_extractor = LogMelSpectrogram(**feat_conf)
    feat = feature_extractor(paddle.to_tensor(waveform).unsqueeze(0))
    feat = paddle.transpose(feat, [0, 2, 1])
K
KP 已提交
40 41 42 43 44
    return feat


if __name__ == '__main__':

K
KP 已提交
45 46 47 48 49 50 51 52 53 54 55 56
    args.cfg_path = os.path.abspath(os.path.expanduser(args.cfg_path))
    with open(args.cfg_path, 'r') as f:
        config = yaml.safe_load(f)

    model_conf = config['model']
    data_conf = config['data']
    feat_conf = config['feature']
    predicting_conf = config['predicting']

    ds_class = dynamic_import(data_conf['dataset'])
    backbone_class = dynamic_import(model_conf['backbone'])

K
KP 已提交
57
    model = SoundClassifier(
K
KP 已提交
58 59 60
        backbone=backbone_class(pretrained=False, extract_embedding=True),
        num_class=len(ds_class.label_list))
    model.set_state_dict(paddle.load(predicting_conf['checkpoint']))
K
KP 已提交
61 62
    model.eval()

K
KP 已提交
63
    feat = extract_features(predicting_conf['audio_file'], **feat_conf)
K
KP 已提交
64 65 66 67 68
    logits = model(feat)
    probs = F.softmax(logits, axis=1).numpy()

    sorted_indices = (-probs[0]).argsort()

K
KP 已提交
69 70 71 72
    msg = f"[{predicting_conf['audio_file']}]\n"
    for idx in sorted_indices[:predicting_conf['top_k']]:
        msg += f'{ds_class.label_list[idx]}: {probs[0][idx]}\n'
    logger.info(msg)