module.py

# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re
from typing import List
from typing import Union

import numpy as np
import paddle
import paddleaudio

from .ecapa_tdnn import Classifier
from .ecapa_tdnn import ECAPA_TDNN
from .feature import compute_log_fbank
from .feature import normalize
from paddlehub.module.module import moduleinfo
from paddlehub.utils.log import logger


@moduleinfo(
    name="ecapa_tdnn_common_language",
    version="1.0.0",
    summary="",
    author="paddlepaddle",
    author_email="",
    type="audio/language_identification")
class LanguageIdentification(paddle.nn.Layer):
    def __init__(self):
        super(LanguageIdentification, self).__init__()
        ckpt_path = os.path.join(self.directory, 'assets', 'model.pdparams')
        label_path = os.path.join(self.directory, 'assets', 'label.txt')

        self.label_list = []
        with open(label_path, 'r') as f:
            for l in f:
                self.label_list.append(l.strip())

        self.sr = 16000
        model_conf = {
            'input_size': 80,
            'channels': [1024, 1024, 1024, 1024, 3072],
            'kernel_sizes': [5, 3, 3, 3, 1],
            'dilations': [1, 2, 3, 4, 1],
            'attention_channels': 128,
            'lin_neurons': 192
        }
        self.model = Classifier(
            backbone=ECAPA_TDNN(**model_conf),
            num_class=45,
        )
        self.model.set_state_dict(paddle.load(ckpt_path))
        self.model.eval()

    def load_audio(self, wav):
        wav = os.path.abspath(os.path.expanduser(wav))
        assert os.path.isfile(wav), 'Please check wav file: {}'.format(wav)
        waveform, _ = paddleaudio.load(wav, sr=self.sr, mono=True, normal=False)
        return waveform

    def language_identify(self, wav):
        waveform = self.load_audio(wav)
        logits = self(paddle.to_tensor(waveform)).reshape([-1])
        idx = paddle.argmax(logits)
        return logits[idx].numpy(), self.label_list[idx]

    def forward(self, x):
        if len(x.shape) == 1:
            x = x.unsqueeze(0)

        fbank = compute_log_fbank(x)  # x: waveform tensors with (B, T) shape
        norm_fbank = normalize(fbank)
        logits = self.model(norm_fbank).squeeze(1)

        return logits