# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re
from typing import List
from typing import Union

import numpy as np
import paddle
import paddleaudio

from .ecapa_tdnn import ECAPA_TDNN
from .feature import compute_log_fbank
from .feature import normalize
from paddlehub.module.module import moduleinfo
from paddlehub.utils.log import logger


@moduleinfo(
    name="ecapa_tdnn_voxceleb",
    version="1.0.0",
    summary="",
    author="paddlepaddle",
    author_email="",
    type="audio/speaker_recognition")
class SpeakerRecognition(paddle.nn.Layer):
    def __init__(self, threshold=0.25):
        super(SpeakerRecognition, self).__init__()
        global_stats_path = os.path.join(self.directory, 'assets', 'global_embedding_stats.npy')
        ckpt_path = os.path.join(self.directory, 'assets', 'model.pdparams')

        self.sr = 16000
        self.threshold = threshold
        model_conf = {
            'input_size': 80,
            'channels': [1024, 1024, 1024, 1024, 3072],
            'kernel_sizes': [5, 3, 3, 3, 1],
            'dilations': [1, 2, 3, 4, 1],
            'attention_channels': 128,
            'lin_neurons': 192
        }
        self.model = ECAPA_TDNN(**model_conf)
        self.model.set_state_dict(paddle.load(ckpt_path))
        self.model.eval()

        global_embedding_stats = np.load(global_stats_path, allow_pickle=True)
        self.global_emb_mean = paddle.to_tensor(global_embedding_stats.item().get('global_emb_mean'))
        self.global_emb_std = paddle.to_tensor(global_embedding_stats.item().get('global_emb_std'))

        self.similarity = paddle.nn.CosineSimilarity(axis=-1, eps=1e-6)

    def load_audio(self, wav):
        wav = os.path.abspath(os.path.expanduser(wav))
        assert os.path.isfile(wav), 'Please check wav file: {}'.format(wav)
        waveform, _ = paddleaudio.load(wav, sr=self.sr, mono=True, normal=False)
        return waveform

    def speaker_embedding(self, wav):
        waveform = self.load_audio(wav)
        embedding = self(paddle.to_tensor(waveform)).reshape([-1])
        return embedding.numpy()

    def speaker_verify(self, wav1, wav2):
        waveform1 = self.load_audio(wav1)
        embedding1 = self(paddle.to_tensor(waveform1)).reshape([-1])

        waveform2 = self.load_audio(wav2)
        embedding2 = self(paddle.to_tensor(waveform2)).reshape([-1])

        score = self.similarity(embedding1, embedding2).numpy()
        return score, score > self.threshold

    def forward(self, x):
        if len(x.shape) == 1:
            x = x.unsqueeze(0)

        fbank = compute_log_fbank(x)  # x: waveform tensors with (B, T) shape
        norm_fbank = normalize(fbank)
        embedding = self.model(norm_fbank.transpose([0, 2, 1])).transpose([0, 2, 1])
        norm_embedding = normalize(x=embedding, global_mean=self.global_emb_mean, global_std=self.global_emb_std)

        return norm_embedding