scorer_deprecated.py 2.7 KB
Newer Older
H
Hui Zhang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
Y
Yibing Liu 已提交
14 15
"""External Scorer for Beam Search Decoder."""
import os
16

Y
Yibing Liu 已提交
17 18 19 20
import kenlm
import numpy as np


Y
Yibing Liu 已提交
21
class Scorer(object):
Y
Yibing Liu 已提交
22 23 24
    """External scorer to evaluate a prefix or whole sentence in
       beam search decoding, including the score from n-gram language
       model and word count.
Y
Yibing Liu 已提交
25

Y
Yibing Liu 已提交
26 27
    :param alpha: Parameter associated with language model. Don't use
                  language model when alpha = 0.
Y
Yibing Liu 已提交
28
    :type alpha: float
Y
Yibing Liu 已提交
29 30
    :param beta: Parameter associated with word count. Don't use word
                count when beta = 0.
Y
Yibing Liu 已提交
31 32
    :type beta: float
    :model_path: Path to load language model.
H
Hui Zhang 已提交
33
    :type model_path: str
Y
Yibing Liu 已提交
34 35 36 37 38 39 40 41 42 43
    """

    def __init__(self, alpha, beta, model_path):
        self._alpha = alpha
        self._beta = beta
        if not os.path.isfile(model_path):
            raise IOError("Invaid language model path: %s" % model_path)
        self._language_model = kenlm.LanguageModel(model_path)

    # n-gram language model scoring
Y
Yibing Liu 已提交
44
    def _language_model_score(self, sentence):
Y
Yibing Liu 已提交
45 46 47 48 49 50
        #log10 prob of last word
        log_cond_prob = list(
            self._language_model.full_scores(sentence, eos=False))[-1][0]
        return np.power(10, log_cond_prob)

    # word insertion term
Y
Yibing Liu 已提交
51
    def _word_count(self, sentence):
Y
Yibing Liu 已提交
52 53 54
        words = sentence.strip().split(' ')
        return len(words)

Y
Yibing Liu 已提交
55 56 57 58 59
    # reset alpha and beta
    def reset_params(self, alpha, beta):
        self._alpha = alpha
        self._beta = beta

Y
Yibing Liu 已提交
60 61 62 63 64 65
    # execute evaluation
    def __call__(self, sentence, log=False):
        """Evaluation function, gathering all the different scores
        and return the final one.

        :param sentence: The input sentence for evalutation
H
Hui Zhang 已提交
66
        :type sentence: str
Y
Yibing Liu 已提交
67 68 69 70 71
        :param log: Whether return the score in log representation.
        :type log: bool
        :return: Evaluation score, in the decimal or log.
        :rtype: float
        """
Y
Yibing Liu 已提交
72 73
        lm = self._language_model_score(sentence)
        word_cnt = self._word_count(sentence)
74
        if log is False:
Y
Yibing Liu 已提交
75
            score = np.power(lm, self._alpha) * np.power(word_cnt, self._beta)
Y
Yibing Liu 已提交
76
        else:
Y
Yibing Liu 已提交
77
            score = self._alpha * np.log(lm) + self._beta * np.log(word_cnt)
Y
Yibing Liu 已提交
78
        return score