beam_decoding.py 10.5 KB
Newer Older
C
caoying03 已提交
1
#coding=utf-8
C
caoying03 已提交
2

C
caoying03 已提交
3 4 5 6 7 8
import numpy as np

__all__ = ["BeamDecoding"]


class BeamDecoding(object):
C
caoying03 已提交
9 10 11 12
    """
    Decode outputs of the PaddlePaddle layers into readable answers.
    """

C
caoying03 已提交
13 14
    def __init__(self, documents, sentence_scores, selected_sentences,
                 start_scores, selected_starts, end_scores, selected_ends):
C
caoying03 已提交
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37
        """ The constructor.

        Arguments:
            - documents:          The one-hot input of the document words.
            - sentence_scores:    The score for each sentece in a document.
            - selected_sentences: The top k seleceted sentence. This is the
                                  output of the paddle.layer.kmax_seq_score
                                  layer in the model.
            - start_scores:       The score for each words in the selected
                                  sentence indicating whether the it is start
                                  of the answer.
            - selected_starts:    The top k selected start spans. This is the
                                  output of the paddle.layer.kmax_seq_score
                                  layer in the model.
            - end_scores:         The score for each words in the sub-sequence
                                  which is from the selecetd starts till end of
                                  the selected sentence.
            - selected_ends:      The top k selected end spans. This is the
                                  output of the paddle.layer.kmax_seq_score
                                  layer in the model.

        """

C
caoying03 已提交
38 39 40 41 42 43 44 45 46 47
        self.documents = documents

        self.sentence_scores = sentence_scores
        self.selected_sentences = selected_sentences

        self.start_scores = start_scores
        self.selected_starts = selected_starts

        self.end_scores = end_scores
        self.selected_ends = selected_ends
C
caoying03 已提交
48 49 50 51
        """
        sequence start position information for the three step search
        beam1 is to search the sequence index
        """
C
caoying03 已提交
52
        self.beam1_seq_start_positions = []
C
caoying03 已提交
53
        """beam2 is to search the start answer span"""
C
caoying03 已提交
54
        self.beam2_seq_start_positions = []
C
caoying03 已提交
55
        """beam3 is to search the end answer span """
C
caoying03 已提交
56 57 58 59 60 61 62 63
        self.beam3_seq_start_positions = []

        self.ans_per_sample_in_a_batch = [0]
        self.all_searched_ans = []

        self.final_ans = [[] for i in range(len(documents))]

    def _build_beam1_seq_info(self):
C
caoying03 已提交
64 65 66 67 68
        """
        The internal function to calculate the offset of each test sequence
        in a batch for the first beam in searching the answer sentence.
        """

C
caoying03 已提交
69 70 71 72 73 74 75 76 77 78 79
        self.beam1_seq_start_positions.append([0])
        for idx, one_doc in enumerate(self.documents):
            for sentence in one_doc:
                self.beam1_seq_start_positions[-1].append(
                    self.beam1_seq_start_positions[-1][-1] + len(sentence))

            if len(self.beam1_seq_start_positions) != len(self.documents):
                self.beam1_seq_start_positions.append(
                    [self.beam1_seq_start_positions[-1][-1]])

    def _build_beam2_seq_info(self):
C
caoying03 已提交
80 81 82 83 84
        """
        The internal function to calculate the offset of each test sequence
        in a batch for the second beam in searching the start spans.
        """

C
caoying03 已提交
85 86 87 88 89 90
        seq_num, beam_size = self.selected_sentences.shape
        self.beam2_seq_start_positions.append([0])
        for i in range(seq_num):
            for j in range(beam_size):
                selected_id = int(self.selected_sentences[i][j])
                if selected_id == -1: break
91 92 93
                seq_len = self.beam1_seq_start_positions[i][
                    selected_id + 1] - self.beam1_seq_start_positions[i][
                        selected_id]
C
caoying03 已提交
94 95 96 97 98 99 100 101
                self.beam2_seq_start_positions[-1].append(
                    self.beam2_seq_start_positions[-1][-1] + seq_len)

            if len(self.beam2_seq_start_positions) != seq_num:
                self.beam2_seq_start_positions.append(
                    [self.beam2_seq_start_positions[-1][-1]])

    def _build_beam3_seq_info(self):
C
caoying03 已提交
102 103 104 105 106
        """
        The internal function to calculate the offset of each test sequence
        in a batch for the third beam in searching the end spans.
        """

C
caoying03 已提交
107 108 109 110 111 112 113 114 115
        seq_num_in_a_batch = len(self.documents)

        seq_id = 0
        sub_seq_id = 0
        sub_seq_count = len(self.beam2_seq_start_positions[seq_id]) - 1

        self.beam3_seq_start_positions.append([0])
        sub_seq_num, beam_size = self.selected_starts.shape
        for i in range(sub_seq_num):
116 117 118
            seq_len = self.beam2_seq_start_positions[seq_id][
                sub_seq_id + 1] - self.beam2_seq_start_positions[seq_id][
                    sub_seq_id]
C
caoying03 已提交
119 120 121 122 123 124 125 126 127 128 129 130 131 132
            for j in range(beam_size):
                start_id = int(self.selected_starts[i][j])
                if start_id == -1: break

                self.beam3_seq_start_positions[-1].append(
                    self.beam3_seq_start_positions[-1][-1] + seq_len - start_id)

            sub_seq_id += 1
            if sub_seq_id == sub_seq_count:
                if len(self.beam3_seq_start_positions) != seq_num_in_a_batch:
                    self.beam3_seq_start_positions.append(
                        [self.beam3_seq_start_positions[-1][-1]])
                    sub_seq_id = 0
                    seq_id += 1
133 134
                    sub_seq_count = len(self.beam2_seq_start_positions[
                        seq_id]) - 1
C
caoying03 已提交
135 136 137 138
        assert (
            self.beam3_seq_start_positions[-1][-1] == self.end_scores.shape[0])

    def _build_seq_info_for_each_beam(self):
C
caoying03 已提交
139 140 141 142 143
        """
        The internal function to calculate the offset of each test sequence
        in a batch for beams expanded at all the three search steps.
        """

C
caoying03 已提交
144 145 146 147 148
        self._build_beam1_seq_info()
        self._build_beam2_seq_info()
        self._build_beam3_seq_info()

    def _cal_ans_per_sample_in_a_batch(self):
C
caoying03 已提交
149 150 151 152 153
        """
        The internal function to calculate there are how many candidate answers
        for each of the test sequemce in a batch.
        """

C
caoying03 已提交
154 155 156 157 158 159 160 161 162 163
        start_row = 0
        for seq in self.beam3_seq_start_positions:
            end_row = start_row + len(seq) - 1
            ans_count = np.sum(self.selected_ends[start_row:end_row, :] != -1.)

            self.ans_per_sample_in_a_batch.append(
                self.ans_per_sample_in_a_batch[-1] + ans_count)
            start_row = end_row

    def _get_valid_seleceted_ids(slef, mat):
C
caoying03 已提交
164 165 166 167 168 169 170
        """
        The internal function to post-process the output matrix of
        paddle.layer.kmax_seq_score layer. This function takes off the special
        dilimeter -1 away and flattens the original two-dimensional output
        matrix into a python list.
        """

C
caoying03 已提交
171 172 173 174 175 176 177 178 179
        flattened = []
        height, width = mat.shape
        for i in range(height):
            for j in range(width):
                if mat[i][j] == -1.: break
                flattened.append([int(mat[i][j]), [i, j]])
        return flattened

    def decoding(self):
C
caoying03 已提交
180 181 182 183 184
        """
        The internal function to decode forward results of the GNR network into
        readable answers.
        """

C
caoying03 已提交
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200
        self._build_seq_info_for_each_beam()
        self._cal_ans_per_sample_in_a_batch()

        seq_id = 0
        sub_seq_id = 0
        sub_seq_count = len(self.beam3_seq_start_positions[seq_id]) - 1

        sub_seq_num, beam_size = self.selected_ends.shape
        for i in xrange(sub_seq_num):
            seq_offset_in_batch = self.beam3_seq_start_positions[seq_id][
                sub_seq_id]
            for j in xrange(beam_size):
                end_pos = int(self.selected_ends[i][j])
                if end_pos == -1: break

                self.all_searched_ans.append({
201 202 203 204 205
                    "score": self.end_scores[seq_offset_in_batch + end_pos],
                    "sentence_pos": -1,
                    "start_span_pos": -1,
                    "end_span_pos": end_pos,
                    "parent_ids_in_prev_beam": i
C
caoying03 已提交
206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262
                })

            sub_seq_id += 1
            if sub_seq_id == sub_seq_count:
                seq_id += 1
                if seq_id == len(self.beam3_seq_start_positions): break

                sub_seq_id = 0
                sub_seq_count = len(self.beam3_seq_start_positions[seq_id]) - 1

        assert len(self.all_searched_ans) == self.ans_per_sample_in_a_batch[-1]

        seq_id = 0
        sub_seq_id = 0
        sub_seq_count = len(self.beam2_seq_start_positions[seq_id]) - 1
        last_row_id = None

        starts = self._get_valid_seleceted_ids(self.selected_starts)
        for i, ans in enumerate(self.all_searched_ans):
            ans["start_span_pos"] = starts[ans["parent_ids_in_prev_beam"]][0]

            seq_offset_in_batch = (
                self.beam2_seq_start_positions[seq_id][sub_seq_id])
            ans["score"] += self.start_scores[(
                seq_offset_in_batch + ans["start_span_pos"])]
            ans["parent_ids_in_prev_beam"] = starts[ans[
                "parent_ids_in_prev_beam"]][1][0]

            if last_row_id and last_row_id != ans["parent_ids_in_prev_beam"]:
                sub_seq_id += 1

            if sub_seq_id == sub_seq_count:
                seq_id += 1
                if seq_id == len(self.beam2_seq_start_positions): break
                sub_seq_count = len(self.beam2_seq_start_positions[seq_id]) - 1
                sub_seq_id = 0
            last_row_id = ans["parent_ids_in_prev_beam"]

        offset_info = [0]
        for sen in self.beam1_seq_start_positions[:-1]:
            offset_info.append(offset_info[-1] + len(sen) - 1)
        sen_ids = self._get_valid_seleceted_ids(self.selected_sentences)
        for ans in self.all_searched_ans:
            ans["sentence_pos"] = sen_ids[ans["parent_ids_in_prev_beam"]][0]
            row_id = ans["parent_ids_in_prev_beam"] / beam_size
            offset = offset_info[row_id - 1] if row_id else 0
            ans["score"] += self.sentence_scores[offset + ans["sentence_pos"]]

        for i in range(len(self.ans_per_sample_in_a_batch) - 1):
            start_pos = self.ans_per_sample_in_a_batch[i]
            end_pos = self.ans_per_sample_in_a_batch[i + 1]

            for ans in sorted(
                    self.all_searched_ans[start_pos:end_pos],
                    key=lambda x: x["score"],
                    reverse=True):
                self.final_ans[i].append({
263
                    "score": ans["score"],
C
caoying03 已提交
264 265 266 267 268 269 270
                    "label": [
                        ans["sentence_pos"], ans["start_span_pos"],
                        ans["end_span_pos"]
                    ]
                })

        return self.final_ans