# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ evaluation scripts for KBC and pathQuery tasks """ import json import logging import collections import numpy as np logging.basicConfig( format='%(asctime)s - %(levelname)s - %(name)s - %(message)s', datefmt='%m/%d/%Y %H:%M:%S', level=logging.INFO) logger = logging.getLogger(__name__) def kbc_batch_evaluation(eval_i, all_examples, batch_results, tt): r_hts_idx = collections.defaultdict(list) scores_head = collections.defaultdict(list) scores_tail = collections.defaultdict(list) batch_r_hts_cnt = 0 b_size = len(batch_results) for j in range(b_size): result = batch_results[j] i = eval_i + j example = all_examples[i] assert len(example.token_ids ) == 3, "For kbc task each example consists of 3 tokens" h, r, t = example.token_ids _mask_type = example.mask_type if i % 2 == 0: r_hts_idx[r].append((h, t)) batch_r_hts_cnt += 1 if _mask_type == "MASK_HEAD": scores_head[(r, t)] = result elif _mask_type == "MASK_TAIL": scores_tail[(r, h)] = result else: raise ValueError("Unknown mask type in prediction example:%d" % i) rank = {} f_rank = {} for r, hts in r_hts_idx.items(): r_rank = {'head': [], 'tail': []} r_f_rank = {'head': [], 'tail': []} for h, t in hts: scores_t = scores_tail[(r, h)][:] sortidx_t = np.argsort(scores_t)[::-1] r_rank['tail'].append(np.where(sortidx_t == t)[0][0] + 1) rm_idx = tt[r]['ts'][h] rm_idx = [i for i in rm_idx if i != t] for i in rm_idx: scores_t[i] = -np.Inf sortidx_t = np.argsort(scores_t)[::-1] r_f_rank['tail'].append(np.where(sortidx_t == t)[0][0] + 1) scores_h = scores_head[(r, t)][:] sortidx_h = np.argsort(scores_h)[::-1] r_rank['head'].append(np.where(sortidx_h == h)[0][0] + 1) rm_idx = tt[r]['hs'][t] rm_idx = [i for i in rm_idx if i != h] for i in rm_idx: scores_h[i] = -np.Inf sortidx_h = np.argsort(scores_h)[::-1] r_f_rank['head'].append(np.where(sortidx_h == h)[0][0] + 1) rank[r] = r_rank f_rank[r] = r_f_rank h_pos = [p for k in rank.keys() for p in rank[k]['head']] t_pos = [p for k in rank.keys() for p in rank[k]['tail']] f_h_pos = [p for k in f_rank.keys() for p in f_rank[k]['head']] f_t_pos = [p for k in f_rank.keys() for p in f_rank[k]['tail']] ranks = np.asarray(h_pos + t_pos) f_ranks = np.asarray(f_h_pos + f_t_pos) return ranks, f_ranks def pathquery_batch_evaluation(eval_i, all_examples, batch_results, sen_negli_dict, trivial_sen_set): """ evaluate the metrics for batch datas for pathquery datasets """ mqs = [] ranks = [] for j, result in enumerate(batch_results): i = eval_i + j example = all_examples[i] token_ids, mask_type = example assert mask_type in ["MASK_TAIL", "MASK_HEAD" ], " Unknown mask type in pathquery evaluation" label = token_ids[-1] if mask_type == "MASK_TAIL" else token_ids[0] sen = " ".join([str(x) for x in token_ids]) if sen in trivial_sen_set: mq = rank = -1 else: # candidate vocab set cand_set = sen_negli_dict[sen] assert label in set( cand_set), "predict label must be in the candidate set" cand_idx = np.sort(np.array(cand_set)) cand_ret = result[ cand_idx] #logits for candidate words(neg + gold words) cand_ranks = np.argsort(cand_ret)[::-1] pred_y = cand_idx[cand_ranks] rank = (np.argwhere(pred_y == label).ravel().tolist())[0] + 1 mq = (len(cand_set) - rank) / (len(cand_set) - 1.0) mqs.append(mq) ranks.append(rank) return mqs, ranks def compute_kbc_metrics(rank_li, frank_li, output_evaluation_result_file): """ combine the kbc rank results from batches into the final metrics """ rank_rets = np.array(rank_li).ravel() frank_rets = np.array(frank_li).ravel() mrr = np.mean(1.0 / rank_rets) fmrr = np.mean(1.0 / frank_rets) hits1 = np.mean(rank_rets <= 1.0) hits3 = np.mean(rank_rets <= 3.0) hits10 = np.mean(rank_rets <= 10.0) # filtered metrics fhits1 = np.mean(frank_rets <= 1.0) fhits3 = np.mean(frank_rets <= 3.0) fhits10 = np.mean(frank_rets <= 10.0) eval_result = { 'mrr': mrr, 'hits1': hits1, 'hits3': hits3, 'hits10': hits10, 'fmrr': fmrr, 'fhits1': fhits1, 'fhits3': fhits3, 'fhits10': fhits10 } with open(output_evaluation_result_file, "w") as fw: fw.write(json.dumps(eval_result, indent=4) + "\n") return eval_result def compute_pathquery_metrics(mq_li, rank_li, output_evaluation_result_file): """ combine the pathquery mq, rank results from batches into the final metrics """ rank_rets = np.array(rank_li).ravel() _idx = np.where(rank_rets != -1) non_trivial_eval_rets = rank_rets[_idx] non_trivial_mq = np.array(mq_li).ravel()[_idx] non_trivial_cnt = non_trivial_eval_rets.size mq = np.mean(non_trivial_mq) mr = np.mean(non_trivial_eval_rets) mrr = np.mean(1.0 / non_trivial_eval_rets) fhits10 = np.mean(non_trivial_eval_rets <= 10.0) eval_result = { 'fcnt': non_trivial_cnt, 'mq': mq, 'mr': mr, 'fhits10': fhits10 } with open(output_evaluation_result_file, "w") as fw: fw.write(json.dumps(eval_result, indent=4) + "\n") return eval_result