# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ MPC Metrics """ import paddle.fluid.metrics from paddle.fluid.metrics import MetricBase import numpy as np import scipy __all__ = [ 'KSstatistic', 'Auc', ] def _is_numpy_(var): return isinstance(var, (np.ndarray, np.generic)) class KSstatistic(MetricBase): """ The KSstatistic is for binary classification. Refer to https://en.wikipedia.org/wiki/Kolmogorov%E2%80%93Smirnov_test#Kolmogorov%E2%80%93Smirnov_statistic Please notice that the KS statistic is implemented with scipy. The `KSstatistic` function creates 2 local variables, `data1`, `data2` which is predictions of positive samples and negative samples respectively that are used to compute the KS statistic. Args: name (str, optional): Metric name. For details, please refer to :ref:`api_guide_Name`. Default is None. Examples: .. code-block:: python import paddle_fl.mpc import numpy as np # suppose that batch_size is 128 batch_num = 100 batch_size = 128 for batch_id in range(batch_num): class0_preds = np.random.random(size = (batch_size, 1)) class1_preds = 1 - class0_preds preds = np.concatenate((class0_preds, class1_preds), axis=1) labels = np.random.randint(2, size = (batch_size, 1)) # init the KSstatistic for each batch # to get global ks statistic, init ks before for-loop ks = paddle_fl.mpc.metrics.KSstatistic('ks') ks.update(preds = preds, labels = labels) # shall be some score closing to 0.1 as the preds are randomly assigned print("ks statistic for iteration %d is %.2f" % (batch_id, ks.eval())) """ def __init__(self, name=None): super(KSstatistic, self).__init__(name=name) self._data1 = [] self._data2 = [] def update(self, preds, labels): """ Update the auc curve with the given predictions and labels. Args: preds (numpy.array): an numpy array in the shape of (batch_size, 2), preds[i][j] denotes the probability of classifying the instance i into the class j. labels (numpy.array): an numpy array in the shape of (batch_size, 1), labels[i] is either o or 1, representing the label of the instance i. """ if not _is_numpy_(labels): raise ValueError("The 'labels' must be a numpy ndarray.") if not _is_numpy_(preds): raise ValueError("The 'predictions' must be a numpy ndarray.") data1 = [preds[i, 1] for i, lbl in enumerate(labels) if lbl] data2 = [preds[i, 1] for i, lbl in enumerate(labels) if not lbl] self._data1 += data1 self._data2 += data2 def eval(self): """ Return the area (a float score) under auc curve Return: float: the area under auc curve """ return scipy.stats.ks_2samp(self._data1, self._data2).statistic Auc = paddle.fluid.metrics.Auc