From 16062255a149469e6f2e87b1cb837d14a52e08a0 Mon Sep 17 00:00:00 2001 From: liuluobin Date: Thu, 3 Sep 2020 11:47:54 +0800 Subject: [PATCH] remove parameter 'iid' in GridSearchCV. Increase support for parallel training. Modify ut case. Add attack config verfication. --- .../diff_privacy/evaluation/_check_config.py | 220 ++++++++++++++++++ .../diff_privacy/evaluation/attacker.py | 43 ++-- .../evaluation/membership_inference.py | 40 ++-- tests/ut/python/diff_privacy/test_attacker.py | 8 +- .../diff_privacy/test_membership_inference.py | 9 +- 5 files changed, 277 insertions(+), 43 deletions(-) create mode 100644 mindarmour/diff_privacy/evaluation/_check_config.py diff --git a/mindarmour/diff_privacy/evaluation/_check_config.py b/mindarmour/diff_privacy/evaluation/_check_config.py new file mode 100644 index 0000000..d6e0a2b --- /dev/null +++ b/mindarmour/diff_privacy/evaluation/_check_config.py @@ -0,0 +1,220 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Verify attack config +""" + +from mindarmour.utils.logger import LogUtil + +LOGGER = LogUtil.get_instance() + +TAG = "check_params" + + +def _is_positive_int(item): + """ + Verify that the value is a positive integer. + """ + if not isinstance(item, int) or item <= 0: + return False + return True + + +def _is_non_negative_int(item): + """ + Verify that the value is a non-negative integer. + """ + if not isinstance(item, int) or item < 0: + return False + return True + + +def _is_positive_float(item): + """ + Verify that value is a positive number. + """ + if not isinstance(item, (int, float)) or item <= 0: + return False + return True + + +def _is_non_negative_float(item): + """ + Verify that value is a non-negative number. + """ + if not isinstance(item, (int, float)) or item < 0: + return False + return True + + +def _is_positive_int_tuple(item): + """ + Verify that the input parameter is a positive integer tuple. + """ + if not isinstance(item, tuple): + return False + for i in item: + if not _is_positive_int(i): + return False + return True + + +def _is_dict(item): + """ + Check whether the type is dict. + """ + return isinstance(item, dict) + + +VALID_PARAMS_DICT = { + "knn": { + "n_neighbors": [_is_positive_int], + "weights": [{"uniform", "distance"}], + "algorithm": [{"auto", "ball_tree", "kd_tree", "brute"}], + "leaf_size": [_is_positive_int], + "p": [_is_positive_int], + "metric": None, + "metric_params": None, + }, + "lr": { + "penalty": [{"l1", "l2", "elasticnet", "none"}], + "dual": [{True, False}], + "tol": [_is_positive_float], + "C": [_is_positive_float], + "fit_intercept": [{True, False}], + "intercept_scaling": [_is_positive_float], + "class_weight": [{"balanced", None}, _is_dict], + "random_state": None, + "solver": [{"newton-cg", "lbfgs", "liblinear", "sag", "saga"}] + }, + "mlp": { + "hidden_layer_sizes": [_is_positive_int_tuple], + "activation": [{"identity", "logistic", "tanh", "relu"}], + "solver": {"lbfgs", "sgd", "adam"}, + "alpha": [_is_positive_float], + "batch_size": [{"auto"}, _is_positive_int], + "learning_rate": [{"constant", "invscaling", "adaptive"}], + "learning_rate_init": [_is_positive_float], + "power_t": [_is_positive_float], + "max_iter": [_is_positive_int], + "shuffle": [{True, False}], + "random_state": None, + "tol": [_is_positive_float], + "verbose": [{True, False}], + "warm_start": [{True, False}], + "momentum": [_is_positive_float], + "nesterovs_momentum": [{True, False}], + "early_stopping": [{True, False}], + "validation_fraction": [_is_positive_float], + "beta_1": [_is_positive_float], + "beta_2": [_is_positive_float], + "epsilon": [_is_positive_float], + "n_iter_no_change": [_is_positive_int], + "max_fun": [_is_positive_int] + }, + "rf": { + "n_estimators": [_is_positive_int], + "criterion": [{"gini", "entropy"}], + "max_depth": [_is_positive_int], + "min_samples_split": [_is_positive_float], + "min_samples_leaf": [_is_positive_float], + "min_weight_fraction_leaf": [_is_non_negative_float], + "max_features": [{"auto", "sqrt", "log2", None}, _is_positive_float], + "max_leaf_nodes": [_is_positive_int, {None}], + "min_impurity_decrease": {_is_non_negative_float}, + "min_impurity_split": [{None}, _is_positive_float], + "bootstrap": [{True, False}], + "oob_scroe": [{True, False}], + "n_jobs": [_is_positive_int, {None}], + "random_state": None, + "verbose": [_is_non_negative_int], + "warm_start": [{True, False}], + "class_weight": None, + "ccp_alpha": [_is_non_negative_float], + "max_samples": [_is_positive_float] + } +} + + + +def _check_config(config_list, check_params): + """ + Verify that config_list is valid. + Check_params is the valid value range of the parameter. + """ + if not isinstance(config_list, (list, tuple)): + msg = "Type of parameter 'config_list' must be list, but got {}.".format(type(config_list)) + LOGGER.error(TAG, msg) + raise TypeError(msg) + + for config in config_list: + if not isinstance(config, dict): + msg = "Type of each config in config_list must be dict, but got {}.".format(type(config)) + LOGGER.error(TAG, msg) + raise TypeError(msg) + + if set(config.keys()) != {"params", "method"}: + msg = "Keys of each config in config_list must be {}," \ + "but got {}.".format({'method', 'params'}, set(config.keys())) + LOGGER.error(TAG, msg) + raise KeyError(msg) + + method = str.lower(config["method"]) + params = config["params"] + + if method not in check_params.keys(): + msg = "Method {} is not supported.".format(method) + LOGGER.error(TAG, msg) + raise ValueError(msg) + + if not params.keys() <= check_params[method].keys(): + msg = "Params in method {} is not accepted, the parameters " \ + "that can be set are {}.".format(method, set(check_params[method].keys())) + + LOGGER.error(TAG, msg) + raise KeyError(msg) + + for param_key in params.keys(): + param_value = params[param_key] + candidate_values = check_params[method][param_key] + + if not isinstance(param_value, list): + msg = "The parameter '{}' in method '{}' setting must within the range of " \ + "changeable parameters.".format(param_key, method) + LOGGER.error(TAG, msg) + raise ValueError(msg) + + if candidate_values is None: + continue + + for item_value in param_value: + flag = False + for candidate_value in candidate_values: + if isinstance(candidate_value, set) and item_value in candidate_value: + flag = True + break + elif candidate_value(item_value): + flag = True + break + + if not flag: + msg = "Setting of parmeter {} in method {} is invalid".format(param_key, method) + raise ValueError(msg) + + +def check_config_params(config_list): + """ + External interfaces to verify attack config. + """ + _check_config(config_list, VALID_PARAMS_DICT) diff --git a/mindarmour/diff_privacy/evaluation/attacker.py b/mindarmour/diff_privacy/evaluation/attacker.py index 91f604e..3b337e0 100755 --- a/mindarmour/diff_privacy/evaluation/attacker.py +++ b/mindarmour/diff_privacy/evaluation/attacker.py @@ -27,7 +27,7 @@ LOGGER = LogUtil.get_instance() TAG = "Attacker" -def _attack_knn(features, labels, param_grid): +def _attack_knn(features, labels, param_grid, n_jobs): """ Train and return a KNN model. @@ -35,20 +35,21 @@ def _attack_knn(features, labels, param_grid): features (numpy.ndarray): Loss and logits characteristics of each sample. labels (numpy.ndarray): Labels of each sample whether belongs to training set. param_grid (dict): Setting of GridSearchCV. + n_jobs (int): Number of jobs run in parallel. -1 means using all processors, + otherwise the value of n_jobs must be a positive integer. Returns: sklearn.model_selection.GridSearchCV, trained model. """ knn_model = KNeighborsClassifier() knn_model = GridSearchCV( - knn_model, param_grid=param_grid, cv=3, n_jobs=1, iid=False, - verbose=0, + knn_model, param_grid=param_grid, cv=3, n_jobs=n_jobs, verbose=0, ) knn_model.fit(X=features, y=labels) return knn_model -def _attack_lr(features, labels, param_grid): +def _attack_lr(features, labels, param_grid, n_jobs): """ Train and return a LR model. @@ -56,20 +57,21 @@ def _attack_lr(features, labels, param_grid): features (numpy.ndarray): Loss and logits characteristics of each sample. labels (numpy.ndarray): Labels of each sample whether belongs to training set. param_grid (dict): Setting of GridSearchCV. + n_jobs (int): Number of jobs run in parallel. -1 means using all processors, + otherwise the value of n_jobs must be a positive integer. Returns: sklearn.model_selection.GridSearchCV, trained model. """ - lr_model = LogisticRegression(C=1.0, penalty="l2", max_iter=1000) + lr_model = LogisticRegression(C=1.0, penalty="l2", max_iter=300) lr_model = GridSearchCV( - lr_model, param_grid=param_grid, cv=3, n_jobs=1, iid=False, - verbose=0, + lr_model, param_grid=param_grid, cv=3, n_jobs=n_jobs, verbose=0, ) lr_model.fit(X=features, y=labels) return lr_model -def _attack_mlpc(features, labels, param_grid): +def _attack_mlpc(features, labels, param_grid, n_jobs): """ Train and return a MLPC model. @@ -77,20 +79,21 @@ def _attack_mlpc(features, labels, param_grid): features (numpy.ndarray): Loss and logits characteristics of each sample. labels (numpy.ndarray): Labels of each sample whether belongs to training set. param_grid (dict): Setting of GridSearchCV. + n_jobs (int): Number of jobs run in parallel. -1 means using all processors, + otherwise the value of n_jobs must be a positive integer. Returns: sklearn.model_selection.GridSearchCV, trained model. """ mlpc_model = MLPClassifier(random_state=1, max_iter=300) mlpc_model = GridSearchCV( - mlpc_model, param_grid=param_grid, cv=3, n_jobs=1, iid=False, - verbose=0, + mlpc_model, param_grid=param_grid, cv=3, n_jobs=n_jobs, verbose=0, ) mlpc_model.fit(features, labels) return mlpc_model -def _attack_rf(features, labels, random_grid): +def _attack_rf(features, labels, random_grid, n_jobs): """ Train and return a RF model. @@ -98,20 +101,22 @@ def _attack_rf(features, labels, random_grid): features (numpy.ndarray): Loss and logits characteristics of each sample. labels (numpy.ndarray): Labels of each sample whether belongs to training set. random_grid (dict): Setting of RandomizedSearchCV. + n_jobs (int): Number of jobs run in parallel. -1 means using all processors, + otherwise the value of n_jobs must be a positive integer. Returns: sklearn.model_selection.RandomizedSearchCV, trained model. """ rf_model = RandomForestClassifier(max_depth=2, random_state=0) rf_model = RandomizedSearchCV( - rf_model, param_distributions=random_grid, n_iter=7, cv=3, n_jobs=1, - iid=False, verbose=0, + rf_model, param_distributions=random_grid, n_iter=7, cv=3, n_jobs=n_jobs, + verbose=0, ) rf_model.fit(features, labels) return rf_model -def get_attack_model(features, labels, config): +def get_attack_model(features, labels, config, n_jobs=-1): """ Get trained attack model specify by config. @@ -123,6 +128,8 @@ def get_attack_model(features, labels, config): params of each method must within the range of changeable parameters. Tips of params implement can be found in "https://scikit-learn.org/0.16/modules/generated/sklearn.grid_search.GridSearchCV.html". + n_jobs (int): Number of jobs run in parallel. -1 means using all processors, + otherwise the value of n_jobs must be a positive integer. Returns: sklearn.BaseEstimator, trained model specify by config["method"]. @@ -136,13 +143,13 @@ def get_attack_model(features, labels, config): method = str.lower(config["method"]) if method == "knn": - return _attack_knn(features, labels, config["params"]) + return _attack_knn(features, labels, config["params"], n_jobs) if method == "lr": - return _attack_lr(features, labels, config["params"]) + return _attack_lr(features, labels, config["params"], n_jobs) if method == "mlp": - return _attack_mlpc(features, labels, config["params"]) + return _attack_mlpc(features, labels, config["params"], n_jobs) if method == "rf": - return _attack_rf(features, labels, config["params"]) + return _attack_rf(features, labels, config["params"], n_jobs) msg = "Method {} is not supported.".format(config["method"]) LOGGER.error(TAG, msg) diff --git a/mindarmour/diff_privacy/evaluation/membership_inference.py b/mindarmour/diff_privacy/evaluation/membership_inference.py index a91c5fb..5de6d5b 100755 --- a/mindarmour/diff_privacy/evaluation/membership_inference.py +++ b/mindarmour/diff_privacy/evaluation/membership_inference.py @@ -15,14 +15,16 @@ Membership Inference """ +from multiprocessing import cpu_count import numpy as np import mindspore as ms from mindspore.train import Model from mindspore.dataset.engine import Dataset from mindspore import Tensor -from mindarmour.diff_privacy.evaluation.attacker import get_attack_model from mindarmour.utils.logger import LogUtil +from .attacker import get_attack_model +from ._check_config import check_config_params LOGGER = LogUtil.get_instance() TAG = "MembershipInference" @@ -101,13 +103,15 @@ class MembershipInference: Args: model (Model): Target model. + n_jobs (int): Number of jobs run in parallel. -1 means using all processors, + otherwise the value of n_jobs must be a positive integer. Examples: >>> train_1, train_2 are non-overlapping datasets from training dataset of target model. >>> test_1, test_2 are non-overlapping datasets from test dataset of target model. >>> We use train_1, test_1 to train attack model, and use train_2, test_2 to evaluate attack model. >>> model = Model(network=net, loss_fn=loss, optimizer=opt, metrics={'acc', 'loss'}) - >>> inference_model = MembershipInference(model) + >>> inference_model = MembershipInference(model, n_jobs=-1) >>> config = [{"method": "KNN", "params": {"n_neighbors": [3, 5, 7]}}] >>> inference_model.train(train_1, test_1, config) >>> metrics = ["precision", "recall", "accuracy"] @@ -115,15 +119,26 @@ class MembershipInference: Raises: TypeError: If type of model is not mindspore.train.Model. + TypeError: If type of n_jobs is not int. + ValueError: The value of n_jobs is neither -1 nor a positive integer. """ - def __init__(self, model): + def __init__(self, model, n_jobs=-1): if not isinstance(model, Model): msg = "Type of parameter 'model' must be Model, but got {}.".format(type(model)) LOGGER.error(TAG, msg) raise TypeError(msg) + if not isinstance(n_jobs, int): + msg = "Type of parameter 'n_jobs' must be int, but got {}".format(type(n_jobs)) + LOGGER.error(TAG, msg) + raise TypeError(msg) + if not (n_jobs == -1 or n_jobs > 0): + msg = "Value of n_jobs must be either -1 or positive integer, but got {}.".format(n_jobs) + LOGGER.error(TAG, msg) + raise ValueError(msg) self.model = model + self.n_jobs = min(n_jobs, cpu_count()) self.method_list = ["knn", "lr", "mlp", "rf"] self.attack_list = [] @@ -162,24 +177,13 @@ class MembershipInference: LOGGER.error(TAG, msg) raise TypeError(msg) - for config in attack_config: - if not isinstance(config, dict): - msg = "Type of each config in 'attack_config' must be dict, but got {}.".format(type(config)) - LOGGER.error(TAG, msg) - raise TypeError(msg) - if {"params", "method"} != set(config.keys()): - msg = "Each config in attack_config must have keys 'method' and 'params'," \ - "but your key value is {}.".format(set(config.keys())) - LOGGER.error(TAG, msg) - raise KeyError(msg) - if str.lower(config["method"]) not in self.method_list: - msg = "Method {} is not support.".format(config["method"]) - LOGGER.error(TAG, msg) - raise ValueError(msg) + check_config_params(attack_config) # Verify attack config. features, labels = self._transform(dataset_train, dataset_test) + for config in attack_config: - self.attack_list.append(get_attack_model(features, labels, config)) + self.attack_list.append(get_attack_model(features, labels, config, n_jobs=self.n_jobs)) + def eval(self, dataset_train, dataset_test, metrics): """ diff --git a/tests/ut/python/diff_privacy/test_attacker.py b/tests/ut/python/diff_privacy/test_attacker.py index 9046e2b..2c00901 100644 --- a/tests/ut/python/diff_privacy/test_attacker.py +++ b/tests/ut/python/diff_privacy/test_attacker.py @@ -35,7 +35,7 @@ def test_get_knn_model(): "n_neighbors": [3], } } - knn_attacker = get_attack_model(features, labels, config_knn) + knn_attacker = get_attack_model(features, labels, config_knn, -1) pred = knn_attacker.predict(features) assert pred is not None @@ -54,7 +54,7 @@ def test_get_lr_model(): "C": np.logspace(-4, 2, 10), } } - lr_attacker = get_attack_model(features, labels, config_lr) + lr_attacker = get_attack_model(features, labels, config_lr, -1) pred = lr_attacker.predict(features) assert pred is not None @@ -75,7 +75,7 @@ def test_get_mlp_model(): "alpha": [0.0001, 0.001, 0.01], } } - mlpc_attacker = get_attack_model(features, labels, config_mlpc) + mlpc_attacker = get_attack_model(features, labels, config_mlpc, -1) pred = mlpc_attacker.predict(features) assert pred is not None @@ -98,6 +98,6 @@ def test_get_rf_model(): "min_samples_leaf": [1, 2, 4], } } - rf_attacker = get_attack_model(features, labels, config_rf) + rf_attacker = get_attack_model(features, labels, config_rf, -1) pred = rf_attacker.predict(features) assert pred is not None diff --git a/tests/ut/python/diff_privacy/test_membership_inference.py b/tests/ut/python/diff_privacy/test_membership_inference.py index c11d45a..093ce96 100644 --- a/tests/ut/python/diff_privacy/test_membership_inference.py +++ b/tests/ut/python/diff_privacy/test_membership_inference.py @@ -24,6 +24,7 @@ import numpy as np import mindspore.dataset as ds from mindspore import nn from mindspore.train import Model +import mindspore.context as context from mindarmour.diff_privacy.evaluation.membership_inference import MembershipInference @@ -31,6 +32,8 @@ sys.path.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "../")) from defenses.mock_net import Net +context.set_context(mode=context.GRAPH_MODE) + def dataset_generator(batch_size, batches): """mock training data.""" data = np.random.randn(batches*batch_size, 1, 32, 32).astype( @@ -51,7 +54,7 @@ def test_get_membership_inference_object(): loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) opt = nn.Momentum(params=net.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(network=net, loss_fn=loss, optimizer=opt) - inference_model = MembershipInference(model) + inference_model = MembershipInference(model, -1) assert isinstance(inference_model, MembershipInference) @@ -65,7 +68,7 @@ def test_membership_inference_object_train(): loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) opt = nn.Momentum(params=net.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(network=net, loss_fn=loss, optimizer=opt) - inference_model = MembershipInference(model) + inference_model = MembershipInference(model, -1) assert isinstance(inference_model, MembershipInference) config = [{ @@ -95,7 +98,7 @@ def test_membership_inference_eval(): loss = nn.SoftmaxCrossEntropyWithLogits(is_grad=False, sparse=True) opt = nn.Momentum(params=net.trainable_params(), learning_rate=0.1, momentum=0.9) model = Model(network=net, loss_fn=loss, optimizer=opt) - inference_model = MembershipInference(model) + inference_model = MembershipInference(model, -1) assert isinstance(inference_model, MembershipInference) batch_size = 16 -- GitLab