diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index c6bafb64405cd65f60d66071dea31bc85061578c..12006eb5145964df755727d5592e82c078e62c33 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -936,6 +936,14 @@ paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', ' paddle.fluid.optimizer.AdamaxOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdamaxOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) +paddle.fluid.optimizer.DpsgdOptimizer ('paddle.fluid.optimizer.DpsgdOptimizer', ('document', '71113c30b66c0f4035b10ebd8af8c5ad')) +paddle.fluid.optimizer.DpsgdOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'clip', 'batch_size', 'sigma'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.DpsgdOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) +paddle.fluid.optimizer.DpsgdOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae')) +paddle.fluid.optimizer.DpsgdOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f')) +paddle.fluid.optimizer.DpsgdOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.optimizer.DpsgdOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) +paddle.fluid.optimizer.DpsgdOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) paddle.fluid.optimizer.DecayedAdagradOptimizer ('paddle.fluid.optimizer.DecayedAdagradOptimizer', ('document', 'e76838a8586bf2e58e6b5cdd2f67f780')) paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.cc b/paddle/fluid/operators/optimizers/dpsgd_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f263e67593bbd15f062648e5f09627d5fea64f0d --- /dev/null +++ b/paddle/fluid/operators/optimizers/dpsgd_op.cc @@ -0,0 +1,107 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/optimizers/dpsgd_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +class DpsgdOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true, + "Input(Param) of DpsgdOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true, + "Input(Grad) of DpsgdOp should not be null."); + PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true, + "Input(LearningRate) of DpsgdOp should not be null."); + PADDLE_ENFORCE_EQ( + ctx->GetInputsVarType("Param").front(), + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front()); + PADDLE_ENFORCE_EQ( + ctx->GetInputsVarType("Grad").front(), + framework::proto::VarType::LOD_TENSOR, + "The input var's type should be LoDTensor, but the received is %s", + ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front()); + + PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true, + "Output(ParamOut) of DpsgdOp should not be null."); + + auto lr_dims = ctx->GetInputDim("LearningRate"); + PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1, + "Learning rate should have 1 dimension"); + auto param_dims = ctx->GetInputDim("Param"); + PADDLE_ENFORCE_EQ( + param_dims, ctx->GetInputDim("Grad"), + "Param and Grad input of DpsgdOp should have same dimension"); + + ctx->SetOutputDim("ParamOut", param_dims); + } + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType(ctx.Input("Param")->type(), + ctx.GetPlace()); + } +}; + +class DpsgdOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Param", "(Tensor) Input parameter"); + AddInput("Grad", "(Tensor) Input gradient"); + AddInput("LearningRate", "(Tensor) Learning rate"); + + AddOutput("ParamOut", "(Tensor) Output parameter"); + + AddAttr("clip", + "(float, default 0.9) " + "Exponential decay rate for the " + "1st moment estimates.") + .SetDefault(10.0f); + AddAttr("batch_size", + "(float, default 0.999) " + "exponential decay rate for the weighted " + "infinity norm estimates.") + .SetDefault(16.0f); + AddAttr("sigma", + "(float, default 1.0e-8) " + "Constant for numerical stability") + .SetDefault(1.0f); + AddComment(R"DOC( +Dpsgd Optimizer. + +We implement the Dpsgd optimizer according to CCS16 paper - +Deep Learning with Differential Privacy. + +Dpsgd updates: +CCS16 - Deep Learning with Differential Privacy. +[https://arxiv.org/abs/1607.00133] + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(dpsgd, ops::DpsgdOp, ops::DpsgdOpMaker); +REGISTER_OP_CPU_KERNEL( + dpsgd, ops::DpsgdOpKernel, + ops::DpsgdOpKernel); diff --git a/paddle/fluid/operators/optimizers/dpsgd_op.h b/paddle/fluid/operators/optimizers/dpsgd_op.h new file mode 100644 index 0000000000000000000000000000000000000000..4eba7fed7e98cdb2065ed8245eca898388f23d0f --- /dev/null +++ b/paddle/fluid/operators/optimizers/dpsgd_op.h @@ -0,0 +1,114 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +template +class DpsgdOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + const auto *param_var = ctx.InputVar("Param"); + PADDLE_ENFORCE_EQ(param_var->IsType(), true, + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Param").front(), + framework::ToTypeName(param_var->Type())); + + const auto *grad_var = ctx.InputVar("Grad"); + PADDLE_ENFORCE_EQ(grad_var->IsType(), true, + "The Var(%s)'s type should be LoDTensor, " + "but the received is %s", + ctx.Inputs("Grad").front(), + framework::ToTypeName(grad_var->Type())); + + const auto *learning_rate = ctx.Input("LearningRate"); + + const auto *param = ctx.Input("Param"); + const auto *grad = ctx.Input("Grad"); + + auto *param_out = ctx.Output("ParamOut"); + + auto sz = param_out->numel(); + PADDLE_ENFORCE_EQ(param->numel(), sz); + PADDLE_ENFORCE_EQ(grad->numel(), sz); + + const T *lr = learning_rate->data(); + const T *param_data = param->data(); + const T *grad_data = grad->data(); + + T *out_data = param_out->mutable_data(ctx.GetPlace()); + + T clip = static_cast(ctx.Attr("clip")); + T batch_size = static_cast(ctx.Attr("batch_size")); + T sigma = static_cast(ctx.Attr("sigma")); + + // compute clipping + float l2_norm = 0.0; + for (int64_t i = 0; i < grad->numel(); ++i) { + l2_norm = l2_norm + grad_data[i] * grad_data[i]; + } + l2_norm = std::sqrt(l2_norm); + + float scale = 1.0; + if (l2_norm > clip) { + scale = l2_norm / clip; + } + + // generate gaussian noise. + // [https://en.wikipedia.org/wiki/Box-Muller_transform] + float V1, V2, S; + float X; + float mu = 0.0; + float U1, U2; + unsigned seed = (unsigned int)(time(NULL)); + std::minstd_rand engine; + engine.seed(seed); + std::uniform_real_distribution dist(0.0, 1.0); + do { + // srand((unsigned int)(time(NULL))); + // U1 = (rand() * 1.0) / RAND_MAX; + // U2 = (rand() * 1.0) / RAND_MAX; + // U1 = rand_rr(&seed) * (1.0 / RAND_MAX); + // U2 = rand_rr(&seed) * (1.0 / RAND_MAX); + U1 = dist(engine); + U2 = dist(engine); + V1 = 2 * U1 - 1; + V2 = 2 * U2 - 1; + S = V1 * V1 + V2 * V2; + } while (S >= 1 || S == 0); + + X = V1 * sqrt(-2 * log(S) / S); + + float gaussian_noise = mu + X * sigma; + + // update parameters + for (int64_t i = 0; i < grad->numel(); ++i) { + out_data[i] = + param_data[i] - + lr[0] * (grad_data[i] / scale + gaussian_noise / batch_size); + } + // CCS16 - Deep Learning with Differential Privacy. + // [https://arxiv.org/abs/1607.00133] + } // Compute +}; // class +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py index 4c21427e6b91f164aab07947f787ded6ae2cca02..78697a2023b68bedecc39a713b72e217523a41f5 100644 --- a/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py +++ b/python/paddle/fluid/contrib/slim/graph/graph_wrapper.py @@ -35,6 +35,7 @@ OPTIMIZER_OPS = [ 'adagrad', 'adam', 'adamax', + 'dpsgd', 'decayed_adagrad', 'adadelta', 'rmsprop', diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index e0d68eb2d43d69cf44cbdaf6a91b8965bfe336da..55306724287b91d486fc1a306464e1153bfd4306 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -39,13 +39,13 @@ from .wrapped_decorator import signature_safe_contextmanager from .. import compat as cpt __all__ = [ - 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl', - 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer', - 'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer', - 'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'LarsMomentum', - 'LarsMomentumOptimizer', 'DGCMomentumOptimizer', 'LambOptimizer', - 'ExponentialMovingAverage', 'PipelineOptimizer', 'LookaheadOptimizer', - 'RecomputeOptimizer' + 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad', + 'Ftrl', 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', + 'AdamOptimizer', 'AdamaxOptimizer', 'DpsgdOptimizer', + 'DecayedAdagradOptimizer', 'RMSPropOptimizer', 'FtrlOptimizer', 'Adadelta', + 'ModelAverage', 'LarsMomentum', 'LarsMomentumOptimizer', + 'DGCMomentumOptimizer', 'LambOptimizer', 'ExponentialMovingAverage', + 'PipelineOptimizer', 'LookaheadOptimizer', 'RecomputeOptimizer' ] @@ -1605,6 +1605,85 @@ class AdamaxOptimizer(Optimizer): stop_gradient=True) +class DpsgdOptimizer(Optimizer): + """ + We implement the Dpsgd optimizer according to CCS16 paper - + Deep Learning with Differential Privacy. + + Examples: + .. code-block:: python + + import paddle.fluid as fluid + import numpy + + # First create the Executor. + place = fluid.CPUPlace() # fluid.CUDAPlace(0) + exe = fluid.Executor(place) + + train_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(train_program, startup_program): + data = fluid.layers.data(name='X', shape=[1], dtype='float32') + hidden = fluid.layers.fc(input=data, size=10) + loss = fluid.layers.mean(hidden) + optimizer = fluid.optimizer.Dpsgd(learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0) + optimizer.minimize(loss) + + # Run the startup program once and only once. + exe.run(startup_program) + + x = numpy.random.random(size=(10, 1)).astype('float32') + outs = exe.run(program=train_program, + feed={'X': x}, + fetch_list=[loss.name]) + + Args: + learning_rate (float|Variable): the learning rate used to update parameters. \ + Can be a float value or a Variable with one float value as data element. + clip (float): clipping threshold + batch_size (float): batch size. + sigma (float): for gaussian noise. + Notes: + Currently, DpsgdOptimizer doesn't support sparse parameter optimization. + """ + + def __init__(self, + learning_rate=0.001, + clip=0.9, + batch_size=0.999, + sigma=1e-8): + assert learning_rate is not None + assert clip is not None + assert batch_size is not None + assert sigma is not None + super(DpsgdOptimizer, self).__init__(learning_rate=learning_rate) + self.type = "dpsgd" + self._clip = clip + self._batch_size = batch_size + self._sigma = sigma + + def _append_optimize_op(self, block, param_and_grad): + assert isinstance(block, framework.Block) + + # create the dpsgd optimize op + dpsgd_op = block.append_op( + type=self.type, + inputs={ + "Param": param_and_grad[0], + "Grad": param_and_grad[1], + "LearningRate": self._create_param_lr(param_and_grad) + }, + outputs={"ParamOut": param_and_grad[0]}, + attrs={ + "clip": self._clip, + "batch_size": self._batch_size, + "sigma": self._sigma + }, + stop_gradient=True) + + return dpsgd_op + + class DecayedAdagradOptimizer(Optimizer): """ **Decayed Adagrad Optimizer** @@ -2258,6 +2337,7 @@ Momentum = MomentumOptimizer Adagrad = AdagradOptimizer Adam = AdamOptimizer Adamax = AdamaxOptimizer +Dpsgd = DpsgdOptimizer DecayedAdagrad = DecayedAdagradOptimizer Adadelta = AdadeltaOptimizer RMSProp = RMSPropOptimizer diff --git a/python/paddle/fluid/tests/unittests/test_dpsgd_op.py b/python/paddle/fluid/tests/unittests/test_dpsgd_op.py new file mode 100644 index 0000000000000000000000000000000000000000..48bf786e139dd493fcb3ed6122b4d617a5f5bf2f --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_dpsgd_op.py @@ -0,0 +1,73 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest + + +class TestDpsgdOp(OpTest): + def setUp(self): + '''Test Dpsgd Operator with supplied attributes + ''' + self.op_type = "dpsgd" + param = np.random.uniform(-1, 1, (102, 105)).astype("float32") + grad = np.random.uniform(-1, 1, (102, 105)).astype("float32") + + learning_rate = 0.001 + clip = 10000.0 + batch_size = 16.0 + sigma = 0.0 + + self.inputs = { + 'Param': param, + 'Grad': grad, + 'LearningRate': np.array([learning_rate]).astype("float32") + } + + self.attrs = {'clip': clip, 'batch_size': batch_size, 'sigma': sigma} + + param_out = dpsgd_step(self.inputs, self.attrs) + + self.outputs = {'ParamOut': param_out} + + def test_check_output(self): + self.check_output() + + +def dpsgd_step(inputs, attributes): + ''' + Simulate one step of the dpsgd optimizer + :param inputs: dict of inputs + :param attributes: dict of attributes + :return tuple: tuple of output param, moment, inf_norm and + beta1 power accumulator + ''' + param = inputs['Param'] + grad = inputs['Grad'] + lr = inputs['LearningRate'] + + clip = attributes['clip'] + batch_size = attributes['batch_size'] + sigma = attributes['sigma'] + + param_out = param - lr * grad + + return param_out + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_optimizer.py b/python/paddle/fluid/tests/unittests/test_optimizer.py index 9761698e991e05723afd75f2eb3c21ad7472750f..1c3fd17fd284852ba7aeedaa87ca07e74fdd23e5 100644 --- a/python/paddle/fluid/tests/unittests/test_optimizer.py +++ b/python/paddle/fluid/tests/unittests/test_optimizer.py @@ -408,6 +408,47 @@ class TestAdamaxOptimizer(unittest.TestCase): self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate) +class TestDpsgdOptimizer(unittest.TestCase): + def test_dpsgd_optimizer(self): + def check_dpsgd_optimizer(optimizer_attr): + init_program = framework.Program() + program = framework.Program() + block = program.global_block() + mul_x = block.create_parameter( + dtype="float32", + shape=[5, 10], + lod_level=0, + name="mul.x", + optimize_attr=optimizer_attr) + mul_y = block.create_var( + dtype="float32", shape=[10, 8], lod_level=0, name="mul.y") + mul_out = block.create_var( + dtype="float32", shape=[5, 8], lod_level=0, name="mul.out") + block.append_op( + type="mul", + inputs={"X": mul_x, + "Y": mul_y}, + outputs={"Out": mul_out}, + attrs={"x_num_col_dims": 1}) + mean_out = block.create_var( + dtype="float32", shape=[1], lod_level=0, name="mean.out") + block.append_op( + type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out}) + dpsgd_optimizer = optimizer.DpsgdOptimizer( + learning_rate=0.01, clip=100.0, batch_size=16.0, sigma=0.0) + opts, _ = dpsgd_optimizer.minimize(mean_out, init_program) + return opts + + opts = check_dpsgd_optimizer({ + 'learning_rate': 1.1, + 'clip': 100.0, + 'batch_size': 16.0, + 'sigma': 4.0 + }) + self.assertEqual(len(opts), 2) + self.assertEqual([op.type for op in opts], ["scale", "dpsgd"]) + + class TestDecayedAdagradOptimizer(unittest.TestCase): class MockDecayedAdagrad(optimizer.DecayedAdagradOptimizer): def get_accumulators(self):