提交 766bd529 编写于 作者: J jhjiangcs 提交者: zhang wenhui

add optimizer:dpsgd,test=develop (#19915)

上级 37f76407
...@@ -936,6 +936,14 @@ paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', ' ...@@ -936,6 +936,14 @@ paddle.fluid.optimizer.AdamaxOptimizer.backward (ArgSpec(args=['self', 'loss', '
paddle.fluid.optimizer.AdamaxOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.AdamaxOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.AdamaxOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde')) paddle.fluid.optimizer.AdamaxOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b')) paddle.fluid.optimizer.AdamaxOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
paddle.fluid.optimizer.DpsgdOptimizer ('paddle.fluid.optimizer.DpsgdOptimizer', ('document', '71113c30b66c0f4035b10ebd8af8c5ad'))
paddle.fluid.optimizer.DpsgdOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'clip', 'batch_size', 'sigma'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.DpsgdOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610'))
paddle.fluid.optimizer.DpsgdOptimizer.apply_optimize (ArgSpec(args=['self', 'loss', 'startup_program', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '5c46d1926a40f1f873ffe9f37ac89dae'))
paddle.fluid.optimizer.DpsgdOptimizer.backward (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'callbacks'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'ba3a113d0229ff7bc9d39bda0a6d947f'))
paddle.fluid.optimizer.DpsgdOptimizer.get_opti_var_name_list (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.DpsgdOptimizer.load (ArgSpec(args=['self', 'stat_dict'], varargs=None, keywords=None, defaults=None), ('document', '649a92cf7f1ea28666fd00c4ea01acde'))
paddle.fluid.optimizer.DpsgdOptimizer.minimize (ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set', 'grad_clip'], varargs=None, keywords=None, defaults=(None, None, None, None)), ('document', 'b15cffad0903fc81af77a0580ceb2a9b'))
paddle.fluid.optimizer.DecayedAdagradOptimizer ('paddle.fluid.optimizer.DecayedAdagradOptimizer', ('document', 'e76838a8586bf2e58e6b5cdd2f67f780')) paddle.fluid.optimizer.DecayedAdagradOptimizer ('paddle.fluid.optimizer.DecayedAdagradOptimizer', ('document', 'e76838a8586bf2e58e6b5cdd2f67f780'))
paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.optimizer.DecayedAdagradOptimizer.__init__ (ArgSpec(args=['self', 'learning_rate', 'decay', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.95, 1e-06, None, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610')) paddle.fluid.optimizer.DecayedAdagradOptimizer.apply_gradients (ArgSpec(args=['self', 'params_grads'], varargs=None, keywords=None, defaults=None), ('document', '80ea99c9af7ef5fac7e57fb302103610'))
......
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/optimizers/dpsgd_op.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
class DpsgdOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext *ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput("Param"), true,
"Input(Param) of DpsgdOp should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasInput("Grad"), true,
"Input(Grad) of DpsgdOp should not be null.");
PADDLE_ENFORCE_EQ(ctx->HasInput("LearningRate"), true,
"Input(LearningRate) of DpsgdOp should not be null.");
PADDLE_ENFORCE_EQ(
ctx->GetInputsVarType("Param").front(),
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Param").front(), ctx->GetInputsVarType("Param").front());
PADDLE_ENFORCE_EQ(
ctx->GetInputsVarType("Grad").front(),
framework::proto::VarType::LOD_TENSOR,
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Grad").front(), ctx->GetInputsVarType("Grad").front());
PADDLE_ENFORCE_EQ(ctx->HasOutput("ParamOut"), true,
"Output(ParamOut) of DpsgdOp should not be null.");
auto lr_dims = ctx->GetInputDim("LearningRate");
PADDLE_ENFORCE_EQ(framework::product(lr_dims), 1,
"Learning rate should have 1 dimension");
auto param_dims = ctx->GetInputDim("Param");
PADDLE_ENFORCE_EQ(
param_dims, ctx->GetInputDim("Grad"),
"Param and Grad input of DpsgdOp should have same dimension");
ctx->SetOutputDim("ParamOut", param_dims);
}
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(ctx.Input<Tensor>("Param")->type(),
ctx.GetPlace());
}
};
class DpsgdOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("Param", "(Tensor) Input parameter");
AddInput("Grad", "(Tensor) Input gradient");
AddInput("LearningRate", "(Tensor) Learning rate");
AddOutput("ParamOut", "(Tensor) Output parameter");
AddAttr<float>("clip",
"(float, default 0.9) "
"Exponential decay rate for the "
"1st moment estimates.")
.SetDefault(10.0f);
AddAttr<float>("batch_size",
"(float, default 0.999) "
"exponential decay rate for the weighted "
"infinity norm estimates.")
.SetDefault(16.0f);
AddAttr<float>("sigma",
"(float, default 1.0e-8) "
"Constant for numerical stability")
.SetDefault(1.0f);
AddComment(R"DOC(
Dpsgd Optimizer.
We implement the Dpsgd optimizer according to CCS16 paper -
Deep Learning with Differential Privacy.
Dpsgd updates:
CCS16 - Deep Learning with Differential Privacy.
[https://arxiv.org/abs/1607.00133]
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_WITHOUT_GRADIENT(dpsgd, ops::DpsgdOp, ops::DpsgdOpMaker);
REGISTER_OP_CPU_KERNEL(
dpsgd, ops::DpsgdOpKernel<paddle::platform::CPUDeviceContext, float>,
ops::DpsgdOpKernel<paddle::platform::CPUDeviceContext, double>);
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <math.h>
#include <stdlib.h>
#include <iostream>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
namespace paddle {
namespace operators {
template <typename DeviceContext, typename T>
class DpsgdOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext &ctx) const override {
const auto *param_var = ctx.InputVar("Param");
PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Param").front(),
framework::ToTypeName(param_var->Type()));
const auto *grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.Inputs("Grad").front(),
framework::ToTypeName(grad_var->Type()));
const auto *learning_rate = ctx.Input<framework::Tensor>("LearningRate");
const auto *param = ctx.Input<framework::Tensor>("Param");
const auto *grad = ctx.Input<framework::Tensor>("Grad");
auto *param_out = ctx.Output<framework::Tensor>("ParamOut");
auto sz = param_out->numel();
PADDLE_ENFORCE_EQ(param->numel(), sz);
PADDLE_ENFORCE_EQ(grad->numel(), sz);
const T *lr = learning_rate->data<T>();
const T *param_data = param->data<T>();
const T *grad_data = grad->data<T>();
T *out_data = param_out->mutable_data<T>(ctx.GetPlace());
T clip = static_cast<T>(ctx.Attr<float>("clip"));
T batch_size = static_cast<T>(ctx.Attr<float>("batch_size"));
T sigma = static_cast<T>(ctx.Attr<float>("sigma"));
// compute clipping
float l2_norm = 0.0;
for (int64_t i = 0; i < grad->numel(); ++i) {
l2_norm = l2_norm + grad_data[i] * grad_data[i];
}
l2_norm = std::sqrt(l2_norm);
float scale = 1.0;
if (l2_norm > clip) {
scale = l2_norm / clip;
}
// generate gaussian noise.
// [https://en.wikipedia.org/wiki/Box-Muller_transform]
float V1, V2, S;
float X;
float mu = 0.0;
float U1, U2;
unsigned seed = (unsigned int)(time(NULL));
std::minstd_rand engine;
engine.seed(seed);
std::uniform_real_distribution<T> dist(0.0, 1.0);
do {
// srand((unsigned int)(time(NULL)));
// U1 = (rand() * 1.0) / RAND_MAX;
// U2 = (rand() * 1.0) / RAND_MAX;
// U1 = rand_rr(&seed) * (1.0 / RAND_MAX);
// U2 = rand_rr(&seed) * (1.0 / RAND_MAX);
U1 = dist(engine);
U2 = dist(engine);
V1 = 2 * U1 - 1;
V2 = 2 * U2 - 1;
S = V1 * V1 + V2 * V2;
} while (S >= 1 || S == 0);
X = V1 * sqrt(-2 * log(S) / S);
float gaussian_noise = mu + X * sigma;
// update parameters
for (int64_t i = 0; i < grad->numel(); ++i) {
out_data[i] =
param_data[i] -
lr[0] * (grad_data[i] / scale + gaussian_noise / batch_size);
}
// CCS16 - Deep Learning with Differential Privacy.
// [https://arxiv.org/abs/1607.00133]
} // Compute
}; // class
} // namespace operators
} // namespace paddle
...@@ -35,6 +35,7 @@ OPTIMIZER_OPS = [ ...@@ -35,6 +35,7 @@ OPTIMIZER_OPS = [
'adagrad', 'adagrad',
'adam', 'adam',
'adamax', 'adamax',
'dpsgd',
'decayed_adagrad', 'decayed_adagrad',
'adadelta', 'adadelta',
'rmsprop', 'rmsprop',
......
...@@ -39,13 +39,13 @@ from .wrapped_decorator import signature_safe_contextmanager ...@@ -39,13 +39,13 @@ from .wrapped_decorator import signature_safe_contextmanager
from .. import compat as cpt from .. import compat as cpt
__all__ = [ __all__ = [
'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl', 'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'Dpsgd', 'DecayedAdagrad',
'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer', 'Ftrl', 'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer',
'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer', 'AdamOptimizer', 'AdamaxOptimizer', 'DpsgdOptimizer',
'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'LarsMomentum', 'DecayedAdagradOptimizer', 'RMSPropOptimizer', 'FtrlOptimizer', 'Adadelta',
'LarsMomentumOptimizer', 'DGCMomentumOptimizer', 'LambOptimizer', 'ModelAverage', 'LarsMomentum', 'LarsMomentumOptimizer',
'ExponentialMovingAverage', 'PipelineOptimizer', 'LookaheadOptimizer', 'DGCMomentumOptimizer', 'LambOptimizer', 'ExponentialMovingAverage',
'RecomputeOptimizer' 'PipelineOptimizer', 'LookaheadOptimizer', 'RecomputeOptimizer'
] ]
...@@ -1605,6 +1605,85 @@ class AdamaxOptimizer(Optimizer): ...@@ -1605,6 +1605,85 @@ class AdamaxOptimizer(Optimizer):
stop_gradient=True) stop_gradient=True)
class DpsgdOptimizer(Optimizer):
"""
We implement the Dpsgd optimizer according to CCS16 paper -
Deep Learning with Differential Privacy.
Examples:
.. code-block:: python
import paddle.fluid as fluid
import numpy
# First create the Executor.
place = fluid.CPUPlace() # fluid.CUDAPlace(0)
exe = fluid.Executor(place)
train_program = fluid.Program()
startup_program = fluid.Program()
with fluid.program_guard(train_program, startup_program):
data = fluid.layers.data(name='X', shape=[1], dtype='float32')
hidden = fluid.layers.fc(input=data, size=10)
loss = fluid.layers.mean(hidden)
optimizer = fluid.optimizer.Dpsgd(learning_rate=0.01, clip=10.0, batch_size=16.0, sigma=1.0)
optimizer.minimize(loss)
# Run the startup program once and only once.
exe.run(startup_program)
x = numpy.random.random(size=(10, 1)).astype('float32')
outs = exe.run(program=train_program,
feed={'X': x},
fetch_list=[loss.name])
Args:
learning_rate (float|Variable): the learning rate used to update parameters. \
Can be a float value or a Variable with one float value as data element.
clip (float): clipping threshold
batch_size (float): batch size.
sigma (float): for gaussian noise.
Notes:
Currently, DpsgdOptimizer doesn't support sparse parameter optimization.
"""
def __init__(self,
learning_rate=0.001,
clip=0.9,
batch_size=0.999,
sigma=1e-8):
assert learning_rate is not None
assert clip is not None
assert batch_size is not None
assert sigma is not None
super(DpsgdOptimizer, self).__init__(learning_rate=learning_rate)
self.type = "dpsgd"
self._clip = clip
self._batch_size = batch_size
self._sigma = sigma
def _append_optimize_op(self, block, param_and_grad):
assert isinstance(block, framework.Block)
# create the dpsgd optimize op
dpsgd_op = block.append_op(
type=self.type,
inputs={
"Param": param_and_grad[0],
"Grad": param_and_grad[1],
"LearningRate": self._create_param_lr(param_and_grad)
},
outputs={"ParamOut": param_and_grad[0]},
attrs={
"clip": self._clip,
"batch_size": self._batch_size,
"sigma": self._sigma
},
stop_gradient=True)
return dpsgd_op
class DecayedAdagradOptimizer(Optimizer): class DecayedAdagradOptimizer(Optimizer):
""" """
**Decayed Adagrad Optimizer** **Decayed Adagrad Optimizer**
...@@ -2258,6 +2337,7 @@ Momentum = MomentumOptimizer ...@@ -2258,6 +2337,7 @@ Momentum = MomentumOptimizer
Adagrad = AdagradOptimizer Adagrad = AdagradOptimizer
Adam = AdamOptimizer Adam = AdamOptimizer
Adamax = AdamaxOptimizer Adamax = AdamaxOptimizer
Dpsgd = DpsgdOptimizer
DecayedAdagrad = DecayedAdagradOptimizer DecayedAdagrad = DecayedAdagradOptimizer
Adadelta = AdadeltaOptimizer Adadelta = AdadeltaOptimizer
RMSProp = RMSPropOptimizer RMSProp = RMSPropOptimizer
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
class TestDpsgdOp(OpTest):
def setUp(self):
'''Test Dpsgd Operator with supplied attributes
'''
self.op_type = "dpsgd"
param = np.random.uniform(-1, 1, (102, 105)).astype("float32")
grad = np.random.uniform(-1, 1, (102, 105)).astype("float32")
learning_rate = 0.001
clip = 10000.0
batch_size = 16.0
sigma = 0.0
self.inputs = {
'Param': param,
'Grad': grad,
'LearningRate': np.array([learning_rate]).astype("float32")
}
self.attrs = {'clip': clip, 'batch_size': batch_size, 'sigma': sigma}
param_out = dpsgd_step(self.inputs, self.attrs)
self.outputs = {'ParamOut': param_out}
def test_check_output(self):
self.check_output()
def dpsgd_step(inputs, attributes):
'''
Simulate one step of the dpsgd optimizer
:param inputs: dict of inputs
:param attributes: dict of attributes
:return tuple: tuple of output param, moment, inf_norm and
beta1 power accumulator
'''
param = inputs['Param']
grad = inputs['Grad']
lr = inputs['LearningRate']
clip = attributes['clip']
batch_size = attributes['batch_size']
sigma = attributes['sigma']
param_out = param - lr * grad
return param_out
if __name__ == "__main__":
unittest.main()
...@@ -408,6 +408,47 @@ class TestAdamaxOptimizer(unittest.TestCase): ...@@ -408,6 +408,47 @@ class TestAdamaxOptimizer(unittest.TestCase):
self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate) self.assertAlmostEqual(init_ops[0].attr('value'), learning_rate)
class TestDpsgdOptimizer(unittest.TestCase):
def test_dpsgd_optimizer(self):
def check_dpsgd_optimizer(optimizer_attr):
init_program = framework.Program()
program = framework.Program()
block = program.global_block()
mul_x = block.create_parameter(
dtype="float32",
shape=[5, 10],
lod_level=0,
name="mul.x",
optimize_attr=optimizer_attr)
mul_y = block.create_var(
dtype="float32", shape=[10, 8], lod_level=0, name="mul.y")
mul_out = block.create_var(
dtype="float32", shape=[5, 8], lod_level=0, name="mul.out")
block.append_op(
type="mul",
inputs={"X": mul_x,
"Y": mul_y},
outputs={"Out": mul_out},
attrs={"x_num_col_dims": 1})
mean_out = block.create_var(
dtype="float32", shape=[1], lod_level=0, name="mean.out")
block.append_op(
type="mean", inputs={"X": mul_out}, outputs={"Out": mean_out})
dpsgd_optimizer = optimizer.DpsgdOptimizer(
learning_rate=0.01, clip=100.0, batch_size=16.0, sigma=0.0)
opts, _ = dpsgd_optimizer.minimize(mean_out, init_program)
return opts
opts = check_dpsgd_optimizer({
'learning_rate': 1.1,
'clip': 100.0,
'batch_size': 16.0,
'sigma': 4.0
})
self.assertEqual(len(opts), 2)
self.assertEqual([op.type for op in opts], ["scale", "dpsgd"])
class TestDecayedAdagradOptimizer(unittest.TestCase): class TestDecayedAdagradOptimizer(unittest.TestCase):
class MockDecayedAdagrad(optimizer.DecayedAdagradOptimizer): class MockDecayedAdagrad(optimizer.DecayedAdagradOptimizer):
def get_accumulators(self): def get_accumulators(self):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册