未验证 提交 e2d56561 编写于 作者: Q Qiao Longfei 提交者: GitHub

Merge pull request #14889 from jacquesqiao/optimize-adam

adam optimizer support lazy mode
...@@ -376,7 +376,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin ...@@ -376,7 +376,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin
paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None)) paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False))
paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
......
...@@ -109,6 +109,11 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -109,6 +109,11 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
"(float, default 1.0e-8) " "(float, default 1.0e-8) "
"Constant for numerical stability") "Constant for numerical stability")
.SetDefault(1.0e-8f); .SetDefault(1.0e-8f);
AddAttr<bool>(
"lazy_mode",
"(bool, default false) "
"only update the parameter that has gradient in sparse update")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Adam Optimizer. Adam Optimizer.
......
...@@ -177,12 +177,13 @@ struct SparseAdamFunctor { ...@@ -177,12 +177,13 @@ struct SparseAdamFunctor {
const int64_t* rows_; const int64_t* rows_;
int64_t row_numel_; int64_t row_numel_;
int64_t row_count_; int64_t row_count_;
bool lazy_mode_;
SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
const T* beta2_pow, const T* mom1, T* mom1_out, const T* beta2_pow, const T* mom1, T* mom1_out,
const T* mom2, T* mom2_out, const T* lr, const T* grad, const T* mom2, T* mom2_out, const T* lr, const T* grad,
const T* param, T* param_out, const int64_t* rows, const T* param, T* param_out, const int64_t* rows,
int64_t row_numel, int64_t row_count) int64_t row_numel, int64_t row_count, bool lazy_mode)
: beta1_(beta1), : beta1_(beta1),
beta2_(beta2), beta2_(beta2),
epsilon_(epsilon), epsilon_(epsilon),
...@@ -198,13 +199,10 @@ struct SparseAdamFunctor { ...@@ -198,13 +199,10 @@ struct SparseAdamFunctor {
param_out_(param_out), param_out_(param_out),
rows_(rows), rows_(rows),
row_numel_(row_numel), row_numel_(row_numel),
row_count_(row_count) {} row_count_(row_count),
lazy_mode_(lazy_mode) {}
inline HOSTDEVICE void operator()(size_t i) const {
auto row_idx =
math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
inline HOSTDEVICE void adam_update(size_t i, T g) const {
// The following code is the same as dense // The following code is the same as dense
T mom1 = moment1_[i]; T mom1 = moment1_[i];
T mom2 = moment2_[i]; T mom2 = moment2_[i];
...@@ -225,6 +223,17 @@ struct SparseAdamFunctor { ...@@ -225,6 +223,17 @@ struct SparseAdamFunctor {
moment2_out_[i] = mom2; moment2_out_[i] = mom2;
param_out_[i] = p; param_out_[i] = p;
} }
inline HOSTDEVICE void operator()(size_t i) const {
auto row_idx =
math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
if (lazy_mode_ && row_idx < 0) {
return;
} else {
T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
adam_update(i, g);
}
}
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -240,6 +249,7 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -240,6 +249,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
using paddle::framework::LoDTensor; using paddle::framework::LoDTensor;
using paddle::operators::detail::Ref; using paddle::operators::detail::Ref;
bool lazy_mode = ctx.Attr<bool>("lazy_mode");
T beta1 = static_cast<T>(ctx.Attr<float>("beta1")); T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
T beta2 = static_cast<T>(ctx.Attr<float>("beta2")); T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon")); T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
...@@ -351,11 +361,23 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -351,11 +361,23 @@ class AdamOpKernel : public framework::OpKernel<T> {
mom2_out.template mutable_data<T>(ctx.GetPlace()), mom2_out.template mutable_data<T>(ctx.GetPlace()),
lr.template data<T>(), grad_data, param.template data<T>(), lr.template data<T>(), grad_data, param.template data<T>(),
param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel, param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel,
grad_merge.rows().size()); grad_merge.rows().size(), lazy_mode);
VLOG(3) << "lazy_mode :" << lazy_mode;
if (lazy_mode && platform::is_cpu_place(ctx.GetPlace())) {
size_t row_count = grad_merge.rows().size();
std::vector<int64_t> cpu_rows(grad_merge.rows());
for (size_t row_index = 0; row_index < row_count; ++row_index) {
for (size_t offset = 0; offset < row_numel; ++offset) {
size_t i = cpu_rows[row_index] * row_numel + offset;
functor.adam_update(i, grad_data[row_index * row_numel + offset]);
}
}
} else {
platform::ForRange<DeviceContext> for_range( platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(ctx.device_context()), static_cast<const DeviceContext&>(ctx.device_context()),
param.numel()); param.numel());
for_range(functor); for_range(functor);
}
} else { } else {
PADDLE_THROW("Variable type not supported by adam_op"); PADDLE_THROW("Variable type not supported by adam_op");
} }
......
...@@ -641,9 +641,14 @@ class AdamOptimizer(Optimizer): ...@@ -641,9 +641,14 @@ class AdamOptimizer(Optimizer):
beta1 (float): The exponential decay rate for the 1st moment estimates. beta1 (float): The exponential decay rate for the 1st moment estimates.
beta2 (float): The exponential decay rate for the 2nd moment estimates. beta2 (float): The exponential decay rate for the 2nd moment estimates.
epsilon (float): a small float value for numerical stability. epsilon (float): a small float value for numerical stability.
regularization: A Regularizer, such as regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer.
fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix. name: A optional name prefix.
lazy_mode(bool: false): The official Adam algorithm has two moving-average accumulators
the accumulators are updated at every step. Every element of the two moving-average is updated
in both dense mode and sparse mode. If the size of parameter is very large, then the update
may be very slow. The lazy mode only update the element that has gradient is the current
mini-batch, so it will be much more faster. But this mode has different semantics with the
original Adam algorithm and may lead to different result.
Examples: Examples:
.. code-block:: python .. code-block:: python
...@@ -663,7 +668,8 @@ class AdamOptimizer(Optimizer): ...@@ -663,7 +668,8 @@ class AdamOptimizer(Optimizer):
beta2=0.999, beta2=0.999,
epsilon=1e-8, epsilon=1e-8,
regularization=None, regularization=None,
name=None): name=None,
lazy_mode=False):
assert learning_rate is not None assert learning_rate is not None
assert beta1 is not None assert beta1 is not None
assert beta2 is not None assert beta2 is not None
...@@ -676,6 +682,7 @@ class AdamOptimizer(Optimizer): ...@@ -676,6 +682,7 @@ class AdamOptimizer(Optimizer):
self._beta1 = beta1 self._beta1 = beta1
self._beta2 = beta2 self._beta2 = beta2
self._epsilon = epsilon self._epsilon = epsilon
self._lazy_mode = lazy_mode
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -729,7 +736,8 @@ class AdamOptimizer(Optimizer): ...@@ -729,7 +736,8 @@ class AdamOptimizer(Optimizer):
attrs={ attrs={
"beta1": self._beta1, "beta1": self._beta1,
"beta2": self._beta2, "beta2": self._beta2,
"epsilon": self._epsilon "epsilon": self._epsilon,
"lazy_mode": self._lazy_mode
}) })
return adam_op return adam_op
......
...@@ -194,7 +194,8 @@ def adam_step(inputs, attributes): ...@@ -194,7 +194,8 @@ def adam_step(inputs, attributes):
return param_out, moment1_out, moment2_out return param_out, moment1_out, moment2_out
def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad): def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
lazy_mode):
''' '''
Simulate one step of the adam optimizer Simulate one step of the adam optimizer
:param inputs: dict of inputs :param inputs: dict of inputs
...@@ -218,19 +219,30 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad): ...@@ -218,19 +219,30 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad):
moment2_out = np.zeros(shape=[height, row_numel]) moment2_out = np.zeros(shape=[height, row_numel])
param_out = np.zeros(shape=[height, row_numel]) param_out = np.zeros(shape=[height, row_numel])
for idx, row_id in enumerate(rows): def update_row(row_id, update_value):
moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1 moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1
) * np_grad[idx] ) * update_value
moment2_out[row_id] = beta2 * moment2[row_id] + ( moment2_out[row_id] = beta2 * moment2[row_id] + (
1 - beta2) * np.square(np_grad[idx]) 1 - beta2) * np.square(update_value)
lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow) lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / ( param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / (
np.sqrt(moment2_out[row_id]) + epsilon)) np.sqrt(moment2_out[row_id]) + epsilon))
if lazy_mode:
for idx, row_id in enumerate(rows):
update_row(row_id, np_grad[idx])
else:
for row_id in range(param_out.shape[0]):
update_value = np.zeros(np_grad[0].shape).astype("float32")
if row_id in rows:
update_value = np_grad[rows.index(row_id)]
update_row(row_id, update_value)
return param_out, moment1_out, moment2_out return param_out, moment1_out, moment2_out
class TestSparseAdamOp(unittest.TestCase): class TestSparseAdamOp(unittest.TestCase):
def setup(self, scope, place): def setup(self, scope, place, lazy_mode):
beta1 = 0.78 beta1 = 0.78
beta2 = 0.836 beta2 = 0.836
epsilon = 1e-4 epsilon = 1e-4
...@@ -248,6 +260,7 @@ class TestSparseAdamOp(unittest.TestCase): ...@@ -248,6 +260,7 @@ class TestSparseAdamOp(unittest.TestCase):
'Beta2Pow': np.array([beta2**10]).astype("float32"), 'Beta2Pow': np.array([beta2**10]).astype("float32"),
"LearningRate": np.full((1), 2.0).astype("float32") "LearningRate": np.full((1), 2.0).astype("float32")
} }
self.init_output = np.full((height, row_numel), 0.0).astype("float32")
self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2} self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
grad_selected_rows = scope.var('Grad').get_selected_rows() grad_selected_rows = scope.var('Grad').get_selected_rows()
...@@ -262,19 +275,21 @@ class TestSparseAdamOp(unittest.TestCase): ...@@ -262,19 +275,21 @@ class TestSparseAdamOp(unittest.TestCase):
self.sparse_inputs = ["Grad"] self.sparse_inputs = ["Grad"]
param_out, mom1, mom2 = adam_step_sparse( param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs,
self.dense_inputs, self.attrs, height, rows, row_numel, np_array) height, rows, row_numel,
np_array, lazy_mode)
self.outputs = { self.outputs = {
"ParamOut": param_out, "ParamOut": param_out,
"Moment1Out": mom1, "Moment1Out": mom1,
"Moment2Out": mom2 "Moment2Out": mom2
} }
def check_with_place(self, place): def check_with_place(self, place, lazy_mode):
scope = core.Scope() scope = core.Scope()
self.setup(scope, place) self.setup(scope, place, lazy_mode)
op_args = dict() op_args = dict()
op_args['lazy_mode'] = lazy_mode
for key, np_array in self.dense_inputs.items(): for key, np_array in self.dense_inputs.items():
var = scope.var(key).get_tensor() var = scope.var(key).get_tensor()
var.set(np_array, place) var.set(np_array, place)
...@@ -283,7 +298,7 @@ class TestSparseAdamOp(unittest.TestCase): ...@@ -283,7 +298,7 @@ class TestSparseAdamOp(unittest.TestCase):
op_args[s] = s op_args[s] = s
for s in self.outputs: for s in self.outputs:
var = scope.var(s).get_tensor() var = scope.var(s).get_tensor()
var.set(self.outputs[s], place) var.set(self.init_output, place)
op_args[s] = s op_args[s] = s
for k in self.attrs: for k in self.attrs:
op_args[k] = self.attrs[k] op_args[k] = self.attrs[k]
...@@ -297,20 +312,17 @@ class TestSparseAdamOp(unittest.TestCase): ...@@ -297,20 +312,17 @@ class TestSparseAdamOp(unittest.TestCase):
actual = np.array(out_var) actual = np.array(out_var)
actual = actual.reshape([actual.size]) actual = actual.reshape([actual.size])
np_array = np_array.reshape([np_array.size]) np_array = np_array.reshape([np_array.size])
for idx, row_id in enumerate(self.rows):
j = 0 for i in range(np_array.size):
while j < self.row_numel: self.assertLess((actual[i] - np_array[i]), 0.00001)
pos = row_id * self.row_numel + j
self.assertLess((actual[pos] - np_array[pos]) / actual[pos], def test_sparse_adam(self):
0.00001)
j += 1
def test_sparse_sgd(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.check_with_place(place) for lazy_mode in (True, False):
self.check_with_place(place, lazy_mode)
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册