提交 e0df9f23 编写于 作者: Q Qiao Longfei

merge lazy mode

......@@ -367,7 +367,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin
paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None))
paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False))
paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None))
paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None))
......
......@@ -109,6 +109,11 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
"(float, default 1.0e-8) "
"Constant for numerical stability")
.SetDefault(1.0e-8f);
AddAttr<bool>(
"lazy_mode",
"(bool, default false) "
"only update the parameter that has gradient in sparse update")
.SetDefault(false);
AddComment(R"DOC(
Adam Optimizer.
......
......@@ -178,12 +178,13 @@ struct SparseAdamFunctor {
const int64_t* rows_;
int64_t row_numel_;
int64_t row_count_;
bool lazy_mode_;
SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
const T* beta2_pow, const T* mom1, T* mom1_out,
const T* mom2, T* mom2_out, const T* lr, const T* grad,
const T* param, T* param_out, const int64_t* rows,
int64_t row_numel, int64_t row_count)
int64_t row_numel, int64_t row_count, bool lazy_mode)
: beta1_(beta1),
beta2_(beta2),
epsilon_(epsilon),
......@@ -199,13 +200,10 @@ struct SparseAdamFunctor {
param_out_(param_out),
rows_(rows),
row_numel_(row_numel),
row_count_(row_count) {}
inline HOSTDEVICE void operator()(size_t i) const {
auto row_idx =
math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
row_count_(row_count),
lazy_mode_(lazy_mode) {}
inline HOSTDEVICE void adam_update(size_t i, T g) const {
// The following code is the same as dense
T mom1 = moment1_[i];
T mom2 = moment2_[i];
......@@ -226,6 +224,17 @@ struct SparseAdamFunctor {
moment2_out_[i] = mom2;
param_out_[i] = p;
}
inline HOSTDEVICE void operator()(size_t i) const {
auto row_idx =
math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
if (lazy_mode_ && row_idx < 0) {
return;
} else {
T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
adam_update(i, g);
}
}
};
template <typename DeviceContext, typename T>
......@@ -241,6 +250,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
using paddle::framework::LoDTensor;
using paddle::operators::detail::Ref;
bool lazy_mode = ctx.Attr<bool>("lazy_mode");
T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
......@@ -352,17 +362,27 @@ class AdamOpKernel : public framework::OpKernel<T> {
mom2_out.template mutable_data<T>(ctx.GetPlace()),
lr.template data<T>(), grad_data, param.template data<T>(),
param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel,
grad_merge.rows().size());
int inner_op_parallelism = FLAGS_inner_op_parallelism;
if (inner_op_parallelism > 1 &&
grad_merge.rows().size(), lazy_mode);
VLOG(3) << "lazy_mode :" << lazy_mode;
if (lazy_mode && platform::is_cpu_place(ctx.GetPlace())) {
size_t row_count = grad_merge.rows().size();
std::vector<int64_t> cpu_rows(grad_merge.rows());
for (size_t row_index = 0; row_index < row_count; ++row_index) {
for (size_t offset = 0; offset < row_numel; ++offset) {
size_t i = cpu_rows[row_index] * row_numel + offset;
functor.adam_update(i, grad_data[row_index * row_numel + offset]);
}
}
} else if (FLAGS_inner_op_parallelism > 1 &&
FLAGS_min_param_size_to_use_multithread > 0 &&
param.numel() > FLAGS_min_param_size_to_use_multithread) {
VLOG(3) << "use multi thread, inner_op_parallelism="
<< inner_op_parallelism << " min_param_size_to_use_multithread="
<< FLAGS_inner_op_parallelism
<< " min_param_size_to_use_multithread="
<< FLAGS_min_param_size_to_use_multithread;
std::vector<std::future<void>> fs;
int64_t block_size = param.numel() / inner_op_parallelism;
for (int i = 0; i < inner_op_parallelism; ++i) {
int64_t block_size = param.numel() / FLAGS_inner_op_parallelism;
for (int i = 0; i < FLAGS_inner_op_parallelism; ++i) {
int64_t start = i * block_size;
int64_t end = (i + 1) * block_size;
if (end > param.numel()) {
......
......@@ -641,9 +641,14 @@ class AdamOptimizer(Optimizer):
beta1 (float): The exponential decay rate for the 1st moment estimates.
beta2 (float): The exponential decay rate for the 2nd moment estimates.
epsilon (float): a small float value for numerical stability.
regularization: A Regularizer, such as
fluid.regularizer.L2DecayRegularizer.
regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer.
name: A optional name prefix.
lazy_mode(bool: false): The official Adam algorithm has two moving-average accumulators
the accumulators are updated at every step. Every element of the two moving-average is updated
in both dense mode and sparse mode. If the size of parameter is very large, then the update
may be very slow. The lazy mode only update the element that has gradient is the current
mini-batch, so it will be much more faster. But this mode has different semantics with the
original Adam algorithm and may lead to different result.
Examples:
.. code-block:: python
......@@ -663,7 +668,8 @@ class AdamOptimizer(Optimizer):
beta2=0.999,
epsilon=1e-8,
regularization=None,
name=None):
name=None,
lazy_mode=False):
assert learning_rate is not None
assert beta1 is not None
assert beta2 is not None
......@@ -676,6 +682,7 @@ class AdamOptimizer(Optimizer):
self._beta1 = beta1
self._beta2 = beta2
self._epsilon = epsilon
self._lazy_mode = lazy_mode
def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
......@@ -729,7 +736,8 @@ class AdamOptimizer(Optimizer):
attrs={
"beta1": self._beta1,
"beta2": self._beta2,
"epsilon": self._epsilon
"epsilon": self._epsilon,
"lazy_mode": self._lazy_mode
})
return adam_op
......
......@@ -194,7 +194,8 @@ def adam_step(inputs, attributes):
return param_out, moment1_out, moment2_out
def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad):
def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
lazy_mode):
'''
Simulate one step of the adam optimizer
:param inputs: dict of inputs
......@@ -218,19 +219,30 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad):
moment2_out = np.zeros(shape=[height, row_numel])
param_out = np.zeros(shape=[height, row_numel])
for idx, row_id in enumerate(rows):
def update_row(row_id, update_value):
moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1
) * np_grad[idx]
) * update_value
moment2_out[row_id] = beta2 * moment2[row_id] + (
1 - beta2) * np.square(np_grad[idx])
1 - beta2) * np.square(update_value)
lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow)
param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / (
np.sqrt(moment2_out[row_id]) + epsilon))
if lazy_mode:
for idx, row_id in enumerate(rows):
update_row(row_id, np_grad[idx])
else:
for row_id in range(param_out.shape[0]):
update_value = np.zeros(np_grad[0].shape).astype("float32")
if row_id in rows:
update_value = np_grad[rows.index(row_id)]
update_row(row_id, update_value)
return param_out, moment1_out, moment2_out
class TestSparseAdamOp(unittest.TestCase):
def setup(self, scope, place):
def setup(self, scope, place, lazy_mode):
beta1 = 0.78
beta2 = 0.836
epsilon = 1e-4
......@@ -248,6 +260,7 @@ class TestSparseAdamOp(unittest.TestCase):
'Beta2Pow': np.array([beta2**10]).astype("float32"),
"LearningRate": np.full((1), 2.0).astype("float32")
}
self.init_output = np.full((height, row_numel), 0.0).astype("float32")
self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2}
grad_selected_rows = scope.var('Grad').get_selected_rows()
......@@ -262,19 +275,21 @@ class TestSparseAdamOp(unittest.TestCase):
self.sparse_inputs = ["Grad"]
param_out, mom1, mom2 = adam_step_sparse(
self.dense_inputs, self.attrs, height, rows, row_numel, np_array)
param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs,
height, rows, row_numel,
np_array, lazy_mode)
self.outputs = {
"ParamOut": param_out,
"Moment1Out": mom1,
"Moment2Out": mom2
}
def check_with_place(self, place):
def check_with_place(self, place, lazy_mode):
scope = core.Scope()
self.setup(scope, place)
self.setup(scope, place, lazy_mode)
op_args = dict()
op_args['lazy_mode'] = lazy_mode
for key, np_array in self.dense_inputs.items():
var = scope.var(key).get_tensor()
var.set(np_array, place)
......@@ -283,7 +298,7 @@ class TestSparseAdamOp(unittest.TestCase):
op_args[s] = s
for s in self.outputs:
var = scope.var(s).get_tensor()
var.set(self.outputs[s], place)
var.set(self.init_output, place)
op_args[s] = s
for k in self.attrs:
op_args[k] = self.attrs[k]
......@@ -297,20 +312,17 @@ class TestSparseAdamOp(unittest.TestCase):
actual = np.array(out_var)
actual = actual.reshape([actual.size])
np_array = np_array.reshape([np_array.size])
for idx, row_id in enumerate(self.rows):
j = 0
while j < self.row_numel:
pos = row_id * self.row_numel + j
self.assertLess((actual[pos] - np_array[pos]) / actual[pos],
0.00001)
j += 1
def test_sparse_sgd(self):
for i in range(np_array.size):
self.assertLess((actual[i] - np_array[i]), 0.00001)
def test_sparse_adam(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
self.check_with_place(place)
for lazy_mode in (True, False):
self.check_with_place(place, lazy_mode)
if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册