diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index d1171c4d48d0184f0cbdabd5d8dd0ea98a6818be..07c6b964aa94b3cb11e9a26f2ca1d9ab75af6abe 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -367,7 +367,7 @@ paddle.fluid.optimizer.MomentumOptimizer.__init__ ArgSpec(args=['self', 'learnin paddle.fluid.optimizer.MomentumOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdagradOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(1e-06, None, None)) paddle.fluid.optimizer.AdagradOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) -paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) +paddle.fluid.optimizer.AdamOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name', 'lazy_mode'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None, False)) paddle.fluid.optimizer.AdamOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.__init__ ArgSpec(args=['self', 'learning_rate', 'beta1', 'beta2', 'epsilon', 'regularization', 'name'], varargs=None, keywords=None, defaults=(0.001, 0.9, 0.999, 1e-08, None, None)) paddle.fluid.optimizer.AdamaxOptimizer.minimize ArgSpec(args=['self', 'loss', 'startup_program', 'parameter_list', 'no_grad_set'], varargs=None, keywords=None, defaults=(None, None, None)) diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index 5eae503461f5743a1a3b89e2df98846859bfbfdb..e9c395a9314180960da2b9b0f996fce5d62b14ba 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -109,6 +109,11 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { "(float, default 1.0e-8) " "Constant for numerical stability") .SetDefault(1.0e-8f); + AddAttr( + "lazy_mode", + "(bool, default false) " + "only update the parameter that has gradient in sparse update") + .SetDefault(false); AddComment(R"DOC( Adam Optimizer. diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 5ba5639fd51a639e878e650585b8e7e29317b3c1..c019f19cf08f8893888722cfaf101fa6ffada61b 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -178,12 +178,13 @@ struct SparseAdamFunctor { const int64_t* rows_; int64_t row_numel_; int64_t row_count_; + bool lazy_mode_; SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2, T* mom2_out, const T* lr, const T* grad, const T* param, T* param_out, const int64_t* rows, - int64_t row_numel, int64_t row_count) + int64_t row_numel, int64_t row_count, bool lazy_mode) : beta1_(beta1), beta2_(beta2), epsilon_(epsilon), @@ -199,13 +200,10 @@ struct SparseAdamFunctor { param_out_(param_out), rows_(rows), row_numel_(row_numel), - row_count_(row_count) {} - - inline HOSTDEVICE void operator()(size_t i) const { - auto row_idx = - math::BinarySearch(rows_, row_count_, i / row_numel_); - T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; + row_count_(row_count), + lazy_mode_(lazy_mode) {} + inline HOSTDEVICE void adam_update(size_t i, T g) const { // The following code is the same as dense T mom1 = moment1_[i]; T mom2 = moment2_[i]; @@ -226,6 +224,17 @@ struct SparseAdamFunctor { moment2_out_[i] = mom2; param_out_[i] = p; } + + inline HOSTDEVICE void operator()(size_t i) const { + auto row_idx = + math::BinarySearch(rows_, row_count_, i / row_numel_); + if (lazy_mode_ && row_idx < 0) { + return; + } else { + T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; + adam_update(i, g); + } + } }; template @@ -241,6 +250,7 @@ class AdamOpKernel : public framework::OpKernel { using paddle::framework::LoDTensor; using paddle::operators::detail::Ref; + bool lazy_mode = ctx.Attr("lazy_mode"); T beta1 = static_cast(ctx.Attr("beta1")); T beta2 = static_cast(ctx.Attr("beta2")); T epsilon = static_cast(ctx.Attr("epsilon")); @@ -352,17 +362,27 @@ class AdamOpKernel : public framework::OpKernel { mom2_out.template mutable_data(ctx.GetPlace()), lr.template data(), grad_data, param.template data(), param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, - grad_merge.rows().size()); - int inner_op_parallelism = FLAGS_inner_op_parallelism; - if (inner_op_parallelism > 1 && - FLAGS_min_param_size_to_use_multithread > 0 && - param.numel() > FLAGS_min_param_size_to_use_multithread) { + grad_merge.rows().size(), lazy_mode); + VLOG(3) << "lazy_mode :" << lazy_mode; + if (lazy_mode && platform::is_cpu_place(ctx.GetPlace())) { + size_t row_count = grad_merge.rows().size(); + std::vector cpu_rows(grad_merge.rows()); + for (size_t row_index = 0; row_index < row_count; ++row_index) { + for (size_t offset = 0; offset < row_numel; ++offset) { + size_t i = cpu_rows[row_index] * row_numel + offset; + functor.adam_update(i, grad_data[row_index * row_numel + offset]); + } + } + } else if (FLAGS_inner_op_parallelism > 1 && + FLAGS_min_param_size_to_use_multithread > 0 && + param.numel() > FLAGS_min_param_size_to_use_multithread) { VLOG(3) << "use multi thread, inner_op_parallelism=" - << inner_op_parallelism << " min_param_size_to_use_multithread=" + << FLAGS_inner_op_parallelism + << " min_param_size_to_use_multithread=" << FLAGS_min_param_size_to_use_multithread; std::vector> fs; - int64_t block_size = param.numel() / inner_op_parallelism; - for (int i = 0; i < inner_op_parallelism; ++i) { + int64_t block_size = param.numel() / FLAGS_inner_op_parallelism; + for (int i = 0; i < FLAGS_inner_op_parallelism; ++i) { int64_t start = i * block_size; int64_t end = (i + 1) * block_size; if (end > param.numel()) { diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index da92826d410505c9a80820f655162dd22e6b5966..59c22d4e498814d468c78b10265b7afe35461dfb 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -641,9 +641,14 @@ class AdamOptimizer(Optimizer): beta1 (float): The exponential decay rate for the 1st moment estimates. beta2 (float): The exponential decay rate for the 2nd moment estimates. epsilon (float): a small float value for numerical stability. - regularization: A Regularizer, such as - fluid.regularizer.L2DecayRegularizer. + regularization: A Regularizer, such as fluid.regularizer.L2DecayRegularizer. name: A optional name prefix. + lazy_mode(bool: false): The official Adam algorithm has two moving-average accumulators + the accumulators are updated at every step. Every element of the two moving-average is updated + in both dense mode and sparse mode. If the size of parameter is very large, then the update + may be very slow. The lazy mode only update the element that has gradient is the current + mini-batch, so it will be much more faster. But this mode has different semantics with the + original Adam algorithm and may lead to different result. Examples: .. code-block:: python @@ -663,7 +668,8 @@ class AdamOptimizer(Optimizer): beta2=0.999, epsilon=1e-8, regularization=None, - name=None): + name=None, + lazy_mode=False): assert learning_rate is not None assert beta1 is not None assert beta2 is not None @@ -676,6 +682,7 @@ class AdamOptimizer(Optimizer): self._beta1 = beta1 self._beta2 = beta2 self._epsilon = epsilon + self._lazy_mode = lazy_mode def _create_accumulators(self, block, parameters): assert isinstance(block, framework.Block) @@ -729,7 +736,8 @@ class AdamOptimizer(Optimizer): attrs={ "beta1": self._beta1, "beta2": self._beta2, - "epsilon": self._epsilon + "epsilon": self._epsilon, + "lazy_mode": self._lazy_mode }) return adam_op diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 5318d2f9766ce671925be614feef57d679270b19..ff7fc5100ebaf12655d5963c600bbd5058720349 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -194,7 +194,8 @@ def adam_step(inputs, attributes): return param_out, moment1_out, moment2_out -def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad): +def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad, + lazy_mode): ''' Simulate one step of the adam optimizer :param inputs: dict of inputs @@ -218,19 +219,30 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad): moment2_out = np.zeros(shape=[height, row_numel]) param_out = np.zeros(shape=[height, row_numel]) - for idx, row_id in enumerate(rows): + def update_row(row_id, update_value): moment1_out[row_id] = beta1 * moment1[row_id] + (1 - beta1 - ) * np_grad[idx] + ) * update_value moment2_out[row_id] = beta2 * moment2[row_id] + ( - 1 - beta2) * np.square(np_grad[idx]) + 1 - beta2) * np.square(update_value) lr_t = lr * np.sqrt(1 - beta2_pow) / (1 - beta1_pow) param_out[row_id] = param[row_id] - lr_t * (moment1_out[row_id] / ( np.sqrt(moment2_out[row_id]) + epsilon)) + + if lazy_mode: + for idx, row_id in enumerate(rows): + update_row(row_id, np_grad[idx]) + else: + for row_id in range(param_out.shape[0]): + update_value = np.zeros(np_grad[0].shape).astype("float32") + if row_id in rows: + update_value = np_grad[rows.index(row_id)] + update_row(row_id, update_value) + return param_out, moment1_out, moment2_out class TestSparseAdamOp(unittest.TestCase): - def setup(self, scope, place): + def setup(self, scope, place, lazy_mode): beta1 = 0.78 beta2 = 0.836 epsilon = 1e-4 @@ -248,6 +260,7 @@ class TestSparseAdamOp(unittest.TestCase): 'Beta2Pow': np.array([beta2**10]).astype("float32"), "LearningRate": np.full((1), 2.0).astype("float32") } + self.init_output = np.full((height, row_numel), 0.0).astype("float32") self.attrs = {'epsilon': epsilon, 'beta1': beta1, 'beta2': beta2} grad_selected_rows = scope.var('Grad').get_selected_rows() @@ -262,19 +275,21 @@ class TestSparseAdamOp(unittest.TestCase): self.sparse_inputs = ["Grad"] - param_out, mom1, mom2 = adam_step_sparse( - self.dense_inputs, self.attrs, height, rows, row_numel, np_array) + param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs, + height, rows, row_numel, + np_array, lazy_mode) self.outputs = { "ParamOut": param_out, "Moment1Out": mom1, "Moment2Out": mom2 } - def check_with_place(self, place): + def check_with_place(self, place, lazy_mode): scope = core.Scope() - self.setup(scope, place) + self.setup(scope, place, lazy_mode) op_args = dict() + op_args['lazy_mode'] = lazy_mode for key, np_array in self.dense_inputs.items(): var = scope.var(key).get_tensor() var.set(np_array, place) @@ -283,7 +298,7 @@ class TestSparseAdamOp(unittest.TestCase): op_args[s] = s for s in self.outputs: var = scope.var(s).get_tensor() - var.set(self.outputs[s], place) + var.set(self.init_output, place) op_args[s] = s for k in self.attrs: op_args[k] = self.attrs[k] @@ -297,20 +312,17 @@ class TestSparseAdamOp(unittest.TestCase): actual = np.array(out_var) actual = actual.reshape([actual.size]) np_array = np_array.reshape([np_array.size]) - for idx, row_id in enumerate(self.rows): - j = 0 - while j < self.row_numel: - pos = row_id * self.row_numel + j - self.assertLess((actual[pos] - np_array[pos]) / actual[pos], - 0.00001) - j += 1 - - def test_sparse_sgd(self): + + for i in range(np_array.size): + self.assertLess((actual[i] - np_array[i]), 0.00001) + + def test_sparse_adam(self): places = [core.CPUPlace()] if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) for place in places: - self.check_with_place(place) + for lazy_mode in (True, False): + self.check_with_place(place, lazy_mode) if __name__ == "__main__":