diff --git a/paddle/fluid/operators/optimizers/adam_op.cc b/paddle/fluid/operators/optimizers/adam_op.cc index 5710cda39acce53e35dfceec675fcd4979a84e31..b2c2e5c325451303286acb989d24d261eed7dc4b 100644 --- a/paddle/fluid/operators/optimizers/adam_op.cc +++ b/paddle/fluid/operators/optimizers/adam_op.cc @@ -110,6 +110,11 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { "(float, default 1.0e-8) " "Constant for numerical stability") .SetDefault(1.0e-8f); + AddAttr( + "sparse_mode", + "(bool, default false) " + "only update the parameter that has gradient in sparse update") + .SetDefault(false); AddComment(R"DOC( Adam Optimizer. diff --git a/paddle/fluid/operators/optimizers/adam_op.h b/paddle/fluid/operators/optimizers/adam_op.h index 3455d1ee54e8e6e498d0b0e6932ec099af9c0b30..ca5454ef040ba2515c00309d2c5df8c48e2b6e6d 100644 --- a/paddle/fluid/operators/optimizers/adam_op.h +++ b/paddle/fluid/operators/optimizers/adam_op.h @@ -177,12 +177,13 @@ struct SparseAdamFunctor { const int64_t* rows_; int64_t row_numel_; int64_t row_count_; + bool sparse_mode_; SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, const T* beta2_pow, const T* mom1, T* mom1_out, const T* mom2, T* mom2_out, const T* lr, const T* grad, const T* param, T* param_out, const int64_t* rows, - int64_t row_numel, int64_t row_count) + int64_t row_numel, int64_t row_count, bool sparse_mode) : beta1_(beta1), beta2_(beta2), epsilon_(epsilon), @@ -198,13 +199,10 @@ struct SparseAdamFunctor { param_out_(param_out), rows_(rows), row_numel_(row_numel), - row_count_(row_count) {} - - inline HOSTDEVICE void operator()(size_t i) const { - auto row_idx = - math::BinarySearch(rows_, row_count_, i / row_numel_); - T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; + row_count_(row_count), + sparse_mode_(sparse_mode) {} + inline HOSTDEVICE void sparse_update(size_t i, T g) const { // The following code is the same as dense T mom1 = moment1_[i]; T mom2 = moment2_[i]; @@ -225,6 +223,13 @@ struct SparseAdamFunctor { moment2_out_[i] = mom2; param_out_[i] = p; } + + inline HOSTDEVICE void operator()(size_t i) const { + auto row_idx = + math::BinarySearch(rows_, row_count_, i / row_numel_); + T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0; + sparse_update(i, g); + } }; template @@ -240,6 +245,7 @@ class AdamOpKernel : public framework::OpKernel { using paddle::framework::LoDTensor; using paddle::operators::detail::Ref; + bool sparse_mode = ctx.Attr("sparse_mode"); T beta1 = static_cast(ctx.Attr("beta1")); T beta2 = static_cast(ctx.Attr("beta2")); T epsilon = static_cast(ctx.Attr("epsilon")); @@ -351,11 +357,22 @@ class AdamOpKernel : public framework::OpKernel { mom2_out.template mutable_data(ctx.GetPlace()), lr.template data(), grad_data, param.template data(), param_out.template mutable_data(ctx.GetPlace()), rows, row_numel, - grad_merge.rows().size()); - platform::ForRange for_range( - static_cast(ctx.device_context()), - param.numel()); - for_range(functor); + grad_merge.rows().size(), sparse_mode); + if (sparse_mode) { + size_t row_count = grad_merge.rows().size(); + for (size_t row_index = 0; row_index < row_count; ++row_index) { + for (size_t offset = 0; offset < row_numel; ++offset) { + size_t i = rows[row_index] * row_numel + offset; + T g = grad_data[row_index * row_numel + offset]; + functor.sparse_update(i, g); + } + } + } else { + platform::ForRange for_range( + static_cast(ctx.device_context()), + param.numel()); + for_range(functor); + } } else { PADDLE_THROW("Variable type not supported by adam_op"); } diff --git a/python/paddle/fluid/optimizer.py b/python/paddle/fluid/optimizer.py index da92826d410505c9a80820f655162dd22e6b5966..9c7482bc40da7c55b8e5be3753e283c8618d055a 100644 --- a/python/paddle/fluid/optimizer.py +++ b/python/paddle/fluid/optimizer.py @@ -663,7 +663,8 @@ class AdamOptimizer(Optimizer): beta2=0.999, epsilon=1e-8, regularization=None, - name=None): + name=None, + sparse_mode=False): assert learning_rate is not None assert beta1 is not None assert beta2 is not None @@ -676,6 +677,7 @@ class AdamOptimizer(Optimizer): self._beta1 = beta1 self._beta2 = beta2 self._epsilon = epsilon + self._sparse_mode = sparse_mode def _create_accumulators(self, block, parameters): assert isinstance(block, framework.Block) @@ -729,7 +731,8 @@ class AdamOptimizer(Optimizer): attrs={ "beta1": self._beta1, "beta2": self._beta2, - "epsilon": self._epsilon + "epsilon": self._epsilon, + "sparse_mode": self._sparse_mode }) return adam_op diff --git a/python/paddle/fluid/tests/unittests/test_adam_op.py b/python/paddle/fluid/tests/unittests/test_adam_op.py index 5318d2f9766ce671925be614feef57d679270b19..da91875a145666127f7516cc2328404aa55b0285 100644 --- a/python/paddle/fluid/tests/unittests/test_adam_op.py +++ b/python/paddle/fluid/tests/unittests/test_adam_op.py @@ -194,7 +194,8 @@ def adam_step(inputs, attributes): return param_out, moment1_out, moment2_out -def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad): +def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad, + sparse_mode): ''' Simulate one step of the adam optimizer :param inputs: dict of inputs @@ -230,7 +231,7 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad): class TestSparseAdamOp(unittest.TestCase): - def setup(self, scope, place): + def setup(self, scope, place, sparse_mode): beta1 = 0.78 beta2 = 0.836 epsilon = 1e-4 @@ -262,19 +263,21 @@ class TestSparseAdamOp(unittest.TestCase): self.sparse_inputs = ["Grad"] - param_out, mom1, mom2 = adam_step_sparse( - self.dense_inputs, self.attrs, height, rows, row_numel, np_array) + param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs, + height, rows, row_numel, + np_array, sparse_mode) self.outputs = { "ParamOut": param_out, "Moment1Out": mom1, "Moment2Out": mom2 } - def check_with_place(self, place): + def check_with_place(self, place, sparse_mode): scope = core.Scope() - self.setup(scope, place) + self.setup(scope, place, sparse_mode) op_args = dict() + op_args['sparse_mode'] = sparse_mode for key, np_array in self.dense_inputs.items(): var = scope.var(key).get_tensor() var.set(np_array, place) @@ -305,12 +308,13 @@ class TestSparseAdamOp(unittest.TestCase): 0.00001) j += 1 - def test_sparse_sgd(self): + def test_sparse_adam(self): places = [core.CPUPlace()] if core.is_compiled_with_cuda(): places.append(core.CUDAPlace(0)) for place in places: - self.check_with_place(place) + for sparse_mode in (True, False): + self.check_with_place(place, sparse_mode) if __name__ == "__main__":