提交 fc6ec6bd 编写于 作者: Q Qiao Longfei

add sparse mode adam

上级 66b6e473
...@@ -110,6 +110,11 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -110,6 +110,11 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
"(float, default 1.0e-8) " "(float, default 1.0e-8) "
"Constant for numerical stability") "Constant for numerical stability")
.SetDefault(1.0e-8f); .SetDefault(1.0e-8f);
AddAttr<bool>(
"sparse_mode",
"(bool, default false) "
"only update the parameter that has gradient in sparse update")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Adam Optimizer. Adam Optimizer.
......
...@@ -177,12 +177,13 @@ struct SparseAdamFunctor { ...@@ -177,12 +177,13 @@ struct SparseAdamFunctor {
const int64_t* rows_; const int64_t* rows_;
int64_t row_numel_; int64_t row_numel_;
int64_t row_count_; int64_t row_count_;
bool sparse_mode_;
SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow, SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
const T* beta2_pow, const T* mom1, T* mom1_out, const T* beta2_pow, const T* mom1, T* mom1_out,
const T* mom2, T* mom2_out, const T* lr, const T* grad, const T* mom2, T* mom2_out, const T* lr, const T* grad,
const T* param, T* param_out, const int64_t* rows, const T* param, T* param_out, const int64_t* rows,
int64_t row_numel, int64_t row_count) int64_t row_numel, int64_t row_count, bool sparse_mode)
: beta1_(beta1), : beta1_(beta1),
beta2_(beta2), beta2_(beta2),
epsilon_(epsilon), epsilon_(epsilon),
...@@ -198,13 +199,10 @@ struct SparseAdamFunctor { ...@@ -198,13 +199,10 @@ struct SparseAdamFunctor {
param_out_(param_out), param_out_(param_out),
rows_(rows), rows_(rows),
row_numel_(row_numel), row_numel_(row_numel),
row_count_(row_count) {} row_count_(row_count),
sparse_mode_(sparse_mode) {}
inline HOSTDEVICE void operator()(size_t i) const {
auto row_idx =
math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
inline HOSTDEVICE void sparse_update(size_t i, T g) const {
// The following code is the same as dense // The following code is the same as dense
T mom1 = moment1_[i]; T mom1 = moment1_[i];
T mom2 = moment2_[i]; T mom2 = moment2_[i];
...@@ -225,6 +223,13 @@ struct SparseAdamFunctor { ...@@ -225,6 +223,13 @@ struct SparseAdamFunctor {
moment2_out_[i] = mom2; moment2_out_[i] = mom2;
param_out_[i] = p; param_out_[i] = p;
} }
inline HOSTDEVICE void operator()(size_t i) const {
auto row_idx =
math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
sparse_update(i, g);
}
}; };
template <typename DeviceContext, typename T> template <typename DeviceContext, typename T>
...@@ -240,6 +245,7 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -240,6 +245,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
using paddle::framework::LoDTensor; using paddle::framework::LoDTensor;
using paddle::operators::detail::Ref; using paddle::operators::detail::Ref;
bool sparse_mode = ctx.Attr<bool>("sparse_mode");
T beta1 = static_cast<T>(ctx.Attr<float>("beta1")); T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
T beta2 = static_cast<T>(ctx.Attr<float>("beta2")); T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon")); T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
...@@ -351,11 +357,22 @@ class AdamOpKernel : public framework::OpKernel<T> { ...@@ -351,11 +357,22 @@ class AdamOpKernel : public framework::OpKernel<T> {
mom2_out.template mutable_data<T>(ctx.GetPlace()), mom2_out.template mutable_data<T>(ctx.GetPlace()),
lr.template data<T>(), grad_data, param.template data<T>(), lr.template data<T>(), grad_data, param.template data<T>(),
param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel, param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel,
grad_merge.rows().size()); grad_merge.rows().size(), sparse_mode);
platform::ForRange<DeviceContext> for_range( if (sparse_mode) {
static_cast<const DeviceContext&>(ctx.device_context()), size_t row_count = grad_merge.rows().size();
param.numel()); for (size_t row_index = 0; row_index < row_count; ++row_index) {
for_range(functor); for (size_t offset = 0; offset < row_numel; ++offset) {
size_t i = rows[row_index] * row_numel + offset;
T g = grad_data[row_index * row_numel + offset];
functor.sparse_update(i, g);
}
}
} else {
platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(ctx.device_context()),
param.numel());
for_range(functor);
}
} else { } else {
PADDLE_THROW("Variable type not supported by adam_op"); PADDLE_THROW("Variable type not supported by adam_op");
} }
......
...@@ -663,7 +663,8 @@ class AdamOptimizer(Optimizer): ...@@ -663,7 +663,8 @@ class AdamOptimizer(Optimizer):
beta2=0.999, beta2=0.999,
epsilon=1e-8, epsilon=1e-8,
regularization=None, regularization=None,
name=None): name=None,
sparse_mode=False):
assert learning_rate is not None assert learning_rate is not None
assert beta1 is not None assert beta1 is not None
assert beta2 is not None assert beta2 is not None
...@@ -676,6 +677,7 @@ class AdamOptimizer(Optimizer): ...@@ -676,6 +677,7 @@ class AdamOptimizer(Optimizer):
self._beta1 = beta1 self._beta1 = beta1
self._beta2 = beta2 self._beta2 = beta2
self._epsilon = epsilon self._epsilon = epsilon
self._sparse_mode = sparse_mode
def _create_accumulators(self, block, parameters): def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block) assert isinstance(block, framework.Block)
...@@ -729,7 +731,8 @@ class AdamOptimizer(Optimizer): ...@@ -729,7 +731,8 @@ class AdamOptimizer(Optimizer):
attrs={ attrs={
"beta1": self._beta1, "beta1": self._beta1,
"beta2": self._beta2, "beta2": self._beta2,
"epsilon": self._epsilon "epsilon": self._epsilon,
"sparse_mode": self._sparse_mode
}) })
return adam_op return adam_op
......
...@@ -194,7 +194,8 @@ def adam_step(inputs, attributes): ...@@ -194,7 +194,8 @@ def adam_step(inputs, attributes):
return param_out, moment1_out, moment2_out return param_out, moment1_out, moment2_out
def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad): def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
sparse_mode):
''' '''
Simulate one step of the adam optimizer Simulate one step of the adam optimizer
:param inputs: dict of inputs :param inputs: dict of inputs
...@@ -230,7 +231,7 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad): ...@@ -230,7 +231,7 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad):
class TestSparseAdamOp(unittest.TestCase): class TestSparseAdamOp(unittest.TestCase):
def setup(self, scope, place): def setup(self, scope, place, sparse_mode):
beta1 = 0.78 beta1 = 0.78
beta2 = 0.836 beta2 = 0.836
epsilon = 1e-4 epsilon = 1e-4
...@@ -262,19 +263,21 @@ class TestSparseAdamOp(unittest.TestCase): ...@@ -262,19 +263,21 @@ class TestSparseAdamOp(unittest.TestCase):
self.sparse_inputs = ["Grad"] self.sparse_inputs = ["Grad"]
param_out, mom1, mom2 = adam_step_sparse( param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs,
self.dense_inputs, self.attrs, height, rows, row_numel, np_array) height, rows, row_numel,
np_array, sparse_mode)
self.outputs = { self.outputs = {
"ParamOut": param_out, "ParamOut": param_out,
"Moment1Out": mom1, "Moment1Out": mom1,
"Moment2Out": mom2 "Moment2Out": mom2
} }
def check_with_place(self, place): def check_with_place(self, place, sparse_mode):
scope = core.Scope() scope = core.Scope()
self.setup(scope, place) self.setup(scope, place, sparse_mode)
op_args = dict() op_args = dict()
op_args['sparse_mode'] = sparse_mode
for key, np_array in self.dense_inputs.items(): for key, np_array in self.dense_inputs.items():
var = scope.var(key).get_tensor() var = scope.var(key).get_tensor()
var.set(np_array, place) var.set(np_array, place)
...@@ -305,12 +308,13 @@ class TestSparseAdamOp(unittest.TestCase): ...@@ -305,12 +308,13 @@ class TestSparseAdamOp(unittest.TestCase):
0.00001) 0.00001)
j += 1 j += 1
def test_sparse_sgd(self): def test_sparse_adam(self):
places = [core.CPUPlace()] places = [core.CPUPlace()]
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0)) places.append(core.CUDAPlace(0))
for place in places: for place in places:
self.check_with_place(place) for sparse_mode in (True, False):
self.check_with_place(place, sparse_mode)
if __name__ == "__main__": if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册