提交 fc6ec6bd 编写于 作者: Q Qiao Longfei

add sparse mode adam

上级 66b6e473
......@@ -110,6 +110,11 @@ class AdamOpMaker : public framework::OpProtoAndCheckerMaker {
"(float, default 1.0e-8) "
"Constant for numerical stability")
.SetDefault(1.0e-8f);
AddAttr<bool>(
"sparse_mode",
"(bool, default false) "
"only update the parameter that has gradient in sparse update")
.SetDefault(false);
AddComment(R"DOC(
Adam Optimizer.
......
......@@ -177,12 +177,13 @@ struct SparseAdamFunctor {
const int64_t* rows_;
int64_t row_numel_;
int64_t row_count_;
bool sparse_mode_;
SparseAdamFunctor(T beta1, T beta2, T epsilon, const T* beta1_pow,
const T* beta2_pow, const T* mom1, T* mom1_out,
const T* mom2, T* mom2_out, const T* lr, const T* grad,
const T* param, T* param_out, const int64_t* rows,
int64_t row_numel, int64_t row_count)
int64_t row_numel, int64_t row_count, bool sparse_mode)
: beta1_(beta1),
beta2_(beta2),
epsilon_(epsilon),
......@@ -198,13 +199,10 @@ struct SparseAdamFunctor {
param_out_(param_out),
rows_(rows),
row_numel_(row_numel),
row_count_(row_count) {}
inline HOSTDEVICE void operator()(size_t i) const {
auto row_idx =
math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
row_count_(row_count),
sparse_mode_(sparse_mode) {}
inline HOSTDEVICE void sparse_update(size_t i, T g) const {
// The following code is the same as dense
T mom1 = moment1_[i];
T mom2 = moment2_[i];
......@@ -225,6 +223,13 @@ struct SparseAdamFunctor {
moment2_out_[i] = mom2;
param_out_[i] = p;
}
inline HOSTDEVICE void operator()(size_t i) const {
auto row_idx =
math::BinarySearch<int64_t>(rows_, row_count_, i / row_numel_);
T g = row_idx >= 0 ? grad_[row_idx * row_numel_ + i % row_numel_] : 0;
sparse_update(i, g);
}
};
template <typename DeviceContext, typename T>
......@@ -240,6 +245,7 @@ class AdamOpKernel : public framework::OpKernel<T> {
using paddle::framework::LoDTensor;
using paddle::operators::detail::Ref;
bool sparse_mode = ctx.Attr<bool>("sparse_mode");
T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
T beta2 = static_cast<T>(ctx.Attr<float>("beta2"));
T epsilon = static_cast<T>(ctx.Attr<float>("epsilon"));
......@@ -351,11 +357,22 @@ class AdamOpKernel : public framework::OpKernel<T> {
mom2_out.template mutable_data<T>(ctx.GetPlace()),
lr.template data<T>(), grad_data, param.template data<T>(),
param_out.template mutable_data<T>(ctx.GetPlace()), rows, row_numel,
grad_merge.rows().size());
platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(ctx.device_context()),
param.numel());
for_range(functor);
grad_merge.rows().size(), sparse_mode);
if (sparse_mode) {
size_t row_count = grad_merge.rows().size();
for (size_t row_index = 0; row_index < row_count; ++row_index) {
for (size_t offset = 0; offset < row_numel; ++offset) {
size_t i = rows[row_index] * row_numel + offset;
T g = grad_data[row_index * row_numel + offset];
functor.sparse_update(i, g);
}
}
} else {
platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(ctx.device_context()),
param.numel());
for_range(functor);
}
} else {
PADDLE_THROW("Variable type not supported by adam_op");
}
......
......@@ -663,7 +663,8 @@ class AdamOptimizer(Optimizer):
beta2=0.999,
epsilon=1e-8,
regularization=None,
name=None):
name=None,
sparse_mode=False):
assert learning_rate is not None
assert beta1 is not None
assert beta2 is not None
......@@ -676,6 +677,7 @@ class AdamOptimizer(Optimizer):
self._beta1 = beta1
self._beta2 = beta2
self._epsilon = epsilon
self._sparse_mode = sparse_mode
def _create_accumulators(self, block, parameters):
assert isinstance(block, framework.Block)
......@@ -729,7 +731,8 @@ class AdamOptimizer(Optimizer):
attrs={
"beta1": self._beta1,
"beta2": self._beta2,
"epsilon": self._epsilon
"epsilon": self._epsilon,
"sparse_mode": self._sparse_mode
})
return adam_op
......
......@@ -194,7 +194,8 @@ def adam_step(inputs, attributes):
return param_out, moment1_out, moment2_out
def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad):
def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad,
sparse_mode):
'''
Simulate one step of the adam optimizer
:param inputs: dict of inputs
......@@ -230,7 +231,7 @@ def adam_step_sparse(inputs, attributes, height, rows, row_numel, np_grad):
class TestSparseAdamOp(unittest.TestCase):
def setup(self, scope, place):
def setup(self, scope, place, sparse_mode):
beta1 = 0.78
beta2 = 0.836
epsilon = 1e-4
......@@ -262,19 +263,21 @@ class TestSparseAdamOp(unittest.TestCase):
self.sparse_inputs = ["Grad"]
param_out, mom1, mom2 = adam_step_sparse(
self.dense_inputs, self.attrs, height, rows, row_numel, np_array)
param_out, mom1, mom2 = adam_step_sparse(self.dense_inputs, self.attrs,
height, rows, row_numel,
np_array, sparse_mode)
self.outputs = {
"ParamOut": param_out,
"Moment1Out": mom1,
"Moment2Out": mom2
}
def check_with_place(self, place):
def check_with_place(self, place, sparse_mode):
scope = core.Scope()
self.setup(scope, place)
self.setup(scope, place, sparse_mode)
op_args = dict()
op_args['sparse_mode'] = sparse_mode
for key, np_array in self.dense_inputs.items():
var = scope.var(key).get_tensor()
var.set(np_array, place)
......@@ -305,12 +308,13 @@ class TestSparseAdamOp(unittest.TestCase):
0.00001)
j += 1
def test_sparse_sgd(self):
def test_sparse_adam(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
self.check_with_place(place)
for sparse_mode in (True, False):
self.check_with_place(place, sparse_mode)
if __name__ == "__main__":
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册