未验证 提交 a6beb96d 编写于 作者: L leesusu 提交者: GitHub

FTRL with sparse update, test=develop (#22092)

上级 6aae034f
......@@ -32,20 +32,6 @@ class FTRLOp : public framework::OperatorWithKernel {
OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "FTRL");
OP_INOUT_CHECK(ctx->HasInput("LearningRate"), "Input", "LearningRate",
"FTRL");
PADDLE_ENFORCE_EQ(
ctx->GetInputsVarType("Param").front(),
framework::proto::VarType::LOD_TENSOR,
platform::errors::InvalidArgument(
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Param").front(),
ctx->GetInputsVarType("Param").front()));
PADDLE_ENFORCE_EQ(
ctx->GetInputsVarType("Grad").front(),
framework::proto::VarType::LOD_TENSOR,
platform::errors::InvalidArgument(
"The input var's type should be LoDTensor, but the received is %s",
ctx->Inputs("Grad").front(),
ctx->GetInputsVarType("Grad").front()));
OP_INOUT_CHECK(ctx->HasOutput("ParamOut"), "Output", "ParamOut", "FTRL");
OP_INOUT_CHECK(ctx->HasOutput("SquaredAccumOut"), "Output",
......
......@@ -15,6 +15,8 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/for_range.h"
namespace paddle {
namespace operators {
......@@ -24,24 +26,97 @@ template <typename T, int MajorType = Eigen::RowMajor,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename T>
class SparseFTRLFunctor {
private:
const T* g_;
const T* p_;
const T* s_acc_;
const T* l_acc_;
const T* lr_;
const T l1_;
const T l2_;
const T lr_power_;
const int64_t* rows_;
const int64_t row_numel_;
T* p_out_;
T* s_acc_out_;
T* l_acc_out_;
public:
SparseFTRLFunctor(const T* g, const T* p, const T* s_acc, const T* lr,
const T l1, const T l2, const T lr_power,
const int64_t* rows, int64_t row_numel, T* p_out,
T* s_acc_out, T* l_acc_out)
: g_(g),
p_(p),
s_acc_(s_acc),
lr_(lr),
l1_(l1),
l2_(l2),
lr_power_(lr_power),
rows_(rows),
row_numel_(row_numel),
p_out_(p_out),
s_acc_out_(s_acc_out),
l_acc_out_(l_acc_out) {}
inline HOSTDEVICE void operator()(size_t i) {
auto j = rows_[i / row_numel_] * row_numel_ + i % row_numel_;
const T g = g_[i];
const T p = p_[j];
const T s_acc = s_acc_[j];
const T lr = lr_[0];
auto new_acc = s_acc + g * g;
if (lr_power_ == static_cast<T>(-0.5)) {
l_acc_out_[j] += g - (std::sqrt(new_acc) - std::sqrt(s_acc)) / lr * p;
} else {
l_acc_out_[j] +=
g -
(std::pow(new_acc, -lr_power_) - std::pow(s_acc, -lr_power_)) / lr *
p;
}
auto l_acc = l_acc_out_[j];
if (std::fabs(l_acc) > l1_) {
auto x = -l_acc;
if (l_acc >= static_cast<T>(0)) {
x += l1_;
} else {
x -= l1_;
}
auto y = static_cast<T>(2) * l2_;
if (lr_power_ == static_cast<T>(-0.5)) {
y += std::sqrt(new_acc) / lr;
} else {
y += std::pow(new_acc, -lr_power_) / lr;
}
auto pre_shrink = x / y;
p_out_[j] = pre_shrink;
} else {
p_out_[j] = static_cast<T>(0);
}
s_acc_out_[j] += g * g;
}
};
template <typename DeviceContext, typename T>
class FTRLOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const auto* param_var = ctx.InputVar("Param");
PADDLE_ENFORCE_EQ(param_var->IsType<framework::LoDTensor>(), true,
platform::errors::InvalidArgument(
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.InputNames("Param").front(),
framework::ToTypeName(param_var->Type())));
const auto* grad_var = ctx.InputVar("Grad");
PADDLE_ENFORCE_EQ(grad_var->IsType<framework::LoDTensor>(), true,
platform::errors::InvalidArgument(
"The Var(%s)'s type should be LoDTensor, "
"but the received is %s",
ctx.InputNames("Grad").front(),
framework::ToTypeName(grad_var->Type())));
auto* lr_in = ctx.Input<Tensor>("LearningRate");
auto* param_in = ctx.Input<Tensor>("Param");
auto* sq_accum_in = ctx.Input<Tensor>("SquaredAccumulator");
auto* lin_accum_in = ctx.Input<Tensor>("LinearAccumulator");
auto* param_out = ctx.Output<Tensor>("ParamOut");
auto* sq_accum_out = ctx.Output<Tensor>("SquaredAccumOut");
......@@ -51,59 +126,88 @@ class FTRLOpKernel : public framework::OpKernel<T> {
sq_accum_out->mutable_data<T>(ctx.GetPlace());
lin_accum_out->mutable_data<T>(ctx.GetPlace());
auto grad = ctx.Input<Tensor>("Grad");
auto l1 = static_cast<T>(ctx.Attr<float>("l1"));
auto l2 = static_cast<T>(ctx.Attr<float>("l2"));
auto l1 = static_cast<T>(ctx.Attr<float>("l1")) + static_cast<T>(1e-10);
auto l2 = static_cast<T>(ctx.Attr<float>("l2")) + static_cast<T>(1e-10);
auto lr_power = static_cast<T>(ctx.Attr<float>("lr_power"));
auto p = EigenVector<T>::Flatten(*ctx.Input<Tensor>("Param"));
auto sq_accum =
EigenVector<T>::Flatten(*ctx.Input<Tensor>("SquaredAccumulator"));
auto lin_accum =
EigenVector<T>::Flatten(*ctx.Input<Tensor>("LinearAccumulator"));
auto g = EigenVector<T>::Flatten(*grad);
auto lr = EigenVector<T>::Flatten(*ctx.Input<Tensor>("LearningRate"));
auto p_out = EigenVector<T>::Flatten(*param_out);
auto s_acc_out = EigenVector<T>::Flatten(*sq_accum_out);
auto l_acc_out = EigenVector<T>::Flatten(*lin_accum_out);
auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
Eigen::DSizes<int, 1> grad_dsize(grad->numel());
auto new_accum = sq_accum + g * g;
// Special case for lr_power = -0.5
if (lr_power == static_cast<T>(-0.5)) {
l_acc_out.device(place) =
lin_accum + g -
((new_accum.sqrt() - sq_accum.sqrt()) / lr.broadcast(grad_dsize)) * p;
if (grad_var->IsType<framework::LoDTensor>()) {
auto grad = ctx.Input<Tensor>("Grad");
auto g = EigenVector<T>::Flatten(*grad);
auto p = EigenVector<T>::Flatten(*param_in);
auto sq_accum = EigenVector<T>::Flatten(*sq_accum_in);
auto lin_accum = EigenVector<T>::Flatten(*lin_accum_in);
auto lr = EigenVector<T>::Flatten(*lr_in);
auto p_out = EigenVector<T>::Flatten(*param_out);
auto s_acc_out = EigenVector<T>::Flatten(*sq_accum_out);
auto l_acc_out = EigenVector<T>::Flatten(*lin_accum_out);
auto& place =
*ctx.template device_context<DeviceContext>().eigen_device();
Eigen::DSizes<int, 1> grad_dsize(grad->numel());
auto new_accum = sq_accum + g * g;
// Special case for lr_power = -0.5
if (lr_power == static_cast<T>(-0.5)) {
l_acc_out.device(place) =
lin_accum + g -
((new_accum.sqrt() - sq_accum.sqrt()) / lr.broadcast(grad_dsize)) *
p;
} else {
l_acc_out.device(place) =
lin_accum + g -
((new_accum.pow(-lr_power) - sq_accum.pow(-lr_power)) /
lr.broadcast(grad_dsize)) *
p;
}
auto x = (l_acc_out.constant(l1) * l_acc_out.sign() - l_acc_out);
if (lr_power == static_cast<T>(-0.5)) {
auto y = (new_accum.sqrt() / lr.broadcast(grad_dsize)) +
l_acc_out.constant(static_cast<T>(2) * l2);
auto pre_shrink = x / y;
p_out.device(place) =
(l_acc_out.abs() > l_acc_out.constant(l1))
.select(pre_shrink, p.constant(static_cast<T>(0)));
} else {
auto y = (new_accum.pow(-lr_power) / lr.broadcast(grad_dsize)) +
l_acc_out.constant(static_cast<T>(2) * l2);
auto pre_shrink = x / y;
p_out.device(place) =
(l_acc_out.abs() > l_acc_out.constant(l1))
.select(pre_shrink, p.constant(static_cast<T>(0)));
}
s_acc_out.device(place) = sq_accum + g * g;
} else if (grad_var->IsType<framework::SelectedRows>()) {
auto grad = ctx.Input<framework::SelectedRows>("Grad");
framework::SelectedRows tmp_merged_grad;
framework::SelectedRows* merged_grad = &tmp_merged_grad;
math::scatter::MergeAdd<DeviceContext, T> merge_func;
merge_func(ctx.template device_context<DeviceContext>(), *grad,
merged_grad);
const int64_t* rows = merged_grad->rows().Data(ctx.GetPlace());
auto row_numel = static_cast<int64_t>(merged_grad->value().dims()[1]);
auto row_height = static_cast<int64_t>(merged_grad->rows().size());
platform::ForRange<DeviceContext> for_range(
static_cast<const DeviceContext&>(ctx.device_context()),
row_numel * row_height);
SparseFTRLFunctor<T> functor(
merged_grad->value().data<T>(), param_in->data<T>(),
sq_accum_in->data<T>(), lr_in->data<T>(), l1, l2, lr_power, rows,
row_numel, param_out->mutable_data<T>(ctx.GetPlace()),
sq_accum_out->mutable_data<T>(ctx.GetPlace()),
lin_accum_out->mutable_data<T>(ctx.GetPlace()));
for_range(functor);
} else {
l_acc_out.device(place) =
lin_accum + g -
((new_accum.pow(-lr_power) - sq_accum.pow(-lr_power)) /
lr.broadcast(grad_dsize)) *
p;
PADDLE_THROW(platform::errors::InvalidArgument(
"Unsupported Variable Type of Grad"));
}
auto x = (l_acc_out.constant(l1) * l_acc_out.sign() - l_acc_out);
if (lr_power == static_cast<T>(-0.5)) {
auto y = (new_accum.sqrt() / lr.broadcast(grad_dsize)) +
l_acc_out.constant(static_cast<T>(2) * l2);
auto pre_shrink = x / y;
p_out.device(place) =
(l_acc_out.abs() > l_acc_out.constant(l1))
.select(pre_shrink, p.constant(static_cast<T>(0)));
} else {
auto y = (new_accum.pow(-lr_power) / lr.broadcast(grad_dsize)) +
l_acc_out.constant(static_cast<T>(2) * l2);
auto pre_shrink = x / y;
p_out.device(place) =
(l_acc_out.abs() > l_acc_out.constant(l1))
.select(pre_shrink, p.constant(static_cast<T>(0)));
}
s_acc_out.device(place) = sq_accum + g * g;
}
};
......
......@@ -16,16 +16,62 @@ from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid.core as core
from paddle.fluid.op import Operator
from op_test import OpTest
def ftrl_step(param, grad, rows, sq_accum, lin_accum, lr, l1, l2, lr_power):
l1 += 1e-10
l2 += 1e-10
param_hit = param[rows]
sq_accum_hit = sq_accum[rows]
lin_accum_hit = lin_accum[rows]
new_accum = sq_accum_hit + grad * grad
if lr_power == -0.5:
lin_accum_updated = lin_accum_hit + grad - (
(np.sqrt(new_accum) - np.sqrt(sq_accum_hit)) / lr) * param_hit
else:
lin_accum_updated = lin_accum_hit + grad - (
(np.power(new_accum, -lr_power) - np.power(sq_accum_hit, -lr_power)
) / lr) * param_hit
x = l1 * np.sign(lin_accum_updated) - lin_accum_updated
if lr_power == -0.5:
y = (np.sqrt(new_accum) / lr) + (2 * l2)
pre_shrink = x / y
param_updated = np.where(
np.abs(lin_accum_updated) > l1, pre_shrink, 0.0)
else:
y = (np.power(new_accum, -lr_power) / lr) + (2 * l2)
pre_shrink = x / y
param_updated = np.where(
np.abs(lin_accum_updated) > l1, pre_shrink, 0.0)
sq_accum_updated = sq_accum_hit + grad * grad
param_out = param.copy()
sq_accum_out = sq_accum.copy()
lin_accum_out = lin_accum.copy()
for i in range(len(rows)):
param_out[rows[i]] = param_updated[i]
sq_accum_out[rows[i]] = sq_accum_updated[i]
lin_accum_out[rows[i]] = lin_accum_updated[i]
return param_out, sq_accum_out, lin_accum_out
class TestFTRLOp(OpTest):
def setUp(self):
self.op_type = "ftrl"
w = np.random.random((102, 105)).astype("float32")
g = np.random.random((102, 105)).astype("float32")
sq_accum = np.full((102, 105), 0.1).astype("float32")
linear_accum = np.full((102, 105), 0.1).astype("float32")
rows = 102
w = np.random.random((rows, 105)).astype("float32")
g = np.random.random((rows, 105)).astype("float32")
sq_accum = np.full((rows, 105), 0.1).astype("float32")
linear_accum = np.full((rows, 105), 0.1).astype("float32")
lr = np.array([0.01]).astype("float32")
l1 = 0.1
l2 = 0.2
......@@ -44,35 +90,115 @@ class TestFTRLOp(OpTest):
'lr_power': lr_power,
'learning_rate': lr
}
new_accum = sq_accum + g * g
if lr_power == -0.5:
linear_out = linear_accum + g - (
(np.sqrt(new_accum) - np.sqrt(sq_accum)) / lr) * w
else:
linear_out = linear_accum + g - ((np.power(
new_accum, -lr_power) - np.power(sq_accum, -lr_power)) / lr) * w
x = (l1 * np.sign(linear_out) - linear_out)
if lr_power == -0.5:
y = (np.sqrt(new_accum) / lr) + (2 * l2)
pre_shrink = x / y
param_out = np.where(np.abs(linear_out) > l1, pre_shrink, 0.0)
else:
y = (np.power(new_accum, -lr_power) / lr) + (2 * l2)
pre_shrink = x / y
param_out = np.where(np.abs(linear_out) > l1, pre_shrink, 0.0)
sq_accum_out = sq_accum + g * g
param_out, sq_accum_out, lin_accum_out = ftrl_step(
w, g, range(rows), sq_accum, linear_accum, lr, l1, l2, lr_power)
self.outputs = {
'ParamOut': param_out,
'SquaredAccumOut': sq_accum_out,
'LinearAccumOut': linear_out
'LinearAccumOut': lin_accum_out
}
def test_check_output(self):
self.check_output()
class TestSparseFTRLOp(unittest.TestCase):
def setUp(self):
self.lr_power = -0.5
def check_with_place(self, place):
self.init_kernel()
scope = core.Scope()
height = 10
rows = [0, 4, 7]
row_numel = 12
l1 = 0.1
l2 = 0.2
lr_power = self.lr_power
# create and initialize Param Variable
param = scope.var('Param').get_tensor()
param_array = np.random.random((height, row_numel)).astype("float32")
param.set(param_array, place)
# create and initialize Grad Variable
grad = scope.var('Grad').get_selected_rows()
grad.set_height(height)
grad.set_rows(rows)
grad_array = np.random.random((len(rows), row_numel)).astype("float32")
grad_tensor = grad.get_tensor()
grad_tensor.set(grad_array, place)
# create and initialize SquaredAccumulator Variable
sq_accum = scope.var('SquaredAccumulator').get_tensor()
sq_accum_array = np.full((height, row_numel), 0.1).astype("float32")
sq_accum.set(sq_accum_array, place)
# create and initialize LinearAccumulator Variable
lin_accum = scope.var('LinearAccumulator').get_tensor()
lin_accum_array = np.full((height, row_numel), 0.1).astype("float32")
lin_accum.set(lin_accum_array, place)
# create and initialize LeraningRate Variable
lr = scope.var('LearningRate').get_tensor()
lr_array = np.array([0.01]).astype("float32")
lr.set(lr_array, place)
# calculate ground-truth answer
param_out, sq_accum_out, lin_accum_out = ftrl_step(
param_array, grad_array, rows, sq_accum_array, lin_accum_array, lr,
l1, l2, lr_power)
# create and run operator
op = Operator(
"ftrl",
Param='Param',
Grad='Grad',
ParamOut='Param',
SquaredAccumulator='SquaredAccumulator',
SquaredAccumOut='SquaredAccumulator',
LinearAccumulator='LinearAccumulator',
LinearAccumOut='LinearAccumulator',
LearningRate='LearningRate',
l1=l1,
l2=l2,
lr_power=lr_power)
op.run(scope, place)
# get and compare param result
param_array = np.array(param)
sq_accum_array = np.array(sq_accum)
lin_accum_array = np.array(lin_accum)
for i in range(height):
for j in range(row_numel):
self.assertAlmostEqual(
param_out[i][j], param_array[i][j], places=4)
self.assertAlmostEqual(
sq_accum_out[i][j], sq_accum_array[i][j], places=4)
self.assertAlmostEqual(
lin_accum_out[i][j], lin_accum_array[i][j], places=4)
def init_kernel(self):
pass
def test_sparse_ftrl(self):
places = [core.CPUPlace()]
if core.is_compiled_with_cuda():
places.append(core.CUDAPlace(0))
for place in places:
self.check_with_place(place)
class TestSparseFTRLOp2(TestSparseFTRLOp):
def init_kernel(self):
self.lr_power = -0.6
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册