From a6beb96dd0235c236336f2db31df875b33db6635 Mon Sep 17 00:00:00 2001 From: leesusu Date: Wed, 3 Jun 2020 23:38:41 +0800 Subject: [PATCH] FTRL with sparse update, test=develop (#22092) --- paddle/fluid/operators/optimizers/ftrl_op.cc | 14 -- paddle/fluid/operators/optimizers/ftrl_op.h | 228 +++++++++++++----- .../fluid/tests/unittests/test_ftrl_op.py | 174 +++++++++++-- 3 files changed, 316 insertions(+), 100 deletions(-) diff --git a/paddle/fluid/operators/optimizers/ftrl_op.cc b/paddle/fluid/operators/optimizers/ftrl_op.cc index 0c8e6c0b571..3bdafbb96d5 100644 --- a/paddle/fluid/operators/optimizers/ftrl_op.cc +++ b/paddle/fluid/operators/optimizers/ftrl_op.cc @@ -32,20 +32,6 @@ class FTRLOp : public framework::OperatorWithKernel { OP_INOUT_CHECK(ctx->HasInput("Grad"), "Input", "Grad", "FTRL"); OP_INOUT_CHECK(ctx->HasInput("LearningRate"), "Input", "LearningRate", "FTRL"); - PADDLE_ENFORCE_EQ( - ctx->GetInputsVarType("Param").front(), - framework::proto::VarType::LOD_TENSOR, - platform::errors::InvalidArgument( - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Param").front(), - ctx->GetInputsVarType("Param").front())); - PADDLE_ENFORCE_EQ( - ctx->GetInputsVarType("Grad").front(), - framework::proto::VarType::LOD_TENSOR, - platform::errors::InvalidArgument( - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Grad").front(), - ctx->GetInputsVarType("Grad").front())); OP_INOUT_CHECK(ctx->HasOutput("ParamOut"), "Output", "ParamOut", "FTRL"); OP_INOUT_CHECK(ctx->HasOutput("SquaredAccumOut"), "Output", diff --git a/paddle/fluid/operators/optimizers/ftrl_op.h b/paddle/fluid/operators/optimizers/ftrl_op.h index 5c7ac48663b..6bf8c8d724f 100644 --- a/paddle/fluid/operators/optimizers/ftrl_op.h +++ b/paddle/fluid/operators/optimizers/ftrl_op.h @@ -15,6 +15,8 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" +#include "paddle/fluid/platform/for_range.h" namespace paddle { namespace operators { @@ -24,24 +26,97 @@ template using EigenVector = framework::EigenVector; +template +class SparseFTRLFunctor { + private: + const T* g_; + const T* p_; + const T* s_acc_; + const T* l_acc_; + const T* lr_; + const T l1_; + const T l2_; + const T lr_power_; + const int64_t* rows_; + const int64_t row_numel_; + T* p_out_; + T* s_acc_out_; + T* l_acc_out_; + + public: + SparseFTRLFunctor(const T* g, const T* p, const T* s_acc, const T* lr, + const T l1, const T l2, const T lr_power, + const int64_t* rows, int64_t row_numel, T* p_out, + T* s_acc_out, T* l_acc_out) + : g_(g), + p_(p), + s_acc_(s_acc), + lr_(lr), + l1_(l1), + l2_(l2), + lr_power_(lr_power), + rows_(rows), + row_numel_(row_numel), + p_out_(p_out), + s_acc_out_(s_acc_out), + l_acc_out_(l_acc_out) {} + + inline HOSTDEVICE void operator()(size_t i) { + auto j = rows_[i / row_numel_] * row_numel_ + i % row_numel_; + const T g = g_[i]; + const T p = p_[j]; + const T s_acc = s_acc_[j]; + const T lr = lr_[0]; + + auto new_acc = s_acc + g * g; + + if (lr_power_ == static_cast(-0.5)) { + l_acc_out_[j] += g - (std::sqrt(new_acc) - std::sqrt(s_acc)) / lr * p; + } else { + l_acc_out_[j] += + g - + (std::pow(new_acc, -lr_power_) - std::pow(s_acc, -lr_power_)) / lr * + p; + } + + auto l_acc = l_acc_out_[j]; + + if (std::fabs(l_acc) > l1_) { + auto x = -l_acc; + if (l_acc >= static_cast(0)) { + x += l1_; + } else { + x -= l1_; + } + + auto y = static_cast(2) * l2_; + if (lr_power_ == static_cast(-0.5)) { + y += std::sqrt(new_acc) / lr; + } else { + y += std::pow(new_acc, -lr_power_) / lr; + } + + auto pre_shrink = x / y; + p_out_[j] = pre_shrink; + } else { + p_out_[j] = static_cast(0); + } + + s_acc_out_[j] += g * g; + } +}; + template class FTRLOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - const auto* param_var = ctx.InputVar("Param"); - PADDLE_ENFORCE_EQ(param_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Param").front(), - framework::ToTypeName(param_var->Type()))); const auto* grad_var = ctx.InputVar("Grad"); - PADDLE_ENFORCE_EQ(grad_var->IsType(), true, - platform::errors::InvalidArgument( - "The Var(%s)'s type should be LoDTensor, " - "but the received is %s", - ctx.InputNames("Grad").front(), - framework::ToTypeName(grad_var->Type()))); + + auto* lr_in = ctx.Input("LearningRate"); + + auto* param_in = ctx.Input("Param"); + auto* sq_accum_in = ctx.Input("SquaredAccumulator"); + auto* lin_accum_in = ctx.Input("LinearAccumulator"); auto* param_out = ctx.Output("ParamOut"); auto* sq_accum_out = ctx.Output("SquaredAccumOut"); @@ -51,59 +126,88 @@ class FTRLOpKernel : public framework::OpKernel { sq_accum_out->mutable_data(ctx.GetPlace()); lin_accum_out->mutable_data(ctx.GetPlace()); - auto grad = ctx.Input("Grad"); - - auto l1 = static_cast(ctx.Attr("l1")); - auto l2 = static_cast(ctx.Attr("l2")); + auto l1 = static_cast(ctx.Attr("l1")) + static_cast(1e-10); + auto l2 = static_cast(ctx.Attr("l2")) + static_cast(1e-10); auto lr_power = static_cast(ctx.Attr("lr_power")); - auto p = EigenVector::Flatten(*ctx.Input("Param")); - auto sq_accum = - EigenVector::Flatten(*ctx.Input("SquaredAccumulator")); - auto lin_accum = - EigenVector::Flatten(*ctx.Input("LinearAccumulator")); - auto g = EigenVector::Flatten(*grad); - auto lr = EigenVector::Flatten(*ctx.Input("LearningRate")); - - auto p_out = EigenVector::Flatten(*param_out); - auto s_acc_out = EigenVector::Flatten(*sq_accum_out); - auto l_acc_out = EigenVector::Flatten(*lin_accum_out); - auto& place = *ctx.template device_context().eigen_device(); - - Eigen::DSizes grad_dsize(grad->numel()); - - auto new_accum = sq_accum + g * g; - // Special case for lr_power = -0.5 - if (lr_power == static_cast(-0.5)) { - l_acc_out.device(place) = - lin_accum + g - - ((new_accum.sqrt() - sq_accum.sqrt()) / lr.broadcast(grad_dsize)) * p; + if (grad_var->IsType()) { + auto grad = ctx.Input("Grad"); + auto g = EigenVector::Flatten(*grad); + + auto p = EigenVector::Flatten(*param_in); + auto sq_accum = EigenVector::Flatten(*sq_accum_in); + auto lin_accum = EigenVector::Flatten(*lin_accum_in); + auto lr = EigenVector::Flatten(*lr_in); + + auto p_out = EigenVector::Flatten(*param_out); + auto s_acc_out = EigenVector::Flatten(*sq_accum_out); + auto l_acc_out = EigenVector::Flatten(*lin_accum_out); + auto& place = + *ctx.template device_context().eigen_device(); + + Eigen::DSizes grad_dsize(grad->numel()); + + auto new_accum = sq_accum + g * g; + // Special case for lr_power = -0.5 + if (lr_power == static_cast(-0.5)) { + l_acc_out.device(place) = + lin_accum + g - + ((new_accum.sqrt() - sq_accum.sqrt()) / lr.broadcast(grad_dsize)) * + p; + } else { + l_acc_out.device(place) = + lin_accum + g - + ((new_accum.pow(-lr_power) - sq_accum.pow(-lr_power)) / + lr.broadcast(grad_dsize)) * + p; + } + + auto x = (l_acc_out.constant(l1) * l_acc_out.sign() - l_acc_out); + if (lr_power == static_cast(-0.5)) { + auto y = (new_accum.sqrt() / lr.broadcast(grad_dsize)) + + l_acc_out.constant(static_cast(2) * l2); + auto pre_shrink = x / y; + p_out.device(place) = + (l_acc_out.abs() > l_acc_out.constant(l1)) + .select(pre_shrink, p.constant(static_cast(0))); + } else { + auto y = (new_accum.pow(-lr_power) / lr.broadcast(grad_dsize)) + + l_acc_out.constant(static_cast(2) * l2); + auto pre_shrink = x / y; + p_out.device(place) = + (l_acc_out.abs() > l_acc_out.constant(l1)) + .select(pre_shrink, p.constant(static_cast(0))); + } + + s_acc_out.device(place) = sq_accum + g * g; + } else if (grad_var->IsType()) { + auto grad = ctx.Input("Grad"); + + framework::SelectedRows tmp_merged_grad; + framework::SelectedRows* merged_grad = &tmp_merged_grad; + math::scatter::MergeAdd merge_func; + merge_func(ctx.template device_context(), *grad, + merged_grad); + + const int64_t* rows = merged_grad->rows().Data(ctx.GetPlace()); + auto row_numel = static_cast(merged_grad->value().dims()[1]); + auto row_height = static_cast(merged_grad->rows().size()); + + platform::ForRange for_range( + static_cast(ctx.device_context()), + row_numel * row_height); + + SparseFTRLFunctor functor( + merged_grad->value().data(), param_in->data(), + sq_accum_in->data(), lr_in->data(), l1, l2, lr_power, rows, + row_numel, param_out->mutable_data(ctx.GetPlace()), + sq_accum_out->mutable_data(ctx.GetPlace()), + lin_accum_out->mutable_data(ctx.GetPlace())); + for_range(functor); } else { - l_acc_out.device(place) = - lin_accum + g - - ((new_accum.pow(-lr_power) - sq_accum.pow(-lr_power)) / - lr.broadcast(grad_dsize)) * - p; + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupported Variable Type of Grad")); } - - auto x = (l_acc_out.constant(l1) * l_acc_out.sign() - l_acc_out); - if (lr_power == static_cast(-0.5)) { - auto y = (new_accum.sqrt() / lr.broadcast(grad_dsize)) + - l_acc_out.constant(static_cast(2) * l2); - auto pre_shrink = x / y; - p_out.device(place) = - (l_acc_out.abs() > l_acc_out.constant(l1)) - .select(pre_shrink, p.constant(static_cast(0))); - } else { - auto y = (new_accum.pow(-lr_power) / lr.broadcast(grad_dsize)) + - l_acc_out.constant(static_cast(2) * l2); - auto pre_shrink = x / y; - p_out.device(place) = - (l_acc_out.abs() > l_acc_out.constant(l1)) - .select(pre_shrink, p.constant(static_cast(0))); - } - - s_acc_out.device(place) = sq_accum + g * g; } }; diff --git a/python/paddle/fluid/tests/unittests/test_ftrl_op.py b/python/paddle/fluid/tests/unittests/test_ftrl_op.py index a6390b054f0..f58672a7a1e 100644 --- a/python/paddle/fluid/tests/unittests/test_ftrl_op.py +++ b/python/paddle/fluid/tests/unittests/test_ftrl_op.py @@ -16,16 +16,62 @@ from __future__ import print_function import unittest import numpy as np +import paddle.fluid.core as core +from paddle.fluid.op import Operator from op_test import OpTest +def ftrl_step(param, grad, rows, sq_accum, lin_accum, lr, l1, l2, lr_power): + l1 += 1e-10 + l2 += 1e-10 + + param_hit = param[rows] + sq_accum_hit = sq_accum[rows] + lin_accum_hit = lin_accum[rows] + + new_accum = sq_accum_hit + grad * grad + if lr_power == -0.5: + lin_accum_updated = lin_accum_hit + grad - ( + (np.sqrt(new_accum) - np.sqrt(sq_accum_hit)) / lr) * param_hit + else: + lin_accum_updated = lin_accum_hit + grad - ( + (np.power(new_accum, -lr_power) - np.power(sq_accum_hit, -lr_power) + ) / lr) * param_hit + + x = l1 * np.sign(lin_accum_updated) - lin_accum_updated + if lr_power == -0.5: + y = (np.sqrt(new_accum) / lr) + (2 * l2) + pre_shrink = x / y + param_updated = np.where( + np.abs(lin_accum_updated) > l1, pre_shrink, 0.0) + else: + y = (np.power(new_accum, -lr_power) / lr) + (2 * l2) + pre_shrink = x / y + param_updated = np.where( + np.abs(lin_accum_updated) > l1, pre_shrink, 0.0) + + sq_accum_updated = sq_accum_hit + grad * grad + + param_out = param.copy() + sq_accum_out = sq_accum.copy() + lin_accum_out = lin_accum.copy() + + for i in range(len(rows)): + param_out[rows[i]] = param_updated[i] + sq_accum_out[rows[i]] = sq_accum_updated[i] + lin_accum_out[rows[i]] = lin_accum_updated[i] + + return param_out, sq_accum_out, lin_accum_out + + class TestFTRLOp(OpTest): def setUp(self): self.op_type = "ftrl" - w = np.random.random((102, 105)).astype("float32") - g = np.random.random((102, 105)).astype("float32") - sq_accum = np.full((102, 105), 0.1).astype("float32") - linear_accum = np.full((102, 105), 0.1).astype("float32") + rows = 102 + w = np.random.random((rows, 105)).astype("float32") + g = np.random.random((rows, 105)).astype("float32") + sq_accum = np.full((rows, 105), 0.1).astype("float32") + linear_accum = np.full((rows, 105), 0.1).astype("float32") lr = np.array([0.01]).astype("float32") l1 = 0.1 l2 = 0.2 @@ -44,35 +90,115 @@ class TestFTRLOp(OpTest): 'lr_power': lr_power, 'learning_rate': lr } - new_accum = sq_accum + g * g - if lr_power == -0.5: - linear_out = linear_accum + g - ( - (np.sqrt(new_accum) - np.sqrt(sq_accum)) / lr) * w - else: - linear_out = linear_accum + g - ((np.power( - new_accum, -lr_power) - np.power(sq_accum, -lr_power)) / lr) * w - - x = (l1 * np.sign(linear_out) - linear_out) - if lr_power == -0.5: - y = (np.sqrt(new_accum) / lr) + (2 * l2) - pre_shrink = x / y - param_out = np.where(np.abs(linear_out) > l1, pre_shrink, 0.0) - else: - y = (np.power(new_accum, -lr_power) / lr) + (2 * l2) - pre_shrink = x / y - param_out = np.where(np.abs(linear_out) > l1, pre_shrink, 0.0) - - sq_accum_out = sq_accum + g * g + + param_out, sq_accum_out, lin_accum_out = ftrl_step( + w, g, range(rows), sq_accum, linear_accum, lr, l1, l2, lr_power) self.outputs = { 'ParamOut': param_out, 'SquaredAccumOut': sq_accum_out, - 'LinearAccumOut': linear_out + 'LinearAccumOut': lin_accum_out } def test_check_output(self): self.check_output() +class TestSparseFTRLOp(unittest.TestCase): + def setUp(self): + self.lr_power = -0.5 + + def check_with_place(self, place): + self.init_kernel() + scope = core.Scope() + + height = 10 + rows = [0, 4, 7] + row_numel = 12 + l1 = 0.1 + l2 = 0.2 + lr_power = self.lr_power + + # create and initialize Param Variable + param = scope.var('Param').get_tensor() + param_array = np.random.random((height, row_numel)).astype("float32") + param.set(param_array, place) + + # create and initialize Grad Variable + grad = scope.var('Grad').get_selected_rows() + grad.set_height(height) + grad.set_rows(rows) + grad_array = np.random.random((len(rows), row_numel)).astype("float32") + + grad_tensor = grad.get_tensor() + grad_tensor.set(grad_array, place) + + # create and initialize SquaredAccumulator Variable + sq_accum = scope.var('SquaredAccumulator').get_tensor() + sq_accum_array = np.full((height, row_numel), 0.1).astype("float32") + sq_accum.set(sq_accum_array, place) + + # create and initialize LinearAccumulator Variable + lin_accum = scope.var('LinearAccumulator').get_tensor() + lin_accum_array = np.full((height, row_numel), 0.1).astype("float32") + lin_accum.set(lin_accum_array, place) + + # create and initialize LeraningRate Variable + lr = scope.var('LearningRate').get_tensor() + lr_array = np.array([0.01]).astype("float32") + lr.set(lr_array, place) + + # calculate ground-truth answer + param_out, sq_accum_out, lin_accum_out = ftrl_step( + param_array, grad_array, rows, sq_accum_array, lin_accum_array, lr, + l1, l2, lr_power) + + # create and run operator + op = Operator( + "ftrl", + Param='Param', + Grad='Grad', + ParamOut='Param', + SquaredAccumulator='SquaredAccumulator', + SquaredAccumOut='SquaredAccumulator', + LinearAccumulator='LinearAccumulator', + LinearAccumOut='LinearAccumulator', + LearningRate='LearningRate', + l1=l1, + l2=l2, + lr_power=lr_power) + + op.run(scope, place) + + # get and compare param result + param_array = np.array(param) + sq_accum_array = np.array(sq_accum) + lin_accum_array = np.array(lin_accum) + + for i in range(height): + for j in range(row_numel): + self.assertAlmostEqual( + param_out[i][j], param_array[i][j], places=4) + self.assertAlmostEqual( + sq_accum_out[i][j], sq_accum_array[i][j], places=4) + self.assertAlmostEqual( + lin_accum_out[i][j], lin_accum_array[i][j], places=4) + + def init_kernel(self): + pass + + def test_sparse_ftrl(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + for place in places: + self.check_with_place(place) + + +class TestSparseFTRLOp2(TestSparseFTRLOp): + def init_kernel(self): + self.lr_power = -0.6 + + if __name__ == "__main__": unittest.main() -- GitLab