diff --git a/paddle/fluid/operators/momentum_op.cc b/paddle/fluid/operators/momentum_op.cc index 5f43c5810812260c4384349bdb709716c9a182f5..d2c148c572bc5293ee6b8c88cc98fa8142c516f0 100644 --- a/paddle/fluid/operators/momentum_op.cc +++ b/paddle/fluid/operators/momentum_op.cc @@ -24,7 +24,7 @@ class MomentumOp : public framework::OperatorWithKernel { using framework::OperatorWithKernel::OperatorWithKernel; protected: - void InferShape(framework::InferShapeContext *ctx) const override { + void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("Param"), "Input(param) of Momentum should not be null."); PADDLE_ENFORCE(ctx->HasInput("Grad"), @@ -53,13 +53,30 @@ class MomentumOp : public framework::OperatorWithKernel { ctx->SetOutputDim("VelocityOut", param_dim); } framework::OpKernelType GetExpectedKernelType( - const framework::ExecutionContext &ctx) const override { - auto input_data_type = - framework::ToDataType(ctx.Input("Param")->type()); + const framework::ExecutionContext& ctx) const override { + auto input_data_type = framework::GetDataTypeOfVar(ctx.InputVar("Param")); return framework::OpKernelType(input_data_type, ctx.GetPlace()); } }; +class MomentumOpInferVarType : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto input_var = op_desc.Input("Param")[0]; + for (auto& out_var : op_desc.Output("ParamOut")) { + if (block->FindRecursiveOrCreateVar(input_var).GetType() == + framework::proto::VarType::SELECTED_ROWS) { + block->FindRecursiveOrCreateVar(out_var).SetType( + framework::proto::VarType::SELECTED_ROWS); + } else { + block->FindRecursiveOrCreateVar(out_var).SetType( + framework::proto::VarType::LOD_TENSOR); + } + } + } +}; + class MomentumOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -110,6 +127,8 @@ $$ } // namespace paddle namespace ops = paddle::operators; -REGISTER_OP_WITHOUT_GRADIENT(momentum, ops::MomentumOp, ops::MomentumOpMaker); +REGISTER_OPERATOR(momentum, ops::MomentumOp, ops::MomentumOpMaker, + paddle::framework::EmptyGradOpMaker, + ops::MomentumOpInferVarType); REGISTER_OP_CPU_KERNEL(momentum, ops::MomentumOpKernel, ops::MomentumOpKernel); diff --git a/paddle/fluid/operators/momentum_op.cu b/paddle/fluid/operators/momentum_op.cu index a3932db1f3a50305d585cd3d5e86fa1b527df78b..a336f6e67117f5a0c3f761105a2fbbc662d2fbf6 100644 --- a/paddle/fluid/operators/momentum_op.cu +++ b/paddle/fluid/operators/momentum_op.cu @@ -42,32 +42,92 @@ __global__ void MomentumKernel(const T* p, const T* g, const T* v, } } +template +__global__ void SparseMomentumKernel(const T* p, const T* g, const T* v, + const T* lr, const T mu, + const int64_t* grad_rows, + const size_t grad_row_numel, + const size_t grad_row_size, + const T use_nesterov, T* p_out, T* v_out) { + for (int i = blockIdx.x; i < grad_row_size; i += gridDim.x) { + for (int j = threadIdx.x; j < grad_row_numel; j += blockDim.x) { + size_t p_i = grad_rows[i] * grad_row_numel + j; + size_t g_i = i * grad_row_numel + j; + v_out[g_i] = v[g_i] * mu + g[g_i]; + if (use_nesterov) { + p_out[p_i] = p[p_i] - (g[g_i] + v_out[g_i] * mu) * lr[0]; + } else { + p_out[p_i] = p[p_i] - v_out[g_i] * lr[0]; + } + } + } +} + template class MomentumOpCUDAKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto param_out = ctx.Output("ParamOut"); - auto velocity_out = ctx.Output("VelocityOut"); - auto param = ctx.Input("Param"); - auto velocity = ctx.Input("Velocity"); - auto grad = ctx.Input("Grad"); + T mu = static_cast(ctx.Attr("mu")); + bool use_nesterov = ctx.Attr("use_nesterov"); + auto learning_rate = ctx.Input("LearningRate"); + auto param = ctx.Input("Param"); + auto param_out = ctx.Output("ParamOut"); + auto* velocity_var = ctx.InputVar("Velocity"); + auto* grad_var = ctx.InputVar("Grad"); - T* p_out = param_out->mutable_data(ctx.GetPlace()); - T* v_out = velocity_out->mutable_data(ctx.GetPlace()); + if (grad_var->IsType()) { + PADDLE_ENFORCE(velocity_var->IsType(), + "Unmatched Type of Param and Grad"); + auto velocity = ctx.Input("Velocity"); + auto grad = ctx.Input("Grad"); + auto velocity_out = ctx.Output("VelocityOut"); + T* p_out = param_out->mutable_data(ctx.GetPlace()); + T* v_out = velocity_out->mutable_data(ctx.GetPlace()); + auto* p = param->data(); + auto* v = velocity->data(); + auto* g = grad->data(); + auto* lr = learning_rate->data(); - T mu = static_cast(ctx.Attr("mu")); - bool use_nesterov = ctx.Attr("use_nesterov"); + const int kThreadPerBlock = 256; + int grid = (param->numel() + kThreadPerBlock - 1) / kThreadPerBlock; + MomentumKernel< + T><<>>( + p, g, v, lr, mu, param->numel(), use_nesterov, p_out, v_out); + } else if (grad_var->IsType()) { + // sparse update embedding with selectedrows + PADDLE_ENFORCE(velocity_var->IsType(), + "Unmatched Type of Param and Grad"); + auto velocity = ctx.Input("Velocity"); + auto grad = ctx.Input("Grad"); + auto velocity_out = ctx.Output("VelocityOut"); - auto* p = param->data(); - auto* v = velocity->data(); - auto* g = grad->data(); - auto* lr = learning_rate->data(); + // sparse update maybe empty. + if (grad->rows().size() == 0) { + return; + } + PADDLE_ENFORCE(grad->height() == velocity->height(), + "Unmatched gradient and velocity."); + auto* p_out = param_out->mutable_data(ctx.GetPlace()); + auto* v_out = + velocity_out->mutable_value()->mutable_data(ctx.GetPlace()); + auto* lr = learning_rate->data(); + auto* p = param->data(); + auto* g = grad->value().data(); + auto* v = velocity->value().data(); + size_t grad_row_numel = grad->value().numel() / grad->rows().size(); + size_t grad_row_size = grad->rows().size(); + framework::Vector rows(grad->rows()); - int block = 512; - int grid = (param->numel() + block - 1) / block; - MomentumKernel<<>>( - p, g, v, lr, mu, param->numel(), use_nesterov, p_out, v_out); + const int kThreadPerBlock = 256; + int grid = (param->numel() + kThreadPerBlock - 1) / kThreadPerBlock; + SparseMomentumKernel< + T><<>>( + p, g, v, lr, mu, rows.CUDAData(ctx.GetPlace()), grad_row_numel, + grad->rows().size(), use_nesterov, p_out, v_out); + } else { + PADDLE_THROW("Unsupported Variable Type of Grad"); + } } }; diff --git a/paddle/fluid/operators/momentum_op.h b/paddle/fluid/operators/momentum_op.h index 264726040fb566a52b8c0cdee0a1524197d2a675..aee6d094e1d3130b3edee107e178a48b118991d8 100644 --- a/paddle/fluid/operators/momentum_op.h +++ b/paddle/fluid/operators/momentum_op.h @@ -23,32 +23,74 @@ template class MomentumOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto param_out = ctx.Output("ParamOut"); - auto velocity_out = ctx.Output("VelocityOut"); - auto param = ctx.Input("Param"); - auto velocity = ctx.Input("Velocity"); - auto grad = ctx.Input("Grad"); - auto learning_rate = ctx.Input("LearningRate"); - - param_out->mutable_data(ctx.GetPlace()); - velocity_out->mutable_data(ctx.GetPlace()); - T mu = static_cast(ctx.Attr("mu")); bool use_nesterov = ctx.Attr("use_nesterov"); - auto p_out = framework::EigenVector::Flatten(*param_out); - auto v_out = framework::EigenVector::Flatten(*velocity_out); + auto learning_rate = ctx.Input("LearningRate"); + auto param = ctx.Input("Param"); + auto param_out = ctx.Output("ParamOut"); + auto* velocity_var = ctx.InputVar("Velocity"); + auto* grad_var = ctx.InputVar("Grad"); + if (grad_var->IsType()) { + PADDLE_ENFORCE(velocity_var->IsType(), + "Unmatched Type of Param and Grad"); + auto velocity = ctx.Input("Velocity"); + auto grad = ctx.Input("Grad"); + auto velocity_out = ctx.Output("VelocityOut"); + param_out->mutable_data(ctx.GetPlace()); + velocity_out->mutable_data(ctx.GetPlace()); + auto p_out = framework::EigenVector::Flatten(*param_out); + auto v_out = framework::EigenVector::Flatten(*velocity_out); + + auto p = framework::EigenVector::Flatten(*param); + auto v = framework::EigenVector::Flatten(*velocity); + auto g = framework::EigenVector::Flatten(*grad); + auto* lr = learning_rate->data(); + + v_out = v * mu + g; + if (use_nesterov) { + p_out = p - (g + v_out * mu) * lr[0]; + } else { + p_out = p - lr[0] * v_out; + } + } else if (grad_var->IsType()) { + // sparse update embedding with selectedrows + PADDLE_ENFORCE(velocity_var->IsType(), + "Unmatched Type of Param and Grad"); + auto velocity = ctx.Input("Velocity"); + auto grad = ctx.Input("Grad"); + auto velocity_out = ctx.Output("VelocityOut"); - auto p = framework::EigenVector::Flatten(*param); - auto v = framework::EigenVector::Flatten(*velocity); - auto g = framework::EigenVector::Flatten(*grad); - auto* lr = learning_rate->data(); + // sparse update maybe empty. + if (grad->rows().size() == 0) { + return; + } + PADDLE_ENFORCE(grad->height() == velocity->height(), + "Unmatched gradient and velocity."); + auto* p_out = param_out->mutable_data(ctx.GetPlace()); + auto* v_out = + velocity_out->mutable_value()->mutable_data(ctx.GetPlace()); + auto* lr = learning_rate->data(); + auto* p = param->data(); + auto* g = grad->value().data(); + auto* v = velocity->value().data(); + size_t grad_row_numel = grad->value().numel() / grad->rows().size(); - v_out = v * mu + g; - if (use_nesterov) { - p_out = p - (g + v_out * mu) * lr[0]; + for (size_t i = 0; i < grad->rows().size(); ++i) { + size_t grad_row_index = grad->rows()[i]; + for (size_t j = 0; j < grad_row_numel; ++j) { + size_t p_i = grad_row_index * grad_row_numel + j; + size_t g_i = i * grad_row_numel + j; + v_out[g_i] = v[g_i] * mu + g[g_i]; + if (use_nesterov) { + p_out[p_i] = p[p_i] - (g[g_i] + v_out[g_i] * mu) * lr[0]; + } else { + p_out[p_i] = p[p_i] - v_out[g_i] * lr[0]; + } + } + } } else { - p_out = p - lr[0] * v_out; + PADDLE_THROW("Unsupported Variable Type of Grad"); } } }; diff --git a/python/paddle/fluid/tests/unittests/test_momentum_op.py b/python/paddle/fluid/tests/unittests/test_momentum_op.py index 7137fd0fdb7c503492107da684b95989037eb872..9bbffaa7ebbad39d203afe99dbc347237b4f7485 100644 --- a/python/paddle/fluid/tests/unittests/test_momentum_op.py +++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py @@ -16,6 +16,8 @@ from __future__ import print_function import unittest import numpy as np +import paddle.fluid.core as core +from paddle.fluid.op import Operator from op_test import OpTest @@ -88,5 +90,103 @@ class TestMomentumOp2(OpTest): self.check_output() +class TestSparseMomentumOp(unittest.TestCase): + def setUp(self): + self.use_nesterov = False + + def check_with_place(self, place): + self.init_kernel() + scope = core.Scope() + # create and initialize Grad Variable + height = 10 + rows = [0, 4, 7] + row_numel = 12 + mu = 1.0 + use_nesterov = self.use_nesterov + + # create and initialize Param Variable + param = scope.var('Param').get_tensor() + param_array = np.full((height, row_numel), 5.0).astype("float32") + param.set(param_array, place) + param_out = scope.var("ParamOut").get_tensor() + param_out_array = np.full((height, row_numel), 0.0).astype("float32") + param_out.set(param_out_array, place) + + grad_selected_rows = scope.var('Grad').get_selected_rows() + grad_selected_rows.set_height(height) + grad_selected_rows.set_rows(rows) + grad_np_array = np.ones((len(rows), row_numel)).astype("float32") + grad_np_array[0, 0] = 2.0 + grad_np_array[2, 8] = 4.0 + grad_tensor = grad_selected_rows.get_tensor() + grad_tensor.set(grad_np_array, place) + + velocity_selected_rows = scope.var('Velocity').get_selected_rows() + velocity_selected_rows.set_height(height) + velocity_selected_rows.set_rows(rows) + velocity_np_array = np.ones((len(rows), row_numel)).astype("float32") + velocity_np_array[0, 0] = 2.0 + velocity_np_array[2, 8] = 2.0 + velocity_tensor = velocity_selected_rows.get_tensor() + velocity_tensor.set(velocity_np_array, place) + velocity_out_selected_rows = scope.var('VelocityOut').get_selected_rows( + ) + velocity_out_selected_rows.set_height(height) + velocity_out_selected_rows.set_rows(rows) + velocity_out_np_array = np.full((len(rows), row_numel), + 0.0).astype("float32") + velocity_out_tensor = velocity_out_selected_rows.get_tensor() + velocity_out_tensor.set(velocity_out_np_array, place) + + # create and initialize LeraningRate Variable + lr = scope.var('LearningRate').get_tensor() + lr_array = np.full((1), 2.0).astype("float32") + lr.set(lr_array, place) + + # create and run operator + op = Operator( + "momentum", + Param='Param', + Grad='Grad', + Velocity='Velocity', + ParamOut='ParamOut', + VelocityOut='VelocityOut', + LearningRate='LearningRate', + mu=mu, + use_nesterov=use_nesterov) + op.run(scope, place) + + # get and compare result + param_out_np_array = np.array(param_out) + velocity_out_np_array = np.array(velocity_out_tensor) + + # TODO(dzh): add a more suitable general numpy interface + # for sparse update. + _velocity_out = mu * velocity_np_array + grad_np_array + _param = param_array[rows] + if use_nesterov: + _param_out = _param - grad_np_array * lr_array - \ + _velocity_out * mu * lr_array + else: + _param_out = _param - lr * _velocity_out + self.assertTrue((_param_out == param_out_np_array[rows]).all()) + self.assertTrue((_velocity_out == velocity_out_np_array).all()) + + def init_kernel(self): + pass + + def test_sparse_momentum(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + for place in places: + self.check_with_place(place) + + +class TestSparseMomentumOp2(TestSparseMomentumOp): + def init_kernel(self): + self.use_nesterov = True + + if __name__ == "__main__": unittest.main()