// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include #include "paddle/fluid/operators/optimizers/momentum_op.h" #include "paddle/phi/kernels/sgd_kernel.h" namespace paddle { namespace operators { template class DGCMomentumKernel : public framework::OpKernel { public: DGCMomentumKernel() : _momentum_op_kernel(new MomentumOpKernel()) {} void Compute(const framework::ExecutionContext& context) const override { auto rampup_begin_step = context.Attr("rampup_begin_step"); if (static_cast(rampup_begin_step) < 0) { return; } auto current_step_tensor = context.Input("current_step"); auto* current_step = current_step_tensor->data(); // nranks auto nranks_tensor = context.Input("nranks"); const int nranks = static_cast(*nranks_tensor->data()); PADDLE_ENFORCE_GT( nranks, 1, platform::errors::InvalidArgument( "DGC is not useful when num_trainers <= 1, but now nranks=%d", nranks)); const framework::Tensor* g = context.Input("Grad"); framework::Tensor* g_out = context.Output("Grad_out"); auto g_e = framework::EigenVector::Flatten(*g); auto g_out_e = framework::EigenVector::Flatten(*g_out); auto& dev_ctx = context.template device_context(); auto& eigen_ctx = *dev_ctx.eigen_device(); // NOTE. In dgc_op we multi grad with nranks, so we need /nranks here. g_out_e.device(eigen_ctx) = (1.0 / nranks) * g_e; VLOG(10) << "current_step:" << *current_step << ", rampup_begin_step:" << rampup_begin_step; if (static_cast(*current_step) < static_cast(rampup_begin_step)) { VLOG(10) << " so use momentum optimizer"; return _momentum_op_kernel->Compute(context); } VLOG(10) << " so use sgd optimizer"; const auto* param_var = context.InputVar("Param"); const auto* grad_var = context.InputVar("Grad"); auto* learning_rate = context.Input("LearningRate"); bool multi_precision = context.Attr("multi_precision"); if (param_var->IsType()) { auto* param = context.Input("Param"); auto* param_out = context.Output("ParamOut"); auto* master_param_out = context.Output("MasterParamOut"); paddle::optional master_param_opt = paddle::none; if (multi_precision) { auto* master_param = context.Input("MasterParam"); master_param_opt = *master_param; } if (grad_var->IsType()) { // sgd_dense auto* grad = context.Input("Grad"); phi::SGDDenseKernel( static_cast::TYPE&>(dev_ctx), *param, *learning_rate, *grad, master_param_opt, multi_precision, param_out, master_param_out); } else { // sgd dense param sparse grad auto* grad = context.Input("Grad"); phi::SGDDenseParamSparseGradKernel( static_cast::TYPE&>(dev_ctx), *param, *learning_rate, *grad, master_param_opt, multi_precision, param_out, master_param_out); } } else if (param_var->IsType() && grad_var->IsType() && platform::is_cpu_place(context.GetPlace())) { // sgd sparse param sparse grad auto* param = context.Input("Param"); auto* param_out = context.Output("ParamOut"); auto* master_param_out = context.Output("MasterParamOut"); paddle::optional master_param_opt = paddle::none; if (multi_precision) { auto* master_param = context.Input("MasterParam"); master_param_opt = *master_param; } auto* grad = context.Input("Grad"); phi::SGDSparseParamSparseGradKernel( static_cast::TYPE&>(dev_ctx), *param, *learning_rate, *grad, master_param_opt, multi_precision, param_out, master_param_out); } else { PADDLE_THROW("gdc not support yet"); } } private: std::unique_ptr> _momentum_op_kernel; }; } // namespace operators } // namespace paddle