// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include #include "paddle/fluid/operators/optimizers/momentum_op.h" #include "paddle/fluid/operators/optimizers/sgd_op.h" namespace paddle { namespace operators { template class DGCMomentumKernel : public framework::OpKernel { public: DGCMomentumKernel() : _momentum_op_kernel(new MomentumOpKernel()), _sgd_op_kernel(new SGDOpKernel()) {} void Compute(const framework::ExecutionContext& context) const override { auto rampup_begin_step = context.Attr("rampup_begin_step"); if (static_cast(rampup_begin_step) < 0) { return; } auto current_step_tensor = context.Input("current_step"); auto* current_step = current_step_tensor->data(); // nranks auto nranks_tensor = context.Input("nranks"); const int nranks = static_cast(*nranks_tensor->data()); PADDLE_ENFORCE_GT( nranks, 1, platform::errors::InvalidArgument( "DGC is not useful when num_trainers <= 1, but now nranks=%d", nranks)); const framework::Tensor* g = context.Input("Grad"); framework::Tensor* g_out = context.Output("Grad_out"); auto g_e = framework::EigenVector::Flatten(*g); auto g_out_e = framework::EigenVector::Flatten(*g_out); auto& dev_ctx = context.template device_context(); auto& eigen_ctx = *dev_ctx.eigen_device(); // NOTE. In dgc_op we multi grad with nranks, so we need /nranks here. g_out_e.device(eigen_ctx) = (1.0 / nranks) * g_e; VLOG(10) << "current_step:" << *current_step << ", rampup_begin_step:" << rampup_begin_step; if (static_cast(*current_step) < static_cast(rampup_begin_step)) { VLOG(10) << " so use momentum optimizer"; return _momentum_op_kernel->Compute(context); } VLOG(10) << " so use sgd optimizer"; return _sgd_op_kernel->Compute(context); } private: std::unique_ptr> _momentum_op_kernel; std::unique_ptr> _sgd_op_kernel; }; } // namespace operators } // namespace paddle