/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/math/algorithm.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/platform/for_range.h" namespace paddle { namespace operators { using framework::Tensor; using framework::SelectedRows; struct NoNesterov; struct UseNesterov; template class CPUDenseMomentumFunctor { private: const Tensor* param; const Tensor* grad; const Tensor* velocity; const Tensor* learning_rate; const T mu; const T use_nesterov; Tensor* param_out; Tensor* velocity_out; public: CPUDenseMomentumFunctor(const Tensor* param, const Tensor* grad, const Tensor* velocity, const Tensor* learning_rate, const T mu, const bool use_nesterov, Tensor* param_out, Tensor* velocity_out) : param(param), grad(grad), velocity(velocity), learning_rate(learning_rate), mu(mu), use_nesterov(use_nesterov), param_out(param_out), velocity_out(velocity_out) {} inline void operator()() { auto p_out = framework::EigenVector::Flatten(*param_out); auto v_out = framework::EigenVector::Flatten(*velocity_out); auto p = framework::EigenVector::Flatten(*param); auto v = framework::EigenVector::Flatten(*velocity); auto g = framework::EigenVector::Flatten(*grad); auto* lr = learning_rate->data(); v_out = v * mu + g; if (use_nesterov) { p_out = p - (g + v_out * mu) * lr[0]; } else { p_out = p - lr[0] * v_out; } } }; template class DenseMomentumFunctor; // NOTE(dzh) for performance. // avoid if/else in inside kernel, implement GPU UseNesterov/NoNesterov as two // functor. template class DenseMomentumFunctor { private: const T* p_; const T* g_; const T* v_; const T* lr_; const T mu_; const int64_t num_; T* p_out_; T* v_out_; public: DenseMomentumFunctor(const T* p, const T* g, const T* v, const T* learning_rate, const T mu, const int64_t num, T* p_out, T* v_out) : p_(p), g_(g), v_(v), lr_(learning_rate), mu_(mu), num_(num), p_out_(p_out), v_out_(v_out) {} inline HOSTDEVICE void operator()(size_t i) const { // put memory access in register const T p = p_[i]; const T g = g_[i]; const T lr = lr_[0]; const T v = v_[i]; T v_out = v * mu_ + g; T p_out = p - (g + v_out * mu_) * lr; // write reigster to memory v_out_[i] = v_out; p_out_[i] = p_out; } }; template class DenseMomentumFunctor { private: const T* p_; const T* g_; const T* v_; const T* lr_; const T mu_; const int64_t num_; T* p_out_; T* v_out_; public: DenseMomentumFunctor(const T* p, const T* g, const T* v, const T* learning_rate, const T mu, const int64_t num, T* p_out, T* v_out) : p_(p), g_(g), v_(v), lr_(learning_rate), mu_(mu), num_(num), p_out_(p_out), v_out_(v_out) {} inline HOSTDEVICE void operator()(size_t i) const { // put memory access in register const T p = p_[i]; const T g = g_[i]; const T lr = lr_[0]; const T v = v_[i]; T v_out = v * mu_ + g; T p_out = p - lr * v_out; // write reigster to memory v_out_[i] = v_out; p_out_[i] = p_out; } }; // TODO(dzh): enhance speed use eigen // template // class CPUSparseMomentumFunctor { // private: // const T* p_; // const T* g_; // const T* v_; // const T* lr_; // const T mu_; // const bool use_nesterov_; // const int64_t* rows_; // const int64_t row_numel_; // const int64_t row_height_; // T* p_out_; // T* v_out_; // public: // CPUSparseMomentumFunctor(const T* p, const T* g, const T* v, const T* lr, // const T mu, const bool use_nesterov, const int64_t* rows, const int64_t // row_numel, const int64_t row_height, T* p_out, T* v_out) :p_(p), g_(g), // v_(v), lr_(lr), mu_(mu), rows_(rows), row_numel_(row_numel), // row_height_(row_height), p_out_(p_out), v_out_(v_out) {} // inline void operator()() { // } // }; template class SparseMomentumFunctor; template class SparseMomentumFunctor { private: const T* p_; const T* g_; const T* v_; const T* lr_; const T mu_; const int64_t* rows_; const int64_t row_numel_; const int64_t row_height_; T* p_out_; T* v_out_; public: SparseMomentumFunctor(const T* p, const T* g, const T* v, const T* lr, const T mu, const int64_t* rows, int64_t row_numel, int64_t row_height, T* p_out, T* v_out) : p_(p), g_(g), v_(v), lr_(lr), mu_(mu), rows_(rows), row_numel_(row_numel), row_height_(row_height), p_out_(p_out), v_out_(v_out) {} inline HOSTDEVICE void operator()(size_t i) { auto row_idx = math::BinarySearch(rows_, row_height_, i / row_numel_); T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0; // put memory access in register const T p = p_[i]; const T lr = lr_[0]; const T v = v_[i]; T v_out = v * mu_ + g; T p_out = p - (g + v_out * mu_) * lr; // write reigster to memory v_out_[i] = v_out; p_out_[i] = p_out; } }; template class SparseMomentumFunctor { private: const T* p_; const T* g_; const T* v_; const T* lr_; const T mu_; const int64_t* rows_; const int64_t row_numel_; const int64_t row_height_; T* p_out_; T* v_out_; public: SparseMomentumFunctor(const T* p, const T* g, const T* v, const T* lr, const T mu, const int64_t* rows, int64_t row_numel, int64_t row_height, T* p_out, T* v_out) : p_(p), g_(g), v_(v), lr_(lr), mu_(mu), rows_(rows), row_numel_(row_numel), row_height_(row_height), p_out_(p_out), v_out_(v_out) {} inline HOSTDEVICE void operator()(size_t i) { auto row_idx = math::BinarySearch(rows_, row_height_, i / row_numel_); T g = row_idx >= 0 ? g_[row_idx * row_numel_ + i % row_numel_] : 0; // put memory access in register const T p = p_[i]; const T lr = lr_[0]; const T v = v_[i]; T v_out = v * mu_ + g; T p_out = p - v_out * lr; // write reigster to memory v_out_[i] = v_out; p_out_[i] = p_out; } }; template class MomentumOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { T mu = static_cast(ctx.Attr("mu")); bool use_nesterov = ctx.Attr("use_nesterov"); auto learning_rate = ctx.Input("LearningRate"); auto param = ctx.Input("Param"); auto param_out = ctx.Output("ParamOut"); auto* velocity = ctx.Input("Velocity"); auto velocity_out = ctx.Output("VelocityOut"); param_out->mutable_data(ctx.GetPlace()); velocity_out->mutable_data(ctx.GetPlace()); auto* grad_var = ctx.InputVar("Grad"); if (grad_var->IsType()) { auto grad = ctx.Input("Grad"); if (platform::is_cpu_place(ctx.GetPlace())) { CPUDenseMomentumFunctor functor(param, grad, velocity, learning_rate, mu, use_nesterov, param_out, velocity_out); functor(); } else if (platform::is_gpu_place(ctx.GetPlace())) { platform::ForRange for_range( static_cast(ctx.device_context()), param->numel()); if (use_nesterov) { DenseMomentumFunctor functor( param->data(), grad->data(), velocity->data(), learning_rate->data(), mu, param->numel(), param_out->mutable_data(ctx.GetPlace()), velocity_out->mutable_data(ctx.GetPlace())); for_range(functor); } else { DenseMomentumFunctor functor( param->data(), grad->data(), velocity->data(), learning_rate->data(), mu, param->numel(), param_out->mutable_data(ctx.GetPlace()), velocity_out->mutable_data(ctx.GetPlace())); for_range(functor); } } } else if (grad_var->IsType()) { // sparse update embedding with selectedrows auto grad = ctx.Input("Grad"); // sparse update maybe empty. if (grad->rows().size() == 0) { VLOG(3) << "Grad SelectedRows contains no data!"; return; } auto* merged_grad = const_cast(ctx.scope()) .Var() ->GetMutable(); math::scatter::MergeAdd merge_func; merge_func(ctx.template device_context(), *grad, merged_grad); platform::ForRange for_range( static_cast(ctx.device_context()), param->numel()); const int64_t* rows = nullptr; if (platform::is_gpu_place(ctx.GetPlace())) { rows = merged_grad->rows().CUDAData(ctx.GetPlace()); } else { rows = merged_grad->rows().data(); } if (use_nesterov) { SparseMomentumFunctor functor( param->data(), merged_grad->value().data(), velocity->data(), learning_rate->data(), mu, rows, static_cast(merged_grad->rows().size()), static_cast(merged_grad->height()), param_out->mutable_data(ctx.GetPlace()), velocity_out->mutable_data(ctx.GetPlace())); for_range(functor); } else { SparseMomentumFunctor functor( param->data(), merged_grad->value().data(), velocity->data(), learning_rate->data(), mu, rows, static_cast(merged_grad->rows().size()), static_cast(merged_grad->height()), param_out->mutable_data(ctx.GetPlace()), velocity_out->mutable_data(ctx.GetPlace())); for_range(functor); } } else { PADDLE_THROW("Unsupported Variable Type of Grad"); } } }; } // namespace operators } // namespace paddle