/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #include #include #include #include #include #include #include #include #include #include #include #include "gflags/gflags.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/variable.h" #include "paddle/fluid/operators/distributed/communicator_common.h" #include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/distributed/large_scale_kv.h" #include "paddle/fluid/operators/distributed/rpc_client.h" #include "paddle/fluid/operators/distributed_ops/send_recv_util.h" #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/place.h" #include "paddle/fluid/string/split.h" DECLARE_bool(communicator_is_sgd_optimizer); namespace paddle { namespace operators { namespace distributed { using Scope = framework::Scope; using Variable = framework::Variable; template class BlockingQueue { public: explicit BlockingQueue(size_t capacity) : capacity_(capacity) { PADDLE_ENFORCE_GT(capacity_, 0, "The capacity must be greater than 0."); } bool Push(const T &elem) { { std::unique_lock lock(mutex_); cv_.wait(lock, [&] { return queue_.size() < capacity_; }); PADDLE_ENFORCE_LT(queue_.size(), capacity_); queue_.push_back(elem); } cv_.notify_one(); return true; } bool Push(T &&elem) { { std::unique_lock lock(mutex_); cv_.wait(lock, [&] { return queue_.size() < capacity_; }); PADDLE_ENFORCE_LT(queue_.size(), capacity_); queue_.emplace_back(std::move(elem)); } cv_.notify_one(); return true; } T Pop() { std::unique_lock lock(mutex_); cv_.wait(lock, [=] { return !queue_.empty(); }); T rc(std::move(queue_.front())); queue_.pop_front(); cv_.notify_one(); return rc; } size_t Cap() const { std::lock_guard lock(mutex_); return capacity_; } size_t Size() const { std::lock_guard lock(mutex_); return queue_.size(); } private: const size_t capacity_; std::deque queue_; mutable std::mutex mutex_; std::condition_variable cv_; }; template using EigenVector = framework::EigenVector; template inline void MergeVars(const std::string &var_name, const std::vector> &vars, Scope *scope, bool merge_add = true) { PADDLE_ENFORCE(!vars.empty(), "should have value to merge!"); auto cpu_place = platform::CPUPlace(); auto &var0 = vars[0]; auto *out_var = scope->Var(var_name); if (var0->IsType()) { auto dims = var0->Get().dims(); VLOG(3) << "merge " << var_name << " LoDTensor dims " << dims << "; merge add: " << merge_add; // init output tensor auto *out_t = out_var->GetMutable(); out_t->mutable_data(dims, cpu_place); // check the input dims for (auto &var : vars) { auto &var_t = var->Get(); PADDLE_ENFORCE_EQ(var_t.dims(), dims, "should have the same dims"); } // set output tensor to 0. auto cpu_ctx = paddle::platform::CPUDeviceContext(); math::SetConstant constant_functor; constant_functor(cpu_ctx, out_t, static_cast(0)); // sum all vars to out auto result = EigenVector::Flatten(*out_t); for (auto &var : vars) { auto &in_t = var->Get(); auto in = EigenVector::Flatten(in_t); result.device(*cpu_ctx.eigen_device()) = result + in; } if (!merge_add) { result.device(*cpu_ctx.eigen_device()) = result / static_cast(vars.size()); } } else if (var0->IsType()) { auto &slr0 = var0->Get(); auto *out_slr = out_var->GetMutable(); out_slr->mutable_rows()->clear(); out_slr->mutable_value()->mutable_data({{}}, cpu_place); std::vector inputs; inputs.reserve(vars.size()); for (auto &var : vars) { inputs.push_back(&var->Get()); } auto dev_ctx = paddle::platform::CPUDeviceContext(); if (merge_add) { math::scatter::MergeAdd merge_add; merge_add(dev_ctx, inputs, out_slr); } else { math::scatter::MergeAverage merge_average; merge_average(dev_ctx, inputs, out_slr); } VLOG(3) << "merge " << var_name << " SelectedRows height: " << slr0.height() << " dims: " << slr0.value().dims() << "; merge add: " << merge_add; } else { PADDLE_THROW("unsupported var type!"); } } using RpcCtxMap = std::unordered_map; using SparseValue = std::unordered_map>; class Communicator { public: Communicator(); explicit Communicator(const std::map &envs_) { for (auto &iter : envs_) { envs[iter.first] = iter.second; } } virtual ~Communicator() {} virtual void Start() = 0; virtual void Stop() = 0; virtual bool IsRunning() { return running_; } virtual void Clean() {} virtual void Send(const std::vector &var_names, const std::vector &var_tables, const framework::Scope &scope) = 0; virtual void RecvNoBarrier() {} virtual void Barrier() {} virtual void BarrierTriggerDecrement() {} virtual void BarrierTriggerReset(int init_counter) {} virtual void InitEnvs() = 0; virtual void InitImpl(const RpcCtxMap &send_varname_to_ctx, const RpcCtxMap &recv_varname_to_ctx, Scope *recv_scope) {} static Communicator *GetInstance() { return communicator_.get(); } static std::shared_ptr GetInstantcePtr() { return communicator_; } template static Communicator *InitInstance( const RpcCtxMap &send_ctx, const RpcCtxMap &recv_ctx, Scope *recv_scope, const std::map &envs) { std::call_once(init_flag_, &Communicator::InitWithRpcCtx, send_ctx, recv_ctx, recv_scope, std::ref(envs)); return communicator_.get(); } // Init is called by InitInstance. template static void InitWithRpcCtx(const RpcCtxMap &send_ctx, const RpcCtxMap &recv_ctx, Scope *recv_scope, const std::map &envs) { if (communicator_.get() == nullptr) { communicator_.reset(new T(std::ref(envs))); communicator_->InitEnvs(); communicator_->InitImpl(send_ctx, recv_ctx, recv_scope); } } protected: bool running_ = false; bool waiting_ = true; static std::shared_ptr communicator_; static std::once_flag init_flag_; std::unordered_map envs; }; class AsyncCommunicator : public Communicator { public: AsyncCommunicator() : Communicator() {} explicit AsyncCommunicator(const std::map &envs) : Communicator(envs) {} ~AsyncCommunicator(); void InitEnvs() { min_send_grad_num_before_recv_ = std::stoi(envs.at("communicator_min_send_grad_num_before_recv")); thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size")); max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num")); send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times")); send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size")); need_global_step_ = static_cast(std::stoi(envs.at("need_global_step"))); VLOG(0) << "AsyncCommunicator Initialized"; } void Start() override; void Stop() override; void InitImpl(const RpcCtxMap &send_varname_to_ctx, const RpcCtxMap &recv_varname_to_ctx, Scope *recv_scope) override; void MainThread(); void Send(const std::vector &var_names, const std::vector &var_tables, const framework::Scope &scope) override; virtual void SendByCommunicator(int batches); virtual void SendGlobalStep(int batches); virtual void RecvByCommunicator(); virtual void RecvNoBarrier(); virtual int Meet(); virtual void BarrierSend() {} virtual void BarrierRecv() {} virtual void BarrierWeakUp() {} protected: int min_send_grad_num_before_recv_; int thread_pool_size_; int max_merge_var_num_; int send_wait_times_; int send_queue_size_; int trainer_id_ = 0; bool need_global_step_ = false; std::unordered_map>>> send_varname_to_queue_; RpcCtxMap send_varname_to_ctx_; RpcCtxMap recv_varname_to_ctx_; std::unique_ptr main_thread_{nullptr}; Scope *recv_scope_; // should be global scope std::unique_ptr send_scope_; // an independent scope std::unique_ptr<::ThreadPool> send_threadpool_{nullptr}; std::unique_ptr<::ThreadPool> recv_threadpool_{nullptr}; std::atomic_uint grad_num_{0}; // the num of gradient sent since last recv }; class HalfAsyncCommunicator : public AsyncCommunicator { public: HalfAsyncCommunicator() {} explicit HalfAsyncCommunicator(const std::map &envs) : AsyncCommunicator(envs) {} void InitEnvs() { min_send_grad_num_before_recv_ = 0; max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num")); send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times")); thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size")); send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size")); need_global_step_ = static_cast(std::stoi(envs.at("need_global_step"))); VLOG(0) << "HalfAsyncCommunicator Initialized"; } void Clean() override; void Barrier() override; void BarrierTriggerDecrement() override; void BarrierTriggerReset(int initial_val) override; int Meet(); void BarrierWeakUp(); protected: // mutex for Wait for barrier std::mutex barrier_mutex_; std::condition_variable barrier_cond_; std::atomic barrier_trigger_{0}; std::atomic barrier_counter_{0}; }; class SyncCommunicator : public HalfAsyncCommunicator { public: SyncCommunicator() : HalfAsyncCommunicator() {} explicit SyncCommunicator(const std::map &envs) : HalfAsyncCommunicator(envs) {} void InitEnvs() { min_send_grad_num_before_recv_ = 0; max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num")); send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times")); thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size")); send_queue_size_ = std::stoi(envs.at("communicator_send_queue_size")); need_global_step_ = static_cast(std::stoi(envs.at("need_global_step"))); trainer_id_ = std::stoi(envs.at("trainer_id")); auto pserver_strings = envs.at("pserver_endpoints"); pserver_endpoints_ = paddle::string::Split(pserver_strings, ','); VLOG(0) << "SyncCommunicator Initialized"; } void BarrierSend(); void BarrierRecv(); private: std::vector pserver_endpoints_{}; }; class GeoCommunicator : public AsyncCommunicator { public: GeoCommunicator() : AsyncCommunicator() {} explicit GeoCommunicator(const std::map &envs) : AsyncCommunicator(envs) {} void InitImpl(const RpcCtxMap &send_varname_to_ctx, const RpcCtxMap &recv_varname_to_ctx, Scope *recv_scope) override; void InitEnvs() { min_send_grad_num_before_recv_ = 0; max_merge_var_num_ = std::stoi(envs.at("communicator_max_merge_var_num")); send_wait_times_ = std::stoi(envs.at("communicator_send_wait_times")); thread_pool_size_ = std::stoi(envs.at("communicator_thread_pool_size")); send_queue_size_ = max_merge_var_num_; trainers_ = std::stoi(envs.at("trainers")); sparse_attrs_ = envs.at("sparse_attrs"); VLOG(0) << "GeoCommunicator Initialized"; } void Send(const std::vector &var_names, const std::vector &var_tables, const framework::Scope &scope) override; void SendByCommunicator(int batches) override; void SendSparse(const std::string &varname, int batches); void SendDense(const std::string &varname); void SendGlobalStep(int batches) override {} void RecvByCommunicator() override; void RecvSparse(const std::string &varname); void RecvDense(const std::string &varname); void Init(); void InitSparse(); void InitDense(const std::string varname); private: int trainers_; std::string sparse_attrs_; // parameter for delta calc and send std::shared_ptr delta_scope_; // parameter for storage the pserver param after last recv std::shared_ptr old_scope_; // parameter on pserver std::shared_ptr pserver_scope_; std::unordered_map>>> send_ids_to_queue_; std::unordered_map> old_sparses_; }; } // namespace distributed } // namespace operators } // namespace paddle