// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #pragma once #include #include #include #include #include #include #include "paddle/fluid/distributed/collective/ProcessGroup.h" #include "paddle/fluid/platform/device/npu/npu_stream.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/distributed/collective/HCCLTools.h" #include "paddle/fluid/distributed/store/store.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/gen_comm_id_helper.h" #include "paddle/fluid/platform/place.h" constexpr const char* HCCL_BACKEND_NAME = "HCCL"; namespace paddle { namespace distributed { using Place = paddle::platform::Place; using NPUStream = platform::stream::NPUStream; using NPUDeviceContext = paddle::platform::NPUDeviceContext; class ProcessGroupHCCL : public ProcessGroup { public: class HCCLTask : public ProcessGroup::Task, public std::enable_shared_from_this { public: HCCLTask(const std::vector& places, int rank, CommType CommType, const std::vector& inputs); bool IsCompleted(); void SynchronizeStreams(); bool Wait(std::chrono::milliseconds timeout = kWaitTimeout); void Synchronize(); void SetOutputs(std::vector& outputs); // NOLINT virtual ~HCCLTask(); std::vector control_events_; protected: std::vector places_; std::vector> hcclComms_; std::shared_ptr> outputs_; private: }; ProcessGroupHCCL(const std::shared_ptr& store, int rank, int size); const std::string GetBackendName() const override { return std::string(HCCL_BACKEND_NAME); } std::shared_ptr AllReduce( std::vector& tensors, const AllreduceOptions& = AllreduceOptions()) override; std::shared_ptr Broadcast( std::vector& tensors, const BroadcastOptions& = BroadcastOptions()) override; std::shared_ptr Barrier( const BarrierOptions& = BarrierOptions()) override; std::shared_ptr Send(std::vector& tensors, int dst_rank) override; std::shared_ptr Recv(std::vector& tensors, int src_rank) override; std::shared_ptr AllGather( std::vector& in_tensors, std::vector& out_tensors) override; std::shared_ptr AllToAll( std::vector& in, std::vector& out) override; std::shared_ptr Reduce( std::vector& tensors, const ReduceOptions& opts) override; std::shared_ptr Scatter(std::vector& in_tensors, std::vector& out_tensors, const ScatterOptions&) override; protected: virtual std::shared_ptr CreateTask( std::vector places, int rank, CommType opType, const std::vector& inputs); std::shared_ptr store_; std::shared_ptr hccl_comm_; std::mutex mutex_; std::unordered_map>> places_to_hcclcomm_; std::unordered_map> places_to_events_; std::unordered_map>> places_to_ctx_; std::set used_place_ids_; private: void BcastHCCLId(std::vector& hccl_ids, int root, // NOLINT int server_fd); void BroadcastUniqueHCCLID(std::vector& hccl_ids); // NOLINT template std::shared_ptr Collective( std::vector& inputs, // NOLINT std::vector& outputs, // NOLINT Fn fn, CommType op_type); template std::shared_ptr PointToPoint( std::vector& tensors, // NOLINT Fn fn, int dst_rank, CommType op_type); void CreateHCCLManagerCache(const std::string& places_key, const std::vector& places); }; } // namespace distributed } // namespace paddle