From 847aa172ae650a05087d0ced0260b5cb7229f8ca Mon Sep 17 00:00:00 2001 From: liuyuhui Date: Tue, 29 Dec 2020 20:35:25 +0800 Subject: [PATCH] [Kunlun] 2.0 cherry-pick:Support for Baidu Kunlun XPU multi card training (#29713) * [Kunlun] PR1:Support one Kunlun card training in parallel executor (#29337) * [Kunlun] PR2: Support MultiDevicePass and BKCL in parallel executor (#29574) * [Kunlun] bug fix of PR2: Support MultiDevicePass and BKCL in parallel executor (#29926) * add bkcl.so in whl for kunlun (#29947) * [Kunlun] bug fix of PR2: Support MultiDevicePass and BKCL in parallel executor (#29961) Co-authored-by: QingshuChen --- CMakeLists.txt | 11 +- cmake/external/xpu.cmake | 14 + .../framework/details/all_reduce_op_handle.cc | 65 ++++ .../framework/details/all_reduce_op_handle.h | 18 +- .../fluid/framework/details/bkcl_op_handle.h | 131 ++++++++ .../framework/details/broadcast_op_handle.cc | 68 ++++- .../framework/details/broadcast_op_handle.h | 28 +- .../details/broadcast_op_handle_test.cc | 20 +- .../details/broadcast_op_handle_test.h | 64 +++- .../fluid/framework/details/build_strategy.cc | 56 +++- .../fluid/framework/details/build_strategy.h | 12 +- .../framework/details/execution_strategy.h | 6 +- .../fast_threaded_ssa_graph_executor.cc | 2 +- .../details/fused_all_reduce_op_handle.cc | 16 +- .../details/fused_all_reduce_op_handle.h | 9 + .../details/fused_broadcast_op_handle.h | 15 +- .../details/fused_broadcast_op_handle_test.cc | 38 ++- .../details/gather_op_handle_test.cc | 5 +- .../framework/details/multi_devices_helper.h | 1 + .../fluid/framework/details/op_handle_base.cc | 66 +++- .../fluid/framework/details/op_handle_base.h | 7 +- .../framework/details/reduce_op_handle.cc | 56 +++- .../framework/details/reduce_op_handle.h | 18 ++ .../details/reduce_op_handle_test.cc | 8 +- .../details/scale_loss_grad_op_handle.cc | 16 +- .../details/threaded_ssa_graph_executor.cc | 2 +- ...est_reference_count_pass_last_lived_ops.cc | 3 +- .../fuse_all_reduce_op_pass.cc | 21 ++ .../multi_devices_graph_pass.cc | 28 ++ .../multi_devices_graph_pass.h | 8 + paddle/fluid/framework/parallel_executor.cc | 288 +++++++++++++++--- paddle/fluid/framework/parallel_executor.h | 2 + paddle/fluid/framework/var_type_traits.cc | 4 + paddle/fluid/framework/var_type_traits.h | 11 + .../fluid/framework/var_type_traits_test.cc | 3 + paddle/fluid/platform/bkcl_helper.h | 280 +++++++++++++++++ paddle/fluid/platform/device_context.cc | 40 ++- paddle/fluid/platform/device_context.h | 37 ++- paddle/fluid/pybind/pybind.cc | 24 +- python/paddle/fluid/compiler.py | 37 ++- python/paddle/fluid/framework.py | 48 +++ .../unittests/parallel_executor_test_base.py | 33 +- .../fluid/tests/unittests/seresnext_net.py | 21 +- .../tests/unittests/seresnext_test_base.py | 23 +- .../unittests/test_fuse_all_reduce_pass.py | 36 +-- .../test_fuse_elewise_add_act_pass.py | 20 +- .../unittests/test_fuse_optimizer_pass.py | 38 +-- .../test_fuse_relu_depthwise_conv_pass.py | 18 +- .../tests/unittests/test_ir_inplace_pass.py | 4 +- .../test_ir_memory_optimize_ifelse_op.py | 2 +- .../unittests/test_ir_memory_optimize_pass.py | 18 +- .../test_ir_memory_optimize_transformer.py | 6 +- .../test_mix_precision_all_reduce_fuse.py | 8 +- .../unittests/test_parallel_executor_mnist.py | 72 +++-- .../unittests/test_parallel_executor_pg.py | 20 +- ...st_parallel_executor_seresnext_base_cpu.py | 7 +- ...st_parallel_executor_seresnext_base_gpu.py | 4 +- ...utor_seresnext_with_fuse_all_reduce_cpu.py | 5 +- ...utor_seresnext_with_fuse_all_reduce_gpu.py | 4 +- ...llel_executor_seresnext_with_reduce_cpu.py | 43 +-- ...llel_executor_seresnext_with_reduce_gpu.py | 5 +- .../test_parallel_executor_transformer.py | 8 +- .../unittests/test_program_prune_backward.py | 4 +- .../tests/unittests/xpu/test_xpu_place.py | 47 +++ python/paddle/static/__init__.py | 3 +- python/setup.py.in | 4 + tools/wlist.json | 3 +- 67 files changed, 1720 insertions(+), 322 deletions(-) create mode 100644 paddle/fluid/framework/details/bkcl_op_handle.h create mode 100644 paddle/fluid/platform/bkcl_helper.h create mode 100644 python/paddle/fluid/tests/unittests/xpu/test_xpu_place.py diff --git a/CMakeLists.txt b/CMakeLists.txt index dd7ac439f3..f738427769 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,7 +29,8 @@ include(generic) # simplify cmake module find_package(CUDA QUIET) option(WITH_GPU "Compile PaddlePaddle with NVIDIA GPU" ${CUDA_FOUND}) option(WITH_TENSORRT "Compile PaddlePaddle with NVIDIA TensorRT" OFF) -option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN" OFF) +option(WITH_XPU "Compile PaddlePaddle with BAIDU KUNLUN XPU" OFF) +option(WITH_WIN_DUMP_DBG "Compile with windows core dump debug mode" OFF) if (WITH_GPU AND WITH_XPU) message(FATAL_ERROR "Error when compile GPU and XPU at the same time") endif() @@ -141,6 +142,7 @@ option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE} option(SANITIZER_TYPE "Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined" OFF) option(WITH_LITE "Compile Paddle Fluid with Lite Engine" OFF) option(WITH_NCCL "Compile PaddlePaddle with NCCL support" ON) +option(WITH_XPU_BKCL "Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL" OFF) option(WITH_CRYPTO "Compile PaddlePaddle with crypto support" ON) option(WITH_ARM "Compile PaddlePaddle with arm support" OFF) option(WITH_SW "Compile PaddlePaddle with sw support" OFF) @@ -188,6 +190,13 @@ if (NOT WITH_GPU AND WITH_NCCL) "Disable NCCL when compiling without GPU" FORCE) endif() +if (NOT WITH_XPU AND WITH_XPU_BKCL) + MESSAGE(WARNING + "Disable BKCL when compiling without XPU. Force WITH_XPU_BKCL=OFF.") + set(WITH_XPU_BKCL OFF CACHE STRING + "Disable BKCL when compiling without XPU" FORCE) +endif() + if(WITH_NCCL) add_definitions("-DPADDLE_WITH_NCCL") include(nccl) diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake index 6b24354440..bbd065c0a5 100644 --- a/cmake/external/xpu.cmake +++ b/cmake/external/xpu.cmake @@ -47,4 +47,18 @@ set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}") generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake") TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB}) + +if (WITH_XPU_BKCL) + MESSAGE(STATUS "Compile with XPU BKCL!") + ADD_DEFINITIONS(-DPADDLE_WITH_XPU_BKCL) + + SET(XPU_BKCL_LIB_NAME "libbkcl.so") + SET(XPU_BKCL_LIB "${XPU_LIB_DIR}/${XPU_BKCL_LIB_NAME}") + SET(XPU_BKCL_INC_DIR "${THIRD_PARTY_PATH}/install/xpu/include") + INCLUDE_DIRECTORIES(${XPU_BKCL_INC_DIR}) + TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_BKCL_LIB}) +else(WITH_XPU_BKCL) + TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ) +endif(WITH_XPU_BKCL) + ADD_DEPENDENCIES(xpulib ${XPU_PROJECT}) diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index 78887f3ac5..bd5c93d8ab 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -43,6 +43,19 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, "number of local scopes is %d.", places_.size(), local_scopes_.size())); } +#elif defined(PADDLE_WITH_XPU_BKCL) +AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, + const std::vector &local_scopes, + const std::vector &places, + const platform::BKCLCommunicator *ctxs) + : BKCLOpHandleBase(node, places, ctxs), local_scopes_(local_scopes) { + PADDLE_ENFORCE_EQ(places_.size(), local_scopes_.size(), + platform::errors::InvalidArgument( + "The number of places and the number of local scopes " + "should be equal, but got number of places is %d and " + "number of local scopes is %d.", + places_.size(), local_scopes_.size())); +} #else AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, @@ -98,6 +111,9 @@ void AllReduceOpHandle::AllReduceImpl( places.reserve(num_places); int64_t numel = -1; bool is_gpu_place = false; +#if defined(PADDLE_WITH_XPU_BKCL) + bool is_xpu_place = false; +#endif auto dtype = static_cast(0); for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { auto &local_scope = local_exec_scopes_[i]; @@ -117,6 +133,9 @@ void AllReduceOpHandle::AllReduceImpl( in_var_handles[i]->name(), numel)); dtype = lod_tensor.type(); is_gpu_place = platform::is_gpu_place(lod_tensor.place()); +#if defined(PADDLE_WITH_XPU_BKCL) + is_xpu_place = platform::is_xpu_place(lod_tensor.place()); +#endif } PADDLE_ENFORCE_EQ( numel, static_cast(lod_tensor.numel()), @@ -128,6 +147,12 @@ void AllReduceOpHandle::AllReduceImpl( platform::errors::PreconditionNotMet( "The dtype of tensors of the same variable in different local " "scopes should be equal.")); +#if defined(PADDLE_WITH_XPU_BKCL) + PADDLE_ENFORCE_EQ(is_xpu_place, platform::is_xpu_place(lod_tensor.place()), + platform::errors::PreconditionNotMet( + "The place type of tensors of the same variable " + "in different local scopes should be equal.")); +#endif PADDLE_ENFORCE_EQ(is_gpu_place, platform::is_gpu_place(lod_tensor.place()), platform::errors::PreconditionNotMet( "The place type of tensors of the same variable " @@ -179,6 +204,25 @@ void AllReduceOpHandle::AllReduceFunc( #else PADDLE_THROW( platform::errors::PreconditionNotMet("Not compiled with CUDA.")); +#endif + } else if (is_xpu_place(places[0])) { +#if defined(PADDLE_WITH_XPU_BKCL) + PADDLE_ENFORCE_NOT_NULL(bkcl_ctxs_, + platform::errors::InvalidArgument( + "The bkcl context should not be NULL.")); + BKCLDataType bkcl_dtype = platform::ToBKCLDataType(dtype); + std::vector> all_reduce_calls; + for (size_t i = 0; i < local_exec_scopes_.size(); ++i) { + auto &p = places[i]; + void *buffer = const_cast(lod_tensor_data.at(i)); + all_reduce_calls.emplace_back([=] { + BKCLAllReduce(p, buffer, buffer, numel, bkcl_dtype, BKCL_ADD); + }); + } + BKCLAllReduceFunc(all_reduce_calls); +#else + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with BKCL.")); #endif } else { // Special handle CPU only Operator's gradient. Like CRF auto &trg = *local_exec_scopes_[0] @@ -205,6 +249,27 @@ void AllReduceOpHandle::AllReduceFunc( VLOG(10) << Name() << " size:" << numel * SizeOfType(dtype); } +#if defined(PADDLE_WITH_XPU_BKCL) +void AllReduceOpHandle::BKCLAllReduceFunc( + const std::vector> &all_reduce_calls) { + this->RunAndRecordEvent([&] { + if (all_reduce_calls.size() == 1UL) { + all_reduce_calls[0](); + } else { + PADDLE_ENFORCE_EQ( + bkcl_group_start(), BKCL_SUCCESS, + platform::errors::PreconditionNotMet("bkcl_group_start failed")); + for (auto &call : all_reduce_calls) { + call(); + } + PADDLE_ENFORCE_EQ( + bkcl_group_end(), BKCL_SUCCESS, + platform::errors::PreconditionNotMet("bkcl_group_end failed")); + } + }); +} +#endif + #if defined(PADDLE_WITH_NCCL) void AllReduceOpHandle::NCCLAllReduceFunc( const std::vector> &all_reduce_calls) { diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h index e0064ec264..fa260dea09 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/all_reduce_op_handle.h @@ -34,6 +34,9 @@ class NCCLCommunicator; #if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/platform/nccl_helper.h" +#elif defined(PADDLE_WITH_XPU_BKCL) +#include "paddle/fluid/framework/details/bkcl_op_handle.h" +#include "paddle/fluid/platform/bkcl_helper.h" #endif namespace paddle { @@ -46,6 +49,12 @@ class AllReduceOpHandle : public NCCLOpHandleBase { AllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, const platform::NCCLCommunicator *ctxs); +#elif defined(PADDLE_WITH_XPU_BKCL) +class AllReduceOpHandle : public BKCLOpHandleBase { + public: + AllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, + const std::vector &places, + const platform::BKCLCommunicator *ctxs); #else class AllReduceOpHandle : public OpHandleBase { public: @@ -65,8 +74,8 @@ class AllReduceOpHandle : public OpHandleBase { std::vector local_scopes_; -#ifndef PADDLE_WITH_NCCL - // NCCLOpHandleBase already have these attributes. +#if !(PADDLE_WITH_NCCL || PADDLE_WITH_XPU_BKCL) + // NCCLOpHandleBase and BKCLOpHandleBase already have these attributes. // Will polish it by class inheritance framework. std::vector places_; #endif @@ -78,6 +87,11 @@ class AllReduceOpHandle : public OpHandleBase { void SyncNCCLAllReduce(); #endif +#if defined(PADDLE_WITH_XPU_BKCL) + void BKCLAllReduceFunc( + const std::vector> &all_reduce_calls); +#endif + void AllReduceImpl(const std::vector &in_var_handles, const std::vector &out_var_handles); diff --git a/paddle/fluid/framework/details/bkcl_op_handle.h b/paddle/fluid/framework/details/bkcl_op_handle.h new file mode 100644 index 0000000000..fe63153a30 --- /dev/null +++ b/paddle/fluid/framework/details/bkcl_op_handle.h @@ -0,0 +1,131 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "xpu/bkcl.h" + +#include +#include +#include + +#include "paddle/fluid/framework/details/op_handle_base.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/bkcl_helper.h" + +DECLARE_bool(sync_bkcl_allreduce); + +namespace paddle { +namespace framework { +namespace details { + +class BKCLOpHandleBase : public OpHandleBase { + public: + BKCLOpHandleBase(ir::Node* node, const std::vector& places, + const platform::BKCLCommunicator* bkcl_ctxs) + : OpHandleBase(node), places_(places), bkcl_ctxs_(bkcl_ctxs) { + if (bkcl_ctxs == nullptr) { + return; + } + // init device context + auto default_bkcl_ctxs = bkcl_ctxs_->DefaultFlatCtx(); + for (auto& p : places_) { + this->SetDeviceContext(p, default_bkcl_ctxs->DevCtx(p)); + } + } + + virtual ~BKCLOpHandleBase() {} + + void SetRunEnv(int run_order, bool use_hierarchical_allreduce) { + PADDLE_ENFORCE_GE( + run_order, 0, + platform::errors::InvalidArgument( + "The argument run_order must be >= 0, but got %d.", run_order)); + PADDLE_ENFORCE_NE(use_hierarchical_allreduce, true, + platform::errors::Unimplemented( + "xpu doesn't support hierarchical_allreduce")); + + run_order_ = run_order; + use_hierarchical_allreduce_ = use_hierarchical_allreduce; + + VLOG(10) << "SetRunEnv " + << " run_order:" << run_order + << ", use_hierarchical_allreduce:" << use_hierarchical_allreduce; + + if (bkcl_ctxs_ == nullptr) { + return; + } + + if (!use_hierarchical_allreduce_) { + auto ctxs = bkcl_ctxs_->GetFlatCtx(run_order); + for (auto& p : places_) { + this->SetDeviceContext(p, ctxs->DevCtx(p)); + } + return; + } + } + + void FlatBKCLAllReduce(platform::Place place, const void* sendbuff, + void* recvbuff, size_t count, BKCLDataType datatype, + BKCLOp op) { + PADDLE_ENFORCE_GE( + run_order_, 0, + platform::errors::InvalidArgument( + "The argument run_order_ must be >= 0, but got %d.", run_order_)); + auto flat_bkcl_ctxs = bkcl_ctxs_->GetFlatCtx(run_order_); + int dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device; + auto& bkcl_ctx = flat_bkcl_ctxs->at(dev_id); + auto comm = bkcl_ctx.comm_; + + VLOG(10) << "before all reduce buffer:" << sendbuff << ", numel:" << count + << ", dev_id:" << dev_id << ", dtype:" << datatype + << ", place:" << place; + + PADDLE_ENFORCE_EQ( + bkcl_all_reduce(comm, sendbuff, recvbuff, count, datatype, op, NULL), + BKCL_SUCCESS, + platform::errors::PreconditionNotMet("bckl all reduce failed")); + } + + void BKCLAllReduce(platform::Place place, const void* sendbuff, + void* recvbuff, size_t count, BKCLDataType datatype, + BKCLOp op) { + PADDLE_ENFORCE_GE( + run_order_, 0, + platform::errors::InvalidArgument( + "The argument run_order_ must be >= 0, but got %d.", run_order_)); + PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_, false, + platform::errors::Unimplemented( + "xpu doesn't support hierarchical all reduce")); + if (!use_hierarchical_allreduce_) { + FlatBKCLAllReduce(place, sendbuff, recvbuff, count, datatype, op); + return; + } + } + + protected: + std::vector places_; + const platform::BKCLCommunicator* bkcl_ctxs_{nullptr}; + // When multi trainer call collective function, they need run the same order. + // Or the program will hang.So we use allreduce_deps_pass to set this + // run_order_. + int run_order_{0}; + // Use 2d allreduce or not. + bool use_hierarchical_allreduce_{false}; +}; + +} // namespace details +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 35b1066067..34d800994f 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -80,7 +80,7 @@ void BroadcastOpHandle::BroadcastOneVar( &VariableVisitor::GetMutableTensor(out_var)); }); } - } else { + } else if (platform::is_gpu_place(in_tensor.place())) { #if defined(PADDLE_WITH_NCCL) VarHandle *out_handle = nullptr; int root_id = @@ -141,6 +141,72 @@ void BroadcastOpHandle::BroadcastOneVar( #else PADDLE_THROW( platform::errors::PreconditionNotMet("Not compiled with NCLL.")); +#endif + } else { +#if defined(PADDLE_WITH_XPU_BKCL) + VarHandle *out_handle = nullptr; + int root_id = BOOST_GET_CONST(platform::XPUPlace, in_tensor.place()).device; + std::vector> broadcast_calls; + + int type = platform::ToBKCLDataType(in_tensor.type()); + size_t numel = static_cast(in_tensor.numel()); + + for (auto out_var_handle : out_var_handles) { + Variable *out_var = var_scopes.at(out_var_handle->scope_idx()) + ->FindVar(out_var_handle->name()); + + int dst_id = + BOOST_GET_CONST(platform::XPUPlace, out_var_handle->place()).device; + + auto &bkcl_ctx = bkcl_ctxs_->at(dst_id); + + void *send_recv_buffer = nullptr; + if (root_id == dst_id) { + send_recv_buffer = const_cast(in_tensor.data()); + out_handle = out_var_handle; + } else { + send_recv_buffer = VariableVisitor::GetMutableTensor(out_var) + .Resize(in_tensor.dims()) + .mutable_data(out_var_handle->place()); + } + + broadcast_calls.emplace_back([send_recv_buffer, numel, type, root_id, + &bkcl_ctx] { + PADDLE_ENFORCE_EQ( + bkcl_broadcast(bkcl_ctx.comm(), send_recv_buffer, send_recv_buffer, + numel, static_cast(type), root_id, + nullptr), + BKCL_SUCCESS, + platform::errors::Unavailable("bkcl_broadcast failed")); + }); + } + + WaitInputVarGenerated(); + this->RunAndRecordEvent([&] { + { + PADDLE_ENFORCE_EQ( + bkcl_group_start(), BKCL_SUCCESS, + platform::errors::Unavailable("bkcl_group_start failed")); + for (auto &call : broadcast_calls) { + call(); + } + PADDLE_ENFORCE_EQ( + bkcl_group_end(), BKCL_SUCCESS, + platform::errors::Unavailable("bkcl_group_end failed")); + } + + if (!out_handle->IsTheSameVar(in_var_handle)) { + auto out_var = var_scopes.at(in_var_handle.scope_idx()) + ->FindVar(out_var_handles[0]->name()); + paddle::framework::TensorCopy( + in_tensor, in_var_handle.place(), + *(dev_ctxes_.at(in_var_handle.place())), + &VariableVisitor::GetMutableTensor(out_var)); + } + }); +#else + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with BKCL.")); #endif } } diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h index 1412e2cd9d..e15dd18467 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.h +++ b/paddle/fluid/framework/details/broadcast_op_handle.h @@ -34,12 +34,19 @@ class Node; } // namespace ir } // namespace framework namespace platform { +#if defined(PADDLE_WITH_NCCL) struct NCCLContextMap; +#endif +#if defined(PADDLE_WITH_XPU_BKCL) +struct BKCLContextMap; +#endif } // namespace platform } // namespace paddle #if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/nccl_helper.h" +#elif defined(PADDLE_WITH_XPU_BKCL) +#include "paddle/fluid/platform/bkcl_helper.h" #endif namespace paddle { @@ -63,11 +70,26 @@ struct BroadcastOpHandle : public OpHandleBase { } } } -#else +#endif +#if defined(PADDLE_WITH_XPU_BKCL) + BroadcastOpHandle(ir::Node *node, const std::vector &local_scopes, + const std::vector &places, + const platform::BKCLContextMap *bkcl_ctxs) + : OpHandleBase(node), + local_scopes_(local_scopes), + places_(places), + bkcl_ctxs_(bkcl_ctxs) { + if (bkcl_ctxs_) { + for (auto &p_ctx : bkcl_ctxs_->contexts_) { + this->SetDeviceContext(platform::XPUPlace(p_ctx.first), + p_ctx.second.ctx_.get()); + } + } + } +#endif BroadcastOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places) : OpHandleBase(node), local_scopes_(local_scopes), places_(places) {} -#endif std::string Name() const override; @@ -86,6 +108,8 @@ struct BroadcastOpHandle : public OpHandleBase { std::vector places_; #if defined(PADDLE_WITH_NCCL) const platform::NCCLContextMap *nccl_ctxs_; +#elif defined(PADDLE_WITH_XPU_BKCL) + const platform::BKCLContextMap *bkcl_ctxs_; #endif void InitOutputValue(const VarHandle &in_var_handle, diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.cc b/paddle/fluid/framework/details/broadcast_op_handle_test.cc index 650de5a48d..46814ca5b9 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.cc @@ -18,10 +18,12 @@ namespace paddle { namespace framework { namespace details { +using DeviceType = paddle::platform::DeviceType; + TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) { TestBroadcastOpHandle test_op; size_t input_scope_idx = 0; - test_op.InitCtxOnGpu(false); + test_op.InitCtxOnDevice(p::kCPU); test_op.InitBroadcastOp(input_scope_idx); test_op.TestBroadcastLodTensor(input_scope_idx); } @@ -29,7 +31,7 @@ TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) { TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) { TestBroadcastOpHandle test_op; size_t input_scope_idx = 0; - test_op.InitCtxOnGpu(false); + test_op.InitCtxOnDevice(p::kCPU); test_op.InitBroadcastOp(input_scope_idx); test_op.TestBroadcastSelectedRows(input_scope_idx); } @@ -38,7 +40,7 @@ TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) { TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) { TestBroadcastOpHandle test_op; size_t input_scope_idx = 0; - test_op.InitCtxOnGpu(true); + test_op.InitCtxOnDevice(p::kCUDA); test_op.InitBroadcastOp(input_scope_idx); test_op.TestBroadcastLodTensor(input_scope_idx); } @@ -46,12 +48,22 @@ TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) { TEST(BroadcastTester, TestGPUBroadcastTestSelectedRows) { TestBroadcastOpHandle test_op; size_t input_scope_idx = 0; - test_op.InitCtxOnGpu(true); + test_op.InitCtxOnDevice(p::kCUDA); test_op.InitBroadcastOp(input_scope_idx); test_op.TestBroadcastSelectedRows(input_scope_idx); } #endif +#if defined(PADDLE_WITH_XPU_BKCL) +TEST(BroadcastTester, TestXPUBroadcastTestLodTensor) { + TestBroadcastOpHandle test_op; + size_t input_scope_idx = 0; + test_op.InitCtxOnDevice(p::kXPU); + test_op.InitBroadcastOp(input_scope_idx); + test_op.TestBroadcastLodTensor(input_scope_idx); +} +#endif + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h index 4fdc420e1e..af053de4f6 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.h +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h @@ -33,6 +33,8 @@ struct VarHandle; namespace f = paddle::framework; namespace p = paddle::platform; +using DeviceType = paddle::platform::DeviceType; + // test data amount const f::DDim kDims = {20, 20}; @@ -45,11 +47,15 @@ struct TestBroadcastOpHandle { std::vector vars_; std::vector> nodes_; std::vector place_list_; - bool use_gpu_; + DeviceType use_device_; #if defined(PADDLE_WITH_NCCL) std::unique_ptr nccl_ctxs_; #endif +#if defined(PADDLE_WITH_XPU_BKCL) + std::unique_ptr bkcl_ctxs_; +#endif + void WaitAll() { for (size_t j = 0; j < ctxs_.size(); ++j) { ctxs_[j]->Wait(); @@ -58,12 +64,36 @@ struct TestBroadcastOpHandle { if (nccl_ctxs_) { nccl_ctxs_->WaitAll(); } +#endif +#if defined(PADDLE_WITH_XPU_BKCL) + if (bkcl_ctxs_) { + bkcl_ctxs_->WaitAll(); + } #endif } - void InitCtxOnGpu(bool use_gpu) { - use_gpu_ = use_gpu; - if (use_gpu_) { + void InitCtxOnDevice(DeviceType use_device) { + use_device_ = use_device; + if (use_device_ == p::kXPU) { +#if defined(PADDLE_WITH_XPU_BKCL) + int count = p::GetXPUDeviceCount(); + if (count <= 1) { + LOG(WARNING) << "Cannot test multi-xpu Broadcast, because the XPU " + "device count is " + << count; + exit(0); + } + for (int i = 0; i < count; ++i) { + auto p = p::XPUPlace(i); + place_list_.push_back(p); + ctxs_.emplace_back(new p::XPUDeviceContext(p)); + } + bkcl_ctxs_.reset(new platform::BKCLContextMap(place_list_)); +#else + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with BKCL.")); +#endif + } else if (use_device_ == p::kCUDA) { #if defined(PADDLE_WITH_NCCL) int count = p::GetCUDADeviceCount(); if (count <= 1) { @@ -89,6 +119,9 @@ struct TestBroadcastOpHandle { place_list_.push_back(p); ctxs_.emplace_back(new p::CPUDeviceContext(p)); } +#if defined(PADDLE_WITH_XPU_BKCL) + bkcl_ctxs_.reset(nullptr); +#endif #if defined(PADDLE_WITH_NCCL) nccl_ctxs_.reset(nullptr); #endif @@ -109,22 +142,25 @@ struct TestBroadcastOpHandle { nodes_.emplace_back( ir::CreateNodeForTest("node0", ir::Node::Type::kOperation)); - if (use_gpu_) { + if (use_device_ == p::kCUDA) { #if defined(PADDLE_WITH_NCCL) op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else PADDLE_THROW( - platform::errors::PreconditionNotMet("Not compiled with NCLL.")); + platform::errors::PreconditionNotMet("Not compiled with NCCL.")); #endif - } else { -#if defined(PADDLE_WITH_NCCL) + } else if (use_device_ == p::kXPU) { +#if defined(PADDLE_WITH_XPU_BKCL) op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, - place_list_, nccl_ctxs_.get()); + place_list_, bkcl_ctxs_.get()); #else + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with BKCL.")); +#endif + } else { op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, place_list_); -#endif } op_handle_->SetLocalExecScopes(scope_map); @@ -147,7 +183,7 @@ struct TestBroadcastOpHandle { op_handle_->AddInput(dummy_var_handle); for (size_t j = 0; j < place_list_.size(); ++j) { - if (!use_gpu_) { + if (use_device_ != p::kCUDA) { op_handle_->SetDeviceContext(place_list_[j], ctxs_[j].get()); } nodes_.emplace_back( @@ -273,7 +309,8 @@ struct TestBroadcastOpHandle { f::LoD lod{{0, 10, 20}}; auto send_vector = InitLoDTensor("input", input_scope_idx, lod); - op_handle_->Run(false); + DeviceType use_device = p::kCPU; + op_handle_->Run(use_device); WaitAll(); for (size_t j = 0; j < place_list_.size(); ++j) { @@ -287,7 +324,8 @@ struct TestBroadcastOpHandle { int height = static_cast(kDims[0] * 2); auto send_vector = InitSelectedRows("input", input_scope_idx, rows, height); - op_handle_->Run(false); + DeviceType use_device = p::kCPU; + op_handle_->Run(use_device); WaitAll(); for (size_t j = 0; j < place_list_.size(); ++j) { diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 678946fbc5..c045dae471 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -313,10 +313,13 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, const std::vector &local_scopes, const size_t &nranks, #if defined(PADDLE_WITH_NCCL) - const bool use_cuda, + DeviceType use_device, platform::NCCLCommunicator *nccl_ctxs) const { +#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL) + DeviceType use_device, + platform::BKCLCommunicator *bkcl_ctxs) const { #else - const bool use_cuda) const { + DeviceType use_device) const { #endif VLOG(1) << "apply all passes"; // Create a default one if not finalized by user. @@ -336,9 +339,16 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, pass->Set(kNRanks, new size_t(nranks)); #if defined(PADDLE_WITH_NCCL) - platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr; + platform::NCCLCommunicator *nctx = + (use_device == p::kCUDA) ? nccl_ctxs : nullptr; pass->Erase(kNCCLCtxs); pass->SetNotOwned(kNCCLCtxs, nctx); +#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL) + // ToDo: more check + platform::BKCLCommunicator *bkcl_ctx = + (use_device == p::kXPU) ? bkcl_ctxs : nullptr; + pass->Erase(kBKCLCtxs); + pass->SetNotOwned(kBKCLCtxs, bkcl_ctx); #endif } else if (pass->Type() == "fuse_all_reduce_op_pass") { pass->Erase(kNRanks); @@ -349,12 +359,24 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, pass->SetNotOwned>(kLocalScopes, &local_scopes); #if defined(PADDLE_WITH_NCCL) - platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr; + platform::NCCLCommunicator *nctx = + (use_device == p::kCUDA) ? nccl_ctxs : nullptr; pass->Erase(kNCCLCtxs); pass->SetNotOwned(kNCCLCtxs, nctx); pass->Erase(kUseHierarchicalAllReduce); pass->Set(kUseHierarchicalAllReduce, new bool(use_hierarchical_allreduce_)); +#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL) + platform::BKCLCommunicator *nctx = + (use_device == p::kXPU) ? bkcl_ctxs : nullptr; + pass->Erase(kBKCLCtxs); + pass->SetNotOwned(kBKCLCtxs, nctx); + pass->Erase(kUseHierarchicalAllReduce); + PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_, false, + platform::errors::Unimplemented( + "xpu doesn't support hierarchical_allreduce")); + pass->Set(kUseHierarchicalAllReduce, + new bool(use_hierarchical_allreduce_)); #endif } else if (pass->Type() == "coalesce_grad_tensor_pass") { pass->Erase(kNRanks); @@ -364,35 +386,47 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, << enable_sequential_execution_; } else if (pass->Type() == "all_reduce_deps_pass") { #if defined(PADDLE_WITH_NCCL) - platform::NCCLCommunicator *nctx = use_cuda ? nccl_ctxs : nullptr; + platform::NCCLCommunicator *nctx = + (use_device == p::kCUDA) ? nccl_ctxs : nullptr; pass->Erase(kNCCLCtxs); pass->SetNotOwned(kNCCLCtxs, nctx); pass->Erase(kUseHierarchicalAllReduce); pass->Set(kUseHierarchicalAllReduce, new bool(use_hierarchical_allreduce_)); +#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL) + platform::BKCLCommunicator *nctx = + (use_device == p::kXPU) ? bkcl_ctxs : nullptr; + pass->Erase(kBKCLCtxs); + pass->SetNotOwned(kBKCLCtxs, nctx); + pass->Erase(kUseHierarchicalAllReduce); + PADDLE_ENFORCE_EQ(use_hierarchical_allreduce_, false, + platform::errors::Unimplemented( + "xpu doesn't support hierarchical_allreduce")); + pass->Set(kUseHierarchicalAllReduce, + new bool(use_hierarchical_allreduce_)); #endif VLOG(1) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this) << ", num_trainers:" << num_trainers_; } else if (pass->Type() == "fuse_relu_depthwise_conv_pass") { - if (!use_cuda) { + if (use_device != p::kCUDA) { LOG(WARNING) << "fuse_relu_depthwise_conv_pass is only supported on " "GPU, skipped."; continue; } } else if (pass->Type() == "fusion_group_pass") { - pass->Set("use_gpu", new bool(use_cuda)); - if (!use_cuda) { + pass->Set("use_gpu", new bool((use_device == p::kCUDA))); + if (use_device != p::kCUDA) { LOG(WARNING) << "fusion_group_pass is only supported on GPU, skipped."; continue; } } else if (pass->Type() == "fuse_bn_act_pass") { - if (!use_cuda) { + if (use_device != p::kCUDA) { LOG(WARNING) << "fuse_bn_act_pass is only supported on " "GPU, skipped."; continue; } } else if (pass->Type() == "fuse_bn_add_act_pass") { - if (!use_cuda) { + if (use_device != p::kCUDA) { LOG(WARNING) << "fuse_bn_add_act_pass is only supported on " "GPU, skipped."; continue; @@ -401,7 +435,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph, pass->Set("mkldnn_enabled_op_types", new std::unordered_set(mkldnn_enabled_op_types_)); } else if (pass->Type() == "backward_optimizer_op_deps_pass") { - if (!use_cuda) { + if (use_device != p::kCUDA) { VLOG(1) << "backward_optimizer_op_deps_pass is only supported on " "GPU, skipped."; continue; diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index bc275cb8f3..13ee0a1b4f 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -41,11 +41,15 @@ class NCCLCommunicator; #if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/nccl_helper.h" +#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL) +#include "paddle/fluid/platform/bkcl_helper.h" #endif namespace paddle { namespace framework { namespace details { +using DeviceType = paddle::platform::DeviceType; +namespace p = paddle::platform; struct BuildStrategy { // ParallelExecutor supports two modes of ReduceStrategy, kAllReduce and @@ -147,6 +151,7 @@ struct BuildStrategy { // NCCL config size_t nccl_comm_num_{1}; + size_t bkcl_comm_num_{1}; // The picture is here: // https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396 bool use_hierarchical_allreduce_{false}; @@ -181,10 +186,13 @@ struct BuildStrategy { const std::vector &local_scopes, const size_t &nranks, #if defined(PADDLE_WITH_NCCL) - const bool use_cuda, + DeviceType use_device, platform::NCCLCommunicator *nccl_ctxs) const; +#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL) + DeviceType use_device, + platform::BKCLCommunicator *bkcl_ctxs) const; #else - const bool use_cuda) const; + DeviceType use_device) const; #endif // If set true, ParallelExecutor would build the main_program into multiple diff --git a/paddle/fluid/framework/details/execution_strategy.h b/paddle/fluid/framework/details/execution_strategy.h index a6936577c5..7f51de435b 100644 --- a/paddle/fluid/framework/details/execution_strategy.h +++ b/paddle/fluid/framework/details/execution_strategy.h @@ -14,17 +14,19 @@ #pragma once #include // for size_t +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace framework { namespace details { - +using DeviceType = paddle::platform::DeviceType; +namespace p = paddle::platform; struct ExecutionStrategy { enum ExecutorType { kDefault = 0, kExperimental = 1 }; // num_threads indicates the size of thread pool. size_t num_threads_{0}; - bool use_cuda_{true}; + DeviceType use_device_ = p::kCUDA; // Note that allow_op_delay is invalid now. bool allow_op_delay_{false}; // num_iteration_per_drop_scope indicates how many diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc index 18f2332b6e..e13059e36d 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.cc @@ -330,7 +330,7 @@ bool FastThreadedSSAGraphExecutor::RunOpSync(OpHandleBase *op) { try { VLOG(10) << op << " " << op->Name() << " : " << op->DebugString(); if (LIKELY(!strategy_.dry_run_)) { - op->Run(strategy_.use_cuda_); + op->Run(strategy_.use_device_); } VLOG(10) << op << " " << op->Name() << " Done "; return true; diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc index c538811669..4a5cc67ba7 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.cc @@ -37,6 +37,13 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle( const platform::NCCLCommunicator *ctxs) : AllReduceOpHandle(node, local_scopes, places, ctxs), num_of_all_reduce_(num_of_all_reduce) {} +#elif defined(PADDLE_WITH_XPU_BKCL) +FusedAllReduceOpHandle::FusedAllReduceOpHandle( + ir::Node *node, const std::vector &local_scopes, + const std::vector &places, const size_t num_of_all_reduce, + const platform::BKCLCommunicator *ctxs) + : AllReduceOpHandle(node, local_scopes, places, ctxs), + num_of_all_reduce_(num_of_all_reduce) {} #else FusedAllReduceOpHandle::FusedAllReduceOpHandle( ir::Node *node, const std::vector &local_scopes, @@ -73,9 +80,14 @@ void FusedAllReduceOpHandle::RunImpl() { "handles is %d, and the number of output variable handles is %d.", in_var_handles.size(), out_var_handles.size())); - // Note: some gradient op doesn't have CUDAKernel, so the gradients of - // those op are in CPUPlace, in this case, the all reduce should not be fused. +// Note: some gradient op doesn't have CUDAKernel, so the gradients of +// those op are in CPUPlace, in this case, the all reduce should not be fused. +#if defined(PADDLE_WITH_XPU_BKCL) + // TODO(liuyuhui): XPU don't support fuse all reduce for now + if (InputIsInDifferentPlace(in_var_handles) || true) { +#else if (InputIsInDifferentPlace(in_var_handles)) { +#endif for (size_t j = 0; j < num_of_all_reduce_; ++j) { std::vector dev_inputs; std::vector dev_outputs; diff --git a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h index 9bed792a42..463460a1ff 100644 --- a/paddle/fluid/framework/details/fused_all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/fused_all_reduce_op_handle.h @@ -36,6 +36,8 @@ class NCCLCommunicator; #if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/framework/details/nccl_op_handle.h" #include "paddle/fluid/platform/nccl_helper.h" +#elif defined(PADDLE_WITH_XPU_BKCL) +#include "paddle/fluid/platform/bkcl_helper.h" #endif namespace paddle { @@ -49,6 +51,13 @@ struct FusedAllReduceOpHandle : public AllReduceOpHandle { const std::vector &places, const size_t num_of_all_reduce, const platform::NCCLCommunicator *ctxs); +#elif defined(PADDLE_WITH_XPU_BKCL) +struct FusedAllReduceOpHandle : public AllReduceOpHandle { + FusedAllReduceOpHandle(ir::Node *node, + const std::vector &local_scopes, + const std::vector &places, + const size_t num_of_all_reduce, + const platform::BKCLCommunicator *ctxs); #else struct FusedAllReduceOpHandle : public AllReduceOpHandle { FusedAllReduceOpHandle(ir::Node *node, diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h index 8fd3ec56d1..ee45521c21 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h @@ -52,11 +52,18 @@ struct FusedBroadcastOpHandle : public BroadcastOpHandle { const std::vector &places, const platform::NCCLContextMap *nccl_ctx) : BroadcastOpHandle(node, local_scopes, places, nccl_ctx) {} -#else - FusedBroadcastOpHandle(ir::Node* node, const std::vector local_scopes, - const std::vector& places) - : BroadcastOpHandle(node, local_scopes, places) {} #endif +#if defined(PADDLE_WITH_XPU_BKCL) + FusedBroadcastOpHandle(ir::Node *node, + const std::vector local_scopes, + const std::vector &places, + const platform::BKCLContextMap *bkcl_ctx) + : BroadcastOpHandle(node, local_scopes, places, bkcl_ctx) {} +#endif + FusedBroadcastOpHandle(ir::Node *node, + const std::vector local_scopes, + const std::vector &places) + : BroadcastOpHandle(node, local_scopes, places) {} std::string Name() const override; protected: diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc index ce7621d4e3..b19d60ac20 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc @@ -32,6 +32,7 @@ namespace framework { namespace details { struct VarHandle; +using DeviceType = paddle::platform::DeviceType; struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { std::vector out_varnames_; @@ -55,7 +56,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { // create op handle node nodes_.emplace_back( ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation)); - if (use_gpu_) { + if (use_device_ == p::kCUDA) { #if defined(PADDLE_WITH_NCCL) op_handle_ = new FusedBroadcastOpHandle( nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); @@ -63,14 +64,17 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { PADDLE_THROW( platform::errors::PreconditionNotMet("Not compiled with CUDA.")); #endif - } else { -#if defined(PADDLE_WITH_NCCL) + } else if (use_device_ == p::kXPU) { +#if defined(PADDLE_WITH_XPU_BKCL) op_handle_ = new FusedBroadcastOpHandle( - nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); + nodes_.back().get(), local_scopes_, place_list_, bkcl_ctxs_.get()); #else + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with XPU.")); +#endif + } else { op_handle_ = new FusedBroadcastOpHandle(nodes_.back().get(), local_scopes_, place_list_); -#endif } op_handle_->SetLocalExecScopes(scope_map); @@ -108,7 +112,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { InitLoDTensor(varname, input_scope_idxes[i], lod, val_scalar)); } - op_handle_->Run(false); + DeviceType use_device = p::kCPU; + op_handle_->Run(use_device); WaitAll(); for (size_t i = 0; i < input_scope_idxes.size(); ++i) { @@ -131,7 +136,8 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { rows, height, val_scalar)); } - op_handle_->Run(false); + DeviceType use_device = p::kCPU; + op_handle_->Run(use_device); WaitAll(); for (size_t i = 0; i < input_scope_idxes.size(); ++i) { @@ -147,7 +153,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { TEST(FusedBroadcastTester, CPULodTensor) { TestFusedBroadcastOpHandle test_op; std::vector input_scope_idxes = {0, 1}; - test_op.InitCtxOnGpu(false); + test_op.InitCtxOnDevice(p::kCPU); test_op.InitFusedBroadcastOp(input_scope_idxes); test_op.TestFusedBroadcastLoDTensor(input_scope_idxes); } @@ -155,7 +161,7 @@ TEST(FusedBroadcastTester, CPULodTensor) { TEST(FusedBroadcastTester, CPUSelectedRows) { TestFusedBroadcastOpHandle test_op; std::vector input_scope_idxes = {0, 1}; - test_op.InitCtxOnGpu(false); + test_op.InitCtxOnDevice(p::kCPU); test_op.InitFusedBroadcastOp(input_scope_idxes); test_op.TestFusedBroadcastSelectedRows(input_scope_idxes); } @@ -164,7 +170,7 @@ TEST(FusedBroadcastTester, CPUSelectedRows) { TEST(FusedBroadcastTester, GPULodTensor) { TestFusedBroadcastOpHandle test_op; std::vector input_scope_idxes = {0, 1}; - test_op.InitCtxOnGpu(true); + test_op.InitCtxOnDevice(p::kCUDA); test_op.InitFusedBroadcastOp(input_scope_idxes); test_op.TestFusedBroadcastLoDTensor(input_scope_idxes); } @@ -172,12 +178,22 @@ TEST(FusedBroadcastTester, GPULodTensor) { TEST(FusedBroadcastTester, GPUSelectedRows) { TestFusedBroadcastOpHandle test_op; std::vector input_scope_idxes = {0, 1}; - test_op.InitCtxOnGpu(true); + test_op.InitCtxOnDevice(p::kCUDA); test_op.InitFusedBroadcastOp(input_scope_idxes); test_op.TestFusedBroadcastSelectedRows(input_scope_idxes); } #endif +#if defined(PADDLE_WITH_XPU_BKCL) +TEST(FusedBroadcastTester, XPULodTensor) { + TestFusedBroadcastOpHandle test_op; + std::vector input_scope_idxes = {0, 1}; + test_op.InitCtxOnDevice(p::kXPU); + test_op.InitFusedBroadcastOp(input_scope_idxes); + test_op.TestFusedBroadcastLoDTensor(input_scope_idxes); +} +#endif + } // namespace details } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/details/gather_op_handle_test.cc b/paddle/fluid/framework/details/gather_op_handle_test.cc index 60c1d0d39a..c0df833882 100644 --- a/paddle/fluid/framework/details/gather_op_handle_test.cc +++ b/paddle/fluid/framework/details/gather_op_handle_test.cc @@ -27,6 +27,8 @@ struct DummyVarHandle; namespace f = paddle::framework; namespace p = paddle::platform; +using DeviceType = paddle::platform::DeviceType; + // test data amount const f::DDim kDims = {20, 20}; @@ -171,7 +173,8 @@ struct TestGatherOpHandle { out_selected_rows->mutable_value()->ShareDataWith( in_selected_rows->value()); - op_handle_->Run(false); + DeviceType use_device = p::kCPU; + op_handle_->Run(use_device); WaitAll(); diff --git a/paddle/fluid/framework/details/multi_devices_helper.h b/paddle/fluid/framework/details/multi_devices_helper.h index c3a18433cf..304e7f0375 100644 --- a/paddle/fluid/framework/details/multi_devices_helper.h +++ b/paddle/fluid/framework/details/multi_devices_helper.h @@ -55,6 +55,7 @@ constexpr char kPlaces[] = "places"; constexpr char kGlobalScope[] = "global_scope"; constexpr char kLocalScopes[] = "local_scopes"; constexpr char kNCCLCtxs[] = "nccl_ctxs"; +constexpr char kBKCLCtxs[] = "bkcl_ctxs"; constexpr char kUseHierarchicalAllReduce[] = "use_hierarchical_allreduce"; // aux variables to represent dependency. Useful to resolve data hazard. diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc index 22b7bd17fe..eeff0f3d46 100644 --- a/paddle/fluid/framework/details/op_handle_base.cc +++ b/paddle/fluid/framework/details/op_handle_base.cc @@ -82,20 +82,74 @@ void OpHandleBase::InitCUDA() { } } } +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't use CUDA device since it's not compiled with CUDA," + "Please recompile or reinstall Paddle with GPU support.")); +#endif +} + +void OpHandleBase::InitXPU() { +#ifdef PADDLE_WITH_XPU + if (IsMultiDeviceTransfer() && dev_ctxes_.size() > 0) { + for (auto &out_var : outputs_) { + auto *out_var_handle = dynamic_cast(out_var); + if (out_var_handle) { + // TODO(liuyuhui): XPU now don't support sync events, add later. + } + } + } else { + PADDLE_ENFORCE_EQ(dev_ctxes_.size(), 1UL, + platform::errors::InvalidArgument( + "%s should have only one dev_ctx.", Name())); + auto &place = dev_ctxes_.begin()->first; + int dev_id = BOOST_GET_CONST(platform::XPUPlace, place).device; + PADDLE_ENFORCE_EQ( + xpu_set_device(dev_id), XPU_SUCCESS, + platform::errors::PreconditionNotMet("xpu_set_device failed")); + for (auto &out_var : outputs_) { + auto *out_var_handle = dynamic_cast(out_var); + if (out_var_handle) { + PADDLE_ENFORCE_EQ( + platform::is_same_place(place, out_var_handle->place()), true, + platform::errors::InvalidArgument( + "The place of output(%s) is not consistent with the " + "place of current op(%s).", + out_var_handle->Name(), Name())); + } + } + } +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't use XPU device since it's not compiled with XPU," + "Please recompile or reinstall Paddle with XPU support.")); #endif } -void OpHandleBase::Run(bool use_cuda) { +void OpHandleBase::Run(DeviceType use_device) { #ifdef PADDLE_WITH_CUDA - if (events_.empty() && use_cuda && dev_ctxes_.size() > 0) { + if (events_.empty() && use_device == p::kCUDA && dev_ctxes_.size() > 0) { InitCUDA(); } #else - PADDLE_ENFORCE_EQ(use_cuda, false, - platform::errors::InvalidArgument( - "Argument use_cuda should be false when Paddle is not " - "compiled with CUDA.")); + PADDLE_ENFORCE_NE( + use_device, p::kCUDA, + platform::errors::InvalidArgument( + "Argument use_device should not be kCUDA when Paddle is not " + "compiled with CUDA.")); +#endif + + if (use_device == p::kXPU && dev_ctxes_.size() > 0) { +#ifdef PADDLE_WITH_XPU + InitXPU(); +#else + PADDLE_ENFORCE_NE( + use_device, p::kXPU, + platform::errors::InvalidArgument( + "Argument use_device should not be kXPU when Paddle is not " + "compiled with XPU.")); #endif + } // skip running current op, used with inplace_addto_op_pass if (skip_running_) { diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h index 37e18adf9d..ced3927f1f 100644 --- a/paddle/fluid/framework/details/op_handle_base.h +++ b/paddle/fluid/framework/details/op_handle_base.h @@ -19,6 +19,7 @@ #include #include +#include "paddle/fluid/framework/details/execution_strategy.h" #include "paddle/fluid/framework/details/var_handle.h" #include "paddle/fluid/framework/ir/node.h" #include "paddle/fluid/platform/device_context.h" @@ -42,7 +43,8 @@ class Node; } // namespace ir namespace details { - +using DeviceType = paddle::platform::DeviceType; +namespace p = paddle::platform; // Wraps ir::Node and provide helper utilities. // It's responsible for populating necessary fields of ir::Node. class OpHandleBase { @@ -71,7 +73,7 @@ class OpHandleBase { virtual std::string Name() const = 0; - void Run(bool use_cuda); + void Run(DeviceType use_device); virtual void RecordWaitEventOnCtx(platform::DeviceContext *waited_ctx); @@ -144,6 +146,7 @@ class OpHandleBase { virtual void RunImpl() = 0; virtual void InitCUDA(); + virtual void InitXPU(); ir::Node *node_; std::vector inputs_; diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index b43d4b526b..5f1f27b8d5 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -212,10 +212,64 @@ void ReduceOpHandle::RunImpl() { #else PADDLE_THROW( platform::errors::PreconditionNotMet("Not compiled with CUDA.")); +#endif + } else if (paddle::platform::is_xpu_place(lod_tensors[0]->place())) { +#if defined(PADDLE_WITH_XPU_BKCL) + auto pre_in = pre_in_var->Get(); + VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var); + VariableVisitor::GetMutableTensor(out_var).mutable_data( + out_var_handle->place(), pre_in.type()); + + auto out_p = out_var_handle->place(); + int root_id = BOOST_GET_CONST(platform::XPUPlace, out_p).device; + std::vector> all_reduce_calls; + for (size_t i = 0; i < var_scopes.size(); ++i) { + auto &p = in_places[i]; + auto &lod_tensor = *lod_tensors[i]; + + int dev_id = BOOST_GET_CONST(platform::XPUPlace, p).device; + auto &bkcl_ctx = bkcl_ctxs_->at(dev_id); + + void *buffer = const_cast(lod_tensor.data()); + void *recvbuffer = nullptr; + if (root_id == dev_id) { + recvbuffer = + out_var->GetMutable()->mutable_data( + out_var_handle->place()); + } + + int type = platform::ToBKCLDataType(lod_tensor.type()); + size_t numel = static_cast(lod_tensor.numel()); + all_reduce_calls.emplace_back([buffer, recvbuffer, type, numel, root_id, + &bkcl_ctx] { + PADDLE_ENFORCE_EQ(bkcl_reduce(bkcl_ctx.comm(), buffer, recvbuffer, + numel, static_cast(type), + BKCL_ADD, root_id, nullptr), + BKCL_SUCCESS, platform::errors::Unavailable( + "bkcl_all_reduce failed")); + }); + } + + WaitInputVarGenerated(); + this->RunAndRecordEvent([&] { + PADDLE_ENFORCE_EQ( + bkcl_group_start(), BKCL_SUCCESS, + platform::errors::Unavailable("bkcl_group_start failed")); + for (auto &call : all_reduce_calls) { + call(); + } + PADDLE_ENFORCE_EQ( + bkcl_group_end(), BKCL_SUCCESS, + platform::errors::Unavailable("bkcl_group_end failed")); + }); +#else + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with XPU.")); #endif } else { PADDLE_THROW(platform::errors::InvalidArgument( - "The place of tensor should be CPUPlace or CUDAPlace, but got %s.", + "The place of tensor should be CPUPlace, CUDAPlace or XPUPlace, but " + "got %s.", lod_tensors[0]->place())); } } diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index e76a48d207..b2b4196805 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -41,6 +41,8 @@ struct NCCLContextMap; } // namespace paddle #if defined(PADDLE_WITH_NCCL) #include "paddle/fluid/platform/nccl_helper.h" +#elif defined(PADDLE_WITH_XPU_BKCL) +#include "paddle/fluid/platform/bkcl_helper.h" #endif namespace paddle { @@ -93,6 +95,22 @@ struct ReduceOpHandle : public OpHandleBase { } } } +#elif defined(PADDLE_WITH_XPU_BKCL) + const platform::BKCLContextMap *bkcl_ctxs_; + ReduceOpHandle(ir::Node *node, const std::vector &local_scopes, + const std::vector &places, + const platform::BKCLContextMap *bkcl_ctxs) + : OpHandleBase(node), + local_scopes_(local_scopes), + places_(places), + bkcl_ctxs_(bkcl_ctxs) { + if (bkcl_ctxs_) { + for (auto &p_ctx : bkcl_ctxs_->contexts_) { + this->SetDeviceContext(platform::XPUPlace(p_ctx.first), + p_ctx.second.ctx_.get()); + } + } + } #else ReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places) diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc index ba03c3a267..0ae53b35a4 100644 --- a/paddle/fluid/framework/details/reduce_op_handle_test.cc +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -25,6 +25,8 @@ namespace details { namespace f = paddle::framework; namespace p = paddle::platform; +using DeviceType = paddle::platform::DeviceType; + // test data amount const f::DDim kDims = {20, 20}; @@ -196,7 +198,8 @@ struct TestReduceOpHandle { out_selected_rows->mutable_value()->ShareDataWith( in_selected_rows->value()); - op_handle_->Run(false); + DeviceType use_device = p::kCPU; + op_handle_->Run(use_device); WaitAll(); @@ -260,7 +263,8 @@ struct TestReduceOpHandle { out_lodtensor->ShareDataWith(in_lodtensor); - op_handle_->Run(false); + DeviceType use_device = p::kCPU; + op_handle_->Run(use_device); WaitAll(); diff --git a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc index 287667d5ee..aa32a248e7 100644 --- a/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc +++ b/paddle/fluid/framework/details/scale_loss_grad_op_handle.cc @@ -58,6 +58,17 @@ struct ScaleLossGradFunctor { auto *out_data = out_->mutable_data(place_); if (platform::is_cpu_place(place_)) { *out_data = static_cast(coeff_); + } else if (platform::is_xpu_place(place_)) { +#if defined(PADDLE_WITH_XPU) + OutT cast_coeff = static_cast(coeff_); + memory::Copy(BOOST_GET_CONST(platform::XPUPlace, place_), out_data, + platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_)); + VLOG(10) << place_ << "RUN Scale loss grad op"; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't use XPU device since it's not compiled with XPU," + "Please recompile or reinstall Paddle with XPU support.")); +#endif } else { #ifdef PADDLE_WITH_CUDA OutT cast_coeff = static_cast(coeff_); @@ -66,7 +77,10 @@ struct ScaleLossGradFunctor { platform::CPUPlace(), &cast_coeff, SizeOfType(out_dtype_), stream); VLOG(10) << place_ << "RUN Scale loss grad op"; - +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't use CUDA device since it's not compiled with CUDA," + "Please recompile or reinstall Paddle with GPU support.")); #endif } } diff --git a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc index 139a033a81..00201bd442 100644 --- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc @@ -348,7 +348,7 @@ bool ThreadedSSAGraphExecutor::RunOpSync(OpHandleBase *op) { try { VLOG(10) << op << " " << op->Name() << " : " << op->DebugString(); if (LIKELY(!strategy_.dry_run_)) { - op->Run(strategy_.use_cuda_); + op->Run(strategy_.use_device_); } VLOG(10) << op << " " << op->Name() << " Done "; return true; diff --git a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc index 9427480852..a29b07fbe9 100644 --- a/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc +++ b/paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc @@ -30,6 +30,7 @@ DECLARE_double(eager_delete_tensor_gb); namespace paddle { namespace framework { +namespace p = paddle::platform; static std::vector CreatePlaces(size_t num, bool use_cuda) { std::vector result; @@ -88,7 +89,7 @@ class ReferenceCountPassTestHelper { FLAGS_eager_delete_tensor_gb = -1; details::ExecutionStrategy exec_strategy; - exec_strategy.use_cuda_ = use_cuda; + exec_strategy.use_device_ = use_cuda ? p::kCUDA : p::kCPU; executor_.reset(new ParallelExecutor(CreatePlaces(1, use_cuda), {}, "", &scope_, {}, exec_strategy, diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc index 81c98ecf0c..b0ab6d23af 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc @@ -41,6 +41,9 @@ class FuseAllReduceOpPass : public ir::Pass { #if defined(PADDLE_WITH_NCCL) auto *multi_nccl_ctxs = &Get(details::kNCCLCtxs); +#elif defined(PADDLE_WITH_XPU_BKCL) + auto *multi_bkcl_ctxs = + &Get(details::kBKCLCtxs); #endif ir::Graph &result = *graph; @@ -92,6 +95,9 @@ class FuseAllReduceOpPass : public ir::Pass { #if defined(PADDLE_WITH_NCCL) InsertFusedAllReduce(places, local_scopes, group_size, group_all_reduce_ops, multi_nccl_ctxs, &result); +#elif defined(PADDLE_WITH_XPU_BKCL) + InsertFusedAllReduce(places, local_scopes, group_size, + group_all_reduce_ops, multi_bkcl_ctxs, &result); #else InsertFusedAllReduce(places, local_scopes, group_size, group_all_reduce_ops, &result); @@ -154,6 +160,8 @@ class FuseAllReduceOpPass : public ir::Pass { const std::vector &all_reduce_ops, #if defined(PADDLE_WITH_NCCL) const platform::NCCLCommunicator *multi_nccl_ctxs, +#elif defined(PADDLE_WITH_XPU_BKCL) + const platform::BKCLCommunicator *multi_bkcl_ctxs, #endif ir::Graph *result) const { std::vector inputs; @@ -182,6 +190,9 @@ class FuseAllReduceOpPass : public ir::Pass { #if defined(PADDLE_WITH_NCCL) CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places, local_scopes, multi_nccl_ctxs, result); +#elif defined(PADDLE_WITH_XPU_BKCL) + CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places, + local_scopes, multi_bkcl_ctxs, result); #else CreateFusedAllReduceOp(inputs, outputs, num_of_all_reduce, places, local_scopes, result); @@ -197,12 +208,18 @@ class FuseAllReduceOpPass : public ir::Pass { const std::vector &local_scopes, #if defined(PADDLE_WITH_NCCL) const platform::NCCLCommunicator *multi_nccl_ctxs, +#elif defined(PADDLE_WITH_XPU_BKCL) + const platform::BKCLCommunicator *multi_bkcl_ctxs, #endif ir::Graph *result) const { #if defined(PADDLE_WITH_NCCL) auto *op_handle = new details::FusedAllReduceOpHandle( result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation), local_scopes, places, num_of_all_reduce, multi_nccl_ctxs); +#elif defined(PADDLE_WITH_XPU_BKCL) + auto *op_handle = new details::FusedAllReduceOpHandle( + result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation), + local_scopes, places, num_of_all_reduce, multi_bkcl_ctxs); #else auto *op_handle = new details::FusedAllReduceOpHandle( result->CreateEmptyNode("fused_all_reduce", ir::Node::Type::kOperation), @@ -221,6 +238,10 @@ class FuseAllReduceOpPass : public ir::Pass { if (!multi_nccl_ctxs) { SetCommunicationContext(places, op_handle); } +#elif defined(PADDLE_WITH_XPU_BKCL) + if (!multi_bkcl_ctxs) { + SetCommunicationContext(places, op_handle); + } #else SetCommunicationContext(places, op_handle); #endif diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc index fd82d6b10e..6fe1fcdada 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc @@ -162,6 +162,12 @@ void MultiDevSSAGraphBuilderBase::Init() const { if (multi_nccl_ctxs_) { nccl_ctxs_ = multi_nccl_ctxs_->DefaultFlatCtx(); } +#elif defined(PADDLE_WITH_XPU_BKCL) + multi_bkcl_ctxs_ = &Get(details::kBKCLCtxs); + bkcl_ctxs_ = nullptr; + if (multi_bkcl_ctxs_) { + bkcl_ctxs_ = multi_bkcl_ctxs_->DefaultFlatCtx(); + } #endif PADDLE_ENFORCE_EQ( places_.size(), local_scopes_.size(), @@ -371,6 +377,11 @@ void MultiDevSSAGraphBuilderBase::SetCommunicationContext( op_handle->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p)); } +#elif defined(PADDLE_WITH_XPU_BKCL) + if (bkcl_ctxs_ == nullptr) { + op_handle->SetDeviceContext(p, + platform::DeviceContextPool::Instance().Get(p)); + } #else op_handle->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p)); @@ -384,6 +395,10 @@ void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result, auto *op_handle = new details::BroadcastOpHandle( result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_); +#elif defined(PADDLE_WITH_XPU_BKCL) + auto *op_handle = new details::BroadcastOpHandle( + result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation), + local_scopes_, places_, bkcl_ctxs_); #else auto *op_handle = new details::BroadcastOpHandle( result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation), @@ -417,6 +432,10 @@ void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp( auto *op_handle = new details::FusedBroadcastOpHandle( result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_); +#elif defined(PADDLE_WITH_XPU_BKCL) + auto *op_handle = new details::FusedBroadcastOpHandle( + result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation), + local_scopes_, places_, bkcl_ctxs_); #else auto *op_handle = new details::FusedBroadcastOpHandle( result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation), @@ -487,6 +506,11 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result, new details::AllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), scopes, places, multi_nccl_ctxs_)); +#elif defined(PADDLE_WITH_XPU_BKCL) + result->Get(kGraphOps).emplace_back( + new details::AllReduceOpHandle( + result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), + scopes, places, multi_bkcl_ctxs_)); #else result->Get(kGraphOps).emplace_back( new details::AllReduceOpHandle( @@ -565,6 +589,10 @@ details::VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp( result->Get(kGraphOps).emplace_back(new details::ReduceOpHandle( result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_)); +#elif defined(PADDLE_WITH_XPU_BKCL) + result->Get(kGraphOps).emplace_back(new details::ReduceOpHandle( + result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), + local_scopes_, places_, bkcl_ctxs_)); #else result->Get(kGraphOps).emplace_back(new details::ReduceOpHandle( result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h index bb3586ba80..97d3a40874 100644 --- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h @@ -39,8 +39,13 @@ class Graph; namespace paddle { namespace platform { +#if defined(PADDLE_WITH_NCCL) class NCCLContextMap; class NCCLCommunicator; +#elif defined(PADDLE_WITH_XPU_BKCL) +class BKCLContextMap; +class BKCLCommunicator; +#endif } namespace framework { @@ -114,6 +119,9 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass { #if defined(PADDLE_WITH_NCCL) mutable platform::NCCLContextMap *nccl_ctxs_{nullptr}; mutable platform::NCCLCommunicator *multi_nccl_ctxs_{nullptr}; +#elif defined(PADDLE_WITH_XPU_BKCL) + mutable platform::BKCLContextMap *bkcl_ctxs_{nullptr}; + mutable platform::BKCLCommunicator *multi_bkcl_ctxs_{nullptr}; #endif mutable std::string loss_var_name_; diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index d9ddf49f46..947a3c9455 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -93,6 +93,8 @@ class ParallelExecutorPrivate { } } + bool IsUseCUDA(DeviceType use_device); + void SetHasFeed(size_t dev_idx, bool has_feed = true); bool AllowPartialFeed() const; @@ -268,6 +270,90 @@ class ParallelExecutorPrivate { } #endif +#if defined(PADDLE_WITH_XPU_BKCL) + void InitBKCLCtxs(framework::Scope *scope, const BuildStrategy &bst) { + VLOG(1) << "bkcl comm num:" << bst.bkcl_comm_num_ << ", nranks:" << nranks_ + << ", num_trainers:" << bst.num_trainers_ + << ", trainer_id:" << bst.trainer_id_; + + PADDLE_ENFORCE_EQ(bst.use_hierarchical_allreduce_, false, + platform::errors::Unimplemented( + "xpu doesn't support use_hierarchical_allreduce")); + + std::vector flat_bkcl_ids; + if (nranks_ == 1) { + // FIXME(gongwb): need not to create bkclid when nranks==1 + bkcl_ctxs_->InitFlatCtxs(places_, flat_bkcl_ids, bst.num_trainers_, + bst.trainer_id_); + return; + } + + if (bst.enable_parallel_graph_) { + VLOG(1) << "use only one bkclid in pg model"; + + BKCLUniqueId *bkcl_id = nullptr; + + std::string var_name = platform::GetFlatBKCLVarName(0); + auto bkcl_id_var = scope->FindVar(var_name); + std::unique_ptr id(new BKCLUniqueId()); + if (bkcl_id_var) { + bkcl_id = bkcl_id_var->GetMutable(); + } else { + PADDLE_ENFORCE_EQ( + bkcl_get_unique_id(id.get()), BKCL_SUCCESS, + platform::errors::Unavailable("bkcl get unique id failed")); + bkcl_id = id.get(); + } + + flat_bkcl_ids.push_back(bkcl_id); + + bkcl_ctxs_->InitFlatCtxs(places_, flat_bkcl_ids, bst.num_trainers_, + bst.trainer_id_); + VLOG(1) << "init bst bkcl context complete!"; + return; + } + + // num_trainers ==1 && places > 1 + if (bst.num_trainers_ == 1) { + bkcl_ctxs_->InitFlatCtxs(places_, flat_bkcl_ids, bst.num_trainers_, + bst.trainer_id_); + return; + } + + for (int i = 0; i < static_cast(bst.bkcl_comm_num_); i++) { + std::string var_name = platform::GetFlatBKCLVarName(i); + auto bkcl_id_var = scope->FindVar(var_name); + PADDLE_ENFORCE_NOT_NULL( + bkcl_id_var, + platform::errors::NotFound("can't find %s bkcl_id_var", var_name)); + auto bkcl_id = bkcl_id_var->GetMutable(); + flat_bkcl_ids.push_back(bkcl_id); + } + + bkcl_ctxs_->InitFlatCtxs(places_, flat_bkcl_ids, bst.num_trainers_, + bst.trainer_id_); + } + + void InitOrGetBKCLCommunicator(framework::Scope *scope, + const BuildStrategy &bst) { + const std::string var_name = "BKCLCommunicator"; + auto var = scope->FindVar(var_name); + if (var != nullptr) { + PADDLE_ENFORCE_EQ(var->IsInitialized(), true, + platform::errors::PreconditionNotMet( + "if %s exists, it must be initialized", var_name)); + VLOG(1) << "find " << var_name + << " in scope, so use it and does not recreate!"; + bkcl_ctxs_ = var->GetMutable(); + return; + } + + VLOG(1) << "not find " << var_name << " in scope, so recreate it!"; + bkcl_ctxs_ = scope->Var(var_name)->GetMutable(); + InitBKCLCtxs(scope, bst); + } +#endif + inline bool IsPersistable(const std::string &name) const { auto iter = is_persistable_.find(name); return iter != is_persistable_.end() && iter->second; @@ -284,9 +370,11 @@ class ParallelExecutorPrivate { #if defined(PADDLE_WITH_NCCL) platform::NCCLCommunicator *nccl_ctxs_{nullptr}; +#elif defined(PADDLE_WITH_XPU_BKCL) + platform::BKCLCommunicator *bkcl_ctxs_{nullptr}; #endif bool own_local_scope_; - bool use_cuda_; + DeviceType use_device_; bool use_all_reduce_; size_t nranks_; @@ -296,6 +384,10 @@ class ParallelExecutorPrivate { details::ParallelSSAGraphExecutor *inference_executor_{nullptr}; }; +bool ParallelExecutorPrivate::IsUseCUDA(DeviceType use_device) { + return use_device == p::kCUDA; +} + void ParallelExecutorPrivate::SetHasFeed(size_t dev_idx, bool has_feed) { if (inference_executor_) { inference_executor_->SetHasFeed(dev_idx, has_feed); @@ -340,7 +432,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { auto addto_pass = ir::PassRegistry::Instance().Get("inplace_addto_op_pass"); addto_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_); addto_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars); - addto_pass->SetNotOwned(ir::kUseCuda, &use_cuda_); + addto_pass->Set(ir::kUseCuda, new bool(use_device_ == p::kCUDA)); VLOG(10) << "Start to apply inplace_addto_op_pass"; graph = addto_pass->Apply(graph); VLOG(10) << "inplace_addto_op_pass Applied"; @@ -351,7 +443,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { ir::PassRegistry::Instance().Get("buffer_shared_inplace_pass"); inplace_pass->SetNotOwned(ir::kMemOptVarInfoMapList, &mem_opt_var_infos_); inplace_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars); - inplace_pass->SetNotOwned(ir::kUseCuda, &use_cuda_); + inplace_pass->Set(ir::kUseCuda, new bool(use_device_ == p::kCUDA)); VLOG(10) << "Start to apply buffer_shared_inplace_pass"; graph = inplace_pass->Apply(graph); VLOG(10) << "buffer_shared_inplace_pass Applied"; @@ -366,7 +458,8 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { &mem_opt_var_infos_); cross_op_memory_reuse_pass->SetNotOwned(ir::kLastLiveOpsOfVars, &last_live_ops_of_vars); - cross_op_memory_reuse_pass->SetNotOwned(ir::kUseCuda, &use_cuda_); + cross_op_memory_reuse_pass->Set(ir::kUseCuda, + new bool(use_device_ == p::kCUDA)); VLOG(10) << "Start to apply buffer_shared_cross_op_memory_reuse_pass"; graph = cross_op_memory_reuse_pass->Apply(graph); VLOG(10) << "buffer_shared_cross_op_memory_reuse_pass Applied"; @@ -386,8 +479,8 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { continue; } std::unique_ptr gc; -#ifdef PADDLE_WITH_CUDA if (platform::is_gpu_place(place)) { +#ifdef PADDLE_WITH_CUDA if (IsFastEagerDeletionModeEnabled()) { gc.reset(new UnsafeFastGPUGarbageCollector( BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size)); @@ -396,20 +489,29 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) { BOOST_GET_CONST(platform::CUDAPlace, place), max_memory_size)); } VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; - } else { +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't use CUDA device since it's not compiled with CUDA," + "Please recompile or reinstall Paddle with GPU support.")); #endif - if (platform::is_cpu_place(place)) { - gc.reset(new CPUGarbageCollector( - BOOST_GET_CONST(platform::CPUPlace, place), max_memory_size)); - VLOG(10) << "Created GarbageCollector at " << place; - } else { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Unsupported place for garbage collection")); - } -#ifdef PADDLE_WITH_CUDA - } + } else if (platform::is_xpu_place(place)) { +#if defined(PADDLE_WITH_XPU) + gc.reset(new XPUGarbageCollector( + BOOST_GET_CONST(platform::XPUPlace, place), max_memory_size)); + VLOG(10) << "Created " << i << "-th GarbageCollector at " << place; +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't use XPU device since it's not compiled with XPU," + "Please recompile or reinstall Paddle with XPU support.")); #endif - + } else if (platform::is_cpu_place(place)) { + gc.reset(new CPUGarbageCollector( + BOOST_GET_CONST(platform::CPUPlace, place), max_memory_size)); + VLOG(10) << "Created GarbageCollector at " << place; + } else { + PADDLE_THROW(platform::errors::PreconditionNotMet( + "Unsupported place for garbage collection")); + } gcs_.emplace(place, std::move(gc)); } @@ -510,13 +612,10 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, const BuildStrategy &build_strategy, ir::Graph *graph) : member_(new ParallelExecutorPrivate(places, scope)) { - PADDLE_ENFORCE(places.size() > 0 && !is_xpu_place(places[0]), - platform::errors::Unavailable( - "XPU is not supported in ParallelExecutor")); InitP2P(places); ir::InitReaderQueueDeviceCount(graph, *(member_->global_scope_), member_->places_.size()); - member_->use_cuda_ = exec_strategy.use_cuda_; + member_->use_device_ = exec_strategy.use_device_; member_->build_strategy_ = build_strategy; member_->use_all_reduce_ = member_->build_strategy_.reduce_ == BuildStrategy::ReduceStrategy::kAllReduce; @@ -529,7 +628,7 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, member_->use_all_reduce_ = true; } #if defined(PADDLE_WITH_CUDA) && defined(_WIN32) - if (member_->use_cuda_) { + if (member_->IsUseCUDA(member_->use_device_)) { PADDLE_ENFORCE_EQ( places.size(), 1, platform::errors::Unavailable("Windows can support Single GPU only.")); @@ -537,19 +636,30 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, #endif #if defined(PADDLE_WITH_CUDA) && !defined(PADDLE_WITH_NCCL) - PADDLE_ENFORCE_EQ( - places.size(), 1, - platform::errors::PermissionDenied( - "Your machine has multiple cards, " - "but the WITH_NCCL option is not turned on during compilation, " - "and you cannot use multi-card training or prediction. " - "Please recompile and turn on the WITH_NCCL option.")); + if (member_->IsUseCUDA(member_->use_device_)) { + PADDLE_ENFORCE_EQ( + places.size(), 1, + platform::errors::PermissionDenied( + "Your machine has multiple cards, " + "but the WITH_NCCL option is not turned on during compilation, " + "and you cannot use multi-card training or prediction. " + "Please recompile and turn on the WITH_NCCL option.")); + } #endif + std::string device_name; + if (member_->use_device_ == p::kCPU) { + device_name = "CPU"; + } else if (member_->use_device_ == p::kCUDA) { + device_name = "CUDA"; + } else { + device_name = "XPU"; + } + VLOG(1) << string::Sprintf( "The Program will be executed on %s using ParallelExecutor, %lu " "cards are used, so %lu programs are executed in parallel.", - (member_->use_cuda_ ? "CUDA" : "CPU"), places.size(), places.size()); + device_name, places.size(), places.size()); // Step 1. Bcast the bcast_vars to devs. // Create local scopes @@ -573,7 +683,7 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, std::vector graphs; if (member_->build_strategy_.async_mode_) { - PADDLE_ENFORCE_EQ(member_->use_cuda_, false, + PADDLE_ENFORCE_EQ(member_->IsUseCUDA(member_->use_device_), false, platform::errors::Unavailable( "gpu mode does not support async_mode_ now!")); graphs.push_back(graph); @@ -596,7 +706,7 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, << "you can force it off by env FLAGS_enable_parallel_graph=0"; } - if (member_->use_cuda_ && member_->nranks_ > 1) { + if (member_->IsUseCUDA(member_->use_device_) && member_->nranks_ > 1) { #if defined(PADDLE_WITH_NCCL) member_->InitOrGetNCCLCommunicator(scope, &member_->build_strategy_); @@ -616,6 +726,27 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, auto &nccl_ctx = nccl_ctxs->at(member_->places_[dev_id]); dev_ctx->set_nccl_comm(nccl_ctx.comm()); } +#else + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with CUDA.")); +#endif + } + if (member_->use_device_ == p::kXPU && member_->nranks_ > 1) { +#if defined(PADDLE_WITH_XPU_BKCL) + member_->InitOrGetBKCLCommunicator(scope, member_->build_strategy_); + + auto *bkcl_ctxs = + member_->bkcl_ctxs_->GetSyncBatchNormCtx(scope, member_->places_); + auto &pool = platform::DeviceContextPool::Instance(); + for (size_t dev_id = 0; dev_id < member_->places_.size(); ++dev_id) { + auto *dev_ctx = static_cast( + pool.Get(member_->places_[dev_id])); + auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[dev_id]); + dev_ctx->set_bkcl_context(bkcl_ctx.comm()); + } +#else + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with XPU.")); #endif } // broadcast parameters from the 0th device to others: @@ -645,36 +776,55 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, VLOG(3) << "use local async mode"; graph = member_->build_strategy_.Apply( graph, {member_->places_[0]}, loss_var_name, - {member_->local_scopes_[0]}, 1, member_->use_cuda_, + {member_->local_scopes_[0]}, 1, member_->use_device_, member_->nccl_ctxs_); for (size_t i = 1; i < member_->places_.size(); ++i) { graphs[i] = member_->build_strategy_.Apply( graphs[i], {member_->places_[i]}, loss_var_name, - {member_->local_scopes_[i]}, 1, member_->use_cuda_, + {member_->local_scopes_[i]}, 1, member_->use_device_, member_->nccl_ctxs_); async_graphs[i] = graphs[i]; } } else { graph = member_->build_strategy_.Apply( graph, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_cuda_, member_->nccl_ctxs_); + member_->nranks_, member_->use_device_, member_->nccl_ctxs_); + } +#elif defined(PADDLE_WITH_XPU_BKCL) + if (member_->build_strategy_.async_mode_) { + VLOG(3) << "use local async mode"; + graph = member_->build_strategy_.Apply( + graph, {member_->places_[0]}, loss_var_name, + {member_->local_scopes_[0]}, 1, member_->use_device_, + member_->bkcl_ctxs_); + for (size_t i = 1; i < member_->places_.size(); ++i) { + graphs[i] = member_->build_strategy_.Apply( + graphs[i], {member_->places_[i]}, loss_var_name, + {member_->local_scopes_[i]}, 1, member_->use_device_, + member_->bkcl_ctxs_); + async_graphs[i] = graphs[i]; + } + } else { + graph = member_->build_strategy_.Apply( + graph, member_->places_, loss_var_name, member_->local_scopes_, + member_->nranks_, member_->use_device_, member_->bkcl_ctxs_); } #else if (member_->build_strategy_.async_mode_) { VLOG(3) << "use local async mode"; graph = member_->build_strategy_.Apply( graph, {member_->places_[0]}, loss_var_name, - {member_->local_scopes_[0]}, 1, member_->use_cuda_); + {member_->local_scopes_[0]}, 1, member_->use_device_); for (size_t i = 1; i < member_->places_.size(); ++i) { graphs[i] = member_->build_strategy_.Apply( graphs[i], {member_->places_[i]}, loss_var_name, - {member_->local_scopes_[i]}, 1, member_->use_cuda_); + {member_->local_scopes_[i]}, 1, member_->use_device_); async_graphs[i] = graphs[i]; } } else { graph = member_->build_strategy_.Apply( graph, member_->places_, loss_var_name, member_->local_scopes_, - member_->nranks_, member_->use_cuda_); + member_->nranks_, member_->use_device_); } #endif @@ -854,6 +1004,63 @@ void ParallelExecutor::BCastParamsToDevices( } nccl_ctxs->WaitAll(); } +#endif + } else if (paddle::platform::is_xpu_place(main_tensor.place())) { +#if defined(PADDLE_WITH_XPU_BKCL) + std::vector buffers; + buffers.reserve(member_->places_.size()); + size_t numel = main_tensor.numel(); + // TODO(liuyuhui): BKCL only support parameters using float type, + // other parameters need to be strongly converted to float before + // broadcasting, + // but broadcast is equivalent to no type of operation, does not affect + // correctness. + BKCLDataType data_type = BKCL_FLOAT; + // BKCLDataType data_type = platform::ToBKCLDataType(main_tensor.type()); + for (size_t i = 0; i < member_->places_.size(); ++i) { + auto place = member_->places_[i]; + void *buffer; + + if (i == 0 && trainer_id == 0) { + buffer = const_cast(main_tensor.data()); + } else { + auto local_scope = member_->local_scopes_[i]; + auto *t = local_scope->Var(var)->GetMutable(); + t->Resize(dims); + buffer = t->mutable_data(place, main_tensor.type()); + } + buffers.push_back(buffer); + } + + PADDLE_ENFORCE_EQ(member_->places_.size(), buffers.size(), + platform::errors::PreconditionNotMet( + "variables' buffer size to bcast is %d, which is " + "NOT equal to places size %d", + buffers.size(), member_->places_.size())); + { + auto *bkcl_ctxs = member_->bkcl_ctxs_->DefaultFlatCtx(); + + PADDLE_ENFORCE_EQ( + bkcl_group_start(), BKCL_SUCCESS, + platform::errors::Unavailable("bkcl_group_start failed")); + for (size_t i = 0; i < member_->places_.size(); ++i) { + auto &bkcl_ctx = bkcl_ctxs->at(member_->places_[i]); + if (main_tensor.type() == framework::proto::VarType::INT64) { + numel *= 2; + } + PADDLE_ENFORCE_EQ( + bkcl_broadcast(bkcl_ctx.comm(), buffers[i], buffers[i], numel, + data_type, 0, NULL), + BKCL_SUCCESS, + platform::errors::Unavailable("bkcl_broadcast failed")); + } + PADDLE_ENFORCE_EQ( + bkcl_group_end(), BKCL_SUCCESS, + platform::errors::Unavailable("bkcl_group_end failed")); + } +#else + PADDLE_THROW( + platform::errors::PreconditionNotMet("Not compiled with BKCL.")); #endif } else { platform::CPUPlace cpu; @@ -872,7 +1079,8 @@ void ParallelExecutor::BCastParamsToDevices( // FIXME(zcd): LR_DECAY_COUNTER should not be shared. This is a hot fix. if (member_->build_strategy_.async_mode_) { share_memory(); - } else if (member_->use_all_reduce_ || member_->use_cuda_ || + } else if (member_->use_all_reduce_ || + member_->IsUseCUDA(member_->use_device_) || var == "@LR_DECAY_COUNTER@") { copy_memory(); } else { @@ -1103,7 +1311,7 @@ bool ParallelExecutor::EnableParallelGraphExecution( } } - if (!member_->use_all_reduce_ || !member_->use_cuda_) { + if (!member_->use_all_reduce_ || !member_->IsUseCUDA(member_->use_device_)) { if (build_strategy.enable_sequential_execution_ || exec_strategy.type_ == ExecutionStrategy::ExecutorType::kExperimental) { enable_parallel_graph = false; diff --git a/paddle/fluid/framework/parallel_executor.h b/paddle/fluid/framework/parallel_executor.h index 7688d8c604..0a1df2f194 100644 --- a/paddle/fluid/framework/parallel_executor.h +++ b/paddle/fluid/framework/parallel_executor.h @@ -43,6 +43,8 @@ class ParallelExecutorPrivate; using details::BuildStrategy; using details::ExecutionStrategy; +namespace p = paddle::platform; +using DeviceType = paddle::platform::DeviceType; class ParallelExecutor { DISABLE_COPY_AND_ASSIGN(ParallelExecutor); diff --git a/paddle/fluid/framework/var_type_traits.cc b/paddle/fluid/framework/var_type_traits.cc index 1e5e8d6575..235427331d 100644 --- a/paddle/fluid/framework/var_type_traits.cc +++ b/paddle/fluid/framework/var_type_traits.cc @@ -30,6 +30,10 @@ #include "paddle/fluid/operators/cudnn_rnn_cache.h" #endif +#if defined(PADDLE_WITH_XPU_BKCL) +#include "paddle/fluid/platform/bkcl_helper.h" +#endif + namespace paddle { namespace framework { diff --git a/paddle/fluid/framework/var_type_traits.h b/paddle/fluid/framework/var_type_traits.h index 07387f8741..2fd4de5cfc 100644 --- a/paddle/fluid/framework/var_type_traits.h +++ b/paddle/fluid/framework/var_type_traits.h @@ -31,6 +31,10 @@ #endif #endif +#if defined(PADDLE_WITH_XPU_BKCL) +#include "xpu/bkcl.h" +#endif + // Users should add forward declarations here namespace paddle { @@ -41,6 +45,10 @@ class Communicator; class NCCLCommunicator; #endif #endif + +#if defined(PADDLE_WITH_XPU_BKCL) +class BKCLCommunicator; +#endif } // namespace platform namespace framework { @@ -148,6 +156,9 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl< ncclUniqueId, platform::Communicator, platform::NCCLCommunicator, #endif operators::CudnnRNNCache, +#endif +#if defined(PADDLE_WITH_XPU_BKCL) + BKCLUniqueId, platform::BKCLCommunicator, #endif int, float>; diff --git a/paddle/fluid/framework/var_type_traits_test.cc b/paddle/fluid/framework/var_type_traits_test.cc index 2d7172e801..970294264d 100644 --- a/paddle/fluid/framework/var_type_traits_test.cc +++ b/paddle/fluid/framework/var_type_traits_test.cc @@ -31,6 +31,9 @@ #include "paddle/fluid/operators/conv_cudnn_op_cache.h" #include "paddle/fluid/operators/cudnn_rnn_cache.h" #endif +#if defined(PADDLE_WITH_XPU_BKCL) +#include "paddle/fluid/platform/bkcl_helper.h" +#endif namespace paddle { namespace framework { diff --git a/paddle/fluid/platform/bkcl_helper.h b/paddle/fluid/platform/bkcl_helper.h new file mode 100644 index 0000000000..cccee15719 --- /dev/null +++ b/paddle/fluid/platform/bkcl_helper.h @@ -0,0 +1,280 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef _WIN32 +#if defined(PADDLE_WITH_XPU_BKCL) +#pragma once + +#include +#include +#include +#include // NOLINT +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/platform/device_context.h" +#include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/platform/float16.h" +#include "paddle/fluid/platform/place.h" +#include "xpu/bkcl.h" +#include "xpu/runtime.h" + +#define BKCL_ID_VARNAME "BKCLID" + +namespace paddle { +namespace platform { + +inline BKCLDataType ToBKCLDataType(framework::proto::VarType::Type type) { + if (type == framework::proto::VarType::FP32) { + return BKCL_FLOAT; + } else { + PADDLE_THROW( + platform::errors::Unimplemented("BKCL currently only support FP32, " + "other data types are not supported.")); + } +} + +struct BKCLContext { + std::unique_ptr ctx_; + BKCLContext_t comm_; + + explicit BKCLContext(int dev_id) + : ctx_(new platform::XPUDeviceContext(XPUPlace(dev_id))), + comm_{nullptr} {} + + BKCLContext_t comm() const { return comm_; } + + int device_id() const { + return BOOST_GET_CONST(platform::XPUPlace, ctx_->GetPlace()).device; + } +}; + +struct InitBKCLPara { + BKCLUniqueId *bkcl_id; + int rank; + int nranks; + int dev_id; + BKCLContext_t *ctx; +}; + +static void *init_bkcl_context_func(void *args) { + struct InitBKCLPara *para = (struct InitBKCLPara *)args; + PADDLE_ENFORCE_EQ(xpu_set_device(para->dev_id), XPU_SUCCESS, + platform::errors::PreconditionNotMet( + "xpu_set_device failed[%d]", para->dev_id)); + PADDLE_ENFORCE_EQ( + bkcl_init_rank(para->ctx, para->rank, para->nranks, para->bkcl_id), + BKCL_SUCCESS, + platform::errors::PreconditionNotMet("bkcl_init_rank failed")); + return nullptr; +} + +struct BKCLContextMap { + std::unordered_map contexts_; + std::vector order_; + std::vector places_; + size_t num_trainers_; + size_t trainer_id_; + BKCLUniqueId *bkcl_id_; + + explicit BKCLContextMap(const std::vector &places, + BKCLUniqueId *bkcl_id = nullptr, + size_t num_trainers = 1, size_t trainer_id = 0) { + places_ = places; + bkcl_id_ = bkcl_id; + num_trainers_ = num_trainers; + trainer_id_ = trainer_id; + } + + // Synchronization is required and can only be initialized with + // multithreading. + int init() { + PADDLE_ENFORCE_EQ(!places_.empty(), true, + platform::errors::InvalidArgument( + "The BKCL place should not be empty.")); + order_.reserve(places_.size()); + for (auto &p : places_) { + int dev_id = BOOST_GET_CONST(platform::XPUPlace, p).device; + order_.emplace_back(dev_id); + contexts_.emplace(dev_id, BKCLContext(dev_id)); + } + PADDLE_ENFORCE_EQ( + order_.size(), contexts_.size(), + platform::errors::Unavailable("BKCL Context Map does not support " + "contain two or more same device")); + + std::unique_ptr comms(new BKCLContext_t[order_.size()]); + std::unique_ptr paras(new InitBKCLPara[order_.size()]); + std::unique_ptr pids(new pthread_t[order_.size()]); + BKCLResult_t ret; + BKCLUniqueId id; + // if num_trainers == 1, should create a new bkcl id for local comms. + if (num_trainers_ == 1 && bkcl_id_ == nullptr) { + ret = bkcl_get_unique_id(&id); + PADDLE_ENFORCE_EQ(BKCL_SUCCESS, ret, + platform::errors::PreconditionNotMet( + "bkcl get unique id failed [%d]", ret)); + bkcl_id_ = &id; + } + PADDLE_ENFORCE_NOT_NULL(bkcl_id_, platform::errors::InvalidArgument( + "The BKCL id should not be null.")); + { + int nranks = num_trainers_ * order_.size(); + for (size_t i = 0; i < order_.size(); ++i) { + int rank; + if (order_.size() > 1) { + rank = trainer_id_ * order_.size() + i; + } else { + rank = trainer_id_; + } + VLOG(1) << "init bkcl rank:" << rank << ", nranks:" << nranks + << ", xpu_id:" << order_[i]; + paras[i].rank = rank; + paras[i].nranks = nranks; + paras[i].dev_id = order_[i]; + paras[i].bkcl_id = bkcl_id_; + paras[i].ctx = &comms[i]; + PADDLE_ENFORCE_EQ( + pthread_create(&pids[i], nullptr, init_bkcl_context_func, + reinterpret_cast(¶s[i])), + 0, platform::errors::External("pthread_create failed")); + } + for (size_t i = 0; i < order_.size(); i++) { + pthread_join(pids[i], nullptr); + } + } + int i = 0; + for (auto &dev_id : order_) { + contexts_.at(dev_id).comm_ = comms[i++]; + } + return 0; + } + + BKCLContextMap(const BKCLContextMap &other) = delete; + BKCLContextMap &operator=(const BKCLContextMap &other) = delete; + + XPUDeviceContext *DevCtx(int dev_id) const { return at(dev_id).ctx_.get(); } + + XPUDeviceContext *DevCtx(platform::Place p) const { + return DevCtx(BOOST_GET_CONST(platform::XPUPlace, p).device); + } + + const BKCLContext &at(platform::Place p) const { + return this->at(BOOST_GET_CONST(platform::XPUPlace, p).device); + } + + const BKCLContext &at(int dev_id) const { return contexts_.at(dev_id); } + + void WaitAll() { + for (auto &p : contexts_) { + p.second.ctx_->Wait(); + } + } +}; + +inline std::string GetFlatBKCLVarName(size_t pos) { + if (pos == 0) { + return BKCL_ID_VARNAME; + } + return string::Sprintf("%s_%d", BKCL_ID_VARNAME, static_cast(pos)); +} + +class BKCLCommunicator { + public: + BKCLCommunicator() {} + virtual ~BKCLCommunicator() {} + + BKCLContextMap *DefaultFlatCtx() const { + if (flat_ctxs_.size() == 0) { + return nullptr; + } + + return flat_ctxs_[0].get(); + } + + std::vector> *GetFlatCtxs() { + return &flat_ctxs_; + } + + BKCLContextMap *GetFlatCtx(size_t run_order) const { + return flat_ctxs_[run_order % flat_ctxs_.size()].get(); + } + + BKCLContextMap *GetRunEnvBKCLCtx(size_t run_order, + bool use_hierarchical_allreduce) const { + PADDLE_ENFORCE_EQ(use_hierarchical_allreduce, false, + platform::errors::Unimplemented( + "Hierarchical all reduce is not support for XPU")); + return GetFlatCtx(run_order); + } + + /* + *It meets error when allreduce ophandle and sync_batch_norm_op use + *bkcl_all_reduce + *parallelly. So create a new bkcl comm for sync_batch_norm_op. And these + *codes should be polished with a unified bkcl management. + */ + BKCLContextMap *GetSyncBatchNormCtx( + framework::Scope *scope, const std::vector &places) { + auto *bkcl_id_var = scope->FindVar(BKCL_ID_VARNAME); + if (bkcl_id_var != nullptr) { + return DefaultFlatCtx(); + } + + if (sync_batch_norm_ctx_.get() == nullptr) { + sync_batch_norm_ctx_.reset(new BKCLContextMap(places)); + sync_batch_norm_ctx_->init(); + } + return sync_batch_norm_ctx_.get(); + } + + void InitFlatCtxs(const std::vector &places, + const std::vector &bkcl_ids, + size_t trainers_num, size_t trainer_id) { + if (bkcl_ids.size() == 0) { + auto ptr = new platform::BKCLContextMap(places); + ptr->init(); + VLOG(1) << "init local trainer"; + flat_ctxs_.emplace_back(ptr); + return; + } + + PADDLE_ENFORCE_EQ(bkcl_ids.size(), 1, + platform::errors::Unimplemented( + "Multi-all-reduce-ring is not support for XPU")); + for (size_t i = 0; i < bkcl_ids.size(); i++) { + auto ptr = new platform::BKCLContextMap(places, bkcl_ids[i], trainers_num, + trainer_id); + ptr->init(); + VLOG(1) << "init trainer_id:" << trainer_id << ", comm no:" << i; + flat_ctxs_.emplace_back(ptr); + } + } + + protected: + // Support multi bkcl comm on default bkcl ring while BKCLContextMap can't. + std::vector> flat_ctxs_; + + // just used for sync_batch_norm op. + std::unique_ptr sync_batch_norm_ctx_; +}; + +} // namespace platform +} // namespace paddle + +#endif // PADDLE_WITH_XPU_BKCL +#endif diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index 297466e8e5..b270331028 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -29,23 +29,39 @@ namespace memory { AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size) { auto place = dev_ctx.GetPlace(); -#ifdef PADDLE_WITH_CUDA - if (size == 0 || !platform::is_gpu_place(place)) { + if (size == 0) { return Alloc(place, size); } - auto* default_dev_ctx = static_cast( - platform::DeviceContextPool::Instance().Get(place)); - auto& desired_dev_ctx = - static_cast(dev_ctx); - if (default_dev_ctx->stream() == desired_dev_ctx.stream()) { + + if (platform::is_gpu_place(place)) { +#ifdef PADDLE_WITH_CUDA + auto* default_dev_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(place)); + auto& desired_dev_ctx = + static_cast(dev_ctx); + if (default_dev_ctx->stream() == desired_dev_ctx.stream()) { + return Alloc(place, size); + } else { + return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc( + desired_dev_ctx, size); + } +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't use CUDA device since it's not compiled with CUDA," + "Please recompile or reinstall Paddle with GPU support.")); +#endif + } else if (platform::is_xpu_place(place)) { +#ifdef PADDLE_WITH_XPU + // TODO(liuyuhui): Consider xpu stream later return Alloc(place, size); - } else { - return allocation::CUDADeviceContextAllocatorPool::Instance().Alloc( - desired_dev_ctx, size); - } #else - return Alloc(place, size); + PADDLE_THROW(platform::errors::PermissionDenied( + "Paddle can't use XPU device since it's not compiled with XPU," + "Please recompile or reinstall Paddle with XPU support.")); #endif + } else { + return Alloc(place, size); + } } } // namespace memory diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 56438a95f2..9f2e5acfc6 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -30,6 +30,10 @@ limitations under the License. */ #include "paddle/fluid/platform/gpu_info.h" #endif +#if defined(PADDLE_WITH_XPU_BKCL) +#include "xpu/bkcl.h" +#endif + #ifdef PADDLE_WITH_MKLDNN #include "mkldnn.hpp" #include "paddle/fluid/framework/data_layout.h" @@ -52,11 +56,29 @@ struct GpuDevice; #ifdef PADDLE_WITH_XPU #include "paddle/fluid/platform/xpu_header.h" +#include "paddle/fluid/platform/xpu_info.h" #endif namespace paddle { namespace platform { +#ifdef PADDLE_WITH_CUDA +/*Set the value of the global variable allow_tf32_cublas*/ +void SetAllowTF32Cublas(bool active); +/*Get the global variable allow_tf32_cublas value*/ +bool AllowTF32Cublas(); +#endif // PADDLE_WITH_CUDA + +enum DeviceType { + CPU = 0, + CUDA = 1, + XPU = 2, +}; + +constexpr DeviceType kCPU = DeviceType::CPU; +constexpr DeviceType kCUDA = DeviceType::CUDA; +constexpr DeviceType kXPU = DeviceType::XPU; + class DeviceContext { public: virtual ~DeviceContext() PADDLE_MAY_THROW {} @@ -100,9 +122,20 @@ class XPUDeviceContext : public DeviceContext { /*! \brief Wait for all operations completion in the stream. */ void Wait() const override; +#ifdef PADDLE_WITH_XPU_BKCL + /*! \brief Return bkcl context. */ + BKCLContext_t bkcl_context() const { return bkcl_context_; } + + /*! \brief Set bkcl context. */ + void set_bkcl_context(BKCLContext_t context) { bkcl_context_ = context; } +#endif + private: XPUPlace place_; xpu::Context* context_; +#ifdef PADDLE_WITH_XPU_BKCL + BKCLContext_t bkcl_context_; +#endif // Need to be the same with other DeviceContext, // Eventhough eigen_device_ is not used in XPU @@ -535,8 +568,8 @@ class MKLDNNDeviceContext : public CPUDeviceContext { const std::string& GetKeySuffix(void) const { return key_suffix_; } // Disable adding thread ID to the key - void DisableThreadInfoInKey(void) { key_attach_thread_id_ = false; }; - bool IsThreadIdUsedInKey(void) const { return key_attach_thread_id_; }; + void DisableThreadInfoInKey(void) { key_attach_thread_id_ = false; } + bool IsThreadIdUsedInKey(void) const { return key_attach_thread_id_; } // Prevent next ResetBlobMap() void BlockNextCacheClearing(); diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 8cde9a8a37..0155bfa791 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1307,6 +1307,7 @@ All parameter, weight, gradient are variables in Paddle. "The module will return special predefined variable name in Paddle") .def("empty", []() { return kEmptyVarName; }) .def("temp", []() { return kTempVarName; }); + // clang-format off py::class_(m, "DeviceContext") .def_static("create", @@ -1492,7 +1493,9 @@ All parameter, weight, gradient are variables in Paddle. #endif .def("__repr__", string::to_string) .def("__str__", string::to_string); - +#ifdef PADDLE_WITH_XPU + m.def("get_xpu_device_count", platform::GetXPUDeviceCount); +#endif py::class_(m, "CPUPlace", R"DOC( CPUPlace is a descriptor of a device. It represents a CPU device on which a tensor will be allocated and a model will run. @@ -2072,6 +2075,11 @@ All parameter, weight, gradient are variables in Paddle. exec_strategy=exec_strategy) )DOC"); + py::enum_(m, "DeviceType", py::arithmetic()) + .value("CPU", paddle::platform::DeviceType::CPU) + .value("CUDA", paddle::platform::DeviceType::CUDA) + .value("XPU", paddle::platform::DeviceType::XPU); + exec_strategy.def(py::init()) .def_property( "num_threads", @@ -2102,14 +2110,12 @@ All parameter, weight, gradient are variables in Paddle. exec_strategy.num_threads = 4 )DOC") .def_property( - "use_cuda", - [](const ExecutionStrategy &self) { return self.use_cuda_; }, - [](ExecutionStrategy &self, bool use_cuda) { - self.use_cuda_ = use_cuda; - }) // FIXME(chengduo): Doesn't add doc for 'use_cuda', use_cuda may - // make user confuse, because ParallelExecutor has a parameter named - // 'use_cuda' too, in current implementation, ParallelExecutor's - // 'use_cuda' will rewrite ExecutionStrategy's 'use_cuda'. + "_use_device", + [](const ExecutionStrategy &self) { return self.use_device_; }, + [](ExecutionStrategy &self, paddle::platform::DeviceType use_device) { + self.use_device_ = use_device; + }) // NOTE(liuyuhui): Doesn't add doc for 'use_device', because + // use_device isn‘t exposed to users. .def_property( "allow_op_delay", [](const ExecutionStrategy &self) { return self.allow_op_delay_; }, diff --git a/python/paddle/fluid/compiler.py b/python/paddle/fluid/compiler.py index 0b980c7eba..a07378a6f5 100644 --- a/python/paddle/fluid/compiler.py +++ b/python/paddle/fluid/compiler.py @@ -18,7 +18,7 @@ import six import sys from .. import compat as cpt from . import framework -from .framework import cuda_places, cpu_places +from .framework import cuda_places, cpu_places, xpu_places from . import core @@ -28,6 +28,7 @@ ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy BuildStrategy = core.ParallelExecutor.BuildStrategy InferNativeConfig = core.NativeConfig InferAnalysisConfig = core.AnalysisConfig +DeviceType = core.DeviceType def _place_obj(place): @@ -316,7 +317,7 @@ class CompiledProgram(object): "Subclass of CompiledProgram should implement _with_distributed method." ) - def _compile_data_parallel(self, places, use_cuda=False, scope=None): + def _compile_data_parallel(self, places, use_device, scope=None): if self._share_vars_from: if scope: sys.stderr.write("share_vars_from is set, scope is ignored.\n") @@ -342,16 +343,23 @@ class CompiledProgram(object): if self._exec_strategy is None: self._exec_strategy = ExecutionStrategy() - self._exec_strategy.use_cuda = use_cuda + self._exec_strategy._use_device = use_device if self._exec_strategy.num_threads == 0: - if self._exec_strategy.use_cuda: + if self._exec_strategy._use_device == DeviceType.CUDA: # Experiments on se-resnext shows that too many threads hurt # performance. Worth tunning for other models in the future. self._exec_strategy.num_threads = len(places) * 4 + elif self._exec_strategy._use_device == DeviceType.XPU: + # Currently only single thread is supported in Kunlun XPU. + self._exec_strategy.num_threads = 1 else: self._exec_strategy.num_threads = len(places) * 2 + if self._exec_strategy._use_device == DeviceType.XPU: + assert self._exec_strategy.num_threads == 1, \ + "Currently only single thread is supported in Kunlun XPU." + if self._build_strategy.num_trainers > 1: assert self._is_data_parallel, \ "If you use multi-trainer to train the model, you should use "\ @@ -377,7 +385,7 @@ class CompiledProgram(object): self._build_strategy.enable_sequential_execution = True if self._program is not None and self._program._enable_dgc: - assert use_cuda, "DGC only used under CUDA environment." + assert self._exec_strategy._use_device == DeviceType.CUDA, "DGC only used under CUDA environment." assert self._build_strategy.num_trainers * len( places) > 1, "DGC is not avaliable for single card training." assert self._build_strategy.reduce_strategy == BuildStrategy.ReduceStrategy.AllReduce, "DGC \ @@ -447,11 +455,14 @@ class CompiledProgram(object): raise NotImplementedError( "If optimizer is used in control flow, " "training on multi-places is not supported now.") - + if isinstance(self._place, core.CUDAPlace): + use_device = DeviceType.CUDA + elif isinstance(self._place, core.XPUPlace): + use_device = DeviceType.XPU + else: + use_device = DeviceType.CPU self._executor = self._compile_data_parallel( - use_cuda=isinstance(self._place, core.CUDAPlace), - scope=self._scope, - places=self._places) + use_device=use_device, scope=self._scope, places=self._places) return self def _get_places(self, place, place_list): @@ -461,7 +472,11 @@ class CompiledProgram(object): assert p._type() == place._type(), \ "Place type not match. You may set wrong type of places." else: - place_list = cuda_places() if isinstance( - place, core.CUDAPlace) else cpu_places() + if isinstance(place, core.CUDAPlace): + place_list = cuda_places() + elif isinstance(place, core.XPUPlace): + place_list = xpu_places() + else: + place_list = cpu_places() assert place_list, "No places for execution." return place_list diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 4e815070da..2e93c0862f 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -46,6 +46,7 @@ __all__ = [ 'name_scope', 'cuda_places', 'cpu_places', + 'xpu_places', 'cuda_pinned_places', 'in_dygraph_mode', 'is_compiled_with_cuda', @@ -353,6 +354,15 @@ def _cuda_ids(): return device_ids +def _xpu_ids(): + xpus_env = os.getenv("FLAGS_selected_xpus") + if xpus_env: + device_ids = [int(s) for s in xpus_env.split(",")] + else: + device_ids = six.moves.range(core.get_xpu_device_count()) + return device_ids + + def is_compiled_with_xpu(): """ Whether this whl package can be used to run the model on XPU. @@ -429,6 +439,44 @@ def cuda_places(device_ids=None): return [core.CUDAPlace(dev_id) for dev_id in device_ids] +def xpu_places(device_ids=None): + """ + **Note**: + For multi-card tasks, please use `FLAGS_selected_xpus` environment variable to set the visible XPU device. + This function creates a list of :code:`paddle.XPUPlace` objects. + If :code:`device_ids` is None, environment variable of + :code:`FLAGS_selected_xpus` would be checked first. For example, if + :code:`FLAGS_selected_xpus=0,1,2`, the returned list would + be [paddle.XPUPlace(0), paddle.XPUPlace(1), paddle.XPUPlace(2)]. + If :code:`FLAGS_selected_xpus` is not set, all visible + xpu places would be returned. + If :code:`device_ids` is not None, it should be the device + ids of XPUs. For example, if :code:`device_ids=[0,1,2]`, + the returned list would be + [paddle.XPUPlace(0), paddle.XPUPlace(1), paddle.XPUPlace(2)]. + + Parameters: + device_ids (list or tuple of int, optional): list of XPU device ids. + Returns: + list of paddle.XPUPlace: Created XPU place list. + Examples: + .. code-block:: python + + import paddle + import paddle.static as static + + paddle.enable_static() + xpu_places = static.xpu_places() + """ + assert core.is_compiled_with_xpu(), \ + "Not compiled with XPU" + if device_ids is None: + device_ids = _xpu_ids() + elif not isinstance(device_ids, (list, tuple)): + device_ids = [device_ids] + return [core.XPUPlace(dev_id) for dev_id in device_ids] + + def cpu_places(device_count=None): """ This function creates a list of :code:`paddle.CPUPlace` objects, and returns the created list. diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index c71e0e3361..47f5c5085a 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -28,13 +28,14 @@ import sys from feed_data_reader import FeedDataReader __all__ = ['TestParallelExecutorBase'] +DeviceType = core.DeviceType class TestParallelExecutorBase(unittest.TestCase): @classmethod def check_network_convergence(cls, method, - use_cuda=True, + use_device=DeviceType.CUDA, iter=5, batch_size=None, feed_dict=None, @@ -74,7 +75,9 @@ class TestParallelExecutorBase(unittest.TestCase): feed_dict, loss = cls.build_model(feed_dict, get_data_from_feeder, main, method, optimizer) - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + place = fluid.CUDAPlace( + 0) if use_device == DeviceType.CUDA else fluid.XPUPlace( + 0) if use_device == DeviceType.XPU else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) @@ -82,7 +85,7 @@ class TestParallelExecutorBase(unittest.TestCase): enable_inplace, enable_sequential_execution, fuse_all_optimizer_ops, fuse_all_reduce_ops, fuse_elewise_add_act_ops, fuse_relu_depthwise_conv, use_fast_executor, use_ir_memory_optimize, - use_reduce, use_cuda) + use_reduce, use_device) if use_parallel_executor: binary = compiler.CompiledProgram(main).with_data_parallel( @@ -94,7 +97,8 @@ class TestParallelExecutorBase(unittest.TestCase): if batch_size is not None: batch_size *= fluid.core.get_cuda_device_count( - ) if use_cuda else int( + ) if use_device == DeviceType.CUDA else fluid.core.get_xpu_device_count( + ) if use_device == DeviceType.XPU else int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) begin = time.time() @@ -123,7 +127,7 @@ class TestParallelExecutorBase(unittest.TestCase): @classmethod def check_pass_conflict(cls, method, - use_cuda=True, + use_device=DeviceType.CUDA, feed_dict=None, get_data_from_feeder=None, use_reduce=False, @@ -143,7 +147,9 @@ class TestParallelExecutorBase(unittest.TestCase): feed_dict, loss = cls.build_model(feed_dict, get_data_from_feeder, main, method, optimizer) - place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() + place = fluid.CUDAPlace( + 0) if use_device == DeviceType.CUDA else fluid.XPUPlace( + 0) if use_device == DeviceType.XPU else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) @@ -151,7 +157,7 @@ class TestParallelExecutorBase(unittest.TestCase): enable_inplace, enable_sequential_execution, fuse_all_optimizer_ops, fuse_all_reduce_ops, fuse_elewise_add_act_ops, fuse_relu_depthwise_conv, use_fast_executor, use_ir_memory_optimize, - use_reduce, use_cuda) + use_reduce, use_device) binary = compiler.CompiledProgram(main).with_data_parallel( loss_name=loss.name, @@ -165,7 +171,7 @@ class TestParallelExecutorBase(unittest.TestCase): fuse_all_optimizer_ops, fuse_all_reduce_ops, fuse_elewise_add_act_ops, fuse_relu_depthwise_conv, use_fast_executor, use_ir_memory_optimize, use_reduce, - use_cuda): + use_device): exec_strategy = fluid.ExecutionStrategy() if use_fast_executor: exec_strategy.use_experimental_executor = True @@ -180,8 +186,17 @@ class TestParallelExecutorBase(unittest.TestCase): build_strategy.enable_inplace = enable_inplace build_strategy.enable_sequential_execution = enable_sequential_execution - if use_cuda and core.is_compiled_with_cuda(): + if use_device == DeviceType.CUDA and core.is_compiled_with_cuda(): build_strategy.remove_unnecessary_lock = True + if use_device == DeviceType.XPU and core.is_compiled_with_xpu(): + build_strategy.fuse_elewise_add_act_ops = False + build_strategy.fuse_relu_depthwise_conv = False + build_strategy.fuse_all_optimizer_ops = False + build_strategy.fuse_all_reduce_ops = False + build_strategy.memory_optimize = False + build_strategy.enable_inplace = False + build_strategy.enable_sequential_execution = False + return build_strategy, exec_strategy @classmethod diff --git a/python/paddle/fluid/tests/unittests/seresnext_net.py b/python/paddle/fluid/tests/unittests/seresnext_net.py index 45d39afc11..2e4b1828c5 100644 --- a/python/paddle/fluid/tests/unittests/seresnext_net.py +++ b/python/paddle/fluid/tests/unittests/seresnext_net.py @@ -19,6 +19,7 @@ fluid.core._set_eager_deletion_mode(-1, -1, False) import paddle.fluid.layers.ops as ops from paddle.fluid.layers.learning_rate_scheduler import cosine_decay from simple_nets import init_data +from seresnext_test_base import DeviceType import math import os os.environ['CPU_NUM'] = str(4) @@ -169,28 +170,32 @@ def optimizer(learning_rate=0.01): model = SE_ResNeXt50Small -def batch_size(use_cuda): - if use_cuda: +def batch_size(use_device): + if use_device == DeviceType.CUDA: # Paddle uses 8GB P4 GPU for unittest so we decreased the batch size. return 8 return 12 -def iter(use_cuda): - if use_cuda: +def iter(use_device): + if use_device == DeviceType.CUDA: return 10 return 1 gpu_img, gpu_label = init_data( - batch_size=batch_size(use_cuda=True), img_shape=img_shape, label_range=999) + batch_size=batch_size(use_device=DeviceType.CUDA), + img_shape=img_shape, + label_range=999) cpu_img, cpu_label = init_data( - batch_size=batch_size(use_cuda=False), img_shape=img_shape, label_range=999) + batch_size=batch_size(use_device=DeviceType.CPU), + img_shape=img_shape, + label_range=999) feed_dict_gpu = {"image": gpu_img, "label": gpu_label} feed_dict_cpu = {"image": cpu_img, "label": cpu_label} -def feed_dict(use_cuda): - if use_cuda: +def feed_dict(use_device): + if use_device == DeviceType.CUDA: return feed_dict_gpu return feed_dict_cpu diff --git a/python/paddle/fluid/tests/unittests/seresnext_test_base.py b/python/paddle/fluid/tests/unittests/seresnext_test_base.py index 9f055191b1..cc40b89b58 100644 --- a/python/paddle/fluid/tests/unittests/seresnext_test_base.py +++ b/python/paddle/fluid/tests/unittests/seresnext_test_base.py @@ -15,34 +15,35 @@ from __future__ import print_function import seresnext_net import paddle.fluid.core as core -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, DeviceType +from parallel_executor_test_base import DeviceType import numpy as np class TestResnetBase(TestParallelExecutorBase): def _compare_result_with_origin_model(self, check_func, - use_cuda, + use_device, delta2=1e-5, compare_seperately=True): - if use_cuda and not core.is_compiled_with_cuda(): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return func_1_first_loss, func_1_last_loss = self.check_network_convergence( seresnext_net.model, - feed_dict=seresnext_net.feed_dict(use_cuda), - iter=seresnext_net.iter(use_cuda), - batch_size=seresnext_net.batch_size(use_cuda), - use_cuda=use_cuda, + feed_dict=seresnext_net.feed_dict(use_device), + iter=seresnext_net.iter(use_device), + batch_size=seresnext_net.batch_size(use_device), + use_device=use_device, use_reduce=False, optimizer=seresnext_net.optimizer) func_2_first_loss, func_2_last_loss = check_func( seresnext_net.model, - feed_dict=seresnext_net.feed_dict(use_cuda), - iter=seresnext_net.iter(use_cuda), - batch_size=seresnext_net.batch_size(use_cuda), - use_cuda=use_cuda) + feed_dict=seresnext_net.feed_dict(use_device), + iter=seresnext_net.iter(use_device), + batch_size=seresnext_net.batch_size(use_device), + use_device=use_device) if compare_seperately: for loss in zip(func_1_first_loss, func_2_first_loss): diff --git a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py index 47671ab3a8..881b9d9057 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py @@ -14,7 +14,7 @@ from simple_nets import simple_fc_net, fc_with_batchnorm, init_data, bow_net from fake_reader import fake_imdb_reader -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, DeviceType import paddle.fluid as fluid import paddle.fluid.core as core from functools import partial @@ -30,12 +30,12 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase): def compare_fuse_all_reduce_ops(self, model, - use_cuda, + use_device, init_feed_dict=None, get_data_from_feeder=None, optimizer=None, fuse_all_optimizer_ops=False): - if use_cuda and not core.is_compiled_with_cuda(): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return feed_dict_data = None @@ -47,7 +47,7 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase): model, feed_dict=feed_dict_data, get_data_from_feeder=get_data_from_feeder, - use_cuda=use_cuda, + use_device=use_device, fuse_all_reduce_ops=False, fuse_all_optimizer_ops=fuse_all_optimizer_ops, optimizer=optimizer) @@ -55,7 +55,7 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase): model, feed_dict=feed_dict_data, get_data_from_feeder=get_data_from_feeder, - use_cuda=use_cuda, + use_device=use_device, fuse_all_reduce_ops=True, fuse_all_optimizer_ops=fuse_all_optimizer_ops, optimizer=optimizer) @@ -73,28 +73,30 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase): class TestFuseAllReduceOps(TestFuseAllReduceOpsBase): - def _decorate_compare_fused_all_reduce(self, model, use_cuda): + def _decorate_compare_fused_all_reduce(self, model, use_device): self.compare_fuse_all_reduce_ops( model, - use_cuda, + use_device, init_feed_dict=init_data, optimizer=self.optimizer, fuse_all_optimizer_ops=True) def test_simple_fc_with_fuse_all_reduce(self): - self._decorate_compare_fused_all_reduce(simple_fc_net, True) - self._decorate_compare_fused_all_reduce(simple_fc_net, False) + self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CUDA) + self._decorate_compare_fused_all_reduce(simple_fc_net, DeviceType.CPU) def test_batchnorm_fc_with_fuse_all_reduce(self): - self._decorate_compare_fused_all_reduce(fc_with_batchnorm, True) - self._decorate_compare_fused_all_reduce(fc_with_batchnorm, False) + self._decorate_compare_fused_all_reduce(fc_with_batchnorm, + DeviceType.CUDA) + self._decorate_compare_fused_all_reduce(fc_with_batchnorm, + DeviceType.CPU) class TestFuseAllReduceOpsAndOptiOps(TestFuseAllReduceOps): - def _decorate_compare_fused_all_reduce(self, model, use_cuda): + def _decorate_compare_fused_all_reduce(self, model, use_device): self.compare_fuse_all_reduce_ops( model, - use_cuda, + use_device, init_feed_dict=init_data, optimizer=self.optimizer, fuse_all_optimizer_ops=True) @@ -115,17 +117,17 @@ class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase): feeder = fluid.DataFeeder(feed_list=["words", "label"], place=place) return feeder.feed(self.train_data) - def _decorate_compare_fused_all_reduce(self, model, use_cuda): + def _decorate_compare_fused_all_reduce(self, model, use_device): self.compare_fuse_all_reduce_ops( model, - use_cuda, + use_device, get_data_from_feeder=self.get_data_from_feeder, optimizer=self.optimizer) def test_simple_bow_net_with_fuse_all_reduce(self): model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True) - self._decorate_compare_fused_all_reduce(model, True) - self._decorate_compare_fused_all_reduce(model, False) + self._decorate_compare_fused_all_reduce(model, DeviceType.CUDA) + self._decorate_compare_fused_all_reduce(model, DeviceType.CPU) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py index 617fecffe0..a1c20be9a9 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py @@ -13,7 +13,7 @@ # limitations under the License. from simple_nets import simple_fc_net, fc_with_batchnorm, init_data -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, DeviceType import paddle.fluid as fluid import paddle.fluid.core as core import unittest @@ -25,8 +25,8 @@ class TestMNIST(TestParallelExecutorBase): def setUpClass(cls): os.environ['CPU_NUM'] = str(4) - def _compare_fuse_elewise_add_act_ops(self, model, use_cuda): - if use_cuda and not core.is_compiled_with_cuda(): + def _compare_fuse_elewise_add_act_ops(self, model, use_device): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return img, label = init_data() @@ -45,7 +45,7 @@ class TestMNIST(TestParallelExecutorBase): model, feed_dict={"image": img, "label": label}, - use_cuda=use_cuda, + use_device=use_device, fuse_elewise_add_act_ops=False, use_ir_memory_optimize=False, enable_inplace=False, @@ -54,7 +54,7 @@ class TestMNIST(TestParallelExecutorBase): model, feed_dict={"image": img, "label": label}, - use_cuda=use_cuda, + use_device=use_device, fuse_elewise_add_act_ops=True, use_ir_memory_optimize=False, enable_inplace=False, @@ -66,12 +66,14 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) def test_simple_fc_with_fuse_op(self): - self._compare_fuse_elewise_add_act_ops(simple_fc_net, True) - self._compare_fuse_elewise_add_act_ops(simple_fc_net, False) + self._compare_fuse_elewise_add_act_ops(simple_fc_net, DeviceType.CUDA) + self._compare_fuse_elewise_add_act_ops(simple_fc_net, DeviceType.CPU) def test_batchnorm_fc_with_fuse_op(self): - self._compare_fuse_elewise_add_act_ops(fc_with_batchnorm, True) - self._compare_fuse_elewise_add_act_ops(fc_with_batchnorm, False) + self._compare_fuse_elewise_add_act_ops(fc_with_batchnorm, + DeviceType.CUDA) + self._compare_fuse_elewise_add_act_ops(fc_with_batchnorm, + DeviceType.CPU) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py index a22daeedd0..51c06bb79d 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py @@ -14,7 +14,7 @@ from simple_nets import simple_fc_net, fc_with_batchnorm, init_data, bow_net from fake_reader import fake_imdb_reader -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, DeviceType from functools import partial import paddle import paddle.fluid as fluid @@ -34,25 +34,25 @@ class TestFuseOptimizationOps(TestParallelExecutorBase): def _compare_fused_optimizer_ops(self, model, - use_cuda, + use_device, feed_dict=None, get_data_from_feeder=None, optimizer=fluid.optimizer.Adam): - if use_cuda and not core.is_compiled_with_cuda(): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( model, feed_dict=feed_dict, get_data_from_feeder=get_data_from_feeder, - use_cuda=use_cuda, + use_device=use_device, fuse_all_optimizer_ops=False, optimizer=optimizer) fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence( model, feed_dict=feed_dict, get_data_from_feeder=get_data_from_feeder, - use_cuda=use_cuda, + use_device=use_device, fuse_all_optimizer_ops=True, optimizer=optimizer) @@ -61,10 +61,11 @@ class TestFuseOptimizationOps(TestParallelExecutorBase): for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss): self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) - def _decorate_compare_fused_optimizer_ops(self, model, use_cuda, optimizer): + def _decorate_compare_fused_optimizer_ops(self, model, use_device, + optimizer): self._compare_fused_optimizer_ops( model, - use_cuda, + use_device, feed_dict=self._get_feed_dict(), optimizer=optimizer) @@ -75,9 +76,9 @@ class TestFuseAdamOps(TestFuseOptimizationOps): def test_batchnorm_fc_with_fuse_op(self): self._decorate_compare_fused_optimizer_ops( - fc_with_batchnorm, True, optimizer=self.optimizer) + fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer) self._decorate_compare_fused_optimizer_ops( - fc_with_batchnorm, False, optimizer=self.optimizer) + fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer) class TestFuseSGDOps(TestFuseAdamOps): @@ -106,10 +107,11 @@ class TestSpareFuseAdamOps(TestFuseOptimizationOps): feeder = fluid.DataFeeder(feed_list=["words", "label"], place=place) return feeder.feed(self.train_data) - def _decorate_compare_fused_optimizer_ops(self, model, use_cuda, optimizer): + def _decorate_compare_fused_optimizer_ops(self, model, use_device, + optimizer): self._compare_fused_optimizer_ops( model, - use_cuda, + use_device, get_data_from_feeder=self._get_data_from_feeder, optimizer=optimizer) @@ -119,9 +121,9 @@ class TestSpareFuseAdamOps(TestFuseOptimizationOps): def test_simple_bow_net_with_fuse_op(self): model = partial(bow_net, dict_dim=self.word_dict_len, is_sparse=True) self._decorate_compare_fused_optimizer_ops( - model, True, optimizer=self.optimizer) + model, DeviceType.CUDA, optimizer=self.optimizer) self._decorate_compare_fused_optimizer_ops( - model, False, optimizer=self.optimizer) + model, DeviceType.CPU, optimizer=self.optimizer) class TestSpareFuseSGDOps(TestSpareFuseAdamOps): @@ -138,18 +140,18 @@ class TestSpareFuseMomentumOps(TestSpareFuseAdamOps): class TestPassConflictBase(TestFuseAdamOps): def _compare_fused_optimizer_ops(self, model, - use_cuda, + use_device, feed_dict=None, get_data_from_feeder=None, optimizer=fluid.optimizer.Adam): - if use_cuda and not core.is_compiled_with_cuda(): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return self.check_pass_conflict( model, feed_dict=feed_dict, get_data_from_feeder=get_data_from_feeder, - use_cuda=use_cuda, + use_device=use_device, fuse_all_optimizer_ops=True, optimizer=optimizer, enable_sequential_execution=True) @@ -161,9 +163,9 @@ class TestFuseAdamOpsPassConflict(TestPassConflictBase): def test_batchnorm_fc_with_fuse_op(self): self._decorate_compare_fused_optimizer_ops( - fc_with_batchnorm, True, optimizer=self.optimizer) + fc_with_batchnorm, DeviceType.CPU, optimizer=self.optimizer) self._decorate_compare_fused_optimizer_ops( - fc_with_batchnorm, False, optimizer=self.optimizer) + fc_with_batchnorm, DeviceType.CUDA, optimizer=self.optimizer) class TestFuseSGDOpsPassConflict(TestFuseAdamOpsPassConflict): diff --git a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py index 7c9b56d403..9b739ebdfb 100644 --- a/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py +++ b/python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, DeviceType import paddle.fluid as fluid import paddle.fluid.core as core import numpy as np @@ -72,8 +72,8 @@ class TestMNIST(TestParallelExecutorBase): label = np.ones(shape=[32, 1], dtype='int64') return img, label - def _compare(self, model, use_cuda, random_data=True, only_forward=False): - if use_cuda and not core.is_compiled_with_cuda(): + def _compare(self, model, use_device, random_data=True, only_forward=False): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return img, label = self._init_data(random_data) @@ -90,7 +90,7 @@ class TestMNIST(TestParallelExecutorBase): model, feed_dict={"image": img, "label": label}, - use_cuda=use_cuda, + use_device=use_device, fuse_relu_depthwise_conv=True, use_ir_memory_optimize=True, optimizer=_optimizer) @@ -98,7 +98,7 @@ class TestMNIST(TestParallelExecutorBase): model, feed_dict={"image": img, "label": label}, - use_cuda=use_cuda, + use_device=use_device, fuse_relu_depthwise_conv=False, optimizer=_optimizer) @@ -108,12 +108,12 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) def test_simple_depthwise_with_fuse_op(self): - self._compare(simple_depthwise_net, True) - self._compare(simple_depthwise_net, False) + self._compare(simple_depthwise_net, DeviceType.CUDA) + self._compare(simple_depthwise_net, DeviceType.CPU) def test_simple_depthwise_with_fuse_op_only_forward(self): - self._compare(simple_depthwise_net, True, only_forward=True) - self._compare(simple_depthwise_net, False, only_forward=True) + self._compare(simple_depthwise_net, DeviceType.CUDA, only_forward=True) + self._compare(simple_depthwise_net, DeviceType.CPU, only_forward=True) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py index c1ef0f49af..e2094c76b7 100644 --- a/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py +++ b/python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py @@ -19,7 +19,7 @@ import unittest import numpy as np import paddle.fluid.core as core import paddle.fluid as fluid -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, DeviceType def fc_with_batchnorm(use_feed): @@ -58,7 +58,7 @@ class TestIrInplace(TestParallelExecutorBase): fc_with_batchnorm, feed_dict={"image": img, "label": label}, - use_cuda=True, + use_device=DeviceType.CUDA, use_ir_memory_optimize=ir_memory_optimize, enable_inplace=enable_inplace) diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py index 0ace288d9d..dba92a68cd 100644 --- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py +++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py @@ -75,7 +75,7 @@ class TestIrMemoryOptimizeIfElseOp(unittest.TestCase): exe = Executor(place) exec_strategy = fluid.ExecutionStrategy() - exec_strategy.use_cuda = use_cuda + exec_strategy._use_device = core.DeviceType.CUDA if use_cuda else core.DeviceType.CPU build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = use_mem_opt diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py index d9f68c2d15..f4ec63a8b9 100644 --- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py +++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py @@ -12,7 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, DeviceType import paddle.fluid as fluid import paddle.fluid.core as core import numpy as np @@ -60,8 +60,8 @@ class TestMNIST(TestParallelExecutorBase): label = np.ones(shape=[32, 1], dtype='int64') return img, label - def _compare_ir_memory_optimize(self, model, use_cuda): - if use_cuda and not core.is_compiled_with_cuda(): + def _compare_ir_memory_optimize(self, model, use_device): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return img, label = self._dummy_data() @@ -69,13 +69,13 @@ class TestMNIST(TestParallelExecutorBase): model, feed_dict={"image": img, "label": label}, - use_cuda=use_cuda, + use_device=use_device, use_ir_memory_optimize=False) first_loss1, last_loss1 = self.check_network_convergence( model, feed_dict={"image": img, "label": label}, - use_cuda=use_cuda, + use_device=use_device, use_ir_memory_optimize=True) for loss in zip(first_loss0, first_loss1): self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) @@ -83,12 +83,12 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEqual(loss[0], loss[1], delta=1e-6) def test_simple_fc_net(self): - self._compare_ir_memory_optimize(simple_fc_net, False) - self._compare_ir_memory_optimize(simple_fc_net, True) + self._compare_ir_memory_optimize(simple_fc_net, DeviceType.CPU) + self._compare_ir_memory_optimize(simple_fc_net, DeviceType.CUDA) def test_fc_with_reshape_net(self): - self._compare_ir_memory_optimize(fc_with_inplace_net, False) - self._compare_ir_memory_optimize(fc_with_inplace_net, True) + self._compare_ir_memory_optimize(fc_with_inplace_net, DeviceType.CPU) + self._compare_ir_memory_optimize(fc_with_inplace_net, DeviceType.CUDA) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py index 1af696f873..aa495c7533 100644 --- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py @@ -23,7 +23,7 @@ import paddle.dataset.wmt16 as wmt16 os.environ['FLAGS_eager_delete_tensor_gb'] = "0.0" -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, DeviceType from test_parallel_executor_transformer import get_feed_data_reader, transformer @@ -35,14 +35,14 @@ class TestTransformerWithIR(TestParallelExecutorBase): # check python transpiler self.check_network_convergence( transformer, - use_cuda=True, + use_device=DeviceType.CUDA, feed_data_reader=get_feed_data_reader(), use_ir_memory_optimize=False, iter=2) # check IR memory optimize self.check_network_convergence( transformer, - use_cuda=True, + use_device=DeviceType.CUDA, feed_data_reader=get_feed_data_reader(), use_ir_memory_optimize=True, iter=2) diff --git a/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py b/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py index a3fa84c224..33393bc2fc 100644 --- a/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py +++ b/python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py @@ -24,7 +24,7 @@ import numpy as np import paddle import paddle.fluid as fluid from simple_nets import init_data -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, DeviceType batch_size = 12 img_shape = [1, 28, 28] @@ -68,7 +68,7 @@ def _optimizer(learning_rate=1e-6): class TestResnet(TestParallelExecutorBase): - def check_model(self, use_cuda): + def check_model(self, use_device): img, label = init_data( batch_size=batch_size, img_shape=img_shape, label_range=9) img = np.float16(img) @@ -78,13 +78,13 @@ class TestResnet(TestParallelExecutorBase): conv_net, feed_dict=feed_dict, iter=10, - use_cuda=use_cuda, + use_device=use_device, fuse_all_reduce_ops=True, optimizer=_optimizer) def test_model(self): if core.is_compiled_with_cuda(): - self.check_model(True) + self.check_model(DeviceType.CUDA) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py index da7e30ff10..2c79670f1a 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py @@ -18,9 +18,11 @@ import unittest import numpy as np import paddle.fluid.core as core +import paddle import os import paddle.fluid as fluid -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, DeviceType +from parallel_executor_test_base import DeviceType def simple_fc_net(use_feed): @@ -76,10 +78,13 @@ class TestMNIST(TestParallelExecutorBase): def _compare_reduce_and_allreduce(self, model, - use_cuda, + use_device, delta1=1e-6, delta2=1e-4): - if use_cuda and not core.is_compiled_with_cuda(): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): + return + + if use_device == DeviceType.XPU and not core.is_compiled_with_xpu(): return img, label = self._init_data() @@ -88,14 +93,14 @@ class TestMNIST(TestParallelExecutorBase): model, feed_dict={"image": img, "label": label}, - use_cuda=use_cuda, + use_device=use_device, use_reduce=False) reduce_first_loss, reduce_last_loss = self.check_network_convergence( model, feed_dict={"image": img, "label": label}, - use_cuda=use_cuda, + use_device=use_device, use_reduce=True) for loss in zip(all_reduce_first_loss, reduce_first_loss): @@ -104,8 +109,11 @@ class TestMNIST(TestParallelExecutorBase): self.assertAlmostEqual(loss[0], loss[1], delta=delta2) # simple_fc - def check_simple_fc_convergence(self, use_cuda, use_reduce=False): - if use_cuda and not core.is_compiled_with_cuda(): + def check_simple_fc_convergence(self, use_device, use_reduce=False): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): + return + + if use_device == DeviceType.XPU and not core.is_compiled_with_xpu(): return img, label = self._init_data() @@ -114,23 +122,26 @@ class TestMNIST(TestParallelExecutorBase): simple_fc_net, feed_dict={"image": img, "label": label}, - use_cuda=use_cuda, + use_device=use_device, use_reduce=use_reduce) def test_simple_fc(self): - # use_cuda - self.check_simple_fc_convergence(True) - self.check_simple_fc_convergence(False) + # use_device + self.check_simple_fc_convergence(DeviceType.CUDA) + self.check_simple_fc_convergence(DeviceType.CPU) + self.check_simple_fc_convergence(DeviceType.XPU) def test_simple_fc_with_new_strategy(self): - # use_cuda, use_reduce + # use_device, use_reduce # NOTE: the computation result of nccl_reduce is non-deterministic, # related issue: https://github.com/NVIDIA/nccl/issues/157 - self._compare_reduce_and_allreduce(simple_fc_net, True, 1e-5, 1e-2) - self._compare_reduce_and_allreduce(simple_fc_net, False, 1e-5, 1e-2) + self._compare_reduce_and_allreduce(simple_fc_net, DeviceType.CUDA, 1e-5, + 1e-2) + self._compare_reduce_and_allreduce(simple_fc_net, DeviceType.CPU, 1e-5, + 1e-2) - def check_simple_fc_parallel_accuracy(self, use_cuda): - if use_cuda and not core.is_compiled_with_cuda(): + def check_simple_fc_parallel_accuracy(self, use_device): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return img, label = self._init_data() @@ -139,13 +150,13 @@ class TestMNIST(TestParallelExecutorBase): method=simple_fc_net, feed_dict={"image": img, "label": label}, - use_cuda=use_cuda, + use_device=use_device, use_parallel_executor=False) parallel_first_loss, parallel_last_loss = self.check_network_convergence( method=simple_fc_net, feed_dict={"image": img, "label": label}, - use_cuda=use_cuda, + use_device=use_device, use_parallel_executor=True) self.assertAlmostEquals( @@ -156,33 +167,38 @@ class TestMNIST(TestParallelExecutorBase): np.mean(parallel_last_loss), single_last_loss, delta=1e-6) def test_simple_fc_parallel_accuracy(self): - self.check_simple_fc_parallel_accuracy(True) - self.check_simple_fc_parallel_accuracy(False) + self.check_simple_fc_parallel_accuracy(DeviceType.CUDA) + self.check_simple_fc_parallel_accuracy(DeviceType.CPU) - def check_batchnorm_fc_convergence(self, use_cuda, use_fast_executor): - if use_cuda and not core.is_compiled_with_cuda(): + def check_batchnorm_fc_convergence(self, use_device, use_fast_executor): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): + return + if use_device == DeviceType.XPU and not core.is_compiled_with_xpu(): return - img, label = self._init_data() self.check_network_convergence( fc_with_batchnorm, feed_dict={"image": img, "label": label}, - use_cuda=use_cuda, + use_device=use_device, use_fast_executor=use_fast_executor) def test_batchnorm_fc(self): - for use_cuda in (False, True): + for use_device in (DeviceType.CPU, DeviceType.CUDA): for use_fast_executor in (False, True): - self.check_batchnorm_fc_convergence(use_cuda, use_fast_executor) + self.check_batchnorm_fc_convergence(use_device, + use_fast_executor) def test_batchnorm_fc_with_new_strategy(self): # NOTE: the computation result of nccl_reduce is non-deterministic, # related issue: https://github.com/NVIDIA/nccl/issues/157 - self._compare_reduce_and_allreduce(fc_with_batchnorm, True, 1e-5, 1e-2) - self._compare_reduce_and_allreduce(fc_with_batchnorm, False, 1e-5, 1e-2) + self._compare_reduce_and_allreduce(fc_with_batchnorm, DeviceType.CUDA, + 1e-5, 1e-2) + self._compare_reduce_and_allreduce(fc_with_batchnorm, DeviceType.CPU, + 1e-5, 1e-2) if __name__ == '__main__': + paddle.enable_static() unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py index 080c44143a..e07b89f7aa 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py @@ -21,7 +21,7 @@ import os os.environ['FLAGS_enable_parallel_graph'] = str(1) import paddle.fluid.core as core import os -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, DeviceType from simple_nets import simple_fc_net, init_data @@ -31,8 +31,8 @@ class TestMNIST(TestParallelExecutorBase): os.environ['CPU_NUM'] = str(4) # simple_fc - def check_simple_fc_convergence(self, use_cuda, use_reduce=False): - if use_cuda and not core.is_compiled_with_cuda(): + def check_simple_fc_convergence(self, use_device, use_reduce=False): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return img, label = init_data() @@ -40,15 +40,15 @@ class TestMNIST(TestParallelExecutorBase): simple_fc_net, feed_dict={"image": img, "label": label}, - use_cuda=use_cuda, + use_device=use_device, use_reduce=use_reduce) def test_simple_fc(self): - # use_cuda + # use_device self.check_simple_fc_convergence(True) - def check_simple_fc_parallel_accuracy(self, use_cuda): - if use_cuda and not core.is_compiled_with_cuda(): + def check_simple_fc_parallel_accuracy(self, use_device): + if use_device and not core.is_compiled_with_cuda(): return img, label = init_data() @@ -56,13 +56,13 @@ class TestMNIST(TestParallelExecutorBase): method=simple_fc_net, feed_dict={"image": img, "label": label}, - use_cuda=use_cuda, + use_device=use_device, use_parallel_executor=False) parallel_first_loss, parallel_last_loss = self.check_network_convergence( method=simple_fc_net, feed_dict={"image": img, "label": label}, - use_cuda=use_cuda, + use_device=use_device, use_parallel_executor=True) self.assertAlmostEquals( @@ -73,7 +73,7 @@ class TestMNIST(TestParallelExecutorBase): np.mean(parallel_last_loss), single_last_loss, delta=1e-6) def test_simple_fc_parallel_accuracy(self): - self.check_simple_fc_parallel_accuracy(True) + self.check_simple_fc_parallel_accuracy(DeviceType.CUDA) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py index 1205cfcedb..20a5fcb7af 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py @@ -15,7 +15,7 @@ from __future__ import print_function import unittest import seresnext_net -from seresnext_test_base import TestResnetBase +from seresnext_test_base import TestResnetBase, DeviceType from functools import partial @@ -30,7 +30,10 @@ class TestResnetCPU(TestResnetBase): optimizer=seresnext_net.optimizer, use_parallel_executor=False) self._compare_result_with_origin_model( - check_func, use_cuda=False, compare_seperately=False, delta2=1e-3) + check_func, + use_device=DeviceType.CPU, + compare_seperately=False, + delta2=1e-3) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py index eb8cfdd8e6..9d1364cc59 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py @@ -15,7 +15,7 @@ from __future__ import print_function import unittest import seresnext_net -from seresnext_test_base import TestResnetBase +from seresnext_test_base import TestResnetBase, DeviceType from functools import partial @@ -30,7 +30,7 @@ class TestResnetGPU(TestResnetBase): optimizer=seresnext_net.optimizer, use_parallel_executor=False) self._compare_result_with_origin_model( - check_func, use_cuda=True, compare_seperately=False) + check_func, use_device=DeviceType.CUDA, compare_seperately=False) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py index 159686a7cf..0f1a86a83d 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py @@ -19,7 +19,7 @@ fluid.core._set_fuse_parameter_memory_size(131072) import unittest import seresnext_net -from seresnext_test_base import TestResnetBase +from seresnext_test_base import TestResnetBase, DeviceType from functools import partial @@ -31,7 +31,8 @@ class TestResnetWithFuseAllReduceCPU(TestResnetBase): self.check_network_convergence, optimizer=seresnext_net.optimizer, fuse_all_reduce_ops=True) - self._compare_result_with_origin_model(check_func, use_cuda=False) + self._compare_result_with_origin_model( + check_func, use_device=DeviceType.CPU) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py index 56fcb7914f..c747591c81 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py @@ -19,7 +19,7 @@ fluid.core._set_fuse_parameter_memory_size(131072) import unittest import seresnext_net -from seresnext_test_base import TestResnetBase +from seresnext_test_base import TestResnetBase, DeviceType from functools import partial @@ -32,7 +32,7 @@ class TestResnetWithFuseAllReduceGPU(TestResnetBase): optimizer=seresnext_net.optimizer, fuse_all_reduce_ops=True) self._compare_result_with_origin_model( - check_func, use_cuda=True, delta2=1e-2) + check_func, use_device=DeviceType.CUDA, delta2=1e-2) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py index 57ff4890f6..e67934d87f 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py @@ -14,30 +14,30 @@ from __future__ import print_function import unittest -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, DeviceType import seresnext_net import paddle.fluid.core as core class TestResnetWithReduceBase(TestParallelExecutorBase): - def _compare_reduce_and_allreduce(self, use_cuda, delta2=1e-5): - if use_cuda and not core.is_compiled_with_cuda(): + def _compare_reduce_and_allreduce(self, use_device, delta2=1e-5): + if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( seresnext_net.model, - feed_dict=seresnext_net.feed_dict(use_cuda), - iter=seresnext_net.iter(use_cuda), - batch_size=seresnext_net.batch_size(use_cuda), - use_cuda=use_cuda, + feed_dict=seresnext_net.feed_dict(use_device), + iter=seresnext_net.iter(use_device), + batch_size=seresnext_net.batch_size(use_device), + use_device=use_device, use_reduce=False, optimizer=seresnext_net.optimizer) reduce_first_loss, reduce_last_loss = self.check_network_convergence( seresnext_net.model, - feed_dict=seresnext_net.feed_dict(use_cuda), - iter=seresnext_net.iter(use_cuda), - batch_size=seresnext_net.batch_size(use_cuda), - use_cuda=use_cuda, + feed_dict=seresnext_net.feed_dict(use_device), + iter=seresnext_net.iter(use_device), + batch_size=seresnext_net.batch_size(use_device), + use_device=use_device, use_reduce=True, optimizer=seresnext_net.optimizer) @@ -46,25 +46,25 @@ class TestResnetWithReduceBase(TestParallelExecutorBase): for loss in zip(all_reduce_last_loss, reduce_last_loss): self.assertAlmostEquals(loss[0], loss[1], delta=loss[0] * delta2) - if not use_cuda: + if not use_device: return all_reduce_first_loss_seq, all_reduce_last_loss_seq = self.check_network_convergence( seresnext_net.model, - feed_dict=seresnext_net.feed_dict(use_cuda), - iter=seresnext_net.iter(use_cuda), - batch_size=seresnext_net.batch_size(use_cuda), - use_cuda=use_cuda, + feed_dict=seresnext_net.feed_dict(use_device), + iter=seresnext_net.iter(use_device), + batch_size=seresnext_net.batch_size(use_device), + use_device=use_device, use_reduce=False, optimizer=seresnext_net.optimizer, enable_sequential_execution=True) reduce_first_loss_seq, reduce_last_loss_seq = self.check_network_convergence( seresnext_net.model, - feed_dict=seresnext_net.feed_dict(use_cuda), - iter=seresnext_net.iter(use_cuda), - batch_size=seresnext_net.batch_size(use_cuda), - use_cuda=use_cuda, + feed_dict=seresnext_net.feed_dict(use_device), + iter=seresnext_net.iter(use_device), + batch_size=seresnext_net.batch_size(use_device), + use_device=use_device, use_reduce=True, optimizer=seresnext_net.optimizer, enable_sequential_execution=True) @@ -87,7 +87,8 @@ class TestResnetWithReduceBase(TestParallelExecutorBase): class TestResnetWithReduceCPU(TestResnetWithReduceBase): def test_seresnext_with_reduce(self): - self._compare_reduce_and_allreduce(use_cuda=False, delta2=1e-3) + self._compare_reduce_and_allreduce( + use_device=DeviceType.CPU, delta2=1e-3) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py index f6c868859c..4de1a6092d 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py @@ -14,12 +14,13 @@ from __future__ import print_function import unittest -from test_parallel_executor_seresnext_with_reduce_cpu import TestResnetWithReduceBase +from test_parallel_executor_seresnext_with_reduce_cpu import TestResnetWithReduceBase, DeviceType class TestResnetWithReduceGPU(TestResnetWithReduceBase): def test_seresnext_with_reduce(self): - self._compare_reduce_and_allreduce(use_cuda=True, delta2=1e-2) + self._compare_reduce_and_allreduce( + use_device=DeviceType.CUDA, delta2=1e-2) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index 2d1e0e9849..1cb39eb131 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -17,7 +17,7 @@ from __future__ import print_function import paddle.fluid as fluid import transformer_model import numpy as np -from parallel_executor_test_base import TestParallelExecutorBase +from parallel_executor_test_base import TestParallelExecutorBase, DeviceType import unittest import paddle import paddle.fluid.core as core @@ -191,16 +191,16 @@ class TestTransformer(TestParallelExecutorBase): if core.is_compiled_with_cuda(): self.check_network_convergence( transformer, - use_cuda=True, + use_device=DeviceType.CUDA, feed_data_reader=get_feed_data_reader()) self.check_network_convergence( transformer, - use_cuda=True, + use_device=DeviceType.CUDA, enable_sequential_execution=True, feed_data_reader=get_feed_data_reader()) self.check_network_convergence( transformer, - use_cuda=False, + use_device=DeviceType.CPU, iter=2, feed_data_reader=get_feed_data_reader()) diff --git a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py index bf3aa33886..b01c7cf179 100755 --- a/python/paddle/fluid/tests/unittests/test_program_prune_backward.py +++ b/python/paddle/fluid/tests/unittests/test_program_prune_backward.py @@ -22,7 +22,7 @@ import paddle.fluid as fluid import paddle.fluid.core as core from simple_nets import init_data, simple_fc_net, fc_with_batchnorm import seresnext_net -from test_parallel_executor_transformer import transformer, get_feed_data_reader +from test_parallel_executor_transformer import transformer, get_feed_data_reader, DeviceType from fake_reader import fake_imdb_reader @@ -219,7 +219,7 @@ class TestProgramPruneBackward(unittest.TestCase): with self.program_scope_guard(): self.check_prune_correctness( method=seresnext_net.model, - feed_dict=seresnext_net.feed_dict(use_cuda=False), + feed_dict=seresnext_net.feed_dict(use_device=DeviceType.CPU), optimizer=seresnext_net.optimizer) def test_transformer(self): diff --git a/python/paddle/fluid/tests/unittests/xpu/test_xpu_place.py b/python/paddle/fluid/tests/unittests/xpu/test_xpu_place.py new file mode 100644 index 0000000000..57d456d019 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/xpu/test_xpu_place.py @@ -0,0 +1,47 @@ +# copyright (c) 2020 paddlepaddle authors. all rights reserved. +# +# licensed under the apache license, version 2.0 (the "license"); +# you may not use this file except in compliance with the license. +# you may obtain a copy of the license at +# +# http://www.apache.org/licenses/license-2.0 +# +# unless required by applicable law or agreed to in writing, software +# distributed under the license is distributed on an "as is" basis, +# without warranties or conditions of any kind, either express or implied. +# see the license for the specific language governing permissions and +# limitations under the license. + +from __future__ import print_function + +import unittest +import os +import paddle +import numpy as np +import paddle.fluid as fluid +from paddle.fluid import core +import paddle.static as static + + +class Test_XPU_Places(unittest.TestCase): + def assert_places_equal(self, places0, places1): + self.assertEqual(len(places0), len(places1)) + for place0, place1 in zip(places0, places1): + self.assertEqual(type(place0), type(place1)) + self.assertEqual(place0.get_device_id(), place1.get_device_id()) + + def test_check_preset_envs(self): + if core.is_compiled_with_xpu(): + os.environ["FLAGS_selected_xpus"] = "0" + place_list = static.xpu_places() + self.assert_places_equal([fluid.XPUPlace(0)], place_list) + + def test_check_no_preset_envs(self): + if core.is_compiled_with_xpu(): + place_list = static.xpu_places(0) + self.assert_places_equal([fluid.XPUPlace(0)], place_list) + + +if __name__ == '__main__': + paddle.enable_static() + unittest.main() diff --git a/python/paddle/static/__init__.py b/python/paddle/static/__init__.py index 6778149e2b..d683b4772e 100644 --- a/python/paddle/static/__init__.py +++ b/python/paddle/static/__init__.py @@ -20,7 +20,7 @@ __all__ = [ 'default_main_program', 'default_startup_program', 'Program', 'data', 'InputSpec', 'save', 'load', 'save_inference_model', 'load_inference_model', 'load_program_state', 'set_program_state', 'cpu_places', 'cuda_places', - 'Variable' + 'xpu_places', 'Variable' ] from . import nn @@ -44,6 +44,7 @@ from ..fluid.framework import name_scope #DEFINE_ALIAS from ..fluid.framework import program_guard #DEFINE_ALIAS from ..fluid.framework import cpu_places #DEFINE_ALIAS from ..fluid.framework import cuda_places #DEFINE_ALIAS +from ..fluid.framework import xpu_places #DEFINE_ALIAS from ..fluid.framework import Variable #DEFINE_ALIAS from ..fluid.layers.control_flow import Print #DEFINE_ALIAS from ..fluid.layers.nn import py_func #DEFINE_ALIAS diff --git a/python/setup.py.in b/python/setup.py.in index ffd46c9d5f..f43a97bff3 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -321,6 +321,10 @@ if '${WITH_XPU}' == 'ON': package_data['paddle.libs']+=['${XPU_API_LIB_NAME}', '${XPU_RT_LIB_NAME}'] +if '${WITH_XPU_BKCL}' == 'ON': + shutil.copy('${XPU_BKCL_LIB}', libs_path) + package_data['paddle.libs']+=['${XPU_BKCL_LIB_NAME}'] + # copy libfuild_framework.so to libs if os.name != 'nt' and sys.platform != 'darwin': paddle_framework_lib='${FLUID_FRAMEWORK_SHARED_LIB}' diff --git a/tools/wlist.json b/tools/wlist.json index a51ac905e6..f907d60989 100644 --- a/tools/wlist.json +++ b/tools/wlist.json @@ -413,7 +413,8 @@ "CRFDecoding.forward", "SequenceTagging.forward", "XPUPlace", - "is_compiled_with_xpu" + "is_compiled_with_xpu", + "xpu_places" ], "gpu_not_white":[ "deformable_conv", -- GitLab