From cb636a48f8a17a23fa6ab753946252bec92ce940 Mon Sep 17 00:00:00 2001 From: jianghaicheng Date: Thu, 9 Dec 2021 13:07:27 +0800 Subject: [PATCH] add ipu device p2 (#37840) --- .../accumulation/gradient_accumulation.cc | 16 ++ paddle/fluid/framework/dlpack_tensor.cc | 5 + paddle/fluid/framework/executor.cc | 8 + paddle/fluid/framework/op_registry.h | 3 + paddle/fluid/framework/tensor_util.cc | 51 ++++- .../fluid/imperative/gradient_accumulator.cc | 7 + .../allocation/naive_best_fit_allocator.cc | 28 +++ paddle/fluid/operators/ipu_runtime_op.cc | 62 ++++++ paddle/fluid/operators/ipu_runtime_op.h | 69 ++++++ paddle/fluid/operators/math/math_function.cc | 7 + paddle/fluid/platform/device_context.cc | 32 ++- paddle/fluid/platform/place.cc | 7 + paddle/fluid/platform/place.h | 42 +++- paddle/fluid/pybind/pybind.cc | 201 +++++++++++++++++- paddle/fluid/pybind/tensor_py.h | 15 ++ python/paddle/__init__.py | 2 + python/paddle/device/__init__.py | 49 ++++- python/paddle/fluid/__init__.py | 8 +- python/paddle/framework/__init__.py | 1 + 19 files changed, 599 insertions(+), 14 deletions(-) create mode 100644 paddle/fluid/operators/ipu_runtime_op.cc create mode 100644 paddle/fluid/operators/ipu_runtime_op.h diff --git a/paddle/fluid/eager/accumulation/gradient_accumulation.cc b/paddle/fluid/eager/accumulation/gradient_accumulation.cc index 9d475d96e56..723bf5387c6 100644 --- a/paddle/fluid/eager/accumulation/gradient_accumulation.cc +++ b/paddle/fluid/eager/accumulation/gradient_accumulation.cc @@ -116,6 +116,22 @@ class TensorAddFunctor : public boost::static_visitor<> { } #endif +#ifdef PADDLE_WITH_IPU + void operator()(const paddle::platform::IPUPlace& place) { + PADDLE_THROW(paddle::platform::errors::PermissionDenied( + "Gradient accumulation on place (%s) " + "is not supported in imperative mode", + place)); + } +#else + void operator()(const paddle::platform::IPUPlace& place) { + PADDLE_THROW(paddle::platform::errors::PermissionDenied( + "Gradient accumulation on place (%s) " + "is not supported in imperative mode", + place)); + } +#endif + void operator()(const paddle::platform::NPUPinnedPlace& place) { PADDLE_THROW(paddle::platform::errors::PermissionDenied( "Gradient accumulation on place (%s) " diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index 71b53b8a518..5e450234c40 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -81,6 +81,11 @@ struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> { return device; } + inline ::DLDevice operator()(const platform::IPUPlace &place) const { + PADDLE_THROW( + platform::errors::Unimplemented("platform::IPUPlace is not supported")); + } + inline ::DLDevice operator()(const platform::XPUPlace &place) const { PADDLE_THROW( platform::errors::Unimplemented("platform::XPUPlace is not supported")); diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 93f4f8952fc..9e572614779 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -463,6 +463,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, #else PADDLE_THROW( platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle")); +#endif + } else if (platform::is_ipu_place(place_)) { +#ifdef PADDLE_WITH_IPU + gc.reset(new IPUGarbageCollector( + BOOST_GET_CONST(platform::IPUPlace, place_), max_memory_size)); +#else + PADDLE_THROW( + platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle")); #endif } else if (platform::is_npu_place(place_)) { #ifdef PADDLE_WITH_ASCEND_CL diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 348ca5b952b..39496cb2677 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -327,6 +327,9 @@ struct OpKernelRegistrarFunctorEx { platform::errors::Unimplemented("Not supported on place (%s) ", npu)); // return GetResultHelper(out, npu); } + bool GetResult(const framework::Tensor& out, + const platform::IPUPlace& ipu) const { + PADDLE_THROW( + platform::errors::Unimplemented("Not supported on place (%s) ", ipu)); + } bool GetResult(const framework::Tensor& out, const platform::NPUPinnedPlace& cpu) const { @@ -762,6 +800,9 @@ struct BothFalseVisitor : public boost::static_visitor<> { void VisitorImpl(const platform::XPUPlace& xpu) const { PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported")); } + void VisitorImpl(const platform::IPUPlace& ipu) const { + PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported")); + } void VisitorImpl(const platform::CUDAPlace& gpu) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index d95c78c5db8..6aad54fba86 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -155,6 +155,13 @@ class TensorAddFunctor : public boost::static_visitor<> { "is not supported in imperative mode", place)); } + // there is NO support in IPUPlace + void operator()(const platform::IPUPlace& place) { + PADDLE_THROW(platform::errors::PermissionDenied( + "Gradient accumulation on place (%s) " + "is not supported in imperative mode", + place)); + } private: int64_t numel_; diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 6de32335c62..41dcf277d7a 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -116,6 +116,34 @@ size_t Used(const platform::CPUPlace &place) { return GetCPUBuddyAllocator()->Used(); } +// For Graphcore IPU +template <> +void *Alloc(const platform::IPUPlace &place, size_t size) { + VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); + VLOG(10) << "IPUPlace, Allocate on cpu."; + + void *p = GetCPUBuddyAllocator()->Alloc(size); + if (FLAGS_init_allocated_mem) { + memset(p, 0xEF, size); + } + VLOG(10) << " pointer=" << p; + return p; +} +template <> +void Free(const platform::IPUPlace &place, void *p, + size_t size) { + VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); + GetCPUBuddyAllocator()->Free(p); +} +template <> +uint64_t Release(const platform::IPUPlace &place) { + return GetCPUBuddyAllocator()->Release(); +} +template <> +size_t Used(const platform::IPUPlace &place) { + return GetCPUBuddyAllocator()->Used(); +} + // For kunlun XPU template <> void *Alloc(const platform::XPUPlace &place, size_t size) { diff --git a/paddle/fluid/operators/ipu_runtime_op.cc b/paddle/fluid/operators/ipu_runtime_op.cc new file mode 100644 index 00000000000..4b473da00f3 --- /dev/null +++ b/paddle/fluid/operators/ipu_runtime_op.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/ipu_runtime_op.h" + +namespace paddle { +namespace operators { + +class IpuRuntimeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override {} + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::proto::VarType::Type(ctx.Attr("dtype")), + ctx.device_context()); + } +}; + +class IpuRuntimeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("FeedList", "FeedList of Graph").AsDuplicable(); + AddOutput("FetchList", "FetchList of Graph").AsDuplicable(); + AddAttr("dtype", + "(int, default 5 (FP32)) " + "Output data type") + .SetDefault(framework::proto::VarType::FP32); + AddComment(R"DOC( +Run graph by PopART runtime. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(ipu_runtime, ops::IpuRuntimeOp, ops::IpuRuntimeOpMaker); + +REGISTER_OP_IPU_KERNEL(ipu_runtime, ops::IpuRuntimeKernel, + ops::IpuRuntimeKernel, + ops::IpuRuntimeKernel, + ops::IpuRuntimeKernel, + ops::IpuRuntimeKernel, + ops::IpuRuntimeKernel, + ops::IpuRuntimeKernel); diff --git a/paddle/fluid/operators/ipu_runtime_op.h b/paddle/fluid/operators/ipu_runtime_op.h new file mode 100644 index 00000000000..b6fc9ae9889 --- /dev/null +++ b/paddle/fluid/operators/ipu_runtime_op.h @@ -0,0 +1,69 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#ifdef PADDLE_WITH_IPU +#include "paddle/fluid/framework/ipu/ipu_backend.h" +#include "paddle/fluid/framework/tensor.h" +#endif + +namespace paddle { +namespace operators { + +template +class IpuRuntimeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#ifdef PADDLE_WITH_IPU + auto ipu_backend = framework::ipu::IpuBackend::GetInstance(); + if (!ipu_backend->DeviceIsAttached()) { + const platform::IPUDeviceContext& ipu_ctx = + reinterpret_cast( + ctx.device_context()); + ipu_backend->AttachDevice(ipu_ctx.DeviceId()); + } + + auto inputs = ctx.MultiInput("FeedList"); + auto outputs = ctx.MultiOutput("FetchList"); + auto output_names = ctx.OutputNames("FetchList"); + VLOG(4) << "IpuRuntime Kernel, begin to run graph"; + ipu_backend->Run(inputs, outputs, ctx); + + // post-run + // resize tensor when tensor.dims() is empty + for (size_t i = 0; i < outputs.size(); ++i) { + auto* out = outputs[i]; + if (out->dims().size() == 0) { + auto tensor_dtype = out->type(); + auto sizeof_dtype = framework::SizeOfType(tensor_dtype); + int64_t dim = out->memory_size() / sizeof_dtype; + out->Resize({dim}); + VLOG(10) << "set ipu_runtime_op output: " << output_names[i] + << " dims from () to: " + << "(" << dim << ")"; + } + } +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "Please compile WITH_IPU option to enable ipu_runtime op")); +#endif + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index 4201af18ca7..daa4efa02ac 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -173,6 +173,13 @@ void set_constant_with_place( platform::errors::Unimplemented("NPUPinnedPlace is not supported")); } +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported")); +} + template <> void set_constant_with_place( const platform::DeviceContext& context, framework::Tensor* tensor, diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index a0c9ff09460..206bef12aac 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -16,6 +16,9 @@ limitations under the License. */ #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h" #include "paddle/fluid/platform/cuda_device_guard.h" #endif +#ifdef PADDLE_WITH_IPU +#include "paddle/fluid/platform/ipu/ipu_backend.h" +#endif #include "glog/logging.h" #include "paddle/fluid/platform/profiler.h" @@ -96,8 +99,9 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) { if (it == device_contexts_.end()) { PADDLE_THROW(platform::errors::Unimplemented( "Place %s is not supported. Please check that your paddle compiles " - "with WITH_GPU, WITH_XPU or WITH_ASCEND_CL option or check that " - "your train process set the correct device id if you use Executor.", + "with WITH_GPU, WITH_XPU, WITH_IPU or WITH_ASCEND_CL option or check " + "that your train process set the correct device id if you use " + "Executor.", place)); } return it->second.get().get(); @@ -158,6 +162,14 @@ DeviceContextPool::DeviceContextPool( PADDLE_THROW( platform::errors::Unimplemented("XPUPlace is not supported. Please " "re-compile with WITH_XPU option.")); +#endif + } else if (platform::is_ipu_place(p)) { +#ifdef PADDLE_WITH_IPU + EmplaceDeviceContext(&device_contexts_, p); +#else + PADDLE_THROW( + platform::errors::Unimplemented("IPUPlace is not supported. Please " + "re-compile with WITH_IPU option.")); #endif } else if (platform::is_npu_place(p)) { #ifdef PADDLE_WITH_ASCEND_CL @@ -195,6 +207,22 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const { Place CPUDeviceContext::GetPlace() const { return place_; } +#ifdef PADDLE_WITH_IPU +IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) { + int id = place.GetDeviceId(); + std::shared_ptr ipu_backend = + platform::ipu::IpuBackend::GetInstance(); + device_ = ipu_backend->GetDevice(id); +} + +Place IPUDeviceContext::GetPlace() const { return place_; } +void IPUDeviceContext::Wait() const { + /*! \brief Wait for all operations completion in the stream. */ +} + +IPUDeviceContext::~IPUDeviceContext() {} + +#endif #ifdef PADDLE_WITH_XPU XPUDeviceContext::XPUDeviceContext() { context_ = xpu::create_context(); diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc index 415babc9cb8..ec49134b654 100644 --- a/paddle/fluid/platform/place.cc +++ b/paddle/fluid/platform/place.cc @@ -36,6 +36,7 @@ class PlacePrinter : public boost::static_visitor<> { void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; } void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; } void operator()(const NPUPinnedPlace &p) { os_ << "NPUPinnedPlace"; } + void operator()(const IPUPlace &p) { os_ << "IPUPlace(" << p.device << ")"; } void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; } private: @@ -56,6 +57,10 @@ bool is_npu_place(const Place &p) { return boost::apply_visitor(IsNPUPlace(), p); } +bool is_ipu_place(const Place &p) { + return boost::apply_visitor(IsIPUPlace(), p); +} + bool is_cpu_place(const Place &p) { return boost::apply_visitor(IsCPUPlace(), p); } @@ -80,6 +85,8 @@ bool is_same_place(const Place &p1, const Place &p2) { return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2); } else if (is_npu_place(p1)) { return BOOST_GET_CONST(NPUPlace, p1) == BOOST_GET_CONST(NPUPlace, p2); + } else if (is_ipu_place(p1)) { + return BOOST_GET_CONST(IPUPlace, p1) == BOOST_GET_CONST(IPUPlace, p2); } else { return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2); } diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index 1ab2a623911..fadc1e27e8a 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -95,12 +95,25 @@ struct NPUPinnedPlace { inline bool operator!=(const NPUPinnedPlace &) const { return false; } inline bool operator<(const NPUPinnedPlace &) const { return false; } }; +struct IPUPlace { + IPUPlace() : IPUPlace(0) {} + explicit IPUPlace(int d) : device(d) {} + + inline int GetDeviceId() const { return device; } + // needed for variant equality comparison + inline bool operator==(const IPUPlace &o) const { return device == o.device; } + inline bool operator!=(const IPUPlace &o) const { return !(*this == o); } + inline bool operator<(const IPUPlace &o) const { return device < o.device; } + + int device; +}; struct IsCUDAPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } bool operator()(const XPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return false; } bool operator()(const NPUPinnedPlace &) const { return false; } + bool operator()(const IPUPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return true; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; @@ -110,6 +123,7 @@ struct IsCPUPlace : public boost::static_visitor { bool operator()(const XPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return false; } bool operator()(const NPUPinnedPlace &) const { return false; } + bool operator()(const IPUPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; @@ -119,6 +133,7 @@ struct IsCUDAPinnedPlace : public boost::static_visitor { bool operator()(const XPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return false; } bool operator()(const NPUPinnedPlace &) const { return false; } + bool operator()(const IPUPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; } }; @@ -128,6 +143,7 @@ struct IsXPUPlace : public boost::static_visitor { bool operator()(const XPUPlace &) const { return true; } bool operator()(const NPUPlace &) const { return false; } bool operator()(const NPUPinnedPlace &) const { return false; } + bool operator()(const IPUPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; @@ -137,6 +153,7 @@ struct IsNPUPlace : public boost::static_visitor { bool operator()(const XPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return true; } bool operator()(const NPUPinnedPlace &) const { return false; } + bool operator()(const IPUPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; @@ -145,22 +162,33 @@ struct IsNPUPinnedPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } bool operator()(const XPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return false; } + bool operator()(const IPUPlace &) const { return false; } + bool operator()(const CUDAPlace &) const { return false; } + bool operator()(const CUDAPinnedPlace &) const { return false; } bool operator()(const NPUPinnedPlace &) const { return true; } +}; +struct IsIPUPlace : public boost::static_visitor { + bool operator()(const CPUPlace &) const { return false; } + bool operator()(const XPUPlace &) const { return false; } + bool operator()(const NPUPlace &) const { return false; } + bool operator()(const IPUPlace &) const { return true; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; } + bool operator()(const NPUPinnedPlace &) const { return false; } }; class Place : public boost::variant { + CUDAPinnedPlace, NPUPinnedPlace, IPUPlace> { private: using PlaceBase = boost::variant; + CUDAPinnedPlace, NPUPinnedPlace, IPUPlace>; public: Place() = default; Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {} // NOLINT Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {} // NOLINT Place(const NPUPlace &npu_place) : PlaceBase(npu_place) {} // NOLINT + Place(const IPUPlace &ipu_place) : PlaceBase(ipu_place) {} // NOLINT Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {} // NOLINT Place(const CUDAPinnedPlace &cuda_pinned_place) // NOLINT : PlaceBase(cuda_pinned_place) {} @@ -180,6 +208,7 @@ using PlaceList = std::vector; bool is_gpu_place(const Place &); bool is_xpu_place(const Place &); bool is_npu_place(const Place &); +bool is_ipu_place(const Place &); bool is_cpu_place(const Place &); bool is_cuda_pinned_place(const Place &); bool is_npu_pinned_place(const Place &); @@ -228,6 +257,15 @@ struct PlaceVisitorWrapper return typename Visitor::result_type(); #endif } + typename Visitor::result_type operator()(const IPUPlace &ipu) const { +#ifdef PADDLE_WITH_IPU + return visitor_(ipu); +#else + PADDLE_THROW(platform::errors::Unavailable( + "Paddle is not compiled with IPU. Cannot visit ipu device")); + return typename Visitor::result_type(); +#endif + } typename Visitor::result_type operator()(const CUDAPlace &cuda) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index a93ddb1a22f..c5277a42103 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -132,6 +132,10 @@ limitations under the License. */ #endif #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" +#ifdef PADDLE_WITH_IPU +#include "paddle/fluid/platform/ipu/ipu_backend.h" +#include "paddle/fluid/platform/ipu_info.h" +#endif #ifdef PADDLE_WITH_CRYPTO #include "paddle/fluid/pybind/crypto.h" @@ -201,6 +205,14 @@ bool IsCompiledWithNPU() { #endif } +bool IsCompiledWithIPU() { +#ifndef PADDLE_WITH_IPU + return false; +#else + return true; +#endif +} + bool IsCompiledWithMKLDNN() { #ifndef PADDLE_WITH_MKLDNN return false; @@ -816,6 +828,8 @@ PYBIND11_MODULE(core_noavx, m) { py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) .def("set", SetTensorFromPyArray, py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) + .def("set", SetTensorFromPyArray, + py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) .def("set", SetTensorFromPyArray, py::arg("array"), py::arg("place"), py::arg("zero_copy") = false, R"DOC( @@ -823,7 +837,7 @@ PYBIND11_MODULE(core_noavx, m) { Args: lod (numpy.ndarray): The data to set. - place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace|NPUPlace): The place where the + place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace): The place where the LoDTensor is to be set. zero_copy (bool, optional): Whether to share memory with the input numpy array. This parameter only works with CPUPlace. Default: False. @@ -1913,6 +1927,58 @@ All parameter, weight, gradient are variables in Paddle. [](const platform::NPUPlace &self) { return self.GetDeviceId(); }) .def("__str__", string::to_string); + // IPUPlace + py::class_(m, "IPUPlace", R"DOC( + IPUPlace is a descriptor of a device. + It represents a IPU device on which a tensor will be allocated and a model will run. + + Examples: + .. code-block:: python + import paddle + + # required: ipu + + ipu_place = paddle.IPUPlace() + + )DOC") + .def("__init__", + [](platform::IPUPlace &self) { +#ifdef PADDLE_WITH_IPU + if (platform::GetIPUDeviceCount() == 0) { + LOG(ERROR) << "Cannot use IPU because there is no IPU " + "detected on your " + "machine."; + std::exit(-1); + } + // use ipu(0) to comile, while run with the number user configure + // in sharding and pipline. + new (&self) platform::IPUPlace(0); +#else + LOG(ERROR) << string::Sprintf( + "Cannot use IPU because you didn't install IPU version " + "PaddlePaddle.\n" + "If you want to use IPU, please try to install IPU version " + "PaddlePaddle by: pip install paddlepaddle*\n" + "If you only have CPU, please change IPUPlace to be " + "CPUPlace().\n"); + std::exit(-1); +#endif + }) + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) +#ifdef PADDLE_WITH_IPU + .def("get_device_id", + [](const platform::IPUPlace &self) { return self.GetDeviceId(); }) +#endif + .def("__str__", string::to_string); + py::class_ platformplace(m, "Place"); g_place_pytype = reinterpret_cast(platformplace.ptr()); platformplace.def(py::init<>()) @@ -1922,6 +1988,7 @@ All parameter, weight, gradient are variables in Paddle. .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("is_gpu_place", [](platform::Place &self) { return platform::is_gpu_place(self); }) @@ -1931,6 +1998,8 @@ All parameter, weight, gradient are variables in Paddle. [](platform::Place &self) { return platform::is_xpu_place(self); }) .def("is_npu_place", [](platform::Place &self) { return platform::is_npu_place(self); }) + .def("is_ipu_place", + [](platform::Place &self) { return platform::is_ipu_place(self); }) .def("is_cuda_pinned_place", [](platform::Place &self) { return platform::is_cuda_pinned_place(self); @@ -1947,6 +2016,10 @@ All parameter, weight, gradient are variables in Paddle. [](platform::Place &self) { return BOOST_GET_CONST(platform::NPUPlace, self).device; }) + .def("ipu_device_id", + [](platform::Place &self) { + return BOOST_GET_CONST(platform::IPUPlace, self).device; + }) .def("set_place", [](platform::Place &self, const platform::Place &other) { self = other; }) .def("set_place", @@ -1970,6 +2043,10 @@ All parameter, weight, gradient are variables in Paddle. [](platform::Place &self, const platform::NPUPlace &npu_place) { self = npu_place; }) + .def("set_place", + [](platform::Place &self, const platform::IPUPlace &ipu_place) { + self = ipu_place; + }) .def("__repr__", string::to_string) .def("__str__", string::to_string); @@ -2201,6 +2278,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("is_compiled_with_ascend", IsCompiledWithAscend); m.def("is_compiled_with_rocm", IsCompiledWithROCM); m.def("is_compiled_with_npu", IsCompiledWithNPU); + m.def("is_compiled_with_ipu", IsCompiledWithIPU); m.def("is_compiled_with_xpu", IsCompiledWithXPU); m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN); m.def("is_compiled_with_cinn", IsCompiledWithCINN); @@ -2520,6 +2598,10 @@ All parameter, weight, gradient are variables in Paddle. }); #endif +#ifdef PADDLE_WITH_IPU + m.def("get_ipu_device_count", platform::GetIPUDeviceCount); +#endif + py::enum_(m, "TracerOption", py::arithmetic()) .value("kDefault", platform::TracerOption::kDefault) .value("kOpDetail", platform::TracerOption::kOpDetail) @@ -2597,6 +2679,11 @@ All parameter, weight, gradient are variables in Paddle. bool val) { self.Set(name, new bool(val)); }) .def("set", [](ir::Pass &self, const std::string &name, int val) { self.Set(name, new int(val)); }) + .def("set", + [](ir::Pass &self, const std::string &name, + std::vector set) { + self.Set(name, new std::vector(set)); + }) .def("set", [](ir::Pass &self, const std::string &name, std::unordered_set set) { @@ -3429,6 +3516,118 @@ All parameter, weight, gradient are variables in Paddle. }) .def("device_count", &ParallelExecutor::DeviceCount); +#ifdef PADDLE_WITH_IPU + py::class_>(m, "IpuBackend") + .def(py::init(&platform::ipu::IpuBackend::GetNewInstance)) + .def("clear", &platform::ipu::IpuBackend::Clear) + .def("set_scope", &platform::ipu::IpuBackend::SetScope) + .def("set_ipu_strategy", &platform::ipu::IpuBackend::SetIpuStrategy); + + py::class_(m, "IpuStrategy") + .def(py::init()) + .def_property( + "num_ipus", + [](const platform::ipu::IpuStrategy &self) { return self.num_ipus; }, + [](platform::ipu::IpuStrategy &self, int num_ipus) { + self.num_ipus = num_ipus; + }, + R"DOC( + Int type, set the number ipu we need. Default 1. + )DOC") + .def_property( + "accumulationFactor", + [](const platform::ipu::IpuStrategy &self) { + return self.popart_options_.accumulationFactor; + }, + [](platform::ipu::IpuStrategy &self, int accumulationFactor) { + self.popart_options_.accumulationFactor = accumulationFactor; + }, + R"DOC( + Specify the number of micro-batches to accumulate before + applying the varUpdate. Default 1. + )DOC") + .def_property("batches_per_step", + [](const platform::ipu::IpuStrategy &self) { + return self.batches_per_step; + }, + [](platform::ipu::IpuStrategy &self, int batches_per_step) { + self.batches_per_step = batches_per_step; + }, + R"DOC( + Int type, set batches_per_step. Default 1. + )DOC") + .def_property("is_training", + [](const platform::ipu::IpuStrategy &self) { + return self.is_training; + }, + [](platform::ipu::IpuStrategy &self, bool is_training) { + self.is_training = is_training; + }, + R"DOC( + Bool type, True for training, False inference. Default True. + )DOC") + .def_property( + "enable_pipelining", + [](const platform::ipu::IpuStrategy &self) { + return self.popart_options_.enablePipelining; + }, + [](platform::ipu::IpuStrategy &self, bool enable_pipelining) { + self.popart_options_.enablePipelining = enable_pipelining; + }, + R"DOC( + Bool type, True enable pipeline, otherwise disable. Default False. + )DOC") + .def_property( + "enable_manual_shard", + [](const platform::ipu::IpuStrategy &self) { + return self.popart_options_.virtualGraphMode == + platform::ipu::VirtualGraphMode::Manual; + }, + [](platform::ipu::IpuStrategy &self, bool enable_ipu_shard) { + if (enable_ipu_shard) { + self.popart_options_.virtualGraphMode = + platform::ipu::VirtualGraphMode::Manual; + } else { + self.popart_options_.virtualGraphMode = + platform::ipu::VirtualGraphMode::Off; + } + }, + R"DOC( + Bool type, True enable model sharding, otherwise disable. Default " + "False. + )DOC") + .def_property("need_avg_shard", + [](const platform::ipu::IpuStrategy &self) { + return self.need_avg_shard; + }, + [](platform::ipu::IpuStrategy &self, bool need_avg_shard) { + self.need_avg_shard = need_avg_shard; + }, + R"DOC( + Bool type, True enable avg shard, otherwise disable. Default False. + )DOC") + .def_property("batch_size", + [](const platform::ipu::IpuStrategy &self) { + return self.batch_size; + }, + [](platform::ipu::IpuStrategy &self, int batch_size) { + self.batch_size = batch_size; + }, + R"DOC( + Int type, used to make batch size fixed. Default 1. + )DOC") + .def_property("enable_fp16", + [](const platform::ipu::IpuStrategy &self) { + return self.enable_fp16; + }, + [](platform::ipu::IpuStrategy &self, bool enable_fp16) { + self.enable_fp16 = enable_fp16; + }, + R"DOC( + Bool type, True enable float16 mode, otherwise disable. Default False.)DOC"); +#endif + BindFleetWrapper(&m); BindIO(&m); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index c5d0afb9a17..935a6437338 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -313,6 +313,21 @@ void SetTensorFromPyArrayT( PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use XPUPlace in CPU/GPU version, " "Please recompile or reinstall Paddle with XPU support.")); +#endif + } else if (paddle::platform::is_ipu_place(place)) { +#ifdef PADDLE_WITH_IPU + if (zero_copy) { + auto holder = std::make_shared>(array); + auto type = framework::ToDataType(std::type_index(typeid(T))); + self->ResetHolderWithType(holder, type); + } else { + auto dst = self->mutable_data(place); + std::memcpy(dst, array.data(), array.nbytes()); + } +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot use IPUPlace in CPU/GPU/XPU/NPU version, " + "Please recompile or reinstall Paddle with IPU support.")); #endif } else if (paddle::platform::is_npu_place(place)) { #ifdef PADDLE_WITH_ASCEND_CL diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 44afeecec32..da32aab839c 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -260,6 +260,7 @@ from .framework.random import set_cuda_rng_state # noqa: F401 from .framework import ParamAttr # noqa: F401 from .framework import create_parameter # noqa: F401 from .framework import CPUPlace # noqa: F401 +from .framework import IPUPlace # noqa: F401 from .framework import CUDAPlace # noqa: F401 from .framework import NPUPlace # noqa: F401 from .framework import CUDAPinnedPlace # noqa: F401 @@ -291,6 +292,7 @@ from .fluid.framework import get_flags # noqa: F401 from .fluid.framework import set_flags # noqa: F401 from .device import is_compiled_with_xpu # noqa: F401 from .device import is_compiled_with_npu # noqa: F401 +from .device import is_compiled_with_ipu # noqa: F401 from .device import XPUPlace # noqa: F401 from .fluid.dygraph.base import enable_dygraph as disable_static # noqa: F401 diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py index 95402898589..0a11d59d69c 100644 --- a/python/paddle/device/__init__.py +++ b/python/paddle/device/__init__.py @@ -28,7 +28,9 @@ __all__ = [ # noqa 'set_device', 'get_device', 'XPUPlace', + 'IPUPlace', 'is_compiled_with_xpu', + 'is_compiled_with_ipu', 'is_compiled_with_cinn', 'is_compiled_with_cuda', 'is_compiled_with_rocm', @@ -55,6 +57,36 @@ def is_compiled_with_npu(): return core.is_compiled_with_npu() +def is_compiled_with_ipu(): + """ + Whether paddle was built with WITH_IPU=ON to support Graphcore IPU. + + Returns (bool): `True` if IPU is supported, otherwise `False`. + + Examples: + .. code-block:: python + + import paddle + support_ipu = paddle.is_compiled_with_ipu() + """ + return core.is_compiled_with_ipu() + + +def IPUPlace(): + """ + Return a Graphcore IPU Place + + Examples: + .. code-block:: python + + # required: ipu + + import paddle + place = paddle.device.IPUPlace() + """ + return core.IPUPlace() + + def is_compiled_with_xpu(): """ Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun @@ -143,13 +175,19 @@ def _convert_to_place(device): selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",") device_id = int(selected_npus[0]) place = core.NPUPlace(device_id) + elif lower_device == 'ipu': + if not core.is_compiled_with_ipu(): + raise ValueError( + "The device should not be 'ipu', " \ + "since PaddlePaddle is not compiled with IPU") + place = core.IPUPlace() else: avaliable_gpu_device = re.match(r'gpu:\d+', lower_device) avaliable_xpu_device = re.match(r'xpu:\d+', lower_device) avaliable_npu_device = re.match(r'npu:\d+', lower_device) if not avaliable_gpu_device and not avaliable_xpu_device and not avaliable_npu_device: raise ValueError( - "The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'npu' or 'npu:x'" + "The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'npu', 'npu:x' or ipu" ) if avaliable_gpu_device: if not core.is_compiled_with_cuda(): @@ -183,13 +221,13 @@ def _convert_to_place(device): def set_device(device): """ - Paddle supports running calculations on various types of devices, including CPU, GPU, XPU and NPU. + Paddle supports running calculations on various types of devices, including CPU, GPU, XPU, NPU and IPU. They are represented by string identifiers. This function can specify the global device which the OP will run. Parameters: device(str): This parameter determines the specific running device. - It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``gpu:x``, ``xpu:x`` and ``npu:x``, + It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``gpu:x``, ``xpu:x``, ``npu:x`` and ``ipu``, where ``x`` is the index of the GPUs, XPUs or NPUs. Examples: @@ -236,5 +274,10 @@ def get_device(): elif isinstance(place, core.NPUPlace): device_id = place.get_device_id() device = 'npu:' + str(device_id) + elif isinstance(place, core.IPUPlace): + num_devices = core.get_ipu_device_count() + device = "ipus:{{0-{}}}".format(num_devices - 1) + else: + raise ValueError("The device specification {} is invalid".format(place)) return device diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 5482413dbbc..d8ee875e768 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -71,7 +71,7 @@ from . import distribute_lookup_table from .param_attr import ParamAttr, WeightNormParamAttr from .data_feeder import DataFeeder from .core import LoDTensor, LoDTensorArray, Scope, _Scope -from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace +from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace, IPUPlace from .incubate import fleet from .transpiler import DistributeTranspiler, \ memory_optimize, release_memory, DistributeTranspilerConfig @@ -132,6 +132,7 @@ __all__ = framework.__all__ + executor.__all__ + \ 'CUDAPlace', 'CUDAPinnedPlace', 'NPUPlace', + 'IPUPlace', 'Tensor', 'ParamAttr', 'WeightNormParamAttr', @@ -197,6 +198,11 @@ def __bootstrap__(): if os.name == 'nt': remove_flag_if_exists('cpu_deterministic') + if core.is_compiled_with_ipu(): + # Currently we request all ipu available for training and testing + # finer control of pod of IPUs will be added later + read_env_flags += [] + core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)]) # Note(zhouwei25): sys may not have argv in some cases, # Such as: use Python/C API to call Python from C++ diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py index ce84fb739c0..722003c0340 100644 --- a/python/paddle/framework/__init__.py +++ b/python/paddle/framework/__init__.py @@ -23,6 +23,7 @@ from .framework import set_grad_enabled # noqa: F401 from ..fluid.param_attr import ParamAttr # noqa: F401 from ..fluid.layers.tensor import create_parameter # noqa: F401 from ..fluid.core import CPUPlace # noqa: F401 +from ..fluid.core import IPUPlace # noqa: F401 from ..fluid.core import CUDAPlace # noqa: F401 from ..fluid.core import CUDAPinnedPlace # noqa: F401 from ..fluid.core import NPUPlace # noqa: F401 -- GitLab