diff --git a/paddle/fluid/eager/accumulation/gradient_accumulation.cc b/paddle/fluid/eager/accumulation/gradient_accumulation.cc index 9d475d96e56ce0d06768568f159a4c7630b5bea4..723bf5387c60a9552f74aecc378b0cafed957d96 100644 --- a/paddle/fluid/eager/accumulation/gradient_accumulation.cc +++ b/paddle/fluid/eager/accumulation/gradient_accumulation.cc @@ -116,6 +116,22 @@ class TensorAddFunctor : public boost::static_visitor<> { } #endif +#ifdef PADDLE_WITH_IPU + void operator()(const paddle::platform::IPUPlace& place) { + PADDLE_THROW(paddle::platform::errors::PermissionDenied( + "Gradient accumulation on place (%s) " + "is not supported in imperative mode", + place)); + } +#else + void operator()(const paddle::platform::IPUPlace& place) { + PADDLE_THROW(paddle::platform::errors::PermissionDenied( + "Gradient accumulation on place (%s) " + "is not supported in imperative mode", + place)); + } +#endif + void operator()(const paddle::platform::NPUPinnedPlace& place) { PADDLE_THROW(paddle::platform::errors::PermissionDenied( "Gradient accumulation on place (%s) " diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc index 71b53b8a51882fbb3a130737e5b80a5460bad2cb..5e450234c405cd9a9ade2e89978ce9566e4d8d67 100644 --- a/paddle/fluid/framework/dlpack_tensor.cc +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -81,6 +81,11 @@ struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> { return device; } + inline ::DLDevice operator()(const platform::IPUPlace &place) const { + PADDLE_THROW( + platform::errors::Unimplemented("platform::IPUPlace is not supported")); + } + inline ::DLDevice operator()(const platform::XPUPlace &place) const { PADDLE_THROW( platform::errors::Unimplemented("platform::XPUPlace is not supported")); diff --git a/paddle/fluid/framework/executor.cc b/paddle/fluid/framework/executor.cc index 93f4f8952fc675022ce6e142ac10b194958bd238..9e572614779916bba54dd354c6039c7741dab8bc 100644 --- a/paddle/fluid/framework/executor.cc +++ b/paddle/fluid/framework/executor.cc @@ -463,6 +463,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx, #else PADDLE_THROW( platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle")); +#endif + } else if (platform::is_ipu_place(place_)) { +#ifdef PADDLE_WITH_IPU + gc.reset(new IPUGarbageCollector( + BOOST_GET_CONST(platform::IPUPlace, place_), max_memory_size)); +#else + PADDLE_THROW( + platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle")); #endif } else if (platform::is_npu_place(place_)) { #ifdef PADDLE_WITH_ASCEND_CL diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 348ca5b952bfeab364a5b01ec99e4d0381ab4e84..39496cb26776e765f9675e5041cd13104d374c6a 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -327,6 +327,9 @@ struct OpKernelRegistrarFunctorEx { platform::errors::Unimplemented("Not supported on place (%s) ", npu)); // return GetResultHelper(out, npu); } + bool GetResult(const framework::Tensor& out, + const platform::IPUPlace& ipu) const { + PADDLE_THROW( + platform::errors::Unimplemented("Not supported on place (%s) ", ipu)); + } bool GetResult(const framework::Tensor& out, const platform::NPUPinnedPlace& cpu) const { @@ -762,6 +800,9 @@ struct BothFalseVisitor : public boost::static_visitor<> { void VisitorImpl(const platform::XPUPlace& xpu) const { PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported")); } + void VisitorImpl(const platform::IPUPlace& ipu) const { + PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported")); + } void VisitorImpl(const platform::CUDAPlace& gpu) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc index d95c78c5db8a722e42ac63443f24b887afefcb31..6aad54fba86e481937f0462aef4cbbc35932f023 100644 --- a/paddle/fluid/imperative/gradient_accumulator.cc +++ b/paddle/fluid/imperative/gradient_accumulator.cc @@ -155,6 +155,13 @@ class TensorAddFunctor : public boost::static_visitor<> { "is not supported in imperative mode", place)); } + // there is NO support in IPUPlace + void operator()(const platform::IPUPlace& place) { + PADDLE_THROW(platform::errors::PermissionDenied( + "Gradient accumulation on place (%s) " + "is not supported in imperative mode", + place)); + } private: int64_t numel_; diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc index 6de32335c62b220592d8aebc2e9f6051e741fa4c..41dcf277d7a11e6fa1b90e103b148abffa28704a 100644 --- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc +++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc @@ -116,6 +116,34 @@ size_t Used(const platform::CPUPlace &place) { return GetCPUBuddyAllocator()->Used(); } +// For Graphcore IPU +template <> +void *Alloc(const platform::IPUPlace &place, size_t size) { + VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); + VLOG(10) << "IPUPlace, Allocate on cpu."; + + void *p = GetCPUBuddyAllocator()->Alloc(size); + if (FLAGS_init_allocated_mem) { + memset(p, 0xEF, size); + } + VLOG(10) << " pointer=" << p; + return p; +} +template <> +void Free(const platform::IPUPlace &place, void *p, + size_t size) { + VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); + GetCPUBuddyAllocator()->Free(p); +} +template <> +uint64_t Release(const platform::IPUPlace &place) { + return GetCPUBuddyAllocator()->Release(); +} +template <> +size_t Used(const platform::IPUPlace &place) { + return GetCPUBuddyAllocator()->Used(); +} + // For kunlun XPU template <> void *Alloc(const platform::XPUPlace &place, size_t size) { diff --git a/paddle/fluid/operators/ipu_runtime_op.cc b/paddle/fluid/operators/ipu_runtime_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..4b473da00f3318135f194dd90151fbfb39315fee --- /dev/null +++ b/paddle/fluid/operators/ipu_runtime_op.cc @@ -0,0 +1,62 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/ipu_runtime_op.h" + +namespace paddle { +namespace operators { + +class IpuRuntimeOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override {} + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::proto::VarType::Type(ctx.Attr("dtype")), + ctx.device_context()); + } +}; + +class IpuRuntimeOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("FeedList", "FeedList of Graph").AsDuplicable(); + AddOutput("FetchList", "FetchList of Graph").AsDuplicable(); + AddAttr("dtype", + "(int, default 5 (FP32)) " + "Output data type") + .SetDefault(framework::proto::VarType::FP32); + AddComment(R"DOC( +Run graph by PopART runtime. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(ipu_runtime, ops::IpuRuntimeOp, ops::IpuRuntimeOpMaker); + +REGISTER_OP_IPU_KERNEL(ipu_runtime, ops::IpuRuntimeKernel, + ops::IpuRuntimeKernel, + ops::IpuRuntimeKernel, + ops::IpuRuntimeKernel, + ops::IpuRuntimeKernel, + ops::IpuRuntimeKernel, + ops::IpuRuntimeKernel); diff --git a/paddle/fluid/operators/ipu_runtime_op.h b/paddle/fluid/operators/ipu_runtime_op.h new file mode 100644 index 0000000000000000000000000000000000000000..b6fc9ae98895d40d2e2d1c9eb02a63d200b0b1f8 --- /dev/null +++ b/paddle/fluid/operators/ipu_runtime_op.h @@ -0,0 +1,69 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include + +#include "paddle/fluid/framework/op_registry.h" +#ifdef PADDLE_WITH_IPU +#include "paddle/fluid/framework/ipu/ipu_backend.h" +#include "paddle/fluid/framework/tensor.h" +#endif + +namespace paddle { +namespace operators { + +template +class IpuRuntimeKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { +#ifdef PADDLE_WITH_IPU + auto ipu_backend = framework::ipu::IpuBackend::GetInstance(); + if (!ipu_backend->DeviceIsAttached()) { + const platform::IPUDeviceContext& ipu_ctx = + reinterpret_cast( + ctx.device_context()); + ipu_backend->AttachDevice(ipu_ctx.DeviceId()); + } + + auto inputs = ctx.MultiInput("FeedList"); + auto outputs = ctx.MultiOutput("FetchList"); + auto output_names = ctx.OutputNames("FetchList"); + VLOG(4) << "IpuRuntime Kernel, begin to run graph"; + ipu_backend->Run(inputs, outputs, ctx); + + // post-run + // resize tensor when tensor.dims() is empty + for (size_t i = 0; i < outputs.size(); ++i) { + auto* out = outputs[i]; + if (out->dims().size() == 0) { + auto tensor_dtype = out->type(); + auto sizeof_dtype = framework::SizeOfType(tensor_dtype); + int64_t dim = out->memory_size() / sizeof_dtype; + out->Resize({dim}); + VLOG(10) << "set ipu_runtime_op output: " << output_names[i] + << " dims from () to: " + << "(" << dim << ")"; + } + } +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "Please compile WITH_IPU option to enable ipu_runtime op")); +#endif + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc index 4201af18ca7547172017c04d660d7bcfa5b668b1..daa4efa02ac5081e6ddcd0ed45f6e98c826557ba 100644 --- a/paddle/fluid/operators/math/math_function.cc +++ b/paddle/fluid/operators/math/math_function.cc @@ -173,6 +173,13 @@ void set_constant_with_place( platform::errors::Unimplemented("NPUPinnedPlace is not supported")); } +template <> +void set_constant_with_place( + const platform::DeviceContext& context, framework::Tensor* tensor, + float value) { + PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported")); +} + template <> void set_constant_with_place( const platform::DeviceContext& context, framework::Tensor* tensor, diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index a0c9ff09460aff63d4b41142b7b37635ac17def5..206bef12aac95e0a111e15afbd1a0533e913e7e9 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -16,6 +16,9 @@ limitations under the License. */ #include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h" #include "paddle/fluid/platform/cuda_device_guard.h" #endif +#ifdef PADDLE_WITH_IPU +#include "paddle/fluid/platform/ipu/ipu_backend.h" +#endif #include "glog/logging.h" #include "paddle/fluid/platform/profiler.h" @@ -96,8 +99,9 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) { if (it == device_contexts_.end()) { PADDLE_THROW(platform::errors::Unimplemented( "Place %s is not supported. Please check that your paddle compiles " - "with WITH_GPU, WITH_XPU or WITH_ASCEND_CL option or check that " - "your train process set the correct device id if you use Executor.", + "with WITH_GPU, WITH_XPU, WITH_IPU or WITH_ASCEND_CL option or check " + "that your train process set the correct device id if you use " + "Executor.", place)); } return it->second.get().get(); @@ -158,6 +162,14 @@ DeviceContextPool::DeviceContextPool( PADDLE_THROW( platform::errors::Unimplemented("XPUPlace is not supported. Please " "re-compile with WITH_XPU option.")); +#endif + } else if (platform::is_ipu_place(p)) { +#ifdef PADDLE_WITH_IPU + EmplaceDeviceContext(&device_contexts_, p); +#else + PADDLE_THROW( + platform::errors::Unimplemented("IPUPlace is not supported. Please " + "re-compile with WITH_IPU option.")); #endif } else if (platform::is_npu_place(p)) { #ifdef PADDLE_WITH_ASCEND_CL @@ -195,6 +207,22 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const { Place CPUDeviceContext::GetPlace() const { return place_; } +#ifdef PADDLE_WITH_IPU +IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) { + int id = place.GetDeviceId(); + std::shared_ptr ipu_backend = + platform::ipu::IpuBackend::GetInstance(); + device_ = ipu_backend->GetDevice(id); +} + +Place IPUDeviceContext::GetPlace() const { return place_; } +void IPUDeviceContext::Wait() const { + /*! \brief Wait for all operations completion in the stream. */ +} + +IPUDeviceContext::~IPUDeviceContext() {} + +#endif #ifdef PADDLE_WITH_XPU XPUDeviceContext::XPUDeviceContext() { context_ = xpu::create_context(); diff --git a/paddle/fluid/platform/place.cc b/paddle/fluid/platform/place.cc index 415babc9cb85e6c93c17dd9fdbf7ef61fc424d4c..ec49134b654e93199c3ef522e5afda6337ff3db8 100644 --- a/paddle/fluid/platform/place.cc +++ b/paddle/fluid/platform/place.cc @@ -36,6 +36,7 @@ class PlacePrinter : public boost::static_visitor<> { void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; } void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; } void operator()(const NPUPinnedPlace &p) { os_ << "NPUPinnedPlace"; } + void operator()(const IPUPlace &p) { os_ << "IPUPlace(" << p.device << ")"; } void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; } private: @@ -56,6 +57,10 @@ bool is_npu_place(const Place &p) { return boost::apply_visitor(IsNPUPlace(), p); } +bool is_ipu_place(const Place &p) { + return boost::apply_visitor(IsIPUPlace(), p); +} + bool is_cpu_place(const Place &p) { return boost::apply_visitor(IsCPUPlace(), p); } @@ -80,6 +85,8 @@ bool is_same_place(const Place &p1, const Place &p2) { return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2); } else if (is_npu_place(p1)) { return BOOST_GET_CONST(NPUPlace, p1) == BOOST_GET_CONST(NPUPlace, p2); + } else if (is_ipu_place(p1)) { + return BOOST_GET_CONST(IPUPlace, p1) == BOOST_GET_CONST(IPUPlace, p2); } else { return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2); } diff --git a/paddle/fluid/platform/place.h b/paddle/fluid/platform/place.h index 1ab2a62391157b663120d579e64ed1c9f75c43c5..fadc1e27e8a0ac6116b4d99cc6bc4adbdfbd3907 100644 --- a/paddle/fluid/platform/place.h +++ b/paddle/fluid/platform/place.h @@ -95,12 +95,25 @@ struct NPUPinnedPlace { inline bool operator!=(const NPUPinnedPlace &) const { return false; } inline bool operator<(const NPUPinnedPlace &) const { return false; } }; +struct IPUPlace { + IPUPlace() : IPUPlace(0) {} + explicit IPUPlace(int d) : device(d) {} + + inline int GetDeviceId() const { return device; } + // needed for variant equality comparison + inline bool operator==(const IPUPlace &o) const { return device == o.device; } + inline bool operator!=(const IPUPlace &o) const { return !(*this == o); } + inline bool operator<(const IPUPlace &o) const { return device < o.device; } + + int device; +}; struct IsCUDAPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } bool operator()(const XPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return false; } bool operator()(const NPUPinnedPlace &) const { return false; } + bool operator()(const IPUPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return true; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; @@ -110,6 +123,7 @@ struct IsCPUPlace : public boost::static_visitor { bool operator()(const XPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return false; } bool operator()(const NPUPinnedPlace &) const { return false; } + bool operator()(const IPUPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; @@ -119,6 +133,7 @@ struct IsCUDAPinnedPlace : public boost::static_visitor { bool operator()(const XPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return false; } bool operator()(const NPUPinnedPlace &) const { return false; } + bool operator()(const IPUPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; } }; @@ -128,6 +143,7 @@ struct IsXPUPlace : public boost::static_visitor { bool operator()(const XPUPlace &) const { return true; } bool operator()(const NPUPlace &) const { return false; } bool operator()(const NPUPinnedPlace &) const { return false; } + bool operator()(const IPUPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; @@ -137,6 +153,7 @@ struct IsNPUPlace : public boost::static_visitor { bool operator()(const XPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return true; } bool operator()(const NPUPinnedPlace &) const { return false; } + bool operator()(const IPUPlace &) const { return false; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; } }; @@ -145,22 +162,33 @@ struct IsNPUPinnedPlace : public boost::static_visitor { bool operator()(const CPUPlace &) const { return false; } bool operator()(const XPUPlace &) const { return false; } bool operator()(const NPUPlace &) const { return false; } + bool operator()(const IPUPlace &) const { return false; } + bool operator()(const CUDAPlace &) const { return false; } + bool operator()(const CUDAPinnedPlace &) const { return false; } bool operator()(const NPUPinnedPlace &) const { return true; } +}; +struct IsIPUPlace : public boost::static_visitor { + bool operator()(const CPUPlace &) const { return false; } + bool operator()(const XPUPlace &) const { return false; } + bool operator()(const NPUPlace &) const { return false; } + bool operator()(const IPUPlace &) const { return true; } bool operator()(const CUDAPlace &) const { return false; } bool operator()(const CUDAPinnedPlace &) const { return false; } + bool operator()(const NPUPinnedPlace &) const { return false; } }; class Place : public boost::variant { + CUDAPinnedPlace, NPUPinnedPlace, IPUPlace> { private: using PlaceBase = boost::variant; + CUDAPinnedPlace, NPUPinnedPlace, IPUPlace>; public: Place() = default; Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {} // NOLINT Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {} // NOLINT Place(const NPUPlace &npu_place) : PlaceBase(npu_place) {} // NOLINT + Place(const IPUPlace &ipu_place) : PlaceBase(ipu_place) {} // NOLINT Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {} // NOLINT Place(const CUDAPinnedPlace &cuda_pinned_place) // NOLINT : PlaceBase(cuda_pinned_place) {} @@ -180,6 +208,7 @@ using PlaceList = std::vector; bool is_gpu_place(const Place &); bool is_xpu_place(const Place &); bool is_npu_place(const Place &); +bool is_ipu_place(const Place &); bool is_cpu_place(const Place &); bool is_cuda_pinned_place(const Place &); bool is_npu_pinned_place(const Place &); @@ -228,6 +257,15 @@ struct PlaceVisitorWrapper return typename Visitor::result_type(); #endif } + typename Visitor::result_type operator()(const IPUPlace &ipu) const { +#ifdef PADDLE_WITH_IPU + return visitor_(ipu); +#else + PADDLE_THROW(platform::errors::Unavailable( + "Paddle is not compiled with IPU. Cannot visit ipu device")); + return typename Visitor::result_type(); +#endif + } typename Visitor::result_type operator()(const CUDAPlace &cuda) const { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index a93ddb1a22f9cba8a2694769285a7d734394cd38..c5277a42103958dbedd5a4f51a4b606fa6e6a172 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -132,6 +132,10 @@ limitations under the License. */ #endif #include "paddle/fluid/platform/cuda_graph_with_memory_pool.h" +#ifdef PADDLE_WITH_IPU +#include "paddle/fluid/platform/ipu/ipu_backend.h" +#include "paddle/fluid/platform/ipu_info.h" +#endif #ifdef PADDLE_WITH_CRYPTO #include "paddle/fluid/pybind/crypto.h" @@ -201,6 +205,14 @@ bool IsCompiledWithNPU() { #endif } +bool IsCompiledWithIPU() { +#ifndef PADDLE_WITH_IPU + return false; +#else + return true; +#endif +} + bool IsCompiledWithMKLDNN() { #ifndef PADDLE_WITH_MKLDNN return false; @@ -816,6 +828,8 @@ PYBIND11_MODULE(core_noavx, m) { py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) .def("set", SetTensorFromPyArray, py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) + .def("set", SetTensorFromPyArray, + py::arg("array"), py::arg("place"), py::arg("zero_copy") = false) .def("set", SetTensorFromPyArray, py::arg("array"), py::arg("place"), py::arg("zero_copy") = false, R"DOC( @@ -823,7 +837,7 @@ PYBIND11_MODULE(core_noavx, m) { Args: lod (numpy.ndarray): The data to set. - place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace|NPUPlace): The place where the + place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace): The place where the LoDTensor is to be set. zero_copy (bool, optional): Whether to share memory with the input numpy array. This parameter only works with CPUPlace. Default: False. @@ -1913,6 +1927,58 @@ All parameter, weight, gradient are variables in Paddle. [](const platform::NPUPlace &self) { return self.GetDeviceId(); }) .def("__str__", string::to_string); + // IPUPlace + py::class_(m, "IPUPlace", R"DOC( + IPUPlace is a descriptor of a device. + It represents a IPU device on which a tensor will be allocated and a model will run. + + Examples: + .. code-block:: python + import paddle + + # required: ipu + + ipu_place = paddle.IPUPlace() + + )DOC") + .def("__init__", + [](platform::IPUPlace &self) { +#ifdef PADDLE_WITH_IPU + if (platform::GetIPUDeviceCount() == 0) { + LOG(ERROR) << "Cannot use IPU because there is no IPU " + "detected on your " + "machine."; + std::exit(-1); + } + // use ipu(0) to comile, while run with the number user configure + // in sharding and pipline. + new (&self) platform::IPUPlace(0); +#else + LOG(ERROR) << string::Sprintf( + "Cannot use IPU because you didn't install IPU version " + "PaddlePaddle.\n" + "If you want to use IPU, please try to install IPU version " + "PaddlePaddle by: pip install paddlepaddle*\n" + "If you only have CPU, please change IPUPlace to be " + "CPUPlace().\n"); + std::exit(-1); +#endif + }) + .def("_type", &PlaceIndex) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) + .def("_equals", + &IsSamePlace) +#ifdef PADDLE_WITH_IPU + .def("get_device_id", + [](const platform::IPUPlace &self) { return self.GetDeviceId(); }) +#endif + .def("__str__", string::to_string); + py::class_ platformplace(m, "Place"); g_place_pytype = reinterpret_cast(platformplace.ptr()); platformplace.def(py::init<>()) @@ -1922,6 +1988,7 @@ All parameter, weight, gradient are variables in Paddle. .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) + .def("_equals", &IsSamePlace) .def("_equals", &IsSamePlace) .def("is_gpu_place", [](platform::Place &self) { return platform::is_gpu_place(self); }) @@ -1931,6 +1998,8 @@ All parameter, weight, gradient are variables in Paddle. [](platform::Place &self) { return platform::is_xpu_place(self); }) .def("is_npu_place", [](platform::Place &self) { return platform::is_npu_place(self); }) + .def("is_ipu_place", + [](platform::Place &self) { return platform::is_ipu_place(self); }) .def("is_cuda_pinned_place", [](platform::Place &self) { return platform::is_cuda_pinned_place(self); @@ -1947,6 +2016,10 @@ All parameter, weight, gradient are variables in Paddle. [](platform::Place &self) { return BOOST_GET_CONST(platform::NPUPlace, self).device; }) + .def("ipu_device_id", + [](platform::Place &self) { + return BOOST_GET_CONST(platform::IPUPlace, self).device; + }) .def("set_place", [](platform::Place &self, const platform::Place &other) { self = other; }) .def("set_place", @@ -1970,6 +2043,10 @@ All parameter, weight, gradient are variables in Paddle. [](platform::Place &self, const platform::NPUPlace &npu_place) { self = npu_place; }) + .def("set_place", + [](platform::Place &self, const platform::IPUPlace &ipu_place) { + self = ipu_place; + }) .def("__repr__", string::to_string) .def("__str__", string::to_string); @@ -2201,6 +2278,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("is_compiled_with_ascend", IsCompiledWithAscend); m.def("is_compiled_with_rocm", IsCompiledWithROCM); m.def("is_compiled_with_npu", IsCompiledWithNPU); + m.def("is_compiled_with_ipu", IsCompiledWithIPU); m.def("is_compiled_with_xpu", IsCompiledWithXPU); m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN); m.def("is_compiled_with_cinn", IsCompiledWithCINN); @@ -2520,6 +2598,10 @@ All parameter, weight, gradient are variables in Paddle. }); #endif +#ifdef PADDLE_WITH_IPU + m.def("get_ipu_device_count", platform::GetIPUDeviceCount); +#endif + py::enum_(m, "TracerOption", py::arithmetic()) .value("kDefault", platform::TracerOption::kDefault) .value("kOpDetail", platform::TracerOption::kOpDetail) @@ -2597,6 +2679,11 @@ All parameter, weight, gradient are variables in Paddle. bool val) { self.Set(name, new bool(val)); }) .def("set", [](ir::Pass &self, const std::string &name, int val) { self.Set(name, new int(val)); }) + .def("set", + [](ir::Pass &self, const std::string &name, + std::vector set) { + self.Set(name, new std::vector(set)); + }) .def("set", [](ir::Pass &self, const std::string &name, std::unordered_set set) { @@ -3429,6 +3516,118 @@ All parameter, weight, gradient are variables in Paddle. }) .def("device_count", &ParallelExecutor::DeviceCount); +#ifdef PADDLE_WITH_IPU + py::class_>(m, "IpuBackend") + .def(py::init(&platform::ipu::IpuBackend::GetNewInstance)) + .def("clear", &platform::ipu::IpuBackend::Clear) + .def("set_scope", &platform::ipu::IpuBackend::SetScope) + .def("set_ipu_strategy", &platform::ipu::IpuBackend::SetIpuStrategy); + + py::class_(m, "IpuStrategy") + .def(py::init()) + .def_property( + "num_ipus", + [](const platform::ipu::IpuStrategy &self) { return self.num_ipus; }, + [](platform::ipu::IpuStrategy &self, int num_ipus) { + self.num_ipus = num_ipus; + }, + R"DOC( + Int type, set the number ipu we need. Default 1. + )DOC") + .def_property( + "accumulationFactor", + [](const platform::ipu::IpuStrategy &self) { + return self.popart_options_.accumulationFactor; + }, + [](platform::ipu::IpuStrategy &self, int accumulationFactor) { + self.popart_options_.accumulationFactor = accumulationFactor; + }, + R"DOC( + Specify the number of micro-batches to accumulate before + applying the varUpdate. Default 1. + )DOC") + .def_property("batches_per_step", + [](const platform::ipu::IpuStrategy &self) { + return self.batches_per_step; + }, + [](platform::ipu::IpuStrategy &self, int batches_per_step) { + self.batches_per_step = batches_per_step; + }, + R"DOC( + Int type, set batches_per_step. Default 1. + )DOC") + .def_property("is_training", + [](const platform::ipu::IpuStrategy &self) { + return self.is_training; + }, + [](platform::ipu::IpuStrategy &self, bool is_training) { + self.is_training = is_training; + }, + R"DOC( + Bool type, True for training, False inference. Default True. + )DOC") + .def_property( + "enable_pipelining", + [](const platform::ipu::IpuStrategy &self) { + return self.popart_options_.enablePipelining; + }, + [](platform::ipu::IpuStrategy &self, bool enable_pipelining) { + self.popart_options_.enablePipelining = enable_pipelining; + }, + R"DOC( + Bool type, True enable pipeline, otherwise disable. Default False. + )DOC") + .def_property( + "enable_manual_shard", + [](const platform::ipu::IpuStrategy &self) { + return self.popart_options_.virtualGraphMode == + platform::ipu::VirtualGraphMode::Manual; + }, + [](platform::ipu::IpuStrategy &self, bool enable_ipu_shard) { + if (enable_ipu_shard) { + self.popart_options_.virtualGraphMode = + platform::ipu::VirtualGraphMode::Manual; + } else { + self.popart_options_.virtualGraphMode = + platform::ipu::VirtualGraphMode::Off; + } + }, + R"DOC( + Bool type, True enable model sharding, otherwise disable. Default " + "False. + )DOC") + .def_property("need_avg_shard", + [](const platform::ipu::IpuStrategy &self) { + return self.need_avg_shard; + }, + [](platform::ipu::IpuStrategy &self, bool need_avg_shard) { + self.need_avg_shard = need_avg_shard; + }, + R"DOC( + Bool type, True enable avg shard, otherwise disable. Default False. + )DOC") + .def_property("batch_size", + [](const platform::ipu::IpuStrategy &self) { + return self.batch_size; + }, + [](platform::ipu::IpuStrategy &self, int batch_size) { + self.batch_size = batch_size; + }, + R"DOC( + Int type, used to make batch size fixed. Default 1. + )DOC") + .def_property("enable_fp16", + [](const platform::ipu::IpuStrategy &self) { + return self.enable_fp16; + }, + [](platform::ipu::IpuStrategy &self, bool enable_fp16) { + self.enable_fp16 = enable_fp16; + }, + R"DOC( + Bool type, True enable float16 mode, otherwise disable. Default False.)DOC"); +#endif + BindFleetWrapper(&m); BindIO(&m); diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index c5d0afb9a1716ab711c952140f7fa6e0c5c8d62a..935a6437338a751f8966fdb8c804333d8083cac8 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -313,6 +313,21 @@ void SetTensorFromPyArrayT( PADDLE_THROW(platform::errors::PermissionDenied( "Cannot use XPUPlace in CPU/GPU version, " "Please recompile or reinstall Paddle with XPU support.")); +#endif + } else if (paddle::platform::is_ipu_place(place)) { +#ifdef PADDLE_WITH_IPU + if (zero_copy) { + auto holder = std::make_shared>(array); + auto type = framework::ToDataType(std::type_index(typeid(T))); + self->ResetHolderWithType(holder, type); + } else { + auto dst = self->mutable_data(place); + std::memcpy(dst, array.data(), array.nbytes()); + } +#else + PADDLE_THROW(platform::errors::PermissionDenied( + "Cannot use IPUPlace in CPU/GPU/XPU/NPU version, " + "Please recompile or reinstall Paddle with IPU support.")); #endif } else if (paddle::platform::is_npu_place(place)) { #ifdef PADDLE_WITH_ASCEND_CL diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py index 44afeecec32da4d136705fead505c833c273ee09..da32aab839cb777cf6fb4d49790fff6e367ebea9 100755 --- a/python/paddle/__init__.py +++ b/python/paddle/__init__.py @@ -260,6 +260,7 @@ from .framework.random import set_cuda_rng_state # noqa: F401 from .framework import ParamAttr # noqa: F401 from .framework import create_parameter # noqa: F401 from .framework import CPUPlace # noqa: F401 +from .framework import IPUPlace # noqa: F401 from .framework import CUDAPlace # noqa: F401 from .framework import NPUPlace # noqa: F401 from .framework import CUDAPinnedPlace # noqa: F401 @@ -291,6 +292,7 @@ from .fluid.framework import get_flags # noqa: F401 from .fluid.framework import set_flags # noqa: F401 from .device import is_compiled_with_xpu # noqa: F401 from .device import is_compiled_with_npu # noqa: F401 +from .device import is_compiled_with_ipu # noqa: F401 from .device import XPUPlace # noqa: F401 from .fluid.dygraph.base import enable_dygraph as disable_static # noqa: F401 diff --git a/python/paddle/device/__init__.py b/python/paddle/device/__init__.py index 95402898589f6ed1095ae4d54674c27d487c3fed..0a11d59d69c94c0345802b0f9d070aa23f4c0a24 100644 --- a/python/paddle/device/__init__.py +++ b/python/paddle/device/__init__.py @@ -28,7 +28,9 @@ __all__ = [ # noqa 'set_device', 'get_device', 'XPUPlace', + 'IPUPlace', 'is_compiled_with_xpu', + 'is_compiled_with_ipu', 'is_compiled_with_cinn', 'is_compiled_with_cuda', 'is_compiled_with_rocm', @@ -55,6 +57,36 @@ def is_compiled_with_npu(): return core.is_compiled_with_npu() +def is_compiled_with_ipu(): + """ + Whether paddle was built with WITH_IPU=ON to support Graphcore IPU. + + Returns (bool): `True` if IPU is supported, otherwise `False`. + + Examples: + .. code-block:: python + + import paddle + support_ipu = paddle.is_compiled_with_ipu() + """ + return core.is_compiled_with_ipu() + + +def IPUPlace(): + """ + Return a Graphcore IPU Place + + Examples: + .. code-block:: python + + # required: ipu + + import paddle + place = paddle.device.IPUPlace() + """ + return core.IPUPlace() + + def is_compiled_with_xpu(): """ Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun @@ -143,13 +175,19 @@ def _convert_to_place(device): selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",") device_id = int(selected_npus[0]) place = core.NPUPlace(device_id) + elif lower_device == 'ipu': + if not core.is_compiled_with_ipu(): + raise ValueError( + "The device should not be 'ipu', " \ + "since PaddlePaddle is not compiled with IPU") + place = core.IPUPlace() else: avaliable_gpu_device = re.match(r'gpu:\d+', lower_device) avaliable_xpu_device = re.match(r'xpu:\d+', lower_device) avaliable_npu_device = re.match(r'npu:\d+', lower_device) if not avaliable_gpu_device and not avaliable_xpu_device and not avaliable_npu_device: raise ValueError( - "The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'npu' or 'npu:x'" + "The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'npu', 'npu:x' or ipu" ) if avaliable_gpu_device: if not core.is_compiled_with_cuda(): @@ -183,13 +221,13 @@ def _convert_to_place(device): def set_device(device): """ - Paddle supports running calculations on various types of devices, including CPU, GPU, XPU and NPU. + Paddle supports running calculations on various types of devices, including CPU, GPU, XPU, NPU and IPU. They are represented by string identifiers. This function can specify the global device which the OP will run. Parameters: device(str): This parameter determines the specific running device. - It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``gpu:x``, ``xpu:x`` and ``npu:x``, + It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``gpu:x``, ``xpu:x``, ``npu:x`` and ``ipu``, where ``x`` is the index of the GPUs, XPUs or NPUs. Examples: @@ -236,5 +274,10 @@ def get_device(): elif isinstance(place, core.NPUPlace): device_id = place.get_device_id() device = 'npu:' + str(device_id) + elif isinstance(place, core.IPUPlace): + num_devices = core.get_ipu_device_count() + device = "ipus:{{0-{}}}".format(num_devices - 1) + else: + raise ValueError("The device specification {} is invalid".format(place)) return device diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 5482413dbbc5d10c1829514cf2d76c2236ba51a1..d8ee875e768e524525a67042eb4952df53901f05 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -71,7 +71,7 @@ from . import distribute_lookup_table from .param_attr import ParamAttr, WeightNormParamAttr from .data_feeder import DataFeeder from .core import LoDTensor, LoDTensorArray, Scope, _Scope -from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace +from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace, IPUPlace from .incubate import fleet from .transpiler import DistributeTranspiler, \ memory_optimize, release_memory, DistributeTranspilerConfig @@ -132,6 +132,7 @@ __all__ = framework.__all__ + executor.__all__ + \ 'CUDAPlace', 'CUDAPinnedPlace', 'NPUPlace', + 'IPUPlace', 'Tensor', 'ParamAttr', 'WeightNormParamAttr', @@ -197,6 +198,11 @@ def __bootstrap__(): if os.name == 'nt': remove_flag_if_exists('cpu_deterministic') + if core.is_compiled_with_ipu(): + # Currently we request all ipu available for training and testing + # finer control of pod of IPUs will be added later + read_env_flags += [] + core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)]) # Note(zhouwei25): sys may not have argv in some cases, # Such as: use Python/C API to call Python from C++ diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py index ce84fb739c0009c62bfd2c9c9d9fd74255c96312..722003c034091957ef66cd2202139a8e2f00cd8c 100644 --- a/python/paddle/framework/__init__.py +++ b/python/paddle/framework/__init__.py @@ -23,6 +23,7 @@ from .framework import set_grad_enabled # noqa: F401 from ..fluid.param_attr import ParamAttr # noqa: F401 from ..fluid.layers.tensor import create_parameter # noqa: F401 from ..fluid.core import CPUPlace # noqa: F401 +from ..fluid.core import IPUPlace # noqa: F401 from ..fluid.core import CUDAPlace # noqa: F401 from ..fluid.core import CUDAPinnedPlace # noqa: F401 from ..fluid.core import NPUPlace # noqa: F401