未验证 提交 cb636a48 编写于 作者: J jianghaicheng 提交者: GitHub

add ipu device p2 (#37840)

上级 890638cf
......@@ -116,6 +116,22 @@ class TensorAddFunctor : public boost::static_visitor<> {
}
#endif
#ifdef PADDLE_WITH_IPU
void operator()(const paddle::platform::IPUPlace& place) {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#else
void operator()(const paddle::platform::IPUPlace& place) {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
#endif
void operator()(const paddle::platform::NPUPinnedPlace& place) {
PADDLE_THROW(paddle::platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
......
......@@ -81,6 +81,11 @@ struct DLDeviceVisitor : public boost::static_visitor<::DLDevice> {
return device;
}
inline ::DLDevice operator()(const platform::IPUPlace &place) const {
PADDLE_THROW(
platform::errors::Unimplemented("platform::IPUPlace is not supported"));
}
inline ::DLDevice operator()(const platform::XPUPlace &place) const {
PADDLE_THROW(
platform::errors::Unimplemented("platform::XPUPlace is not supported"));
......
......@@ -463,6 +463,14 @@ void Executor::RunPartialPreparedContext(ExecutorPrepareContext* ctx,
#else
PADDLE_THROW(
platform::errors::Unimplemented("No XPU gc found in CPU/GPU paddle"));
#endif
} else if (platform::is_ipu_place(place_)) {
#ifdef PADDLE_WITH_IPU
gc.reset(new IPUGarbageCollector(
BOOST_GET_CONST(platform::IPUPlace, place_), max_memory_size));
#else
PADDLE_THROW(
platform::errors::Unimplemented("No IPU gc found in CPU/IPU paddle"));
#endif
} else if (platform::is_npu_place(place_)) {
#ifdef PADDLE_WITH_ASCEND_CL
......
......@@ -327,6 +327,9 @@ struct OpKernelRegistrarFunctorEx<PlaceType, false, I,
#define REGISTER_OP_CPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
#define REGISTER_OP_IPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, IPU, ::paddle::platform::IPUPlace, __VA_ARGS__)
#define REGISTER_OP_XPU_KERNEL(op_type, ...) \
REGISTER_OP_KERNEL(op_type, XPU, ::paddle::platform::XPUPlace, __VA_ARGS__)
......
......@@ -76,6 +76,22 @@ void TensorCopy(const Tensor& src, const platform::Place& dst_place,
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
}
#ifdef PADDLE_WITH_IPU
else if (platform::is_ipu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size);
} else if (platform::is_cpu_place(src_place) &&
platform::is_ipu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
} else if (platform::is_ipu_place(src_place) &&
platform::is_ipu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size);
}
#endif
#ifdef PADDLE_WITH_XPU
else if (platform::is_xpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
......@@ -386,16 +402,32 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
}
#ifdef PADDLE_WITH_IPU
else if (platform::is_ipu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::IPUPlace, src_place), src_ptr, size);
} else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_ipu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::IPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
} else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
}
#endif
#ifdef PADDLE_WITH_XPU
else if (platform::is_xpu_place(src_place) && // NOLINT
platform::is_cpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::CPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
} else if (platform::is_cpu_place(src_place) && // NOLINT
}
else if (platform::is_cpu_place(src_place) && // NOLINT
platform::is_xpu_place(dst_place)) {
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::CPUPlace, src_place), src_ptr, size);
} else if (platform::is_xpu_place(src_place) && // NOLINT
}
else if (platform::is_xpu_place(src_place) && // NOLINT
platform::is_xpu_place(dst_place)) {
if (src_ptr == dst_ptr) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
......@@ -404,7 +436,8 @@ void TensorCopySync(const Tensor& src, const platform::Place& dst_place,
}
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, dst_place), dst_ptr,
BOOST_GET_CONST(platform::XPUPlace, src_place), src_ptr, size);
} else { // NOLINT
}
else { // NOLINT
PADDLE_THROW(platform::errors::Unimplemented(
"Copy from %s to %s is not supported.", src_place, dst_place));
}
......@@ -571,6 +604,11 @@ class AnyVisitor : public boost::static_visitor<bool> {
platform::errors::Unimplemented("Not supported on place (%s) ", npu));
// return GetResultHelper(out, npu);
}
bool GetResult(const framework::Tensor& out,
const platform::IPUPlace& ipu) const {
PADDLE_THROW(
platform::errors::Unimplemented("Not supported on place (%s) ", ipu));
}
bool GetResult(const framework::Tensor& out,
const platform::NPUPinnedPlace& cpu) const {
......@@ -762,6 +800,9 @@ struct BothFalseVisitor : public boost::static_visitor<> {
void VisitorImpl(const platform::XPUPlace& xpu) const {
PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
}
void VisitorImpl(const platform::IPUPlace& ipu) const {
PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
}
void VisitorImpl(const platform::CUDAPlace& gpu) const {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......
......@@ -155,6 +155,13 @@ class TensorAddFunctor : public boost::static_visitor<> {
"is not supported in imperative mode",
place));
}
// there is NO support in IPUPlace
void operator()(const platform::IPUPlace& place) {
PADDLE_THROW(platform::errors::PermissionDenied(
"Gradient accumulation on place (%s) "
"is not supported in imperative mode",
place));
}
private:
int64_t numel_;
......
......@@ -116,6 +116,34 @@ size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
return GetCPUBuddyAllocator()->Used();
}
// For Graphcore IPU
template <>
void *Alloc<platform::IPUPlace>(const platform::IPUPlace &place, size_t size) {
VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place);
VLOG(10) << "IPUPlace, Allocate on cpu.";
void *p = GetCPUBuddyAllocator()->Alloc(size);
if (FLAGS_init_allocated_mem) {
memset(p, 0xEF, size);
}
VLOG(10) << " pointer=" << p;
return p;
}
template <>
void Free<platform::IPUPlace>(const platform::IPUPlace &place, void *p,
size_t size) {
VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place);
GetCPUBuddyAllocator()->Free(p);
}
template <>
uint64_t Release<platform::IPUPlace>(const platform::IPUPlace &place) {
return GetCPUBuddyAllocator()->Release();
}
template <>
size_t Used<platform::IPUPlace>(const platform::IPUPlace &place) {
return GetCPUBuddyAllocator()->Used();
}
// For kunlun XPU
template <>
void *Alloc<platform::XPUPlace>(const platform::XPUPlace &place, size_t size) {
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/ipu_runtime_op.h"
namespace paddle {
namespace operators {
class IpuRuntimeOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
framework::proto::VarType::Type(ctx.Attr<int>("dtype")),
ctx.device_context());
}
};
class IpuRuntimeOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() override {
AddInput("FeedList", "FeedList of Graph").AsDuplicable();
AddOutput("FetchList", "FetchList of Graph").AsDuplicable();
AddAttr<int>("dtype",
"(int, default 5 (FP32)) "
"Output data type")
.SetDefault(framework::proto::VarType::FP32);
AddComment(R"DOC(
Run graph by PopART runtime.
)DOC");
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OPERATOR(ipu_runtime, ops::IpuRuntimeOp, ops::IpuRuntimeOpMaker);
REGISTER_OP_IPU_KERNEL(ipu_runtime, ops::IpuRuntimeKernel<float>,
ops::IpuRuntimeKernel<double>,
ops::IpuRuntimeKernel<int>,
ops::IpuRuntimeKernel<int64_t>,
ops::IpuRuntimeKernel<bool>,
ops::IpuRuntimeKernel<int8_t>,
ops::IpuRuntimeKernel<paddle::platform::float16>);
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <memory>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#ifdef PADDLE_WITH_IPU
#include "paddle/fluid/framework/ipu/ipu_backend.h"
#include "paddle/fluid/framework/tensor.h"
#endif
namespace paddle {
namespace operators {
template <typename T>
class IpuRuntimeKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
#ifdef PADDLE_WITH_IPU
auto ipu_backend = framework::ipu::IpuBackend::GetInstance();
if (!ipu_backend->DeviceIsAttached()) {
const platform::IPUDeviceContext& ipu_ctx =
reinterpret_cast<const platform::IPUDeviceContext&>(
ctx.device_context());
ipu_backend->AttachDevice(ipu_ctx.DeviceId());
}
auto inputs = ctx.MultiInput<framework::Tensor>("FeedList");
auto outputs = ctx.MultiOutput<framework::Tensor>("FetchList");
auto output_names = ctx.OutputNames("FetchList");
VLOG(4) << "IpuRuntime Kernel, begin to run graph";
ipu_backend->Run(inputs, outputs, ctx);
// post-run
// resize tensor when tensor.dims() is empty
for (size_t i = 0; i < outputs.size(); ++i) {
auto* out = outputs[i];
if (out->dims().size() == 0) {
auto tensor_dtype = out->type();
auto sizeof_dtype = framework::SizeOfType(tensor_dtype);
int64_t dim = out->memory_size() / sizeof_dtype;
out->Resize({dim});
VLOG(10) << "set ipu_runtime_op output: " << output_names[i]
<< " dims from () to: "
<< "(" << dim << ")";
}
}
#else
PADDLE_THROW(platform::errors::PreconditionNotMet(
"Please compile WITH_IPU option to enable ipu_runtime op"));
#endif
}
};
} // namespace operators
} // namespace paddle
......@@ -173,6 +173,13 @@ void set_constant_with_place<platform::NPUPinnedPlace>(
platform::errors::Unimplemented("NPUPinnedPlace is not supported"));
}
template <>
void set_constant_with_place<platform::IPUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
float value) {
PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
}
template <>
void set_constant_with_place<platform::CPUPlace>(
const platform::DeviceContext& context, framework::Tensor* tensor,
......
......@@ -16,6 +16,9 @@ limitations under the License. */
#include "paddle/fluid/memory/allocation/cuda_device_context_allocator.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#ifdef PADDLE_WITH_IPU
#include "paddle/fluid/platform/ipu/ipu_backend.h"
#endif
#include "glog/logging.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -96,8 +99,9 @@ platform::DeviceContext* DeviceContextPool::Get(const platform::Place& place) {
if (it == device_contexts_.end()) {
PADDLE_THROW(platform::errors::Unimplemented(
"Place %s is not supported. Please check that your paddle compiles "
"with WITH_GPU, WITH_XPU or WITH_ASCEND_CL option or check that "
"your train process set the correct device id if you use Executor.",
"with WITH_GPU, WITH_XPU, WITH_IPU or WITH_ASCEND_CL option or check "
"that your train process set the correct device id if you use "
"Executor.",
place));
}
return it->second.get().get();
......@@ -158,6 +162,14 @@ DeviceContextPool::DeviceContextPool(
PADDLE_THROW(
platform::errors::Unimplemented("XPUPlace is not supported. Please "
"re-compile with WITH_XPU option."));
#endif
} else if (platform::is_ipu_place(p)) {
#ifdef PADDLE_WITH_IPU
EmplaceDeviceContext<IPUDeviceContext, IPUPlace>(&device_contexts_, p);
#else
PADDLE_THROW(
platform::errors::Unimplemented("IPUPlace is not supported. Please "
"re-compile with WITH_IPU option."));
#endif
} else if (platform::is_npu_place(p)) {
#ifdef PADDLE_WITH_ASCEND_CL
......@@ -195,6 +207,22 @@ Eigen::DefaultDevice* CPUDeviceContext::eigen_device() const {
Place CPUDeviceContext::GetPlace() const { return place_; }
#ifdef PADDLE_WITH_IPU
IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {
int id = place.GetDeviceId();
std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
platform::ipu::IpuBackend::GetInstance();
device_ = ipu_backend->GetDevice(id);
}
Place IPUDeviceContext::GetPlace() const { return place_; }
void IPUDeviceContext::Wait() const {
/*! \brief Wait for all operations completion in the stream. */
}
IPUDeviceContext::~IPUDeviceContext() {}
#endif
#ifdef PADDLE_WITH_XPU
XPUDeviceContext::XPUDeviceContext() {
context_ = xpu::create_context();
......
......@@ -36,6 +36,7 @@ class PlacePrinter : public boost::static_visitor<> {
void operator()(const XPUPlace &p) { os_ << "XPUPlace(" << p.device << ")"; }
void operator()(const NPUPlace &p) { os_ << "NPUPlace(" << p.device << ")"; }
void operator()(const NPUPinnedPlace &p) { os_ << "NPUPinnedPlace"; }
void operator()(const IPUPlace &p) { os_ << "IPUPlace(" << p.device << ")"; }
void operator()(const CUDAPinnedPlace &p) { os_ << "CUDAPinnedPlace"; }
private:
......@@ -56,6 +57,10 @@ bool is_npu_place(const Place &p) {
return boost::apply_visitor(IsNPUPlace(), p);
}
bool is_ipu_place(const Place &p) {
return boost::apply_visitor(IsIPUPlace(), p);
}
bool is_cpu_place(const Place &p) {
return boost::apply_visitor(IsCPUPlace(), p);
}
......@@ -80,6 +85,8 @@ bool is_same_place(const Place &p1, const Place &p2) {
return BOOST_GET_CONST(XPUPlace, p1) == BOOST_GET_CONST(XPUPlace, p2);
} else if (is_npu_place(p1)) {
return BOOST_GET_CONST(NPUPlace, p1) == BOOST_GET_CONST(NPUPlace, p2);
} else if (is_ipu_place(p1)) {
return BOOST_GET_CONST(IPUPlace, p1) == BOOST_GET_CONST(IPUPlace, p2);
} else {
return BOOST_GET_CONST(CUDAPlace, p1) == BOOST_GET_CONST(CUDAPlace, p2);
}
......
......@@ -95,12 +95,25 @@ struct NPUPinnedPlace {
inline bool operator!=(const NPUPinnedPlace &) const { return false; }
inline bool operator<(const NPUPinnedPlace &) const { return false; }
};
struct IPUPlace {
IPUPlace() : IPUPlace(0) {}
explicit IPUPlace(int d) : device(d) {}
inline int GetDeviceId() const { return device; }
// needed for variant equality comparison
inline bool operator==(const IPUPlace &o) const { return device == o.device; }
inline bool operator!=(const IPUPlace &o) const { return !(*this == o); }
inline bool operator<(const IPUPlace &o) const { return device < o.device; }
int device;
};
struct IsCUDAPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return false; }
bool operator()(const NPUPinnedPlace &) const { return false; }
bool operator()(const IPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return true; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
......@@ -110,6 +123,7 @@ struct IsCPUPlace : public boost::static_visitor<bool> {
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return false; }
bool operator()(const NPUPinnedPlace &) const { return false; }
bool operator()(const IPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
......@@ -119,6 +133,7 @@ struct IsCUDAPinnedPlace : public boost::static_visitor<bool> {
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return false; }
bool operator()(const NPUPinnedPlace &) const { return false; }
bool operator()(const IPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &cuda_pinned) const { return true; }
};
......@@ -128,6 +143,7 @@ struct IsXPUPlace : public boost::static_visitor<bool> {
bool operator()(const XPUPlace &) const { return true; }
bool operator()(const NPUPlace &) const { return false; }
bool operator()(const NPUPinnedPlace &) const { return false; }
bool operator()(const IPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
......@@ -137,6 +153,7 @@ struct IsNPUPlace : public boost::static_visitor<bool> {
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return true; }
bool operator()(const NPUPinnedPlace &) const { return false; }
bool operator()(const IPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
};
......@@ -145,22 +162,33 @@ struct IsNPUPinnedPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return false; }
bool operator()(const IPUPlace &) const { return false; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
bool operator()(const NPUPinnedPlace &) const { return true; }
};
struct IsIPUPlace : public boost::static_visitor<bool> {
bool operator()(const CPUPlace &) const { return false; }
bool operator()(const XPUPlace &) const { return false; }
bool operator()(const NPUPlace &) const { return false; }
bool operator()(const IPUPlace &) const { return true; }
bool operator()(const CUDAPlace &) const { return false; }
bool operator()(const CUDAPinnedPlace &) const { return false; }
bool operator()(const NPUPinnedPlace &) const { return false; }
};
class Place : public boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
CUDAPinnedPlace, NPUPinnedPlace> {
CUDAPinnedPlace, NPUPinnedPlace, IPUPlace> {
private:
using PlaceBase = boost::variant<CUDAPlace, XPUPlace, NPUPlace, CPUPlace,
CUDAPinnedPlace, NPUPinnedPlace>;
CUDAPinnedPlace, NPUPinnedPlace, IPUPlace>;
public:
Place() = default;
Place(const CPUPlace &cpu_place) : PlaceBase(cpu_place) {} // NOLINT
Place(const XPUPlace &xpu_place) : PlaceBase(xpu_place) {} // NOLINT
Place(const NPUPlace &npu_place) : PlaceBase(npu_place) {} // NOLINT
Place(const IPUPlace &ipu_place) : PlaceBase(ipu_place) {} // NOLINT
Place(const CUDAPlace &cuda_place) : PlaceBase(cuda_place) {} // NOLINT
Place(const CUDAPinnedPlace &cuda_pinned_place) // NOLINT
: PlaceBase(cuda_pinned_place) {}
......@@ -180,6 +208,7 @@ using PlaceList = std::vector<Place>;
bool is_gpu_place(const Place &);
bool is_xpu_place(const Place &);
bool is_npu_place(const Place &);
bool is_ipu_place(const Place &);
bool is_cpu_place(const Place &);
bool is_cuda_pinned_place(const Place &);
bool is_npu_pinned_place(const Place &);
......@@ -228,6 +257,15 @@ struct PlaceVisitorWrapper
return typename Visitor::result_type();
#endif
}
typename Visitor::result_type operator()(const IPUPlace &ipu) const {
#ifdef PADDLE_WITH_IPU
return visitor_(ipu);
#else
PADDLE_THROW(platform::errors::Unavailable(
"Paddle is not compiled with IPU. Cannot visit ipu device"));
return typename Visitor::result_type();
#endif
}
typename Visitor::result_type operator()(const CUDAPlace &cuda) const {
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
......
......@@ -132,6 +132,10 @@ limitations under the License. */
#endif
#include "paddle/fluid/platform/cuda_graph_with_memory_pool.h"
#ifdef PADDLE_WITH_IPU
#include "paddle/fluid/platform/ipu/ipu_backend.h"
#include "paddle/fluid/platform/ipu_info.h"
#endif
#ifdef PADDLE_WITH_CRYPTO
#include "paddle/fluid/pybind/crypto.h"
......@@ -201,6 +205,14 @@ bool IsCompiledWithNPU() {
#endif
}
bool IsCompiledWithIPU() {
#ifndef PADDLE_WITH_IPU
return false;
#else
return true;
#endif
}
bool IsCompiledWithMKLDNN() {
#ifndef PADDLE_WITH_MKLDNN
return false;
......@@ -816,6 +828,8 @@ PYBIND11_MODULE(core_noavx, m) {
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::NPUPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::IPUPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false)
.def("set", SetTensorFromPyArray<paddle::platform::CUDAPinnedPlace>,
py::arg("array"), py::arg("place"), py::arg("zero_copy") = false,
R"DOC(
......@@ -823,7 +837,7 @@ PYBIND11_MODULE(core_noavx, m) {
Args:
lod (numpy.ndarray): The data to set.
place (CPUPlace|CUDAPlace|XPUPlace|CUDAPinnedPlace|NPUPlace): The place where the
place (CPUPlace|CUDAPlace|XPUPlace|IPUPlace|CUDAPinnedPlace|NPUPlace): The place where the
LoDTensor is to be set.
zero_copy (bool, optional): Whether to share memory with the input numpy array.
This parameter only works with CPUPlace. Default: False.
......@@ -1913,6 +1927,58 @@ All parameter, weight, gradient are variables in Paddle.
[](const platform::NPUPlace &self) { return self.GetDeviceId(); })
.def("__str__", string::to_string<const platform::NPUPlace &>);
// IPUPlace
py::class_<platform::IPUPlace>(m, "IPUPlace", R"DOC(
IPUPlace is a descriptor of a device.
It represents a IPU device on which a tensor will be allocated and a model will run.
Examples:
.. code-block:: python
import paddle
# required: ipu
ipu_place = paddle.IPUPlace()
)DOC")
.def("__init__",
[](platform::IPUPlace &self) {
#ifdef PADDLE_WITH_IPU
if (platform::GetIPUDeviceCount() == 0) {
LOG(ERROR) << "Cannot use IPU because there is no IPU "
"detected on your "
"machine.";
std::exit(-1);
}
// use ipu(0) to comile, while run with the number user configure
// in sharding and pipline.
new (&self) platform::IPUPlace(0);
#else
LOG(ERROR) << string::Sprintf(
"Cannot use IPU because you didn't install IPU version "
"PaddlePaddle.\n"
"If you want to use IPU, please try to install IPU version "
"PaddlePaddle by: pip install paddlepaddle*\n"
"If you only have CPU, please change IPUPlace to be "
"CPUPlace().\n");
std::exit(-1);
#endif
})
.def("_type", &PlaceIndex<platform::IPUPlace>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::Place>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::CUDAPlace>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::IPUPlace, platform::IPUPlace>)
.def("_equals",
&IsSamePlace<platform::IPUPlace, platform::CUDAPinnedPlace>)
#ifdef PADDLE_WITH_IPU
.def("get_device_id",
[](const platform::IPUPlace &self) { return self.GetDeviceId(); })
#endif
.def("__str__", string::to_string<const platform::IPUPlace &>);
py::class_<platform::Place> platformplace(m, "Place");
g_place_pytype = reinterpret_cast<PyTypeObject *>(platformplace.ptr());
platformplace.def(py::init<>())
......@@ -1922,6 +1988,7 @@ All parameter, weight, gradient are variables in Paddle.
.def("_equals", &IsSamePlace<platform::Place, platform::CPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::XPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::NPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::IPUPlace>)
.def("_equals", &IsSamePlace<platform::Place, platform::CUDAPinnedPlace>)
.def("is_gpu_place",
[](platform::Place &self) { return platform::is_gpu_place(self); })
......@@ -1931,6 +1998,8 @@ All parameter, weight, gradient are variables in Paddle.
[](platform::Place &self) { return platform::is_xpu_place(self); })
.def("is_npu_place",
[](platform::Place &self) { return platform::is_npu_place(self); })
.def("is_ipu_place",
[](platform::Place &self) { return platform::is_ipu_place(self); })
.def("is_cuda_pinned_place",
[](platform::Place &self) {
return platform::is_cuda_pinned_place(self);
......@@ -1947,6 +2016,10 @@ All parameter, weight, gradient are variables in Paddle.
[](platform::Place &self) {
return BOOST_GET_CONST(platform::NPUPlace, self).device;
})
.def("ipu_device_id",
[](platform::Place &self) {
return BOOST_GET_CONST(platform::IPUPlace, self).device;
})
.def("set_place", [](platform::Place &self,
const platform::Place &other) { self = other; })
.def("set_place",
......@@ -1970,6 +2043,10 @@ All parameter, weight, gradient are variables in Paddle.
[](platform::Place &self, const platform::NPUPlace &npu_place) {
self = npu_place;
})
.def("set_place",
[](platform::Place &self, const platform::IPUPlace &ipu_place) {
self = ipu_place;
})
.def("__repr__", string::to_string<const platform::Place &>)
.def("__str__", string::to_string<const platform::Place &>);
......@@ -2201,6 +2278,7 @@ All parameter, weight, gradient are variables in Paddle.
m.def("is_compiled_with_ascend", IsCompiledWithAscend);
m.def("is_compiled_with_rocm", IsCompiledWithROCM);
m.def("is_compiled_with_npu", IsCompiledWithNPU);
m.def("is_compiled_with_ipu", IsCompiledWithIPU);
m.def("is_compiled_with_xpu", IsCompiledWithXPU);
m.def("is_compiled_with_mkldnn", IsCompiledWithMKLDNN);
m.def("is_compiled_with_cinn", IsCompiledWithCINN);
......@@ -2520,6 +2598,10 @@ All parameter, weight, gradient are variables in Paddle.
});
#endif
#ifdef PADDLE_WITH_IPU
m.def("get_ipu_device_count", platform::GetIPUDeviceCount);
#endif
py::enum_<platform::TracerOption>(m, "TracerOption", py::arithmetic())
.value("kDefault", platform::TracerOption::kDefault)
.value("kOpDetail", platform::TracerOption::kOpDetail)
......@@ -2597,6 +2679,11 @@ All parameter, weight, gradient are variables in Paddle.
bool val) { self.Set<bool>(name, new bool(val)); })
.def("set", [](ir::Pass &self, const std::string &name,
int val) { self.Set<const int>(name, new int(val)); })
.def("set",
[](ir::Pass &self, const std::string &name,
std::vector<std::string> set) {
self.Set(name, new std::vector<std::string>(set));
})
.def("set",
[](ir::Pass &self, const std::string &name,
std::unordered_set<std::string> set) {
......@@ -3429,6 +3516,118 @@ All parameter, weight, gradient are variables in Paddle.
})
.def("device_count", &ParallelExecutor::DeviceCount);
#ifdef PADDLE_WITH_IPU
py::class_<platform::ipu::IpuBackend,
std::shared_ptr<platform::ipu::IpuBackend>>(m, "IpuBackend")
.def(py::init(&platform::ipu::IpuBackend::GetNewInstance))
.def("clear", &platform::ipu::IpuBackend::Clear)
.def("set_scope", &platform::ipu::IpuBackend::SetScope)
.def("set_ipu_strategy", &platform::ipu::IpuBackend::SetIpuStrategy);
py::class_<platform::ipu::IpuStrategy>(m, "IpuStrategy")
.def(py::init())
.def_property(
"num_ipus",
[](const platform::ipu::IpuStrategy &self) { return self.num_ipus; },
[](platform::ipu::IpuStrategy &self, int num_ipus) {
self.num_ipus = num_ipus;
},
R"DOC(
Int type, set the number ipu we need. Default 1.
)DOC")
.def_property(
"accumulationFactor",
[](const platform::ipu::IpuStrategy &self) {
return self.popart_options_.accumulationFactor;
},
[](platform::ipu::IpuStrategy &self, int accumulationFactor) {
self.popart_options_.accumulationFactor = accumulationFactor;
},
R"DOC(
Specify the number of micro-batches to accumulate before
applying the varUpdate. Default 1.
)DOC")
.def_property("batches_per_step",
[](const platform::ipu::IpuStrategy &self) {
return self.batches_per_step;
},
[](platform::ipu::IpuStrategy &self, int batches_per_step) {
self.batches_per_step = batches_per_step;
},
R"DOC(
Int type, set batches_per_step. Default 1.
)DOC")
.def_property("is_training",
[](const platform::ipu::IpuStrategy &self) {
return self.is_training;
},
[](platform::ipu::IpuStrategy &self, bool is_training) {
self.is_training = is_training;
},
R"DOC(
Bool type, True for training, False inference. Default True.
)DOC")
.def_property(
"enable_pipelining",
[](const platform::ipu::IpuStrategy &self) {
return self.popart_options_.enablePipelining;
},
[](platform::ipu::IpuStrategy &self, bool enable_pipelining) {
self.popart_options_.enablePipelining = enable_pipelining;
},
R"DOC(
Bool type, True enable pipeline, otherwise disable. Default False.
)DOC")
.def_property(
"enable_manual_shard",
[](const platform::ipu::IpuStrategy &self) {
return self.popart_options_.virtualGraphMode ==
platform::ipu::VirtualGraphMode::Manual;
},
[](platform::ipu::IpuStrategy &self, bool enable_ipu_shard) {
if (enable_ipu_shard) {
self.popart_options_.virtualGraphMode =
platform::ipu::VirtualGraphMode::Manual;
} else {
self.popart_options_.virtualGraphMode =
platform::ipu::VirtualGraphMode::Off;
}
},
R"DOC(
Bool type, True enable model sharding, otherwise disable. Default "
"False.
)DOC")
.def_property("need_avg_shard",
[](const platform::ipu::IpuStrategy &self) {
return self.need_avg_shard;
},
[](platform::ipu::IpuStrategy &self, bool need_avg_shard) {
self.need_avg_shard = need_avg_shard;
},
R"DOC(
Bool type, True enable avg shard, otherwise disable. Default False.
)DOC")
.def_property("batch_size",
[](const platform::ipu::IpuStrategy &self) {
return self.batch_size;
},
[](platform::ipu::IpuStrategy &self, int batch_size) {
self.batch_size = batch_size;
},
R"DOC(
Int type, used to make batch size fixed. Default 1.
)DOC")
.def_property("enable_fp16",
[](const platform::ipu::IpuStrategy &self) {
return self.enable_fp16;
},
[](platform::ipu::IpuStrategy &self, bool enable_fp16) {
self.enable_fp16 = enable_fp16;
},
R"DOC(
Bool type, True enable float16 mode, otherwise disable. Default False.)DOC");
#endif
BindFleetWrapper(&m);
BindIO(&m);
......
......@@ -313,6 +313,21 @@ void SetTensorFromPyArrayT(
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use XPUPlace in CPU/GPU version, "
"Please recompile or reinstall Paddle with XPU support."));
#endif
} else if (paddle::platform::is_ipu_place(place)) {
#ifdef PADDLE_WITH_IPU
if (zero_copy) {
auto holder = std::make_shared<details::NumpyAllocation<T>>(array);
auto type = framework::ToDataType(std::type_index(typeid(T)));
self->ResetHolderWithType(holder, type);
} else {
auto dst = self->mutable_data<T>(place);
std::memcpy(dst, array.data(), array.nbytes());
}
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use IPUPlace in CPU/GPU/XPU/NPU version, "
"Please recompile or reinstall Paddle with IPU support."));
#endif
} else if (paddle::platform::is_npu_place(place)) {
#ifdef PADDLE_WITH_ASCEND_CL
......
......@@ -260,6 +260,7 @@ from .framework.random import set_cuda_rng_state # noqa: F401
from .framework import ParamAttr # noqa: F401
from .framework import create_parameter # noqa: F401
from .framework import CPUPlace # noqa: F401
from .framework import IPUPlace # noqa: F401
from .framework import CUDAPlace # noqa: F401
from .framework import NPUPlace # noqa: F401
from .framework import CUDAPinnedPlace # noqa: F401
......@@ -291,6 +292,7 @@ from .fluid.framework import get_flags # noqa: F401
from .fluid.framework import set_flags # noqa: F401
from .device import is_compiled_with_xpu # noqa: F401
from .device import is_compiled_with_npu # noqa: F401
from .device import is_compiled_with_ipu # noqa: F401
from .device import XPUPlace # noqa: F401
from .fluid.dygraph.base import enable_dygraph as disable_static # noqa: F401
......
......@@ -28,7 +28,9 @@ __all__ = [ # noqa
'set_device',
'get_device',
'XPUPlace',
'IPUPlace',
'is_compiled_with_xpu',
'is_compiled_with_ipu',
'is_compiled_with_cinn',
'is_compiled_with_cuda',
'is_compiled_with_rocm',
......@@ -55,6 +57,36 @@ def is_compiled_with_npu():
return core.is_compiled_with_npu()
def is_compiled_with_ipu():
"""
Whether paddle was built with WITH_IPU=ON to support Graphcore IPU.
Returns (bool): `True` if IPU is supported, otherwise `False`.
Examples:
.. code-block:: python
import paddle
support_ipu = paddle.is_compiled_with_ipu()
"""
return core.is_compiled_with_ipu()
def IPUPlace():
"""
Return a Graphcore IPU Place
Examples:
.. code-block:: python
# required: ipu
import paddle
place = paddle.device.IPUPlace()
"""
return core.IPUPlace()
def is_compiled_with_xpu():
"""
Whether paddle was built with WITH_XPU=ON to support Baidu Kunlun
......@@ -143,13 +175,19 @@ def _convert_to_place(device):
selected_npus = os.getenv("FLAGS_selected_npus", "0").split(",")
device_id = int(selected_npus[0])
place = core.NPUPlace(device_id)
elif lower_device == 'ipu':
if not core.is_compiled_with_ipu():
raise ValueError(
"The device should not be 'ipu', " \
"since PaddlePaddle is not compiled with IPU")
place = core.IPUPlace()
else:
avaliable_gpu_device = re.match(r'gpu:\d+', lower_device)
avaliable_xpu_device = re.match(r'xpu:\d+', lower_device)
avaliable_npu_device = re.match(r'npu:\d+', lower_device)
if not avaliable_gpu_device and not avaliable_xpu_device and not avaliable_npu_device:
raise ValueError(
"The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'npu' or 'npu:x'"
"The device must be a string which is like 'cpu', 'gpu', 'gpu:x', 'xpu', 'xpu:x', 'npu', 'npu:x' or ipu"
)
if avaliable_gpu_device:
if not core.is_compiled_with_cuda():
......@@ -183,13 +221,13 @@ def _convert_to_place(device):
def set_device(device):
"""
Paddle supports running calculations on various types of devices, including CPU, GPU, XPU and NPU.
Paddle supports running calculations on various types of devices, including CPU, GPU, XPU, NPU and IPU.
They are represented by string identifiers. This function can specify the global device
which the OP will run.
Parameters:
device(str): This parameter determines the specific running device.
It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``gpu:x``, ``xpu:x`` and ``npu:x``,
It can be ``cpu``, ``gpu``, ``xpu``, ``npu``, ``gpu:x``, ``xpu:x``, ``npu:x`` and ``ipu``,
where ``x`` is the index of the GPUs, XPUs or NPUs.
Examples:
......@@ -236,5 +274,10 @@ def get_device():
elif isinstance(place, core.NPUPlace):
device_id = place.get_device_id()
device = 'npu:' + str(device_id)
elif isinstance(place, core.IPUPlace):
num_devices = core.get_ipu_device_count()
device = "ipus:{{0-{}}}".format(num_devices - 1)
else:
raise ValueError("The device specification {} is invalid".format(place))
return device
......@@ -71,7 +71,7 @@ from . import distribute_lookup_table
from .param_attr import ParamAttr, WeightNormParamAttr
from .data_feeder import DataFeeder
from .core import LoDTensor, LoDTensorArray, Scope, _Scope
from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace
from .core import CPUPlace, XPUPlace, CUDAPlace, CUDAPinnedPlace, NPUPlace, IPUPlace
from .incubate import fleet
from .transpiler import DistributeTranspiler, \
memory_optimize, release_memory, DistributeTranspilerConfig
......@@ -132,6 +132,7 @@ __all__ = framework.__all__ + executor.__all__ + \
'CUDAPlace',
'CUDAPinnedPlace',
'NPUPlace',
'IPUPlace',
'Tensor',
'ParamAttr',
'WeightNormParamAttr',
......@@ -197,6 +198,11 @@ def __bootstrap__():
if os.name == 'nt':
remove_flag_if_exists('cpu_deterministic')
if core.is_compiled_with_ipu():
# Currently we request all ipu available for training and testing
# finer control of pod of IPUs will be added later
read_env_flags += []
core.init_gflags(["--tryfromenv=" + ",".join(read_env_flags)])
# Note(zhouwei25): sys may not have argv in some cases,
# Such as: use Python/C API to call Python from C++
......
......@@ -23,6 +23,7 @@ from .framework import set_grad_enabled # noqa: F401
from ..fluid.param_attr import ParamAttr # noqa: F401
from ..fluid.layers.tensor import create_parameter # noqa: F401
from ..fluid.core import CPUPlace # noqa: F401
from ..fluid.core import IPUPlace # noqa: F401
from ..fluid.core import CUDAPlace # noqa: F401
from ..fluid.core import CUDAPinnedPlace # noqa: F401
from ..fluid.core import NPUPlace # noqa: F401
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册