add ipu device p1 (#37841)

c9a3c669 · jianghaicheng · GitHub · de874cdd · c9a3c669 · c9a3c669
15 changed file
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -53,6 +53,15 @@ void XPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
 }
 #endif
+#ifdef PADDLE_WITH_IPU
+IPUGarbageCollector::IPUGarbageCollector(const platform::IPUPlace &place,
+                                         size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+void IPUGarbageCollector::ClearCallback(const std::function<void()> &callback) {
+  callback();
+}
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 UnsafeFastGPUGarbageCollector::UnsafeFastGPUGarbageCollector(
    const platform::CUDAPlace &place, size_t max_memory_size)

--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -80,6 +80,16 @@ class XPUGarbageCollector : public GarbageCollector {
 };
 #endif
+#ifdef PADDLE_WITH_IPU
+class IPUGarbageCollector : public GarbageCollector {
+ public:
+  IPUGarbageCollector(const platform::IPUPlace &place, size_t max_memory_size);
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 class UnsafeFastGPUGarbageCollector : public GarbageCollector {
 public:

--- a/paddle/fluid/framework/library_type.h
+++ b/paddle/fluid/framework/library_type.h
@@ -61,6 +61,8 @@ inline LibraryType StringToLibraryType(const char* ctype) {
    return LibraryType::kPlain;
  } else if (s == std::string("XPU")) {
    return LibraryType::kPlain;
+  } else if (s == std::string("IPU")) {
+    return LibraryType::kPlain;
  } else if (s == std::string("NPU")) {
    return LibraryType::kPlain;
  } else if (s == std::string("CUDA")) {
@@ -68,7 +70,7 @@ inline LibraryType StringToLibraryType(const char* ctype) {
  } else {
    PADDLE_THROW(platform::errors::Unimplemented(
        "Unknown LibraryType string (%s), only support library type string "
-        "include PLAIN, MKLDNN, CUDNN, CPU and CUDA.",
+        "include PLAIN, MKLDNN, CUDNN, CPU, CUDA and IPU.",
        s.c_str()));
  }
 }

--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -48,6 +48,8 @@ if (WITH_GPU OR WITH_ROCM)
    endif()
 elseif(WITH_XPU)
    set(AllocatorFacadeDeps xpu_info)
+elseif(WITH_IPU)
+    set(AllocatorFacadeDeps ipu_info)
 elseif(WITH_ASCEND)
    set(AllocatorFacadeDeps ascend_npu_info)
 else ()

--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -51,6 +51,10 @@
 #include "paddle/fluid/memory/allocation/npu_pinned_allocator.h"
 #endif
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/device/ipu/ipu_info.h"
+#endif
 PADDLE_DEFINE_EXPORTED_int64(
    gpu_allocator_retry_time, 10000,
    "The retry time (milliseconds) when allocator fails "
@@ -136,6 +140,11 @@ class AllocatorFacadePrivate {
    switch (strategy_) {
      case AllocatorStrategy::kNaiveBestFit: {
        InitNaiveBestFitCPUAllocator();
+#ifdef PADDLE_WITH_IPU
+        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
+        }
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
        if (FLAGS_use_stream_safe_cuda_allocator) {
          LOG(WARNING) << "FLAGS_use_stream_safe_cuda_allocator is invalid for "
@@ -186,6 +195,11 @@ class AllocatorFacadePrivate {
        for (int dev_id = 0; dev_id < platform::GetXPUDeviceCount(); ++dev_id) {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
+#endif
+#ifdef PADDLE_WITH_IPU
+        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
+        }
 #endif
        break;
      }
@@ -197,6 +211,11 @@ class AllocatorFacadePrivate {
          InitNaiveBestFitXPUAllocator(platform::XPUPlace(dev_id));
        }
 #endif
+#ifdef PADDLE_WITH_IPU
+        for (int dev_id = 0; dev_id < platform::GetIPUDeviceCount(); ++dev_id) {
+          InitNaiveBestFitIPUAllocator(platform::IPUPlace(dev_id));
+        }
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
        if (FLAGS_use_stream_safe_cuda_allocator) {
          LOG(WARNING) << "FLAGS_use_stream_safe_cuda_allocator is invalid for "
@@ -570,6 +589,12 @@ class AllocatorFacadePrivate {
  }
 #endif
+#ifdef PADDLE_WITH_IPU
+  void InitNaiveBestFitIPUAllocator(platform::IPUPlace p) {
+    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+  }
+#endif
 #ifdef PADDLE_WITH_ASCEND_CL
  void InitNaiveBestFitNPUAllocator(platform::NPUPlace p) {
    allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
@@ -591,6 +616,13 @@ class AllocatorFacadePrivate {
      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
    }
 #endif
+#ifdef PADDLE_WITH_IPU
+    int device_count = platform::GetIPUDeviceCount();
+    for (int i = 0; i < device_count; ++i) {
+      platform::IPUPlace p(i);
+      system_allocators_[p] = std::make_shared<NaiveBestFitAllocator>(p);
+    }
+#endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
    system_allocators_[platform::CUDAPinnedPlace()] =
        std::make_shared<CPUPinnedAllocator>();
@@ -625,6 +657,12 @@ class AllocatorFacadePrivate {
      places.emplace_back(platform::NPUPlace(dev_id));
    }
 #endif
+#ifdef PADDLE_WITH_IPU
+    int device_count = platform::GetIPUDeviceCount();
+    for (int dev_id = 0; dev_id < device_count; ++dev_id) {
+      places.emplace_back(platform::IPUPlace(dev_id));
+    }
+#endif
    for (auto& p : places) {
      zero_size_allocators_[p] = std::make_shared<ZeroSizeAllocator>(p);

--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -33,6 +33,32 @@ void Copy<platform::CPUPlace, platform::CPUPlace>(platform::CPUPlace, void* dst,
  VLOG(4) << "src: " << src << ", dst: " << dst << ", num: " << num;
  std::memcpy(dst, src, num);
 }
+#ifdef PADDLE_WITH_IPU
+template <>
+void Copy<platform::IPUPlace, platform::CPUPlace>(platform::IPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::CPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (UNLIKELY(num == 0)) return;
+  std::memcpy(dst, src, num);
+}
+template <>
+void Copy<platform::CPUPlace, platform::IPUPlace>(platform::CPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::IPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (UNLIKELY(num == 0)) return;
+  std::memcpy(dst, src, num);
+}
+template <>
+void Copy<platform::IPUPlace, platform::IPUPlace>(platform::IPUPlace dst_place,
+                                                  void* dst,
+                                                  platform::IPUPlace src_place,
+                                                  const void* src, size_t num) {
+  if (UNLIKELY(num == 0)) return;
+  std::memcpy(dst, src, num);
+}
+#endif
 #ifdef PADDLE_WITH_XPU
 template <>

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -71,6 +71,12 @@ IF(WITH_GPU OR WITH_ROCM)
    set(GPU_CTX_DEPS dynload_cuda dynamic_loader cuda_stream)
 ENDIF()
+IF(WITH_IPU)
+    set(IPU_CTX_DEPS ipu_backend)
+ELSE()
+    set(IPU_CTX_DEPS)
+ENDIF(WITH_IPU)
 IF(WITH_ASCEND_CL)
    set(NPU_CTX_DEPS npu_stream npu_info)
 ENDIF()
@@ -109,7 +115,7 @@ cc_library(cudnn_workspace_helper SRCS cudnn_workspace_helper.cc DEPS boost)
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS simple_threadpool malloc xxhash ${STREAM_CALLBACK_DEPS}
-    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
+    place eigen3 stringpiece cpu_helper cpu_info framework_proto ${IPU_CTX_DEPS} ${GPU_CTX_DEPS} ${NPU_CTX_DEPS} ${MKLDNN_CTX_DEPS}
    ${dgc_deps} dlpack cudnn_workspace_helper ${XPU_CTX_DEPS})
 cc_library(collective_helper SRCS collective_helper.cc gen_comm_id_helper.cc DEPS framework_proto device_context enforce)

--- a/paddle/fluid/platform/device/ipu/CMakeLists.txt
+++ b/paddle/fluid/platform/device/ipu/CMakeLists.txt
-cc_library(ipu_device SRCS device.cc DEPS enforce popart)
+# IPU
-cc_library(ipu_utils SRCS ipu_utils.cc DEPS memory framework_proto popart)
+IF(WITH_IPU)
-cc_library(ipu_strategy SRCS ipu_strategy.cc DEPS popart graph framework_proto enforce)
+  cc_library(ipu_device SRCS device.cc DEPS enforce popart)
-cc_library(ipu_optimizer SRCS ipu_optimizer.cc DEPS popart enforce)
+  cc_library(ipu_utils SRCS ipu_utils.cc DEPS memory framework_proto popart)
-cc_library(ipu_executor SRCS ipu_executor.cc DEPS ipu_optimizer ipu_utils popart graph framework_proto)
+  cc_library(ipu_strategy SRCS ipu_strategy.cc DEPS popart graph framework_proto enforce)
-cc_library(popart_canonicalization_utils SRCS ${POPART_CANONICALIZATION_SRC} DEPS framework_proto enforce ipu_utils)
+  cc_library(ipu_optimizer SRCS ipu_optimizer.cc DEPS popart enforce)
-cc_library(ipu_compiler SRCS ipu_compiler.cc DEPS popart graph ipu_utils graph_helper)
+  cc_library(ipu_executor SRCS ipu_executor.cc DEPS ipu_optimizer ipu_utils popart graph framework_proto)
-cc_library(ipu_backend SRCS ipu_backend.cc DEPS popart ipu_compiler graph framework_proto enforce ipu_utils ipu_strategy ipu_device ipu_executor graph_helper)
+  cc_library(popart_canonicalization_utils SRCS ${POPART_CANONICALIZATION_SRC} DEPS framework_proto enforce ipu_utils)
+  cc_library(ipu_compiler SRCS ipu_compiler.cc DEPS popart graph ipu_utils graph_helper)
+  cc_library(ipu_backend SRCS ipu_backend.cc DEPS popart ipu_compiler graph framework_proto enforce ipu_utils ipu_strategy ipu_device ipu_executor graph_helper)
+  cc_library(ipu_info SRCS ipu_info.cc DEPS ipu_backend)
+ENDIF()
--- a/paddle/fluid/platform/device/ipu/device.cc
+++ b/paddle/fluid/platform/device/ipu/device.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/platform/ipu/device.h"
+#include "paddle/fluid/platform/device/ipu/device.h"
 namespace paddle {
 namespace platform {

--- a/paddle/fluid/platform/device/ipu/ipu_info.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_info.cc
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/platform/device/ipu/ipu_info.h"
+#include "paddle/fluid/platform/device/ipu/ipu_backend.h"
+namespace paddle {
+namespace platform {
+//! Get a list of device ids from environment variable or use all.
+std::vector<int> GetSelectedIPUDevices() {
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+  return ipu_backend->GetDeviceIds();
+}
+//! Get the total number of IPU devices in system.
+int GetIPUDeviceCount() {
+  std::shared_ptr<platform::ipu::IpuBackend> ipu_backend =
+      platform::ipu::IpuBackend::GetInstance();
+  return ipu_backend->GetNumDevices();
+}
+}  // namespace platform
+}  // namespace paddle
--- a/paddle/fluid/platform/device/ipu/ipu_info.h
+++ b/paddle/fluid/platform/device/ipu/ipu_info.h
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef PADDLE_WITH_IPU
+#include <memory>
+#include <vector>
+#include "glog/logging.h"
+namespace paddle {
+namespace platform {
+std::vector<int> GetSelectedIPUDevices();
+int GetIPUDeviceCount();
+}  // namespace platform
+}  // namespace paddle
+#endif
--- a/paddle/fluid/platform/device/ipu/ipu_optimizer.cc
+++ b/paddle/fluid/platform/device/ipu/ipu_optimizer.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/platform/ipu/ipu_optimizer.h"
+#include "paddle/fluid/platform/device/ipu/ipu_optimizer.h"
 namespace paddle {
 namespace platform {

--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -62,6 +62,9 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/npu/enforce_npu.h"
 #include "paddle/fluid/platform/device/npu/npu_stream.h"
 #endif
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/device/ipu/device.h"
+#endif
 #include "unsupported/Eigen/CXX11/Tensor"
 namespace Eigen {
@@ -99,8 +102,8 @@ enum DeviceType {
  CUDA = 1,
  XPU = 2,
  NPU = 3,
+  IPU = 4,
-  MAX_DEVICE_TYPES = 4,
+  MAX_DEVICE_TYPES = 5,
 };
 DeviceType Place2DeviceType(const platform::Place& place);
@@ -109,6 +112,7 @@ constexpr DeviceType kCPU = DeviceType::CPU;
 constexpr DeviceType kCUDA = DeviceType::CUDA;
 constexpr DeviceType kXPU = DeviceType::XPU;
 constexpr DeviceType kNPU = DeviceType::NPU;
+constexpr DeviceType kIPU = DeviceType::IPU;
 class DeviceContext {
 public:
@@ -140,6 +144,30 @@ struct DefaultDeviceContextType<platform::CPUPlace> {
  using TYPE = CPUDeviceContext;
 };
+// Graphcore IPU
+#ifdef PADDLE_WITH_IPU
+class IPUDeviceContext : public DeviceContext {
+ public:
+  IPUDeviceContext() = delete;
+  explicit IPUDeviceContext(IPUPlace place);
+  virtual ~IPUDeviceContext();
+  Eigen::DefaultDevice* eigen_device() const { return nullptr; }
+  Place GetPlace() const override;
+  /*! \brief  Wait for all operations completion in the stream. */
+  void Wait() const override;
+  int DeviceId() const { return device_.getId(); }
+ private:
+  IPUPlace place_;
+  platform::ipu::Device device_;
+};
+template <>
+struct DefaultDeviceContextType<platform::IPUPlace> {
+  using TYPE = IPUDeviceContext;
+};
+#endif
 #ifdef PADDLE_WITH_XPU
 namespace xpu = baidu::xpu::api;
 class XPUDeviceContext : public DeviceContext {

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -45,6 +45,10 @@ limitations under the License. */
 #include "DbgHelp.h"
 #endif
+#ifdef PADDLE_WITH_IPU
+#include "paddle/fluid/platform/device/ipu/ipu_info.h"
+#endif
 DECLARE_int32(paddle_num_threads);
 PADDLE_DEFINE_EXPORTED_int32(
    multiple_of_cupti_buffer_size, 1,
@@ -164,6 +168,15 @@ void InitDevices() {
    LOG(WARNING)
        << "Compiled with PADDLE_WITH_ASCEND_CL, but no NPU found in runtime.";
  }
+#endif
+#ifdef PADDLE_WITH_IPU
+  try {
+    // use user specified IPUs.
+    devices = platform::GetSelectedIPUDevices();
+  } catch (const std::exception &exp) {
+    LOG(WARNING)
+        << "Compiled with PADDLE_WITH_IPU, but no IPU found in runtime.";
+  }
 #endif
  InitDevices(devices);
 }
@@ -185,6 +198,9 @@ void InitDevices(const std::vector<int> devices) {
 #ifdef PADDLE_WITH_XPU
    places.emplace_back(platform::XPUPlace(devices[i]));
 #endif
+#ifdef PADDLE_WITH_IPU
+    places.emplace_back(platform::IPUPlace(devices[i]));
+#endif
 #ifdef PADDLE_WITH_ASCEND_CL
    places.emplace_back(platform::NPUPlace(devices[i]));
 #endif

--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -16,6 +16,9 @@ endif()
 if (WITH_GPU)
  set(PYBIND_DEPS ${PYBIND_DEPS} cuda_profiler)
 endif()
+if (WITH_IPU)
+  set(PYBIND_DEPS ${PYBIND_DEPS} ipu_info)
+endif()
 if (WITH_NCCL OR WITH_RCCL)
  set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)