fix xpu dygraph place (#30868)

6e3856d3 · WangXi · GitHub · 35c5b23f · 6e3856d3 · 6e3856d3
5 changed file
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -305,10 +305,30 @@ TEST(test_tracer, test_expected_place) {
  // default expected place is CPUPlace
  imperative::Tracer tracer;
  ASSERT_EQ(platform::is_cpu_place(tracer.ExpectedPlace()), true);
+  {
+#ifdef PADDLE_WITH_CUDA
    // set to CUDAPlace
    platform::CUDAPlace gpu_place(0);
    tracer.SetExpectedPlace(gpu_place);
    ASSERT_EQ(platform::is_gpu_place(tracer.ExpectedPlace()), true);
+    // assert throw
+    platform::XPUPlace xpu_place(0);
+    ASSERT_THROW(tracer.SetExpectedPlace(xpu_place), platform::EnforceNotMet);
+#endif
+  }
+  {
+#ifdef PADDLE_WITH_XPU
+    // set to XPUPlace
+    platform::XPUPlace xpu_place(0);
+    tracer.SetExpectedPlace(xpu_place);
+    ASSERT_EQ(platform::is_xpu_place(tracer.ExpectedPlace()), true);
+    // assert throw
+    platform::CUDAPlace cuda_place(0);
+    ASSERT_THROW(tracer.SetExpectedPlace(cuda_place), platform::EnforceNotMet);
+#endif
+  }
 }
 TEST(test_tracer, test_var_without_grad_var) {

--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -198,6 +198,26 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
          inplace_map);
 }
+void Tracer::SetExpectedPlace(platform::Place place) {
+  // NOTE(wangxi): set device id before launch device kernel
+  if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_WITH_CUDA
+    platform::SetDeviceId(BOOST_GET_CONST(platform::CUDAPlace, place).device);
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU if use CUDAPlace."));
+#endif
+  } else if (platform::is_xpu_place(place)) {
+#ifdef PADDLE_WITH_XPU
+    platform::SetXPUDeviceId(BOOST_GET_CONST(platform::XPUPlace, place).device);
+#else
+    PADDLE_THROW(platform::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with XPU if use XPUPlace."));
+#endif
+  }
+  expected_place_ = place;
+}
 bool Tracer::ComputeRequiredGrad(const NameVarBaseMap& ins,
                                 const NameVarBaseMap& outs,
                                 bool trace_backward) {

--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -99,7 +99,7 @@ class Tracer {
  platform::Place ExpectedPlace() const { return expected_place_; }
-  void SetExpectedPlace(platform::Place place) { expected_place_ = place; }
+  void SetExpectedPlace(platform::Place place);
  bool HasGrad() const { return has_grad_; }

--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1207,15 +1207,6 @@ void BindImperative(py::module *m_ptr) {
            if (py::isinstance<platform::CUDAPlace>(obj)) {
              auto p = obj.cast<platform::CUDAPlace *>();
              self.SetExpectedPlace(*p);
-// NOTE(zhiqiu): When switching cuda place, we need to set the
-// cuda device id.
-// Otherwise, some cuda API may be launched at other cuda place,
-// which may cost hundreds of MB of GPU memory due to the cuda
-// lib.
-#ifdef PADDLE_WITH_CUDA
-              platform::SetDeviceId(p->device);
-#endif
              VLOG(4) << "Tracer(" << &self << ")"
                      << " set expected place " << *p;
            } else if (py::isinstance<platform::XPUPlace>(obj)) {
@@ -1236,13 +1227,6 @@ void BindImperative(py::module *m_ptr) {
            } else if (py::isinstance<platform::Place>(obj)) {
              auto p = obj.cast<platform::Place *>();
              self.SetExpectedPlace(*p);
-              if (platform::is_gpu_place(*p)) {
-// NOTE(zhiqu): same as obj is CUDAPlace.
-#ifdef PADDLE_WITH_CUDA
-                platform::SetDeviceId(
-                    BOOST_GET_CONST(platform::CUDAPlace, *p).device);
-#endif
-              }
              VLOG(4) << "Tracer(" << &self << ")"
                      << " set expected place " << *p;
            } else {

--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -259,38 +259,6 @@ void TensorSetElement(framework::Tensor *self, size_t offset, T elem) {
  }
 }
-// NOTE(wangxi): When copying data to the accelerator card,
-// we need set_device(dev_id) first.
-template <typename P>
-static int GetDeviceId(const P &place) {
-  // for CPUPlace and CUDAPinnedPlace.
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "Paddle can't Get CPUPlace or CUDAPinnedPlace Device Id."));
-}
-template <>
-int GetDeviceId<platform::CUDAPlace>(const platform::CUDAPlace &place) {
-  return place.GetDeviceId();
-}
-template <>
-int GetDeviceId<platform::XPUPlace>(const platform::XPUPlace &place) {
-  return place.GetDeviceId();
-}
-// NOTE(wangxi16): Used by VarBase __setitem__
-template <>
-int GetDeviceId<platform::Place>(const platform::Place &place) {
-  if (paddle::platform::is_gpu_place(place)) {
-    return GetDeviceId(BOOST_GET_CONST(platform::CUDAPlace, place));
-  } else if (paddle::platform::is_xpu_place(place)) {
-    return GetDeviceId(BOOST_GET_CONST(platform::XPUPlace, place));
-  }
-  // for CPUPlace and CUDAPinnedPlace.
-  PADDLE_THROW(platform::errors::PermissionDenied(
-      "Paddle can't Get CPUPlace or CUDAPinnedPlace Device Id."));
-}
 template <typename T, typename P>
 void SetTensorFromPyArrayT(
    framework::Tensor *self,
@@ -314,7 +282,11 @@ void SetTensorFromPyArrayT(
    }
  } else if (paddle::platform::is_xpu_place(place)) {
 #ifdef PADDLE_WITH_XPU
-    platform::XPUDeviceGuard guard(GetDeviceId(place));
+    // NOTE(wangxi): When copying data to the accelerator card,
+    // we need set_device(dev_id) first.
+    platform::Place tmp_place = place;
+    platform::XPUDeviceGuard guard(
+        BOOST_GET_CONST(platform::XPUPlace, tmp_place).device);
    auto dst = self->mutable_data<T>(place);
    xpu_memcpy(dst, array.data(), array.nbytes(),
               XPUMemcpyKind::XPU_HOST_TO_DEVICE);
@@ -326,7 +298,11 @@ void SetTensorFromPyArrayT(
  } else {
 #ifdef PADDLE_WITH_CUDA
    if (paddle::platform::is_gpu_place(place)) {
-      platform::CUDADeviceGuard guard(GetDeviceId(place));
+      // NOTE(wangxi): When copying data to the accelerator card,
+      // we need set_device(dev_id) first.
+      platform::Place tmp_place = place;
+      platform::CUDADeviceGuard guard(
+          BOOST_GET_CONST(platform::CUDAPlace, tmp_place).device);
      auto dst = self->mutable_data<T>(place);
      paddle::platform::GpuMemcpySync(dst, array.data(), array.nbytes(),
                                      cudaMemcpyHostToDevice);