fix behavior of device_id=None in Tensor.cuda (#44515)

* fix behavior of device_id=None in Tensor.cuda * fix CI

fix behavior of device_id=None in Tensor.cuda (#44515)
* fix behavior of device_id=None in Tensor.cuda * fix CI
50de8a4f · zhouweiwei2014 · GitHub · 98f8fa4c · 50de8a4f · 50de8a4f
4 changed file
--- a/paddle/phi/kernels/sparse/gpu/fused_attention_grad_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/fused_attention_grad_kernel.cu
@@ -75,7 +75,7 @@ void FusedAttentionCsrGradKernel(const Context& dev_ctx,
 #if CUDA_VERSION >= 11070
  /* Step1: Forward: softmax{CSR} * value{Dense} -> out{Dense}, reuse */
  SparseCsrTensor dsoftmax;
-  CsrDenseMatmulGradKernel<T, Context>(
+  MatmulCsrDenseGradKernel<T, Context>(
      dev_ctx, softmax, value, dout, &dsoftmax, dvalue);

  /* Step2: Calculate grad of sdd_result, manualy not reuse */

--- a/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu
+++ b/paddle/phi/kernels/sparse/gpu/fused_attention_kernel.cu
@@ -263,7 +263,7 @@ void FusedAttentionCsrKernel(

  /* Step3: DSD Matmul, reuse */
  softmax->set_dims(phi::make_ddim({q_dim[0], q_dim[1], q_dim[2], q_dim[2]}));
-  CsrDenseMatmulKernel<T, Context>(dev_ctx, *softmax, value, out);
+  MatmulCsrDenseKernel<T, Context>(dev_ctx, *softmax, value, out);
 #else
  PADDLE_THROW(
      phi::errors::Unimplemented("forward of 'sparse.nn.functional.attention' "

--- a/python/paddle/fluid/dygraph/varbase_patch_methods.py
+++ b/python/paddle/fluid/dygraph/varbase_patch_methods.py
@@ -866,15 +866,20 @@ def monkey_patch_varbase():
            return res

    @framework.dygraph_only
-    def cuda(self, device_id=0, blocking=True):
+    def cuda(self, device_id=None, blocking=True):
        if device_id is None:
-            device_id = 0
-        if not isinstance(device_id, int):
-            raise ValueError("\'device_id\' must be a positive integer")
-        if self.place.is_gpu_place():
+            res_place = framework._current_expected_place()
+            if not isinstance(res_place, core.CUDAPlace):
+                res_place = core.CUDAPlace(0)
+        elif isinstance(device_id, int):
+            res_place = core.CUDAPlace(device_id)
+        else:
+            raise ValueError("device_id must be int|None")
+
+        if self.place._equals(res_place):
            return self
        else:
-            res = self._copy_to(core.CUDAPlace(device_id), True)
+            res = self._copy_to(res_place, True)
            res.stop_gradient = self.stop_gradient
            res.persistable = self.persistable
            return res

--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -34,7 +34,7 @@ class TestVarBase(unittest.TestCase):

    def func_test_to_tensor(self):

-        def _test_place(place):
+        def check_with_place(place):
            with fluid.dygraph.guard():
                paddle.set_default_dtype('float32')
                # set_default_dtype should not take effect on int
@@ -79,6 +79,7 @@ class TestVarBase(unittest.TestCase):
                    y = x.pin_memory()
                    self.assertEqual(y.place.__repr__(), "Place(gpu_pinned)")
                    y = x.cuda()
+                    self.assertEqual(y.place.__repr__(), "Place(gpu:0)")
                    y = x.cuda(None)
                    self.assertEqual(y.place.__repr__(), "Place(gpu:0)")
                    y = x.cuda(device_id=0)
@@ -266,16 +267,16 @@ class TestVarBase(unittest.TestCase):
                with self.assertRaises(ValueError):
                    paddle.to_tensor([[1], [2, 3]], place=1)

-        _test_place(core.CPUPlace())
-        _test_place("cpu")
+        check_with_place(core.CPUPlace())
+        check_with_place("cpu")
        if core.is_compiled_with_cuda():
-            _test_place(core.CUDAPinnedPlace())
-            _test_place("gpu_pinned")
-            _test_place(core.CUDAPlace(0))
-            _test_place("gpu:0")
+            check_with_place(core.CUDAPinnedPlace())
+            check_with_place("gpu_pinned")
+            check_with_place(core.CUDAPlace(0))
+            check_with_place("gpu:0")
        if core.is_compiled_with_npu():
-            _test_place(core.NPUPlace(0))
-            _test_place("npu:0")
+            check_with_place(core.NPUPlace(0))
+            check_with_place("npu:0")

    def test_to_tensor(self):
        with _test_eager_guard():