From 4da467370f3be2e6336d51760fba9debb0304318 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Sun, 3 Apr 2022 15:39:41 +0800
Subject: [PATCH] [Eager] do not mutabledata when init (#41331)

* do not mutabledata when init, test=develop

* refine, test=develop

* fix copy_, test=develop

* refine, test=develop
---
 paddle/fluid/pybind/eager.cc                  |  7 ++---
 paddle/fluid/pybind/eager_method.cc           | 11 ++++++--
 .../test_cuda_max_memory_allocated.py         | 28 +++++++++++++++----
 .../unittests/test_cuda_memory_reserved.py    | 28 +++++++++++++++----
 4 files changed, 54 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/pybind/eager.cc b/paddle/fluid/pybind/eager.cc
index 5278f371dd4..657c79e7bd3 100644
--- a/paddle/fluid/pybind/eager.cc
+++ b/paddle/fluid/pybind/eager.cc
@@ -77,9 +77,6 @@ void EmptyTensorInitializer(TensorObject* self, const std::string& name,
             phi::make_intrusive<paddle::experimental::SharedStorage>(place),
             phi::DenseTensorMeta(paddle::framework::TransToPhiDataType(dtype),
                                  ddims));
-    if (phi::product(ddims) > 0) {
-      dense_tensor->mutable_data(place);
-    }
     self->tensor.set_impl(dense_tensor);
   }
 
@@ -92,6 +89,7 @@ void EmptyTensorInitializer(TensorObject* self, const std::string& name,
 }
 
 void InitTensorWithNumpyValue(TensorObject* self, const py::object& array,
+                              const paddle::platform::Place& place,
                               bool zero_copy = false) {
   PADDLE_ENFORCE_EQ(
       self->tensor.defined(), true,
@@ -102,7 +100,6 @@ void InitTensorWithNumpyValue(TensorObject* self, const py::object& array,
           "eager tensor before init it with NumPy."));
   phi::DenseTensor* impl_ptr =
       static_cast<phi::DenseTensor*>(self->tensor.impl().get());
-  paddle::platform::Place place = impl_ptr->place();
   if (platform::is_cpu_place(place)) {
     SetTensorFromPyArray<platform::CPUPlace>(impl_ptr, array, place, zero_copy);
   } else if (platform::is_xpu_place(place)) {
@@ -289,7 +286,7 @@ void AutoInitTensorByPyArray(TensorObject* py_tensor_ptr,
 
   EmptyTensorInitializer(py_tensor_ptr, act_name, place, persistable,
                          stop_gradient);
-  InitTensorWithNumpyValue(py_tensor_ptr, numpy_value, zero_copy);
+  InitTensorWithNumpyValue(py_tensor_ptr, numpy_value, place, zero_copy);
 }
 
 // initialize Tensor by Tensor or framework::Tensor (mix args and
diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
index d9face124bd..814243e0a57 100644
--- a/paddle/fluid/pybind/eager_method.cc
+++ b/paddle/fluid/pybind/eager_method.cc
@@ -330,17 +330,22 @@ static PyObject* tensor_method_copy_(TensorObject* self, PyObject* args,
   bool blocking = CastPyArg2AttrBoolean(PyTuple_GET_ITEM(args, 1), 1);
   VLOG(6) << "Start Copy Tensor " << src_tensor.name() << " to "
           << self->tensor.name();
-  if (!self->tensor.defined()) {
+  if (!self->tensor.initialized()) {
     egr::EagerUtils::autograd_meta(&(self->tensor))
         ->SetStopGradient(
             egr::EagerUtils::autograd_meta(&(src_tensor))->StopGradient());
     egr::EagerUtils::autograd_meta(&(self->tensor))
         ->SetPersistable(
             egr::EagerUtils::autograd_meta(&(src_tensor))->Persistable());
+    if (src_tensor.initialized()) {
+      self->tensor.copy_(src_tensor, src_tensor.inner_place(), blocking);
+    }
+  } else {
+    if (src_tensor.initialized()) {
+      self->tensor.copy_(src_tensor, self->tensor.inner_place(), blocking);
+    }
   }
 
-  self->tensor.copy_(src_tensor, self->tensor.inner_place(), blocking);
-
   VLOG(6) << "Finish Copy Tensor " << src_tensor.name() << " to "
           << self->tensor.name();
   Py_INCREF(Py_None);
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py
index 51c9ba182ab..ae8bdeed1ef 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_max_memory_allocated.py
@@ -16,10 +16,11 @@ import paddle
 import unittest
 from paddle.fluid import core
 from paddle.device.cuda import device_count, memory_allocated, max_memory_allocated
+from paddle.fluid.framework import _test_eager_guard, _in_legacy_dygraph
 
 
 class TestMaxMemoryAllocated(unittest.TestCase):
-    def test_max_memory_allocated(self, device=None):
+    def func_test_max_memory_allocated(self, device=None):
         if core.is_compiled_with_cuda():
             alloc_time = 100
             max_alloc_size = 10000
@@ -35,16 +36,26 @@ class TestMaxMemoryAllocated(unittest.TestCase):
             self.assertEqual(peak_memory_allocated_size,
                              max_memory_allocated(device))
 
-    def test_max_memory_allocated_for_all_places(self):
+    def test_max_memory_allocated(self):
+        with _test_eager_guard():
+            self.func_test_max_memory_allocated()
+        self.func_test_max_memory_allocated()
+
+    def func_test_max_memory_allocated_for_all_places(self):
         if core.is_compiled_with_cuda():
             gpu_num = device_count()
             for i in range(gpu_num):
                 paddle.device.set_device("gpu:" + str(i))
-                self.test_max_memory_allocated(core.CUDAPlace(i))
-                self.test_max_memory_allocated(i)
-                self.test_max_memory_allocated("gpu:" + str(i))
+                self.func_test_max_memory_allocated(core.CUDAPlace(i))
+                self.func_test_max_memory_allocated(i)
+                self.func_test_max_memory_allocated("gpu:" + str(i))
 
-    def test_max_memory_allocated_exception(self):
+    def test_max_memory_allocated_for_all_places(self):
+        with _test_eager_guard():
+            self.func_test_max_memory_allocated_for_all_places()
+        self.func_test_max_memory_allocated_for_all_places()
+
+    def func_test_max_memory_allocated_exception(self):
         if core.is_compiled_with_cuda():
             wrong_device = [
                 core.CPUPlace(), device_count() + 1, -2, 0.5, "gpu1", "npu"
@@ -56,6 +67,11 @@ class TestMaxMemoryAllocated(unittest.TestCase):
             with self.assertRaises(BaseException):
                 max_memory_allocated()
 
+    def test_max_memory_allocated_exception(self):
+        with _test_eager_guard():
+            self.func_test_max_memory_allocated_exception()
+        self.func_test_max_memory_allocated_exception()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py b/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py
index 149760de8b2..ca551ab4a3f 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_memory_reserved.py
@@ -17,26 +17,37 @@ import unittest
 import numpy as np
 from paddle.fluid import core
 from paddle.device.cuda import device_count, memory_reserved
+from paddle.fluid.framework import _test_eager_guard, _in_legacy_dygraph
 
 
 class TestMemoryreserved(unittest.TestCase):
-    def test_memory_reserved(self, device=None):
+    def func_test_memory_reserved(self, device=None):
         if core.is_compiled_with_cuda():
             tensor = paddle.zeros(shape=[256])
             alloc_size = 4 * 256  # 256 float32 data, with 4 bytes for each one
             memory_reserved_size = memory_reserved(device)
             self.assertEqual(memory_reserved_size, alloc_size)
 
-    def test_memory_reserved_for_all_places(self):
+    def test_memory_reserved(self):
+        with _test_eager_guard():
+            self.func_test_memory_reserved()
+        self.func_test_memory_reserved()
+
+    def func_test_memory_reserved_for_all_places(self):
         if core.is_compiled_with_cuda():
             gpu_num = device_count()
             for i in range(gpu_num):
                 paddle.device.set_device("gpu:" + str(i))
-                self.test_memory_reserved(core.CUDAPlace(i))
-                self.test_memory_reserved(i)
-                self.test_memory_reserved("gpu:" + str(i))
+                self.func_test_memory_reserved(core.CUDAPlace(i))
+                self.func_test_memory_reserved(i)
+                self.func_test_memory_reserved("gpu:" + str(i))
 
-    def test_memory_reserved_exception(self):
+    def test_memory_reserved_for_all_places(self):
+        with _test_eager_guard():
+            self.func_test_memory_reserved_for_all_places()
+        self.func_test_memory_reserved_for_all_places()
+
+    def func_test_memory_reserved_exception(self):
         if core.is_compiled_with_cuda():
             wrong_device = [
                 core.CPUPlace(), device_count() + 1, -2, 0.5, "gpu1", "npu"
@@ -48,6 +59,11 @@ class TestMemoryreserved(unittest.TestCase):
             with self.assertRaises(BaseException):
                 memory_reserved()
 
+    def test_memory_reserved_exception(self):
+        with _test_eager_guard():
+            self.func_test_memory_reserved_exception()
+        self.func_test_memory_reserved_exception()
+
 
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab