Add callback after TensorCopy (#30123)

* change to tensor copy sync * change to tensor copy sync * make copy_to safe when use TensorCopy * refine code * add ut * add cudapinned garbagecollector * add testcase: cpu place -> cuda pinned place

Add callback after TensorCopy (#30123)
* change to tensor copy sync * change to tensor copy sync * make copy_to safe when use TensorCopy * refine code * add ut * add cudapinned garbagecollector * add testcase: cpu place -> cuda pinned place
1f97d61c · Leo Chen · GitHub · b2483d78 · 1f97d61c · 1f97d61c
7 changed file
--- a/paddle/fluid/framework/garbage_collector.cc
+++ b/paddle/fluid/framework/garbage_collector.cc
@@ -107,6 +107,15 @@ void StreamGarbageCollector::ClearCallback(
    const std::function<void()> &callback) {
  callback_manager_->AddCallback(callback);
 }
+
+CUDAPinnedGarbageCollector::CUDAPinnedGarbageCollector(
+    const platform::CUDAPinnedPlace &place, size_t max_memory_size)
+    : GarbageCollector(place, max_memory_size) {}
+
+void CUDAPinnedGarbageCollector::ClearCallback(
+    const std::function<void()> &callback) {
+  callback();
+}
 #endif

 int64_t GetEagerDeletionThreshold() {

--- a/paddle/fluid/framework/garbage_collector.h
+++ b/paddle/fluid/framework/garbage_collector.h
@@ -119,6 +119,15 @@ class StreamGarbageCollector : public GarbageCollector {
  cudaStream_t stream_;
  std::unique_ptr<platform::StreamCallbackManager> callback_manager_;
 };
+
+class CUDAPinnedGarbageCollector : public GarbageCollector {
+ public:
+  CUDAPinnedGarbageCollector(const platform::CUDAPinnedPlace &place,
+                             size_t max_memory_size);
+
+ protected:
+  void ClearCallback(const std::function<void()> &callback) override;
+};
 #endif

 template <typename Container>

--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -25,6 +25,7 @@
 #include "paddle/fluid/imperative/infer_var_type_context.h"
 #include "paddle/fluid/imperative/op_base.h"
 #include "paddle/fluid/imperative/prepared_operator.h"
+#include "paddle/fluid/imperative/tracer.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -231,9 +232,9 @@ std::shared_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
      true, platform::errors::InvalidArgument(
                "Variable is not initialized or Variable's type is not "
                "LoDTensor or SelectedRows when getting numpy tensor"));
+
  if (Var().IsType<framework::LoDTensor>()) {
    auto& src_tensor = Var().Get<framework::LoDTensor>();
-
    // TODO(Jiabin): change this after move unique_name generator to CXX
    auto new_var = std::make_shared<VarBase>(
        true, Name() + std::to_string(copied_counter_++));
@@ -252,10 +253,8 @@ std::shared_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
        platform::DeviceContextPool::Instance().Get(src_place)->Wait();
      }
    }
-
-    if (platform::is_gpu_place(dst_place)) {
-      VLOG(3) << "copy tensor " << Name() << " from gpu";
-    }
+    VLOG(4) << "copy tensor " << Name() << " from " << Place() << " to "
+            << dst_place;
    return new_var;
  } else {
    auto& src_selected_rows = Var().Get<framework::SelectedRows>();
@@ -276,9 +275,8 @@ std::shared_ptr<VarBase> VarBase::NewVarBase(const platform::Place& dst_place,
    }
    dst_selected_rows->set_height(src_selected_rows.height());
    dst_selected_rows->set_rows(src_selected_rows.rows());
-    if (platform::is_gpu_place(dst_place)) {
-      VLOG(3) << "copy selected rows " << Name() << " from gpu";
-    }
+    VLOG(4) << "copy tensor " << Name() << " from " << Place() << " to "
+            << dst_place;
    return new_var;
  }
 }

--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -56,6 +56,78 @@ static void PassStopGradient(const NameVarBaseMap& outs, bool generate_grad) {
  }
 }

+void IncreaseVarbaseReferenceCountUntilCopyComplete(
+    const std::shared_ptr<imperative::VarBase>& var,
+    const platform::Place& place) {
+  // Note(zhiqiu): Follow the logic of TensorCopy to determine the place that we
+  // need to add callback, see tensor_utils.cc:245
+  auto place_ = platform::is_gpu_place(place) ? place : var->Place();
+
+  auto tracer = imperative::GetCurrentTracer();
+  auto gc = tracer->MutableGarbageCollectorIfNotExists(place_);
+
+  // Note(zhiqiu): This is an empty callback, the only way is to "reference"
+  // var, so it will not be destructed until the kernels launched at current
+  // stream of given place is finished.
+  auto callback = [var, place_]() {
+    VLOG(4) << "Run callback of var:" << var->Name() << " at place " << place_;
+  };
+
+  gc->DirectClearCallback(callback);
+}
+
+paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
+    const platform::Place& place) {
+  // if not exists, create a new GarbageCollector at given place
+  if (gcs_.count(place) == 0) {
+    std::unique_ptr<framework::GarbageCollector> gc;
+    if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_WITH_CUDA
+      gc.reset(new framework::DefaultStreamGarbageCollector(
+          BOOST_GET_CONST(platform::CUDAPlace, place), 0));
+
+      VLOG(10) << "Created GarbageCollector at " << place;
+#else
+      PADDLE_THROW(platform::errors::PermissionDenied(
+          "Paddle can't use CUDA device since it's not compiled with CUDA,"
+          "Please recompile or reinstall Paddle with GPU support."));
+#endif
+    } else if (platform::is_cuda_pinned_place(place)) {
+#ifdef PADDLE_WITH_CUDA
+      gc.reset(new framework::CUDAPinnedGarbageCollector(
+          BOOST_GET_CONST(platform::CUDAPinnedPlace, place), 0));
+
+      VLOG(10) << "Created GarbageCollector at " << place;
+#else
+      PADDLE_THROW(platform::errors::PermissionDenied(
+          "Paddle can't use CUDAPinned device since it's not compiled with "
+          "CUDA,"
+          "Please recompile or reinstall Paddle with GPU support."));
+#endif
+    } else if (platform::is_xpu_place(place)) {
+#if defined(PADDLE_WITH_XPU)
+      gc.reset(new framework::XPUGarbageCollector(
+          BOOST_GET_CONST(platform::XPUPlace, place), 0));
+      VLOG(10) << "Created GarbageCollector at " << place;
+#else
+      PADDLE_THROW(platform::errors::PermissionDenied(
+          "Paddle can't use XPU device since it's not compiled with XPU,"
+          "Please recompile or reinstall Paddle with XPU support."));
+#endif
+    } else if (platform::is_cpu_place(place)) {
+      gc.reset(new framework::CPUGarbageCollector(
+          BOOST_GET_CONST(platform::CPUPlace, place), 0));
+      VLOG(10) << "Created GarbageCollector at " << place;
+    } else {
+      PADDLE_THROW(platform::errors::PreconditionNotMet(
+          "Unsupported place for garbage collection"));
+    }
+    gcs_.emplace(place, std::move(gc));
+  }
+
+  return gcs_.at(place).get();
+}
+
 void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
                     const NameVarBaseMap& outs, framework::AttributeMap attrs,
                     const platform::Place& place, bool trace_backward) {

--- a/paddle/fluid/imperative/tracer.h
+++ b/paddle/fluid/imperative/tracer.h
@@ -16,12 +16,14 @@

 #include <atomic>
 #include <future>  // NOLINT
+#include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
 #include <vector>

 #include "ThreadPool.h"
+#include "paddle/fluid/framework/garbage_collector.h"
 #include "paddle/fluid/imperative/basic_engine.h"
 #include "paddle/fluid/imperative/jit/program_desc_tracer.h"
 #include "paddle/fluid/imperative/layer.h"
@@ -30,6 +32,10 @@
 namespace paddle {
 namespace imperative {

+using GarbageCollectorMap =
+    std::map<platform::Place,
+             std::unique_ptr<paddle::framework::GarbageCollector>>;
+
 class UniqueNameGenerator {
 public:
  explicit UniqueNameGenerator(std::string prefix = "") : prefix_(prefix) {}
@@ -102,6 +108,9 @@ class Tracer {

  bool IsAutoCastEnabled() const { return enable_autocast_; }

+  paddle::framework::GarbageCollector* MutableGarbageCollectorIfNotExists(
+      const platform::Place& place);
+
 private:
  std::unique_ptr<BasicEngine> basic_engine_;
  std::unique_ptr<jit::ProgramDescTracer> program_desc_tracer_;
@@ -110,11 +119,15 @@ class Tracer {
  platform::Place expected_place_;
  bool has_grad_{true};
  bool enable_autocast_{false};
+  GarbageCollectorMap gcs_;
 };

 // To access static variable current_tracer
 const std::shared_ptr<Tracer>& GetCurrentTracer();
 void SetCurrentTracer(const std::shared_ptr<Tracer>& tracer_);
+void IncreaseVarbaseReferenceCountUntilCopyComplete(
+    const std::shared_ptr<imperative::VarBase>& var,
+    const platform::Place& place);

 }  // namespace imperative
 }  // namespace paddle
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -1060,21 +1060,52 @@ void BindImperative(py::module *m_ptr) {
       )DOC")
      .def("copy_", &imperative::VarBase::CopyFrom)
      .def("_copy_to",
-           [](const imperative::VarBase &self, const platform::CPUPlace &place,
-              bool blocking) { return self.NewVarBase(place, blocking); },
+           [](const std::shared_ptr<imperative::VarBase> &self,
+              const platform::CPUPlace &place, bool blocking) {
+             auto new_var = self->NewVarBase(place, blocking);
+             // Note(zhiqiu): Since NewVarBase may use GpuCopyAsync to
+             // copy data from the tensor of self to the tensor of new varbase,
+             // we need to ensure that the varbase self is not destructed until
+             // the GpuCopyAsync is completed. Otherwise, the memory may be
+             // freed
+             // when varbase self is destructed.
+             // To do that, we increase the reference count of self by 1 and
+             // add a cuda event to wait the GpuCopyAsync's completion.
+             if (!blocking) {
+               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+             }
+             return new_var;
+           },
           py::return_value_policy::copy)
      .def("_copy_to",
-           [](const imperative::VarBase &self,
-              const platform::CUDAPinnedPlace &place,
-              bool blocking) { return self.NewVarBase(place, blocking); },
+           [](const std::shared_ptr<imperative::VarBase> &self,
+              const platform::CUDAPinnedPlace &place, bool blocking) {
+             auto new_var = self->NewVarBase(place, blocking);
+             if (!blocking) {
+               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+             }
+             return new_var;
+           },
           py::return_value_policy::copy)
      .def("_copy_to",
-           [](const imperative::VarBase &self, const platform::XPUPlace &place,
-              bool blocking) { return self.NewVarBase(place, blocking); },
+           [](const std::shared_ptr<imperative::VarBase> &self,
+              const platform::XPUPlace &place, bool blocking) {
+             auto new_var = self->NewVarBase(place, blocking);
+             if (!blocking) {
+               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+             }
+             return new_var;
+           },
           py::return_value_policy::copy)
      .def("_copy_to",
-           [](const imperative::VarBase &self, const platform::CUDAPlace &place,
-              bool blocking) { return self.NewVarBase(place, blocking); },
+           [](const std::shared_ptr<imperative::VarBase> &self,
+              const platform::CUDAPlace &place, bool blocking) {
+             auto new_var = self->NewVarBase(place, blocking);
+             if (!blocking) {
+               IncreaseVarbaseReferenceCountUntilCopyComplete(self, place);
+             }
+             return new_var;
+           },
           py::return_value_policy::copy)
      .def("value", [](imperative::VarBase &self) { return self.MutableVar(); },
           py::return_value_policy::reference)

--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -156,6 +156,24 @@ class TestVarBase(unittest.TestCase):
            _test_place(core.CUDAPlace(0))
            _test_place("gpu:0")

+    def test_to_tensor_change_place(self):
+        if core.is_compiled_with_cuda():
+            a_np = np.random.rand(1024, 1024)
+            with paddle.fluid.dygraph.guard(core.CPUPlace()):
+                a = paddle.to_tensor(a_np, place=paddle.CUDAPinnedPlace())
+                a = paddle.to_tensor(a)
+                self.assertEqual(a.place.__repr__(), "CPUPlace")
+
+            with paddle.fluid.dygraph.guard(core.CUDAPlace(0)):
+                a = paddle.to_tensor(a_np, place=paddle.CUDAPinnedPlace())
+                a = paddle.to_tensor(a)
+                self.assertEqual(a.place.__repr__(), "CUDAPlace(0)")
+
+            with paddle.fluid.dygraph.guard(core.CUDAPlace(0)):
+                a = paddle.to_tensor(a_np, place=paddle.CPUPlace())
+                a = paddle.to_tensor(a, place=paddle.CUDAPinnedPlace())
+                self.assertEqual(a.place.__repr__(), "CUDAPinnedPlace")
+
    def test_to_variable(self):
        with fluid.dygraph.guard():
            var = fluid.dygraph.to_variable(self.array, name="abc")