Sync computation when Python invoke `run`

* Since GPU is an async device by default. We should sync computation when Python invoke `run`. So Python can get the correct computation result

Sync computation when Python invoke `run`
* Since GPU is an async device by default. We should sync computation when Python invoke `run`. So Python can get the correct computation result
ba1f5b5c · Yu Yang · 7d33447d · ba1f5b5c · ba1f5b5c
隐藏空白更改
内联并排

Showing with 10 addition and 3 deletion

paddle/platform/device_context.h paddle/platform/device_context.h +3 -2

paddle/pybind/pybind.cc paddle/pybind/pybind.cc +7 -1

未找到文件。
--- a/paddle/platform/device_context.h
+++ b/paddle/platform/device_context.h
@@ -34,13 +34,14 @@ class DeviceContext {

  template <typename DeviceType>
  DeviceType* get_eigen_device() const;
+
+  virtual void Wait() const {}
 };

 class CPUDeviceContext : public DeviceContext {
 public:
  CPUDeviceContext();
  explicit CPUDeviceContext(CPUPlace place);
-  virtual ~CPUDeviceContext() {}

  Eigen::DefaultDevice* eigen_device() const;

@@ -59,7 +60,7 @@ class CUDADeviceContext : public DeviceContext {
  virtual ~CUDADeviceContext();

  /*! \brief  Wait for all operations completion in the stream. */
-  void Wait() const;
+  void Wait() const override;

  /*! \brief  Return place in the device context. */
  Place GetPlace() const override;

--- a/paddle/pybind/pybind.cc
+++ b/paddle/pybind/pybind.cc
@@ -238,7 +238,13 @@ All parameter, weight, gradient are variables in Paddle.
             return Backward(forwardOp, no_grad_vars).release();
           })
      .def("infer_shape", &OperatorBase::InferShape)
-      .def("run", &OperatorBase::Run)
+      .def("run",
+           [](OperatorBase &self,
+              const Scope &scope,
+              const platform::DeviceContext &dev_ctx) {
+             self.Run(scope, dev_ctx);
+             dev_ctx.Wait();
+           })
      .def("type",
           [](const OperatorBase &op) -> std::string { return op.Type(); })
      .def("outputs",