From cb73feeacf5213595930f865f591a82841fd7ca6 Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Wed, 7 Jul 2021 18:42:52 +0800
Subject: [PATCH] add Wait after TensorCopy (#34005)

---
 .../fluid/operators/uniform_random_op_npu.cc  | 19 ++++++++++---------
 .../npu/test_uniform_random_op_npu.py         |  2 +-
 .../paddle/fluid/tests/unittests/op_test.py   |  4 +++-
 3 files changed, 14 insertions(+), 11 deletions(-)
diff --git a/paddle/fluid/operators/uniform_random_op_npu.cc b/paddle/fluid/operators/uniform_random_op_npu.cc
index efd9d844fc..580c1f3e94 100644
--- a/paddle/fluid/operators/uniform_random_op_npu.cc
+++ b/paddle/fluid/operators/uniform_random_op_npu.cc
@@ -56,10 +56,13 @@ class NPUUniformRandomKernel : public framework::OpKernel<T> {
           "unsupport type: %s.",
           framework::ToTypeName(out_var->Type())));
     }
-    T *data = tensor->mutable_data<T>(ctx.GetPlace());
-
+    tensor->mutable_data<T>(ctx.GetPlace());
     int64_t size = tensor->numel();
-    std::unique_ptr<T[]> data_cpu(new T[size]);
+
+    Tensor cpu_tensor(tensor->type());
+    cpu_tensor.Resize(tensor->dims());
+    T *data_cpu = cpu_tensor.mutable_data<T>(platform::CPUPlace());
+
     std::uniform_real_distribution<T> dist(
         static_cast<T>(ctx.Attr<float>("min")),
         static_cast<T>(ctx.Attr<float>("max")));
@@ -90,12 +93,10 @@ class NPUUniformRandomKernel : public framework::OpKernel<T> {
     }
 
     // copy to NPU
-    auto stream =
-        ctx.template device_context<paddle::platform::NPUDeviceContext>()
-            .stream();
-    memory::Copy(BOOST_GET_CONST(platform::NPUPlace, ctx.GetPlace()), data,
-                 platform::CPUPlace(), reinterpret_cast<void *>(data_cpu.get()),
-                 size * sizeof(T), stream);
+    framework::TensorCopy(
+        cpu_tensor, ctx.GetPlace(),
+        ctx.template device_context<platform::DeviceContext>(), tensor);
+    ctx.template device_context<paddle::platform::NPUDeviceContext>().Wait();
   }
 };
 
diff --git a/python/paddle/fluid/tests/unittests/npu/test_uniform_random_op_npu.py b/python/paddle/fluid/tests/unittests/npu/test_uniform_random_op_npu.py
index 8c37f0a32a..7c358c244f 100644
--- a/python/paddle/fluid/tests/unittests/npu/test_uniform_random_op_npu.py
+++ b/python/paddle/fluid/tests/unittests/npu/test_uniform_random_op_npu.py
@@ -67,7 +67,7 @@ class TestNPUUniformRandomOp(OpTest):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output_customized(self.verify_output)
+        self.check_output_customized(self.verify_output, self.place)
 
     def verify_output(self, outs):
         hist, prob = self.output_hist(np.array(outs[0]))
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index f6de13b6fd..02f1dfbb9f 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -1357,8 +1357,10 @@ class OpTest(unittest.TestCase):
             if self.op_type not in compile_vs_runtime_white_list.COMPILE_RUN_OP_WHITE_LIST:
                 self.check_compile_vs_runtime(fetch_list, outs)
 
-    def check_output_customized(self, checker):
+    def check_output_customized(self, checker, custom_place=None):
         places = self._get_places()
+        if custom_place:
+            places.append(custom_place)
         for place in places:
             outs = self.calc_output(place)
             outs = [np.array(out) for out in outs]
-- 
GitLab