From 2c1bba02e48cfe753531b441bb8f7c3aa1ac8ff3 Mon Sep 17 00:00:00 2001
From: QingshuChen <qingshu.chen714@gmail.com>
Date: Wed, 13 Jan 2021 13:48:14 +0800
Subject: [PATCH] optimize memcpy perf for kunlun (#30291)

* optimize memcpy perf for kunlun

* remove useless unitest for kunlun mean

* minor
---
 paddle/fluid/memory/memcpy.cc                 | 15 ++---
 paddle/fluid/platform/device_context.cc       | 18 +++++-
 .../tests/unittests/xpu/test_mean_op_xpu.py   | 63 -------------------
 3 files changed, 25 insertions(+), 71 deletions(-)
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 10e8bb1f4a..b17da7f69a 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <cstring>  // for memcpy
 
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -186,13 +187,13 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
             ret));
     free(tmp);
   } else {
-    int ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
-    PADDLE_ENFORCE_EQ(
-        ret, XPU_SUCCESS,
-        platform::errors::External(
-            "XPU API return wrong value[%d], please check whether "
-            "Baidu Kunlun Card is properly installed.",
-            ret));
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    auto* dev_ctx = pool.GetByPlace(src_place);
+    dev_ctx->Wait();
+    int ret = xpu::memcpy_device(dev_ctx->x_context(), dst, src, num);
+    PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, platform::errors::External(
+                                            "XPU API return wrong value[%d %s]",
+                                            ret, XPUAPIErrorMsg[ret]));
   }
 }
 #endif
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 57c5ccefae..fb94768984 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -172,7 +172,16 @@ Place CPUDeviceContext::GetPlace() const { return place_; }
 #ifdef PADDLE_WITH_XPU
 XPUDeviceContext::XPUDeviceContext() { context_ = xpu::create_context(); }
 
-XPUDeviceContext::~XPUDeviceContext() { xpu::destroy_context(context_); }
+XPUDeviceContext::~XPUDeviceContext() {
+  xpu::destroy_context(context_);
+  void* l3ptr = nullptr;
+  int l3_size = 13.5 * 1024 * 1024;
+  xpu_malloc(static_cast<void**>(&l3ptr), l3_size, XPU_MEM_L3);
+  if (l3ptr != nullptr) {
+    context_->_l3_mgr.set(l3ptr, l3_size);
+    std::cout << "set l3 size " << l3_size << std::endl;
+  }
+}
 
 XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
   int dev_id = -1;
@@ -189,6 +198,13 @@ XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
                         "Baidu Kunlun Card is properly installed.",
                         ret));
   context_ = xpu::create_context();
+  void* l3ptr = nullptr;
+  int l3_size = 13.5 * 1024 * 1024;
+  xpu_malloc(static_cast<void**>(&l3ptr), l3_size, XPU_MEM_L3);
+  if (l3ptr != nullptr) {
+    context_->_l3_mgr.set(l3ptr, l3_size);
+    std::cout << "set l3 size " << l3_size << std::endl;
+  }
   ret = xpu_set_device(dev_id);
   PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
                     platform::errors::External(
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
index 3ebdd110d3..bbdb0984ed 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
@@ -77,68 +77,5 @@ class TestXPUMeanOp(TestMeanOp):
             self.check_grad_with_place(place, ['X'], 'Out')
 
 
-class TestMeanAPI(unittest.TestCase):
-    # test paddle.tensor.stat.mean
-
-    def setUp(self):
-        self.x_shape = [2, 3, 4, 5]
-        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
-        self.place = paddle.XPUPlace(0)
-
-    def test_api_static(self):
-        paddle.enable_static()
-        with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.fluid.data('X', self.x_shape)
-            out1 = paddle.mean(x)
-            out2 = paddle.tensor.mean(x)
-            out3 = paddle.tensor.stat.mean(x)
-            axis = np.arange(len(self.x_shape)).tolist()
-            out4 = paddle.mean(x, axis)
-            out5 = paddle.mean(x, tuple(axis))
-
-            exe = paddle.static.Executor(self.place)
-            res = exe.run(feed={'X': self.x},
-                          fetch_list=[out1, out2, out3, out4, out5])
-        out_ref = np.mean(self.x)
-        for out in res:
-            self.assertEqual(np.allclose(out, out_ref, rtol=1e-04), True)
-
-    def test_api_dygraph(self):
-        paddle.disable_static(self.place)
-
-        def test_case(x, axis=None, keepdim=False):
-            x_tensor = paddle.to_tensor(x)
-            out = paddle.mean(x_tensor, axis, keepdim)
-            if isinstance(axis, list):
-                axis = tuple(axis)
-                if len(axis) == 0:
-                    axis = None
-            out_ref = np.mean(x, axis, keepdims=keepdim)
-            self.assertEqual(
-                np.allclose(
-                    out.numpy(), out_ref, rtol=1e-04), True)
-
-        test_case(self.x)
-        test_case(self.x, [])
-        test_case(self.x, -1)
-        test_case(self.x, keepdim=True)
-        test_case(self.x, 2, keepdim=True)
-        test_case(self.x, [0, 2])
-        test_case(self.x, (0, 2))
-        test_case(self.x, [0, 1, 2, 3])
-        paddle.enable_static()
-
-    def test_errors(self):
-        paddle.disable_static()
-        x = np.random.uniform(-1, 1, [10, 12]).astype('float32')
-        x = paddle.to_tensor(x)
-        self.assertRaises(Exception, paddle.mean, x, -3)
-        self.assertRaises(Exception, paddle.mean, x, 2)
-        paddle.enable_static()
-        with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.fluid.data('X', [10, 12], 'int32')
-            self.assertRaises(TypeError, paddle.mean, x)
-
-
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab