optimize memcpy perf for kunlun (#30291) (#30382)

* optimize memcpy perf for kunlun (#30291) * optimize memcpy perf for kunlun * remove useless unitest for kunlun mean * minor * fix bug that cann't find mkldnn(kunlun) (#30394)

optimize memcpy perf for kunlun (#30291) (#30382)
* optimize memcpy perf for kunlun (#30291) * optimize memcpy perf for kunlun * remove useless unitest for kunlun mean * minor * fix bug that cann't find mkldnn(kunlun) (#30394)
9de42be2 · QingshuChen · GitHub · 1552343a · 9de42be2 · 9de42be2
4 changed file
--- a/paddle/fluid/memory/CMakeLists.txt
+++ b/paddle/fluid/memory/CMakeLists.txt
@@ -9,7 +9,7 @@ endif()
 cc_library(malloc SRCS malloc.cc DEPS
    place enforce allocator_facade profiler ${MKLDNN_CTX_DEPS})
-cc_library(memcpy SRCS memcpy.cc DEPS place)
+cc_library(memcpy SRCS memcpy.cc DEPS place device_context)
 cc_library(memory DEPS malloc memcpy)

--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <cstring>  // for memcpy
+#include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
@@ -186,13 +187,13 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
            ret));
    free(tmp);
  } else {
-    int ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
-    PADDLE_ENFORCE_EQ(
+    auto* dev_ctx = pool.GetByPlace(src_place);
-        ret, XPU_SUCCESS,
+    dev_ctx->Wait();
-        platform::errors::External(
+    int ret = xpu::memcpy_device(dev_ctx->x_context(), dst, src, num);
-            "XPU API return wrong value[%d], please check whether "
+    PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, platform::errors::External(
-            "Baidu Kunlun Card is properly installed.",
+                                            "XPU API return wrong value[%d %s]",
-            ret));
+                                            ret, XPUAPIErrorMsg[ret]));
  }
 }
 #endif

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -162,7 +162,16 @@ Place CPUDeviceContext::GetPlace() const { return place_; }
 #ifdef PADDLE_WITH_XPU
 XPUDeviceContext::XPUDeviceContext() { context_ = xpu::create_context(); }
-XPUDeviceContext::~XPUDeviceContext() { xpu::destroy_context(context_); }
+XPUDeviceContext::~XPUDeviceContext() {
+  xpu::destroy_context(context_);
+  void* l3ptr = nullptr;
+  int l3_size = 13.5 * 1024 * 1024;
+  xpu_malloc(static_cast<void**>(&l3ptr), l3_size, XPU_MEM_L3);
+  if (l3ptr != nullptr) {
+    context_->_l3_mgr.set(l3ptr, l3_size);
+    std::cout << "set l3 size " << l3_size << std::endl;
+  }
+}
 XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
  int dev_id = -1;
@@ -179,6 +188,13 @@ XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
                        "Baidu Kunlun Card is properly installed.",
                        ret));
  context_ = xpu::create_context();
+  void* l3ptr = nullptr;
+  int l3_size = 13.5 * 1024 * 1024;
+  xpu_malloc(static_cast<void**>(&l3ptr), l3_size, XPU_MEM_L3);
+  if (l3ptr != nullptr) {
+    context_->_l3_mgr.set(l3ptr, l3_size);
+    std::cout << "set l3 size " << l3_size << std::endl;
+  }
  ret = xpu_set_device(dev_id);
  PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
                    platform::errors::External(

--- a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
@@ -77,68 +77,5 @@ class TestXPUMeanOp(TestMeanOp):
            self.check_grad_with_place(place, ['X'], 'Out')
-class TestMeanAPI(unittest.TestCase):
-    # test paddle.tensor.stat.mean
-    def setUp(self):
-        self.x_shape = [2, 3, 4, 5]
-        self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
-        self.place = paddle.XPUPlace(0)
-    def test_api_static(self):
-        paddle.enable_static()
-        with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.fluid.data('X', self.x_shape)
-            out1 = paddle.mean(x)
-            out2 = paddle.tensor.mean(x)
-            out3 = paddle.tensor.stat.mean(x)
-            axis = np.arange(len(self.x_shape)).tolist()
-            out4 = paddle.mean(x, axis)
-            out5 = paddle.mean(x, tuple(axis))
-            exe = paddle.static.Executor(self.place)
-            res = exe.run(feed={'X': self.x},
-                          fetch_list=[out1, out2, out3, out4, out5])
-        out_ref = np.mean(self.x)
-        for out in res:
-            self.assertEqual(np.allclose(out, out_ref, rtol=1e-04), True)
-    def test_api_dygraph(self):
-        paddle.disable_static(self.place)
-        def test_case(x, axis=None, keepdim=False):
-            x_tensor = paddle.to_tensor(x)
-            out = paddle.mean(x_tensor, axis, keepdim)
-            if isinstance(axis, list):
-                axis = tuple(axis)
-                if len(axis) == 0:
-                    axis = None
-            out_ref = np.mean(x, axis, keepdims=keepdim)
-            self.assertEqual(
-                np.allclose(
-                    out.numpy(), out_ref, rtol=1e-04), True)
-        test_case(self.x)
-        test_case(self.x, [])
-        test_case(self.x, -1)
-        test_case(self.x, keepdim=True)
-        test_case(self.x, 2, keepdim=True)
-        test_case(self.x, [0, 2])
-        test_case(self.x, (0, 2))
-        test_case(self.x, [0, 1, 2, 3])
-        paddle.enable_static()
-    def test_errors(self):
-        paddle.disable_static()
-        x = np.random.uniform(-1, 1, [10, 12]).astype('float32')
-        x = paddle.to_tensor(x)
-        self.assertRaises(Exception, paddle.mean, x, -3)
-        self.assertRaises(Exception, paddle.mean, x, 2)
-        paddle.enable_static()
-        with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.fluid.data('X', [10, 12], 'int32')
-            self.assertRaises(TypeError, paddle.mean, x)
 if __name__ == "__main__":
    unittest.main()