未验证 提交 9de42be2 编写于 作者: Q QingshuChen 提交者: GitHub

optimize memcpy perf for kunlun (#30291) (#30382)

* optimize memcpy perf for kunlun (#30291)

* optimize memcpy perf for kunlun

* remove useless unitest for kunlun mean

* minor

* fix bug that cann't find mkldnn(kunlun) (#30394)
上级 1552343a
...@@ -9,7 +9,7 @@ endif() ...@@ -9,7 +9,7 @@ endif()
cc_library(malloc SRCS malloc.cc DEPS cc_library(malloc SRCS malloc.cc DEPS
place enforce allocator_facade profiler ${MKLDNN_CTX_DEPS}) place enforce allocator_facade profiler ${MKLDNN_CTX_DEPS})
cc_library(memcpy SRCS memcpy.cc DEPS place) cc_library(memcpy SRCS memcpy.cc DEPS place device_context)
cc_library(memory DEPS malloc memcpy) cc_library(memory DEPS malloc memcpy)
......
...@@ -16,6 +16,7 @@ limitations under the License. */ ...@@ -16,6 +16,7 @@ limitations under the License. */
#include <cstring> // for memcpy #include <cstring> // for memcpy
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
...@@ -186,13 +187,13 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place, ...@@ -186,13 +187,13 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
ret)); ret));
free(tmp); free(tmp);
} else { } else {
int ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_DEVICE); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
PADDLE_ENFORCE_EQ( auto* dev_ctx = pool.GetByPlace(src_place);
ret, XPU_SUCCESS, dev_ctx->Wait();
platform::errors::External( int ret = xpu::memcpy_device(dev_ctx->x_context(), dst, src, num);
"XPU API return wrong value[%d], please check whether " PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, platform::errors::External(
"Baidu Kunlun Card is properly installed.", "XPU API return wrong value[%d %s]",
ret)); ret, XPUAPIErrorMsg[ret]));
} }
} }
#endif #endif
......
...@@ -162,7 +162,16 @@ Place CPUDeviceContext::GetPlace() const { return place_; } ...@@ -162,7 +162,16 @@ Place CPUDeviceContext::GetPlace() const { return place_; }
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
XPUDeviceContext::XPUDeviceContext() { context_ = xpu::create_context(); } XPUDeviceContext::XPUDeviceContext() { context_ = xpu::create_context(); }
XPUDeviceContext::~XPUDeviceContext() { xpu::destroy_context(context_); } XPUDeviceContext::~XPUDeviceContext() {
xpu::destroy_context(context_);
void* l3ptr = nullptr;
int l3_size = 13.5 * 1024 * 1024;
xpu_malloc(static_cast<void**>(&l3ptr), l3_size, XPU_MEM_L3);
if (l3ptr != nullptr) {
context_->_l3_mgr.set(l3ptr, l3_size);
std::cout << "set l3 size " << l3_size << std::endl;
}
}
XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) { XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
int dev_id = -1; int dev_id = -1;
...@@ -179,6 +188,13 @@ XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) { ...@@ -179,6 +188,13 @@ XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
"Baidu Kunlun Card is properly installed.", "Baidu Kunlun Card is properly installed.",
ret)); ret));
context_ = xpu::create_context(); context_ = xpu::create_context();
void* l3ptr = nullptr;
int l3_size = 13.5 * 1024 * 1024;
xpu_malloc(static_cast<void**>(&l3ptr), l3_size, XPU_MEM_L3);
if (l3ptr != nullptr) {
context_->_l3_mgr.set(l3ptr, l3_size);
std::cout << "set l3 size " << l3_size << std::endl;
}
ret = xpu_set_device(dev_id); ret = xpu_set_device(dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External( platform::errors::External(
......
...@@ -77,68 +77,5 @@ class TestXPUMeanOp(TestMeanOp): ...@@ -77,68 +77,5 @@ class TestXPUMeanOp(TestMeanOp):
self.check_grad_with_place(place, ['X'], 'Out') self.check_grad_with_place(place, ['X'], 'Out')
class TestMeanAPI(unittest.TestCase):
# test paddle.tensor.stat.mean
def setUp(self):
self.x_shape = [2, 3, 4, 5]
self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
self.place = paddle.XPUPlace(0)
def test_api_static(self):
paddle.enable_static()
with paddle.static.program_guard(paddle.static.Program()):
x = paddle.fluid.data('X', self.x_shape)
out1 = paddle.mean(x)
out2 = paddle.tensor.mean(x)
out3 = paddle.tensor.stat.mean(x)
axis = np.arange(len(self.x_shape)).tolist()
out4 = paddle.mean(x, axis)
out5 = paddle.mean(x, tuple(axis))
exe = paddle.static.Executor(self.place)
res = exe.run(feed={'X': self.x},
fetch_list=[out1, out2, out3, out4, out5])
out_ref = np.mean(self.x)
for out in res:
self.assertEqual(np.allclose(out, out_ref, rtol=1e-04), True)
def test_api_dygraph(self):
paddle.disable_static(self.place)
def test_case(x, axis=None, keepdim=False):
x_tensor = paddle.to_tensor(x)
out = paddle.mean(x_tensor, axis, keepdim)
if isinstance(axis, list):
axis = tuple(axis)
if len(axis) == 0:
axis = None
out_ref = np.mean(x, axis, keepdims=keepdim)
self.assertEqual(
np.allclose(
out.numpy(), out_ref, rtol=1e-04), True)
test_case(self.x)
test_case(self.x, [])
test_case(self.x, -1)
test_case(self.x, keepdim=True)
test_case(self.x, 2, keepdim=True)
test_case(self.x, [0, 2])
test_case(self.x, (0, 2))
test_case(self.x, [0, 1, 2, 3])
paddle.enable_static()
def test_errors(self):
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 12]).astype('float32')
x = paddle.to_tensor(x)
self.assertRaises(Exception, paddle.mean, x, -3)
self.assertRaises(Exception, paddle.mean, x, 2)
paddle.enable_static()
with paddle.static.program_guard(paddle.static.Program()):
x = paddle.fluid.data('X', [10, 12], 'int32')
self.assertRaises(TypeError, paddle.mean, x)
if __name__ == "__main__": if __name__ == "__main__":
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册