未验证 提交 9de42be2 编写于 作者: Q QingshuChen 提交者: GitHub

optimize memcpy perf for kunlun (#30291) (#30382)

* optimize memcpy perf for kunlun (#30291)

* optimize memcpy perf for kunlun

* remove useless unitest for kunlun mean

* minor

* fix bug that cann't find mkldnn(kunlun) (#30394)
上级 1552343a
......@@ -9,7 +9,7 @@ endif()
cc_library(malloc SRCS malloc.cc DEPS
place enforce allocator_facade profiler ${MKLDNN_CTX_DEPS})
cc_library(memcpy SRCS memcpy.cc DEPS place)
cc_library(memcpy SRCS memcpy.cc DEPS place device_context)
cc_library(memory DEPS malloc memcpy)
......
......@@ -16,6 +16,7 @@ limitations under the License. */
#include <cstring> // for memcpy
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/profiler.h"
......@@ -186,13 +187,13 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
ret));
free(tmp);
} else {
int ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_DEVICE);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
platform::errors::External(
"XPU API return wrong value[%d], please check whether "
"Baidu Kunlun Card is properly installed.",
ret));
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.GetByPlace(src_place);
dev_ctx->Wait();
int ret = xpu::memcpy_device(dev_ctx->x_context(), dst, src, num);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, platform::errors::External(
"XPU API return wrong value[%d %s]",
ret, XPUAPIErrorMsg[ret]));
}
}
#endif
......
......@@ -162,7 +162,16 @@ Place CPUDeviceContext::GetPlace() const { return place_; }
#ifdef PADDLE_WITH_XPU
XPUDeviceContext::XPUDeviceContext() { context_ = xpu::create_context(); }
XPUDeviceContext::~XPUDeviceContext() { xpu::destroy_context(context_); }
XPUDeviceContext::~XPUDeviceContext() {
xpu::destroy_context(context_);
void* l3ptr = nullptr;
int l3_size = 13.5 * 1024 * 1024;
xpu_malloc(static_cast<void**>(&l3ptr), l3_size, XPU_MEM_L3);
if (l3ptr != nullptr) {
context_->_l3_mgr.set(l3ptr, l3_size);
std::cout << "set l3 size " << l3_size << std::endl;
}
}
XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
int dev_id = -1;
......@@ -179,6 +188,13 @@ XPUDeviceContext::XPUDeviceContext(XPUPlace place) : place_(place) {
"Baidu Kunlun Card is properly installed.",
ret));
context_ = xpu::create_context();
void* l3ptr = nullptr;
int l3_size = 13.5 * 1024 * 1024;
xpu_malloc(static_cast<void**>(&l3ptr), l3_size, XPU_MEM_L3);
if (l3ptr != nullptr) {
context_->_l3_mgr.set(l3ptr, l3_size);
std::cout << "set l3 size " << l3_size << std::endl;
}
ret = xpu_set_device(dev_id);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
......
......@@ -77,68 +77,5 @@ class TestXPUMeanOp(TestMeanOp):
self.check_grad_with_place(place, ['X'], 'Out')
class TestMeanAPI(unittest.TestCase):
# test paddle.tensor.stat.mean
def setUp(self):
self.x_shape = [2, 3, 4, 5]
self.x = np.random.uniform(-1, 1, self.x_shape).astype(np.float32)
self.place = paddle.XPUPlace(0)
def test_api_static(self):
paddle.enable_static()
with paddle.static.program_guard(paddle.static.Program()):
x = paddle.fluid.data('X', self.x_shape)
out1 = paddle.mean(x)
out2 = paddle.tensor.mean(x)
out3 = paddle.tensor.stat.mean(x)
axis = np.arange(len(self.x_shape)).tolist()
out4 = paddle.mean(x, axis)
out5 = paddle.mean(x, tuple(axis))
exe = paddle.static.Executor(self.place)
res = exe.run(feed={'X': self.x},
fetch_list=[out1, out2, out3, out4, out5])
out_ref = np.mean(self.x)
for out in res:
self.assertEqual(np.allclose(out, out_ref, rtol=1e-04), True)
def test_api_dygraph(self):
paddle.disable_static(self.place)
def test_case(x, axis=None, keepdim=False):
x_tensor = paddle.to_tensor(x)
out = paddle.mean(x_tensor, axis, keepdim)
if isinstance(axis, list):
axis = tuple(axis)
if len(axis) == 0:
axis = None
out_ref = np.mean(x, axis, keepdims=keepdim)
self.assertEqual(
np.allclose(
out.numpy(), out_ref, rtol=1e-04), True)
test_case(self.x)
test_case(self.x, [])
test_case(self.x, -1)
test_case(self.x, keepdim=True)
test_case(self.x, 2, keepdim=True)
test_case(self.x, [0, 2])
test_case(self.x, (0, 2))
test_case(self.x, [0, 1, 2, 3])
paddle.enable_static()
def test_errors(self):
paddle.disable_static()
x = np.random.uniform(-1, 1, [10, 12]).astype('float32')
x = paddle.to_tensor(x)
self.assertRaises(Exception, paddle.mean, x, -3)
self.assertRaises(Exception, paddle.mean, x, 2)
paddle.enable_static()
with paddle.static.program_guard(paddle.static.Program()):
x = paddle.fluid.data('X', [10, 12], 'int32')
self.assertRaises(TypeError, paddle.mean, x)
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册