未验证 提交 bdf5834e 编写于 作者: T taixiurong 提交者: GitHub

update xpu_memcpy (#38049)

上级 9a4eec98
......@@ -353,8 +353,10 @@ void CheckVarHasNanOrInf(const std::string& op_type,
}
float* cpu_data = new float[tensor->numel()];
xpu_memcpy(cpu_data, tensor->data<float>(), tensor->numel() * sizeof(float),
XPU_DEVICE_TO_HOST);
memory::Copy(platform::CPUPlace(), static_cast<void*>(cpu_data),
BOOST_GET_CONST(platform::XPUPlace, tensor->place()),
static_cast<const void*>(tensor->data<float>()),
tensor->numel() * sizeof(float));
bool flag = false;
for (int i = 0; i < tensor->numel(); i++) {
if (isnan(cpu_data[i]) || isinf(cpu_data[i])) {
......
......@@ -136,6 +136,11 @@ void Copy<platform::CPUPlace, platform::XPUPlace>(platform::CPUPlace dst_place,
"Baidu Kunlun Card is properly installed.",
ret));
}
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.GetByPlace(src_place);
dev_ctx->Wait();
ret = xpu_memcpy(dst, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External(
......@@ -182,6 +187,11 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
"Baidu Kunlun Card is properly installed.",
ret));
void* tmp = malloc(num);
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.GetByPlace(src_place);
dev_ctx->Wait();
ret = xpu_memcpy(tmp, src, num, XPUMemcpyKind::XPU_DEVICE_TO_HOST);
PADDLE_ENFORCE_EQ(
ret, XPU_SUCCESS,
......@@ -214,8 +224,8 @@ void Copy<platform::XPUPlace, platform::XPUPlace>(platform::XPUPlace dst_place,
} else {
platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.GetByPlace(src_place);
dev_ctx->Wait();
int ret = xpu::memcpy_device(dev_ctx->x_context(), dst, src, num);
int ret = xpu::copy(dev_ctx->x_context(), static_cast<const int8_t*>(src),
static_cast<int8_t*>(dst), num);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS, platform::errors::External(
"XPU API return wrong value[%d %s]",
ret, XPUAPIErrorMsg[ret]));
......
......@@ -40,8 +40,10 @@ class CheckFiniteAndUnscaleXPUKernel : public framework::OpKernel<T> {
MPDType cpu_scale_data;
if (platform::is_xpu_place(scale->place())) {
xpu_memcpy(&cpu_scale_data, scale_data, sizeof(MPDType),
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
memory::Copy(platform::CPUPlace(), static_cast<void*>(&cpu_scale_data),
BOOST_GET_CONST(platform::XPUPlace, scale->place()),
static_cast<const void*>(scale_data), sizeof(MPDType));
} else {
cpu_scale_data = (*scale_data);
}
......
......@@ -42,8 +42,10 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
const bool* found_inf_data = found_inf->data<bool>();
bool cpu_found_inf_data = false;
if (platform::is_xpu_place(found_inf->place())) {
xpu_memcpy(&cpu_found_inf_data, found_inf_data, sizeof(bool),
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
memory::Copy(platform::CPUPlace(),
static_cast<void*>(&cpu_found_inf_data),
BOOST_GET_CONST(platform::XPUPlace, found_inf->place()),
static_cast<const void*>(found_inf_data), sizeof(bool));
} else {
cpu_found_inf_data = (*found_inf_data);
}
......@@ -94,22 +96,26 @@ class UpdateLossScalingXPUKernel : public framework::OpKernel<T> {
int cpu_good_in_data;
MPDType cpu_pre_loss_scaling_data;
if (platform::is_xpu_place(bad_in->place())) {
xpu_memcpy(&cpu_bad_in_data, bad_in_data, sizeof(int),
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
memory::Copy(platform::CPUPlace(), static_cast<void*>(&cpu_bad_in_data),
BOOST_GET_CONST(platform::XPUPlace, bad_in->place()),
static_cast<const void*>(bad_in_data), sizeof(int));
} else {
cpu_bad_in_data = (*bad_in_data);
}
if (platform::is_xpu_place(good_in->place())) {
xpu_memcpy(&cpu_good_in_data, good_in_data, sizeof(int),
XPUMemcpyKind::XPU_DEVICE_TO_HOST);
memory::Copy(platform::CPUPlace(), static_cast<void*>(&cpu_good_in_data),
BOOST_GET_CONST(platform::XPUPlace, good_in->place()),
static_cast<const void*>(good_in_data), sizeof(int));
} else {
cpu_good_in_data = (*good_in_data);
}
if (platform::is_xpu_place(pre_loss_scaling->place())) {
xpu_memcpy(&cpu_pre_loss_scaling_data, pre_loss_scaling_data,
sizeof(MPDType), XPUMemcpyKind::XPU_DEVICE_TO_HOST);
memory::Copy(
platform::CPUPlace(), static_cast<void*>(&cpu_pre_loss_scaling_data),
BOOST_GET_CONST(platform::XPUPlace, pre_loss_scaling->place()),
static_cast<const void*>(pre_loss_scaling_data), sizeof(MPDType));
} else {
cpu_pre_loss_scaling_data = (*pre_loss_scaling_data);
}
......
......@@ -48,16 +48,9 @@ class MaskedSelectXPUKernel : public framework::OpKernel<T> {
"XPU nonzero_count kernel return wrong value[%d %s]",
ret, XPUAPIErrorMsg[ret]));
if (dev_ctx.x_context()->xpu_stream) {
dev_ctx.Wait();
}
ret = xpu_memcpy(static_cast<void*>(&out_size_cpu),
static_cast<const void*>(out_size), sizeof(int32_t),
XPU_DEVICE_TO_HOST);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External("XPU xpu_memcpy return wrong "
"value[%d %s]",
ret, XPUAPIErrorMsg[ret]));
memory::Copy(platform::CPUPlace(), static_cast<void*>(&out_size_cpu),
BOOST_GET_CONST(platform::XPUPlace, mask->place()),
static_cast<void*>(out_size), sizeof(int32_t));
framework::DDim out_dim{out_size_cpu};
out->Resize(out_dim);
......
......@@ -50,12 +50,10 @@ class XPURangeKernel : public framework::OpKernel<T> {
out_cpu_data_ptr[i] = value;
value += step;
}
int ret = xpu_memcpy(out_data, out_cpu_data_ptr, out->numel() * sizeof(T),
XPUMemcpyKind::XPU_HOST_TO_DEVICE);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External("XPU xpu_memcpy return wrong "
"value[%d %s]",
ret, XPUAPIErrorMsg[ret]));
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()),
static_cast<void*>(out_data), platform::CPUPlace(),
static_cast<void*>(out_cpu_data_ptr),
out->numel() * sizeof(T));
}
};
......
......@@ -43,16 +43,9 @@ class WhereIndexXPUKernel : public framework::OpKernel<T> {
"XPU nonzero_count kernel return wrong value[%d %s] in WhereIndex",
ret, XPUAPIErrorMsg[ret]));
if (dev_ctx.x_context()->xpu_stream) {
dev_ctx.Wait();
}
ret = xpu_memcpy(static_cast<void*>(&true_num_cpu),
static_cast<const void*>(true_num), sizeof(int32_t),
XPU_DEVICE_TO_HOST);
PADDLE_ENFORCE_EQ(ret, XPU_SUCCESS,
platform::errors::External("XPU xpu_memcpy return wrong "
"value[%d %s]",
ret, XPUAPIErrorMsg[ret]));
memory::Copy(platform::CPUPlace(), static_cast<void*>(&true_num_cpu),
BOOST_GET_CONST(platform::XPUPlace, context.GetPlace()),
static_cast<void*>(true_num), sizeof(int32_t));
out->Resize(
framework::make_ddim({static_cast<int64_t>(true_num_cpu), rank}));
......
......@@ -307,8 +307,9 @@ void SetTensorFromPyArrayT(
platform::XPUDeviceGuard guard(
BOOST_GET_CONST(platform::XPUPlace, tmp_place).device);
auto dst = self->mutable_data<T>(place);
xpu_memcpy(dst, array.data(), array.nbytes(),
XPUMemcpyKind::XPU_HOST_TO_DEVICE);
memory::Copy(BOOST_GET_CONST(platform::XPUPlace, tmp_place),
static_cast<void *>(dst), platform::CPUPlace(),
static_cast<const void *>(array.data()), array.nbytes());
#else
PADDLE_THROW(platform::errors::PermissionDenied(
"Cannot use XPUPlace in CPU/GPU version, "
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册