未验证 提交 0d8ddf9f 编写于 作者: Z zhangyikun02 提交者: GitHub

modify d2d copy to xpu::copy in xpu kernel, test=kunlun (#48710)

上级 57ad9b46
......@@ -417,8 +417,13 @@ void BufferedReader::ReadAsync(size_t i) {
// TODO(zhanghuan) for now hardware not support xpu_memcpy_async, maybe
// KL3
if ((platform::is_xpu_place(cpu_place))) {
memory::Copy(place_, xpu_ptr, cpu_place, cpu_ptr, size);
platform::XPUStreamSync(stream_.get());
char *tmp = new char[size];
PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(
tmp, cpu_ptr, size, XPUMemcpyKind::XPU_DEVICE_TO_HOST));
PADDLE_ENFORCE_XPU_SUCCESS(xpu_memcpy(
xpu_ptr, tmp, size, XPUMemcpyKind::XPU_HOST_TO_DEVICE));
delete[] tmp;
} else {
memory::Copy(place_, xpu_ptr, cpu_place, cpu_ptr, size);
}
......
......@@ -169,19 +169,12 @@ void MemcpySyncD2D(void* dst,
const phi::XPUContext& dev_ctx) {
int dev_id = GetXPUCurrentDeviceId();
if (dst_place.device == dev_id && src_place.device == dev_id) {
dev_ctx.Wait();
char* tmp = new char[count];
PADDLE_ENFORCE_XPU_SUCCESS(
xpu_memcpy(tmp, src, count, XPUMemcpyKind::XPU_DEVICE_TO_HOST));
PADDLE_ENFORCE_XPU_SUCCESS(
xpu_memcpy(dst, tmp, count, XPUMemcpyKind::XPU_HOST_TO_DEVICE));
delete[] tmp;
// PADDLE_ENFORCE_XDNN_SUCCESS(
// baidu::xpu::api::copy(dev_ctx.x_context(),
// static_cast<const int8_t*>(src),
// static_cast<int8_t*>(dst),
// count),
// "copy ");
PADDLE_ENFORCE_XDNN_SUCCESS(
baidu::xpu::api::copy(dev_ctx.x_context(),
static_cast<const int8_t*>(src),
static_cast<int8_t*>(dst),
count),
"copy ");
} else {
PADDLE_ENFORCE_XPU_SUCCESS(
xpu_memcpy_peer(dst_place.device, dst, src_place.device, src, count));
......
......@@ -17,6 +17,9 @@
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#ifdef PADDLE_WITH_XPU
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#endif
namespace phi {
......@@ -29,6 +32,25 @@ void ReshapeGradKernel(const Context& dev_ctx,
x_grad->Resize(x_dims);
}
#ifdef PADDLE_WITH_XPU
template <>
void ReshapeGradKernel<phi::XPUContext>(const XPUContext& dev_ctx,
const DenseTensor& out_grad,
DenseTensor* x_grad) {
auto x_dims = x_grad->dims();
dev_ctx.Alloc(x_grad, out_grad.dtype());
auto* src_ptr = out_grad.data();
auto* dst_ptr = x_grad->data();
auto size = out_grad.numel() * paddle::experimental::SizeOf(out_grad.dtype());
int ret = xpu::copy(dev_ctx.x_context(),
reinterpret_cast<const int8_t*>(src_ptr),
reinterpret_cast<int8_t*>(dst_ptr),
size);
PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy");
x_grad->Resize(x_dims);
}
#endif
template <typename Context>
void ReshapeDoubleGradKernel(const Context& dev_ctx,
const DenseTensor& out_grad,
......
......@@ -19,6 +19,9 @@
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/infermeta/unary.h"
#include "paddle/phi/kernels/funcs/common_shape.h"
#ifdef PADDLE_WITH_XPU
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#endif
namespace phi {
......@@ -42,6 +45,33 @@ void ReshapeKernel(const Context& dev_ctx,
out->ResetLoD(x.lod());
}
#ifdef PADDLE_WITH_XPU
template <>
void ReshapeKernel<phi::XPUContext>(const XPUContext& dev_ctx,
const DenseTensor& x,
const IntArray& shape,
DenseTensor* out) {
MetaTensor meta_out(out);
InferMetaFromVecValue(x, shape.GetData(), &meta_out);
if (x.initialized() && x.Holder() == out->Holder()) {
dev_ctx.Alloc(out, x.dtype());
return;
}
dev_ctx.Alloc(out, x.dtype());
auto dims = out->dims();
auto* src_ptr = x.data();
auto* dst_ptr = out->data();
auto size = x.numel() * paddle::experimental::SizeOf(x.dtype());
int ret = xpu::copy(dev_ctx.x_context(),
reinterpret_cast<const int8_t*>(src_ptr),
reinterpret_cast<int8_t*>(dst_ptr),
size);
PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy");
out->Resize(dims);
out->ResetLoD(x.lod());
}
#endif
template <typename Context>
void ReshapeWithXShape(const Context& dev_ctx,
const DenseTensor& x,
......
......@@ -30,7 +30,10 @@ void GatherNdKernel(const Context &ctx,
if (x.numel() == 0) return;
if (index.numel() == 0) {
phi::Copy(ctx, x, phi::XPUPlace(), true, out);
out->Resize(x.dims());
ctx.template Alloc<T>(out);
int r = xpu::copy(ctx.x_context(), x.data<T>(), out->data<T>(), x.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
return;
}
......@@ -69,12 +72,7 @@ void GatherNdKernel(const Context &ctx,
x_vec,
index_shape);
}
PADDLE_ENFORCE_EQ(
ret,
XPU_SUCCESS,
phi::errors::External("XPU gather_nd kernel return wrong value[%d %s]",
ret,
XPUAPIErrorMsg[ret]));
PADDLE_ENFORCE_XDNN_SUCCESS(ret, "gather_nd");
}
} // namespace phi
......
......@@ -372,16 +372,16 @@ void GenerateProposalsKernel(const Context& dev_ctx,
DenseTensor& proposals = tensor_pair.first;
DenseTensor& nscores = tensor_pair.second;
paddle::memory::Copy(place,
rpn_rois->data<T>() + num_proposals * 4,
place,
r = xpu::copy(dev_ctx.x_context(),
proposals.data<T>(),
sizeof(T) * proposals.numel());
paddle::memory::Copy(place,
rpn_roi_probs->data<T>() + num_proposals,
place,
rpn_rois->data<T>() + num_proposals * 4,
proposals.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
r = xpu::copy(dev_ctx.x_context(),
nscores.data<T>(),
sizeof(T) * scores.numel());
rpn_roi_probs->data<T>() + num_proposals,
nscores.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
if (dev_ctx.x_context()->xpu_stream) {
dev_ctx.Wait();
......
......@@ -27,7 +27,10 @@ void ScatterKernel(const Context &ctx,
const DenseTensor &updates,
bool overwrite,
DenseTensor *out) {
phi::Copy(ctx, x, ctx.GetPlace(), false, out);
out->Resize(x.dims());
ctx.template Alloc<T>(out);
int ret = xpu::copy(ctx.x_context(), x.data<T>(), out->data<T>(), x.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(ret, "copy");
// Apply ScatterUpdate: Out[index] = Updates[:]
const auto &index_type = index.dtype();
bool index_type_match =
......
......@@ -102,7 +102,11 @@ void TileKernel(const Context& dev_ctx,
std::vector<int64_t> temp(repeat_times.size(), 1);
if (repeat_times == temp) {
phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
out->Resize(x.dims());
dev_ctx.template Alloc<T>(out);
int r =
xpu::copy(dev_ctx.x_context(), x.data<T>(), out->data<T>(), x.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "copy");
return;
}
......
......@@ -94,6 +94,8 @@ xpu_test_op_type_white_list = [
"c_embedding_float32", # unittests of collective ops do not using xpu testing framework
"c_sync_comm_stream_float32",
"c_sync_calc_stream_float32",
"reshape2_bool",
"reshape2_grad_bool",
]
xpu_test_device_op_white_list = []
xpu_test_device_op_type_white_list = []
......
......@@ -41,6 +41,7 @@ class XPUTestReshapeOp(XPUOpTestWrapper):
def setUp(self):
self.init_data()
self.op_type = "reshape2"
self.dtype = self.in_type
self.init_test_input()
self.init_test_output()
self.init_attrs()
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册