未验证 提交 9a7b9eda 编写于 作者: Z zyfncg 提交者: GitHub

[Pten] Refactor the copy kernel (#39731)

* remove SetAllocationForOutputTenosr

* add place param for copy kernel

* recover SetAllocationForOutputTenosr

* polish code

* fix empty_dev api bug

* test=allcases

* test=allcases

* fix bug

* recover empty

* recover modify
上级 581b2c64
...@@ -32,39 +32,33 @@ namespace paddle { ...@@ -32,39 +32,33 @@ namespace paddle {
namespace experimental { namespace experimental {
Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking) { Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking) {
// 1. Get kernel signature and kernel
auto kernel_key_set = ParseKernelKeyByInputArgs(x); auto kernel_key_set = ParseKernelKeyByInputArgs(x);
kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend); kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey(); auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError( auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
"copy", kernel_key); "copy", kernel_key);
VLOG(0) << "to API kernel key: " << kernel_key; VLOG(6) << "copy API kernel key: " << kernel_key;
VLOG(0) << "to API kernel: " << kernel; VLOG(6) << "copy API kernel: " << kernel;
// 2. Get Device Context
auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend()); auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
auto kernel_context = phi::KernelContext(dev_ctx);
auto dense_x = TensorToDenseTensor(x);
// 3. Auto data transform
auto dense_x = std::dynamic_pointer_cast<phi::DenseTensor>(x.impl());
kernel_context.EmplaceBackInput(dense_x.get());
kernel_context.EmplaceBackAttr(blocking);
// 4. Prepare outputs & InferMeta
auto dense_out = std::make_shared<phi::DenseTensor>(
phi::make_intrusive<paddle::experimental::SharedStorage>(
phi::TransToPtenPlace(backend)),
phi::DenseTensorMeta());
phi::MetaTensor meta_out(dense_out.get());
phi::UnchangedInferMeta(*dense_x, &meta_out);
dense_out->mutable_data(phi::TransToPtenPlace(backend));
kernel_context.EmplaceBackOutput(dense_out.get());
Tensor out; Tensor out;
out.set_impl(dense_out); auto kernel_out = SetKernelOutput(kernel_key.backend(), &out);
phi::MetaTensor meta_out(kernel_out);
phi::UnchangedInferMeta(*dense_x, &meta_out);
// 5. Call kernel using kernel_signature = void (*)(const platform::DeviceContext&,
kernel(&kernel_context); const phi::DenseTensor&,
phi::Place,
bool,
phi::DenseTensor*);
auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
(*kernel_fn)(
*dev_ctx, *dense_x, phi::TransToPtenPlace(backend), blocking, kernel_out);
return out; return out;
} }
......
...@@ -245,6 +245,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> { ...@@ -245,6 +245,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType);
PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout);
PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(Place);
PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&);
PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int>&); PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
......
...@@ -22,6 +22,7 @@ namespace phi { ...@@ -22,6 +22,7 @@ namespace phi {
template <typename Context> template <typename Context>
void Copy(const Context& dev_ctx, void Copy(const Context& dev_ctx,
const DenseTensor& src, const DenseTensor& src,
Place dst_place,
bool blocking, bool blocking,
DenseTensor* dst); DenseTensor* dst);
} // namespace phi } // namespace phi
...@@ -28,6 +28,7 @@ namespace phi { ...@@ -28,6 +28,7 @@ namespace phi {
template <typename Context> template <typename Context>
void Copy(const Context& dev_ctx, void Copy(const Context& dev_ctx,
const DenseTensor& src, const DenseTensor& src,
Place dst_place,
bool blocking, bool blocking,
DenseTensor* dst) { DenseTensor* dst) {
auto* src_ptr = src.data(); auto* src_ptr = src.data();
......
...@@ -26,8 +26,8 @@ void FlattenGradKernel(const Context& dev_ctx, ...@@ -26,8 +26,8 @@ void FlattenGradKernel(const Context& dev_ctx,
DenseTensor* x_grad) { DenseTensor* x_grad) {
auto xshape_dims = xshape.dims(); auto xshape_dims = xshape.dims();
auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size()); auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
phi::Copy(dev_ctx, out_grad, false, x_grad); phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
x_grad->ResizeAndAllocate(x_dims); x_grad->Resize(x_dims);
} }
} // namespace phi } // namespace phi
......
...@@ -28,8 +28,8 @@ void FlattenKernel(const Context& dev_ctx, ...@@ -28,8 +28,8 @@ void FlattenKernel(const Context& dev_ctx,
int stop_axis, int stop_axis,
DenseTensor* out) { DenseTensor* out) {
auto out_dims = out->dims(); auto out_dims = out->dims();
phi::Copy(dev_ctx, x, false, out); phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
out->ResizeAndAllocate(out_dims); out->Resize(out_dims);
} }
// TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate // TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate
......
...@@ -28,11 +28,11 @@ namespace phi { ...@@ -28,11 +28,11 @@ namespace phi {
template <typename Context> template <typename Context>
void Copy(const Context& dev_ctx, void Copy(const Context& dev_ctx,
const DenseTensor& src, const DenseTensor& src,
Place dst_place,
bool blocking, bool blocking,
DenseTensor* dst) { DenseTensor* dst) {
auto* src_ptr = src.data(); auto* src_ptr = src.data();
const auto& src_place = src.place(); const auto& src_place = src.place();
auto dst_place = dst->place();
if (src_place == dst_place && paddle::platform::is_cpu_place(src_place)) { if (src_place == dst_place && paddle::platform::is_cpu_place(src_place)) {
PADDLE_THROW(phi::errors::InvalidArgument( PADDLE_THROW(phi::errors::InvalidArgument(
...@@ -43,8 +43,14 @@ void Copy(const Context& dev_ctx, ...@@ -43,8 +43,14 @@ void Copy(const Context& dev_ctx,
VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to " VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
<< dst_place; << dst_place;
dst->ResizeAndAllocate(src.dims()); dst->Resize(src.dims());
auto* dst_ptr = dst->mutable_data(dst_place);
void* dst_ptr = nullptr;
if (paddle::platform::is_cpu_place(dst_place)) {
dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
} else {
dst_ptr = dev_ctx.Alloc(dst, src.dtype());
}
if (src_ptr == dst_ptr && src_place == dst_place) { if (src_ptr == dst_ptr && src_place == dst_place) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to " VLOG(3) << "Skip copy the same data async from " << src_place << " to "
...@@ -57,17 +63,8 @@ void Copy(const Context& dev_ctx, ...@@ -57,17 +63,8 @@ void Copy(const Context& dev_ctx,
auto size = src.numel() * paddle::experimental::SizeOf(src.dtype()); auto size = src.numel() * paddle::experimental::SizeOf(src.dtype());
if (paddle::platform::is_cuda_pinned_place(src_place) && // NOLINT if (paddle::platform::is_gpu_place(src_place) && // NOLINT
paddle::platform::is_cuda_pinned_place(dst_place)) { paddle::platform::is_cpu_place(dst_place)) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else if (paddle::platform::is_cuda_pinned_place(src_place) && // NOLINT
paddle::platform::is_cpu_place(dst_place)) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else if (paddle::platform::is_cpu_place(src_place) && // NOLINT
paddle::platform::is_cuda_pinned_place(dst_place)) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else if (paddle::platform::is_gpu_place(src_place) && // NOLINT
paddle::platform::is_cpu_place(dst_place)) {
auto src_gpu_place = src_place; auto src_gpu_place = src_place;
auto dst_cpu_place = dst_place; auto dst_cpu_place = dst_place;
auto ctx_place = dev_ctx.GetPlace(); auto ctx_place = dev_ctx.GetPlace();
...@@ -114,56 +111,6 @@ void Copy(const Context& dev_ctx, ...@@ -114,56 +111,6 @@ void Copy(const Context& dev_ctx,
: reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream(); : reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
paddle::memory::Copy( paddle::memory::Copy(
dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream); dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
} else if (paddle::platform::is_gpu_place(src_place) && // NOLINT
paddle::platform::is_cuda_pinned_place(dst_place)) {
auto src_gpu_place = src_place;
auto dst_cuda_pinned_place = dst_place;
auto ctx_place = dev_ctx.GetPlace();
PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place),
true,
phi::errors::PreconditionNotMet(
"Device context place mismatch. When copying Tensor "
"data from GPU memory to CUDA Pinned memory, current "
"device context place should be GPU."));
auto ctx_gpu_place = ctx_place;
PADDLE_ENFORCE_EQ(src_gpu_place,
ctx_gpu_place,
phi::errors::PreconditionNotMet(
"The source GPU device and current device context do "
"not match. The source GPU device number is %d, but "
"device context GPU number is %d.",
src_gpu_place.device,
ctx_gpu_place.device));
auto stream =
blocking ? nullptr
: reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
paddle::memory::Copy(
dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else if (paddle::platform::is_cuda_pinned_place(src_place) && // NOLINT
paddle::platform::is_gpu_place(dst_place)) {
auto src_cuda_pinned_place = src_place;
auto dst_gpu_place = dst_place;
auto ctx_place = dev_ctx.GetPlace();
PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place),
true,
phi::errors::PreconditionNotMet(
"Device context place mismatch. When copying Tensor "
"data from CUDA Pinned memory to GPU memory, current "
"device context place should be GPU."));
auto ctx_gpu_place = ctx_place;
PADDLE_ENFORCE_EQ(dst_gpu_place,
ctx_gpu_place,
phi::errors::PreconditionNotMet(
"The target GPU device and current device context do "
"not match. The target GPU device number is %d, but "
"device context GPU number is %d.",
dst_gpu_place.device,
ctx_gpu_place.device));
auto stream =
blocking ? nullptr
: reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
paddle::memory::Copy(
dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
} else if (paddle::platform::is_gpu_place(src_place) && // NOLINT } else if (paddle::platform::is_gpu_place(src_place) && // NOLINT
paddle::platform::is_gpu_place(dst_place)) { paddle::platform::is_gpu_place(dst_place)) {
auto src_gpu_place = src_place; auto src_gpu_place = src_place;
......
...@@ -1460,7 +1460,7 @@ void default_elementwise_add_grad(const GPUContext &ctx, ...@@ -1460,7 +1460,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
auto *dx_data = dx->mutable_data<T>(ctx.GetPlace()); auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
if (dx->dims() == dout.dims()) { if (dx->dims() == dout.dims()) {
if (dx_data != dout_data) { if (dx_data != dout_data) {
phi::Copy(ctx, dout, false, dx); phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
} }
} else { } else {
// For inplace strategy, dx will be stored in addr of dout, which makes // For inplace strategy, dx will be stored in addr of dout, which makes
...@@ -1481,7 +1481,7 @@ void default_elementwise_add_grad(const GPUContext &ctx, ...@@ -1481,7 +1481,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
auto *dy_data = dy->mutable_data<T>(ctx.GetPlace()); auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
if (dy->dims() == dout.dims()) { if (dy->dims() == dout.dims()) {
if (dy_data != dout_data) { if (dy_data != dout_data) {
phi::Copy(ctx, dout, false, dy); phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
} }
} else { } else {
std::vector<int> reduce_dims = std::vector<int> reduce_dims =
...@@ -1507,11 +1507,11 @@ void elementwise_add_grad(const GPUContext &ctx, ...@@ -1507,11 +1507,11 @@ void elementwise_add_grad(const GPUContext &ctx,
if (dx_data == dout_data && dy_data != dout_data) { if (dx_data == dout_data && dy_data != dout_data) {
VLOG(4) << "Special case when dx_data is the same as dout_data, " VLOG(4) << "Special case when dx_data is the same as dout_data, "
"only need copy dout to dy"; "only need copy dout to dy";
phi::Copy(ctx, dout, false, dy); phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
} else if (dx_data != dout_data && dy_data == dout_data) { } else if (dx_data != dout_data && dy_data == dout_data) {
VLOG(4) << "Special case when dy_data is the same as dout_data, " VLOG(4) << "Special case when dy_data is the same as dout_data, "
"only need copy dout to dx"; "only need copy dout to dx";
phi::Copy(ctx, dout, false, dx); phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
} else if (dx_data != dout_data && dy_data != dout_data) { } else if (dx_data != dout_data && dy_data != dout_data) {
auto size = x.numel(); auto size = x.numel();
int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1); int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
...@@ -1571,7 +1571,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx, ...@@ -1571,7 +1571,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
auto *dx_data = dx->mutable_data<T>(ctx.GetPlace()); auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
if (dx->dims() == dout.dims()) { if (dx->dims() == dout.dims()) {
if (dx_data != dout_data) { if (dx_data != dout_data) {
phi::Copy(ctx, dout, false, dx); phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
} }
} else { } else {
// For inplace strategy, dx will be stored in addr of dout, which makes // For inplace strategy, dx will be stored in addr of dout, which makes
......
...@@ -41,7 +41,7 @@ void FullKernel(const Context& dev_ctx, ...@@ -41,7 +41,7 @@ void FullKernel(const Context& dev_ctx,
DenseTensor* out) { DenseTensor* out) {
out->Resize(phi::make_ddim(shape.GetData())); out->Resize(phi::make_ddim(shape.GetData()));
int numel = out->numel(); int numel = out->numel();
out->mutable_data<T>(dev_ctx.GetPlace()); dev_ctx.template Alloc<T>(out);
if (numel > 0) { if (numel > 0) {
// in transformer model the numel of outpout will be zero. // in transformer model the numel of outpout will be zero.
std::vector<const DenseTensor*> inputs = {}; std::vector<const DenseTensor*> inputs = {};
...@@ -85,7 +85,7 @@ void FullLikeKernel(const Context& dev_ctx, ...@@ -85,7 +85,7 @@ void FullLikeKernel(const Context& dev_ctx,
static_cast<float>(value))); static_cast<float>(value)));
std::vector<const DenseTensor*> inputs = {}; std::vector<const DenseTensor*> inputs = {};
std::vector<DenseTensor*> outputs = {out}; std::vector<DenseTensor*> outputs = {out};
out->mutable_data<T>(dev_ctx.GetPlace()); dev_ctx.template Alloc<T>(out);
// This function has no input, so the inputs.size() == 0. Use kUnary, but the // This function has no input, so the inputs.size() == 0. Use kUnary, but the
// data will not be loaded in the kernel because the number of parameters in // data will not be loaded in the kernel because the number of parameters in
// the operator is 0 // the operator is 0
......
...@@ -36,12 +36,12 @@ void AddGradImpl(const Context& dev_ctx, ...@@ -36,12 +36,12 @@ void AddGradImpl(const Context& dev_ctx,
x_grad->dims() == out_grad.dims()) { x_grad->dims() == out_grad.dims()) {
VLOG(4) << "Special case when y_grad is not needed and x_grad doesn't " VLOG(4) << "Special case when y_grad is not needed and x_grad doesn't "
"reduce"; "reduce";
phi::Copy(dev_ctx, out_grad, false, x_grad); phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
} else if (x_grad == nullptr && y_grad != nullptr && } else if (x_grad == nullptr && y_grad != nullptr &&
y_grad->dims() == out_grad.dims()) { y_grad->dims() == out_grad.dims()) {
VLOG(4) << "Special case when x_grad is not needed and y_grad doesn't " VLOG(4) << "Special case when x_grad is not needed and y_grad doesn't "
"reduce"; "reduce";
phi::Copy(dev_ctx, out_grad, false, y_grad); phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, y_grad);
} else { } else {
grad_func(dev_ctx, x, y, *out, out_grad, x_grad, y_grad, axis); grad_func(dev_ctx, x, y, *out, out_grad, x_grad, y_grad, axis);
} }
......
...@@ -88,7 +88,7 @@ void ExpandGradKernel(const Context& ctx, ...@@ -88,7 +88,7 @@ void ExpandGradKernel(const Context& ctx,
} }
// no need reduce, just copy // no need reduce, just copy
if (just_copy) { if (just_copy) {
phi::Copy(ctx, out_grad, false, in_grad); phi::Copy(ctx, out_grad, ctx.GetPlace(), false, in_grad);
} else { } else {
PADDLE_ENFORCE_GE(dims, PADDLE_ENFORCE_GE(dims,
1, 1,
......
...@@ -32,7 +32,7 @@ void SizeKernel(const Context& ctx, ...@@ -32,7 +32,7 @@ void SizeKernel(const Context& ctx,
cpu_tensor.Resize(out->dims()); cpu_tensor.Resize(out->dims());
auto cpu_data = ctx.template HostAlloc<int64_t>(&cpu_tensor); auto cpu_data = ctx.template HostAlloc<int64_t>(&cpu_tensor);
cpu_data[0] = input.numel(); cpu_data[0] = input.numel();
phi::Copy(ctx, cpu_tensor, false, out); phi::Copy(ctx, cpu_tensor, place, false, out);
} }
} }
......
...@@ -24,7 +24,7 @@ void ReshapeGradKernel(const Context& dev_ctx, ...@@ -24,7 +24,7 @@ void ReshapeGradKernel(const Context& dev_ctx,
const DenseTensor& out_grad, const DenseTensor& out_grad,
DenseTensor* x_grad) { DenseTensor* x_grad) {
auto x_dims = x_grad->dims(); auto x_dims = x_grad->dims();
phi::Copy(dev_ctx, out_grad, false, x_grad); phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
x_grad->Resize(x_dims); x_grad->Resize(x_dims);
} }
......
...@@ -36,7 +36,7 @@ void ReshapeKernel(const Context& dev_ctx, ...@@ -36,7 +36,7 @@ void ReshapeKernel(const Context& dev_ctx,
// TODO(chenweihang): the output dims are overwrite after copying, // TODO(chenweihang): the output dims are overwrite after copying,
// here we need to use copy method that only copy data // here we need to use copy method that only copy data
auto dims = out->dims(); auto dims = out->dims();
phi::Copy(dev_ctx, x, false, out); phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
out->Resize(dims); out->Resize(dims);
out->ResetLoD(x.lod()); out->ResetLoD(x.lod());
} }
......
...@@ -27,12 +27,19 @@ namespace phi { ...@@ -27,12 +27,19 @@ namespace phi {
template <typename Context> template <typename Context>
void Copy(const Context& dev_ctx, void Copy(const Context& dev_ctx,
const DenseTensor& src, const DenseTensor& src,
Place dst_place,
bool blocking, bool blocking,
DenseTensor* dst) { DenseTensor* dst) {
auto* src_ptr = src.data(); auto* src_ptr = src.data();
auto* dst_ptr = dev_ctx.Alloc(dst, src.dtype()); void* dst_ptr = nullptr;
dst->Resize(src.dims());
if (paddle::platform::is_cpu_place(dst_place)) {
dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
} else {
dst_ptr = dev_ctx.Alloc(dst, src.dtype());
}
const auto& src_place = src.place(); const auto& src_place = src.place();
const auto& dst_place = dst->place();
if (src_ptr == dst_ptr && src_place == dst_place) { if (src_ptr == dst_ptr && src_place == dst_place) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to " VLOG(3) << "Skip copy the same data async from " << src_place << " to "
...@@ -43,7 +50,7 @@ void Copy(const Context& dev_ctx, ...@@ -43,7 +50,7 @@ void Copy(const Context& dev_ctx,
VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to " VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
<< dst_place; << dst_place;
dst->ResizeAndAllocate(src.dims());
CHECK(dst->layout() == src.layout()); CHECK(dst->layout() == src.layout());
auto size = src.numel() * paddle::experimental::SizeOf(src.dtype()); auto size = src.numel() * paddle::experimental::SizeOf(src.dtype());
......
...@@ -127,8 +127,8 @@ TEST(API, matmul_cuda) { ...@@ -127,8 +127,8 @@ TEST(API, matmul_cuda) {
auto place = paddle::platform::CUDAPlace(); auto place = paddle::platform::CUDAPlace();
auto* dev_ctx = static_cast<const phi::GPUContext*>(pool.GetByPlace(place)); auto* dev_ctx = static_cast<const phi::GPUContext*>(pool.GetByPlace(place));
phi::Copy(*dev_ctx, *ref_x.get(), false, dense_x.get()); phi::Copy(*dev_ctx, *ref_x.get(), phi::GPUPlace(), false, dense_x.get());
phi::Copy(*dev_ctx, *ref_y.get(), false, dense_y.get()); phi::Copy(*dev_ctx, *ref_y.get(), phi::GPUPlace(), false, dense_y.get());
paddle::experimental::Tensor x(dense_x); paddle::experimental::Tensor x(dense_x);
paddle::experimental::Tensor y(dense_y); paddle::experimental::Tensor y(dense_y);
...@@ -152,7 +152,7 @@ TEST(API, matmul_cuda) { ...@@ -152,7 +152,7 @@ TEST(API, matmul_cuda) {
phi::DenseTensorMeta( phi::DenseTensorMeta(
phi::DataType::FLOAT32, out.dims(), phi::DataLayout::NCHW)); phi::DataType::FLOAT32, out.dims(), phi::DataLayout::NCHW));
phi::Copy(*dev_ctx, *dense_out.get(), false, ref_out.get()); phi::Copy(*dev_ctx, *dense_out.get(), phi::CPUPlace(), false, ref_out.get());
for (size_t i = 0; i < 9; i++) { for (size_t i = 0; i < 9; i++) {
ASSERT_NEAR(sum[i], ref_out->data<float>()[i], 1e-6f); ASSERT_NEAR(sum[i], ref_out->data<float>()[i], 1e-6f);
......
...@@ -62,7 +62,8 @@ TEST(DEV_API, copy) { ...@@ -62,7 +62,8 @@ TEST(DEV_API, copy) {
.GetAllocator(paddle::platform::CPUPlace()) .GetAllocator(paddle::platform::CPUPlace())
.get()); .get());
dev_ctx.Init(); dev_ctx.Init();
phi::Copy(dev_ctx, *(dense_src.get()), false, dense_dst.get()); phi::Copy(
dev_ctx, *(dense_src.get()), phi::CPUPlace(), false, dense_dst.get());
// 3. check result // 3. check result
for (int64_t i = 0; i < dense_src->numel(); i++) { for (int64_t i = 0; i < dense_src->numel(); i++) {
......
...@@ -39,7 +39,7 @@ TEST(DEV_API, empty) { ...@@ -39,7 +39,7 @@ TEST(DEV_API, empty) {
dev_ctx.Init(); dev_ctx.Init();
// 2. test API // 2. test API
auto out = phi::Empty<float>(dev_ctx, {3, 2}, phi::DataType::INT32); auto out = phi::Empty<int>(dev_ctx, {3, 2}, phi::DataType::INT32);
// 3. check result // 3. check result
ASSERT_EQ(out.dims().size(), 2); ASSERT_EQ(out.dims().size(), 2);
......
...@@ -53,8 +53,8 @@ inline void CheckResult( ...@@ -53,8 +53,8 @@ inline void CheckResult(
DenseTensorMeta(real_elements.dtype(), DenseTensorMeta(real_elements.dtype(),
real_elements.dims(), real_elements.dims(),
real_elements.layout())); real_elements.layout()));
phi::Copy(*dev_ctx_gpu, real_indices, true, &indices); phi::Copy(*dev_ctx_gpu, real_indices, indices.place(), true, &indices);
phi::Copy(*dev_ctx_gpu, real_elements, true, &elements); phi::Copy(*dev_ctx_gpu, real_elements, elements.place(), true, &elements);
int cmp_indices = memcmp(indices.data<IndicesT>(), int cmp_indices = memcmp(indices.data<IndicesT>(),
non_zero_indices.data(), non_zero_indices.data(),
...@@ -122,7 +122,7 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x, ...@@ -122,7 +122,7 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
cuda_alloc.get(), cuda_alloc.get(),
DenseTensorMeta(dense_x.dtype(), dense_x.dims(), dense_x.layout())); DenseTensorMeta(dense_x.dtype(), dense_x.dims(), dense_x.layout()));
phi::Copy(dev_ctx_gpu, dense_x, true, &d_dense_x); phi::Copy(dev_ctx_gpu, dense_x, phi::GPUPlace(), true, &d_dense_x);
auto sparse_out = auto sparse_out =
sparse::DenseToSparseCoo<T>(dev_ctx_gpu, d_dense_x, sparse_dim); sparse::DenseToSparseCoo<T>(dev_ctx_gpu, d_dense_x, sparse_dim);
CheckResult<T, int64_t>(&dev_ctx_gpu, CheckResult<T, int64_t>(&dev_ctx_gpu,
...@@ -327,9 +327,9 @@ void TestSparseCsrToCoo(const DDim& dense_dims, ...@@ -327,9 +327,9 @@ void TestSparseCsrToCoo(const DDim& dense_dims,
phi::DenseTensor d_crows(cuda_alloc.get(), crows_meta); phi::DenseTensor d_crows(cuda_alloc.get(), crows_meta);
phi::DenseTensor d_cols(cuda_alloc.get(), cols_meta); phi::DenseTensor d_cols(cuda_alloc.get(), cols_meta);
phi::DenseTensor d_values(cuda_alloc.get(), values_meta); phi::DenseTensor d_values(cuda_alloc.get(), values_meta);
phi::Copy(dev_ctx_gpu, crows, true, &d_crows); phi::Copy(dev_ctx_gpu, crows, d_crows.place(), true, &d_crows);
phi::Copy(dev_ctx_gpu, cols, true, &d_cols); phi::Copy(dev_ctx_gpu, cols, d_cols.place(), true, &d_cols);
phi::Copy(dev_ctx_gpu, values, true, &d_values); phi::Copy(dev_ctx_gpu, values, d_values.place(), true, &d_values);
phi::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims); phi::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims);
auto cuda_sparse_out = sparse::SparseCsrToCoo<T>(dev_ctx_gpu, d_csr); auto cuda_sparse_out = sparse::SparseCsrToCoo<T>(dev_ctx_gpu, d_csr);
CheckResult<T, int64_t>(&dev_ctx_gpu, CheckResult<T, int64_t>(&dev_ctx_gpu,
...@@ -406,9 +406,9 @@ inline void CheckCsrResult( ...@@ -406,9 +406,9 @@ inline void CheckCsrResult(
DenseTensorMeta(real_elements.dtype(), DenseTensorMeta(real_elements.dtype(),
real_elements.dims(), real_elements.dims(),
real_elements.layout())); real_elements.layout()));
phi::Copy(*dev_ctx_gpu, real_crows, true, &crows); phi::Copy(*dev_ctx_gpu, real_crows, crows.place(), true, &crows);
phi::Copy(*dev_ctx_gpu, real_cols, true, &cols); phi::Copy(*dev_ctx_gpu, real_cols, cols.place(), true, &cols);
phi::Copy(*dev_ctx_gpu, real_elements, true, &elements); phi::Copy(*dev_ctx_gpu, real_elements, elements.place(), true, &elements);
int cmp_crows = memcmp(crows.data<IndicesT>(), int cmp_crows = memcmp(crows.data<IndicesT>(),
non_zero_crows.data(), non_zero_crows.data(),
...@@ -500,8 +500,8 @@ void TestCooToCsr(const DDim& dense_dims, ...@@ -500,8 +500,8 @@ void TestCooToCsr(const DDim& dense_dims,
dev_ctx_gpu.PartialInitWithAllocator(); dev_ctx_gpu.PartialInitWithAllocator();
phi::DenseTensor d_indices(cuda_alloc.get(), indices_meta); phi::DenseTensor d_indices(cuda_alloc.get(), indices_meta);
phi::DenseTensor d_values(cuda_alloc.get(), values_meta); phi::DenseTensor d_values(cuda_alloc.get(), values_meta);
phi::Copy(dev_ctx_gpu, indices, true, &d_indices); phi::Copy(dev_ctx_gpu, indices, phi::GPUPlace(), true, &d_indices);
phi::Copy(dev_ctx_gpu, values, true, &d_values); phi::Copy(dev_ctx_gpu, values, phi::GPUPlace(), true, &d_values);
phi::SparseCooTensor d_coo(d_indices, d_values, dense_dims); phi::SparseCooTensor d_coo(d_indices, d_values, dense_dims);
auto cuda_sparse_out = sparse::SparseCooToCsr<T>(dev_ctx_gpu, d_coo); auto cuda_sparse_out = sparse::SparseCooToCsr<T>(dev_ctx_gpu, d_coo);
CheckCsrResult<T, int64_t>(&dev_ctx_gpu, CheckCsrResult<T, int64_t>(&dev_ctx_gpu,
...@@ -593,7 +593,7 @@ void TestDenseToSparseCsr(const DenseTensor& dense_x, ...@@ -593,7 +593,7 @@ void TestDenseToSparseCsr(const DenseTensor& dense_x,
.GetAllocator(phi::CPUPlace()) .GetAllocator(phi::CPUPlace())
.get()); .get());
dev_ctx_gpu.PartialInitWithAllocator(); dev_ctx_gpu.PartialInitWithAllocator();
phi::Copy(dev_ctx_gpu, dense_x, true, &d_dense_x); phi::Copy(dev_ctx_gpu, dense_x, phi::GPUPlace(), true, &d_dense_x);
auto sparse_out = sparse::DenseToSparseCsr<T>(dev_ctx_gpu, d_dense_x); auto sparse_out = sparse::DenseToSparseCsr<T>(dev_ctx_gpu, d_dense_x);
CheckCsrResult<T, int64_t>(&dev_ctx_gpu, CheckCsrResult<T, int64_t>(&dev_ctx_gpu,
...@@ -720,8 +720,10 @@ void TestSparseCooToDense(const DDim& dense_dims, ...@@ -720,8 +720,10 @@ void TestSparseCooToDense(const DDim& dense_dims,
dev_ctx_gpu.PartialInitWithAllocator(); dev_ctx_gpu.PartialInitWithAllocator();
DenseTensor d_dense_indices(cuda_alloc.get(), dense_indices.meta()); DenseTensor d_dense_indices(cuda_alloc.get(), dense_indices.meta());
DenseTensor d_dense_elements(cuda_alloc.get(), dense_elements.meta()); DenseTensor d_dense_elements(cuda_alloc.get(), dense_elements.meta());
phi::Copy(dev_ctx_gpu, dense_indices, true, &d_dense_indices); phi::Copy(
phi::Copy(dev_ctx_gpu, dense_elements, true, &d_dense_elements); dev_ctx_gpu, dense_indices, phi::GPUPlace(), true, &d_dense_indices);
phi::Copy(
dev_ctx_gpu, dense_elements, phi::GPUPlace(), true, &d_dense_elements);
SparseCooTensor coo_cuda(d_dense_indices, d_dense_elements, dense_dims); SparseCooTensor coo_cuda(d_dense_indices, d_dense_elements, dense_dims);
auto dense_out_cuda = sparse::SparseCooToDense<T>(dev_ctx_gpu, coo_cuda); auto dense_out_cuda = sparse::SparseCooToDense<T>(dev_ctx_gpu, coo_cuda);
...@@ -729,7 +731,8 @@ void TestSparseCooToDense(const DDim& dense_dims, ...@@ -729,7 +731,8 @@ void TestSparseCooToDense(const DDim& dense_dims,
DenseTensorMeta(dense_out_cuda.dtype(), DenseTensorMeta(dense_out_cuda.dtype(),
dense_out_cuda.dims(), dense_out_cuda.dims(),
dense_out_cuda.layout())); dense_out_cuda.layout()));
phi::Copy(dev_ctx_gpu, dense_out_cuda, true, &h_dense_out); phi::Copy(
dev_ctx_gpu, dense_out_cuda, h_dense_out.place(), true, &h_dense_out);
int cmp_cuda = memcmp( int cmp_cuda = memcmp(
&dense_data[0], h_dense_out.data<T>(), sizeof(T) * dense_data.size()); &dense_data[0], h_dense_out.data<T>(), sizeof(T) * dense_data.size());
ASSERT_EQ(cmp_cuda, 0); ASSERT_EQ(cmp_cuda, 0);
...@@ -858,13 +861,13 @@ void TestSparseCsrToDense(const DDim& dense_dims, ...@@ -858,13 +861,13 @@ void TestSparseCsrToDense(const DDim& dense_dims,
phi::DenseTensor d_crows(cuda_alloc.get(), crows_meta); phi::DenseTensor d_crows(cuda_alloc.get(), crows_meta);
phi::DenseTensor d_cols(cuda_alloc.get(), cols_meta); phi::DenseTensor d_cols(cuda_alloc.get(), cols_meta);
phi::DenseTensor d_values(cuda_alloc.get(), values_meta); phi::DenseTensor d_values(cuda_alloc.get(), values_meta);
phi::Copy(dev_ctx_gpu, crows, true, &d_crows); phi::Copy(dev_ctx_gpu, crows, phi::GPUPlace(), true, &d_crows);
phi::Copy(dev_ctx_gpu, cols, true, &d_cols); phi::Copy(dev_ctx_gpu, cols, phi::GPUPlace(), true, &d_cols);
phi::Copy(dev_ctx_gpu, values, true, &d_values); phi::Copy(dev_ctx_gpu, values, phi::GPUPlace(), true, &d_values);
phi::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims); phi::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims);
auto cuda_sparse_out = sparse::SparseCsrToDense<T>(dev_ctx_gpu, d_csr); auto cuda_sparse_out = sparse::SparseCsrToDense<T>(dev_ctx_gpu, d_csr);
phi::DenseTensor h_out(alloc.get(), cpu_sparse_out.meta()); phi::DenseTensor h_out(alloc.get(), cpu_sparse_out.meta());
phi::Copy(dev_ctx_gpu, cuda_sparse_out, true, &h_out); phi::Copy(dev_ctx_gpu, cuda_sparse_out, phi::CPUPlace(), true, &h_out);
int cmp_cuda = int cmp_cuda =
memcmp(h_out.data<T>(), dense_data.data(), sizeof(T) * dense_data.size()); memcmp(h_out.data<T>(), dense_data.data(), sizeof(T) * dense_data.size());
ASSERT_EQ(cmp_cuda, 0); ASSERT_EQ(cmp_cuda, 0);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册