未验证 提交 9a7b9eda 编写于 作者: Z zyfncg 提交者: GitHub

[Pten] Refactor the copy kernel (#39731)

* remove SetAllocationForOutputTenosr

* add place param for copy kernel

* recover SetAllocationForOutputTenosr

* polish code

* fix empty_dev api bug

* test=allcases

* test=allcases

* fix bug

* recover empty

* recover modify
上级 581b2c64
......@@ -32,39 +32,33 @@ namespace paddle {
namespace experimental {
Tensor copy_to_impl(const Tensor& x, Backend backend, bool blocking) {
// 1. Get kernel signature and kernel
auto kernel_key_set = ParseKernelKeyByInputArgs(x);
kernel_key_set.backend_set = kernel_key_set.backend_set | BackendSet(backend);
auto kernel_key = kernel_key_set.GetHigestPriorityKernelKey();
auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
"copy", kernel_key);
VLOG(0) << "to API kernel key: " << kernel_key;
VLOG(0) << "to API kernel: " << kernel;
VLOG(6) << "copy API kernel key: " << kernel_key;
VLOG(6) << "copy API kernel: " << kernel;
// 2. Get Device Context
auto* dev_ctx = GetDeviceContextByBackend(kernel_key.backend());
auto kernel_context = phi::KernelContext(dev_ctx);
// 3. Auto data transform
auto dense_x = std::dynamic_pointer_cast<phi::DenseTensor>(x.impl());
kernel_context.EmplaceBackInput(dense_x.get());
kernel_context.EmplaceBackAttr(blocking);
// 4. Prepare outputs & InferMeta
auto dense_out = std::make_shared<phi::DenseTensor>(
phi::make_intrusive<paddle::experimental::SharedStorage>(
phi::TransToPtenPlace(backend)),
phi::DenseTensorMeta());
phi::MetaTensor meta_out(dense_out.get());
phi::UnchangedInferMeta(*dense_x, &meta_out);
dense_out->mutable_data(phi::TransToPtenPlace(backend));
kernel_context.EmplaceBackOutput(dense_out.get());
auto dense_x = TensorToDenseTensor(x);
Tensor out;
out.set_impl(dense_out);
auto kernel_out = SetKernelOutput(kernel_key.backend(), &out);
phi::MetaTensor meta_out(kernel_out);
phi::UnchangedInferMeta(*dense_x, &meta_out);
// 5. Call kernel
kernel(&kernel_context);
using kernel_signature = void (*)(const platform::DeviceContext&,
const phi::DenseTensor&,
phi::Place,
bool,
phi::DenseTensor*);
auto* kernel_fn = kernel.GetVariadicKernelFn<kernel_signature>();
(*kernel_fn)(
*dev_ctx, *dense_x, phi::TransToPtenPlace(backend), blocking, kernel_out);
return out;
}
......
......@@ -245,6 +245,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const Scalar&);
PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataType);
PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(DataLayout);
PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(Place);
PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int64_t>&);
PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const ScalarArray&);
PT_SPECIALIZE_KernelCallHelper_FOR_ATTRIBUTE(const std::vector<int>&);
......
......@@ -22,6 +22,7 @@ namespace phi {
template <typename Context>
void Copy(const Context& dev_ctx,
const DenseTensor& src,
Place dst_place,
bool blocking,
DenseTensor* dst);
} // namespace phi
......@@ -28,6 +28,7 @@ namespace phi {
template <typename Context>
void Copy(const Context& dev_ctx,
const DenseTensor& src,
Place dst_place,
bool blocking,
DenseTensor* dst) {
auto* src_ptr = src.data();
......
......@@ -26,8 +26,8 @@ void FlattenGradKernel(const Context& dev_ctx,
DenseTensor* x_grad) {
auto xshape_dims = xshape.dims();
auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
phi::Copy(dev_ctx, out_grad, false, x_grad);
x_grad->ResizeAndAllocate(x_dims);
phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
x_grad->Resize(x_dims);
}
} // namespace phi
......
......@@ -28,8 +28,8 @@ void FlattenKernel(const Context& dev_ctx,
int stop_axis,
DenseTensor* out) {
auto out_dims = out->dims();
phi::Copy(dev_ctx, x, false, out);
out->ResizeAndAllocate(out_dims);
phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
out->Resize(out_dims);
}
// TODO(yuanrisheng): this kernel is for training and xshape is a Intermediate
......
......@@ -28,11 +28,11 @@ namespace phi {
template <typename Context>
void Copy(const Context& dev_ctx,
const DenseTensor& src,
Place dst_place,
bool blocking,
DenseTensor* dst) {
auto* src_ptr = src.data();
const auto& src_place = src.place();
auto dst_place = dst->place();
if (src_place == dst_place && paddle::platform::is_cpu_place(src_place)) {
PADDLE_THROW(phi::errors::InvalidArgument(
......@@ -43,8 +43,14 @@ void Copy(const Context& dev_ctx,
VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
<< dst_place;
dst->ResizeAndAllocate(src.dims());
auto* dst_ptr = dst->mutable_data(dst_place);
dst->Resize(src.dims());
void* dst_ptr = nullptr;
if (paddle::platform::is_cpu_place(dst_place)) {
dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
} else {
dst_ptr = dev_ctx.Alloc(dst, src.dtype());
}
if (src_ptr == dst_ptr && src_place == dst_place) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
......@@ -57,16 +63,7 @@ void Copy(const Context& dev_ctx,
auto size = src.numel() * paddle::experimental::SizeOf(src.dtype());
if (paddle::platform::is_cuda_pinned_place(src_place) && // NOLINT
paddle::platform::is_cuda_pinned_place(dst_place)) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else if (paddle::platform::is_cuda_pinned_place(src_place) && // NOLINT
paddle::platform::is_cpu_place(dst_place)) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else if (paddle::platform::is_cpu_place(src_place) && // NOLINT
paddle::platform::is_cuda_pinned_place(dst_place)) {
paddle::memory::Copy(dst_place, dst_ptr, src_place, src_ptr, size);
} else if (paddle::platform::is_gpu_place(src_place) && // NOLINT
if (paddle::platform::is_gpu_place(src_place) && // NOLINT
paddle::platform::is_cpu_place(dst_place)) {
auto src_gpu_place = src_place;
auto dst_cpu_place = dst_place;
......@@ -114,56 +111,6 @@ void Copy(const Context& dev_ctx,
: reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
paddle::memory::Copy(
dst_gpu_place, dst_ptr, src_cpu_place, src_ptr, size, stream);
} else if (paddle::platform::is_gpu_place(src_place) && // NOLINT
paddle::platform::is_cuda_pinned_place(dst_place)) {
auto src_gpu_place = src_place;
auto dst_cuda_pinned_place = dst_place;
auto ctx_place = dev_ctx.GetPlace();
PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place),
true,
phi::errors::PreconditionNotMet(
"Device context place mismatch. When copying Tensor "
"data from GPU memory to CUDA Pinned memory, current "
"device context place should be GPU."));
auto ctx_gpu_place = ctx_place;
PADDLE_ENFORCE_EQ(src_gpu_place,
ctx_gpu_place,
phi::errors::PreconditionNotMet(
"The source GPU device and current device context do "
"not match. The source GPU device number is %d, but "
"device context GPU number is %d.",
src_gpu_place.device,
ctx_gpu_place.device));
auto stream =
blocking ? nullptr
: reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
paddle::memory::Copy(
dst_cuda_pinned_place, dst_ptr, src_gpu_place, src_ptr, size, stream);
} else if (paddle::platform::is_cuda_pinned_place(src_place) && // NOLINT
paddle::platform::is_gpu_place(dst_place)) {
auto src_cuda_pinned_place = src_place;
auto dst_gpu_place = dst_place;
auto ctx_place = dev_ctx.GetPlace();
PADDLE_ENFORCE_EQ(paddle::platform::is_gpu_place(ctx_place),
true,
phi::errors::PreconditionNotMet(
"Device context place mismatch. When copying Tensor "
"data from CUDA Pinned memory to GPU memory, current "
"device context place should be GPU."));
auto ctx_gpu_place = ctx_place;
PADDLE_ENFORCE_EQ(dst_gpu_place,
ctx_gpu_place,
phi::errors::PreconditionNotMet(
"The target GPU device and current device context do "
"not match. The target GPU device number is %d, but "
"device context GPU number is %d.",
dst_gpu_place.device,
ctx_gpu_place.device));
auto stream =
blocking ? nullptr
: reinterpret_cast<const phi::GPUContext&>(dev_ctx).stream();
paddle::memory::Copy(
dst_gpu_place, dst_ptr, src_cuda_pinned_place, src_ptr, size, stream);
} else if (paddle::platform::is_gpu_place(src_place) && // NOLINT
paddle::platform::is_gpu_place(dst_place)) {
auto src_gpu_place = src_place;
......
......@@ -1460,7 +1460,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
if (dx->dims() == dout.dims()) {
if (dx_data != dout_data) {
phi::Copy(ctx, dout, false, dx);
phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
}
} else {
// For inplace strategy, dx will be stored in addr of dout, which makes
......@@ -1481,7 +1481,7 @@ void default_elementwise_add_grad(const GPUContext &ctx,
auto *dy_data = dy->mutable_data<T>(ctx.GetPlace());
if (dy->dims() == dout.dims()) {
if (dy_data != dout_data) {
phi::Copy(ctx, dout, false, dy);
phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
}
} else {
std::vector<int> reduce_dims =
......@@ -1507,11 +1507,11 @@ void elementwise_add_grad(const GPUContext &ctx,
if (dx_data == dout_data && dy_data != dout_data) {
VLOG(4) << "Special case when dx_data is the same as dout_data, "
"only need copy dout to dy";
phi::Copy(ctx, dout, false, dy);
phi::Copy(ctx, dout, ctx.GetPlace(), false, dy);
} else if (dx_data != dout_data && dy_data == dout_data) {
VLOG(4) << "Special case when dy_data is the same as dout_data, "
"only need copy dout to dx";
phi::Copy(ctx, dout, false, dx);
phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
} else if (dx_data != dout_data && dy_data != dout_data) {
auto size = x.numel();
int vec_size = max(static_cast<int>(sizeof(float4) / sizeof(T)), 1);
......@@ -1571,7 +1571,7 @@ void default_elementwise_sub_grad(const GPUContext &ctx,
auto *dx_data = dx->mutable_data<T>(ctx.GetPlace());
if (dx->dims() == dout.dims()) {
if (dx_data != dout_data) {
phi::Copy(ctx, dout, false, dx);
phi::Copy(ctx, dout, ctx.GetPlace(), false, dx);
}
} else {
// For inplace strategy, dx will be stored in addr of dout, which makes
......
......@@ -41,7 +41,7 @@ void FullKernel(const Context& dev_ctx,
DenseTensor* out) {
out->Resize(phi::make_ddim(shape.GetData()));
int numel = out->numel();
out->mutable_data<T>(dev_ctx.GetPlace());
dev_ctx.template Alloc<T>(out);
if (numel > 0) {
// in transformer model the numel of outpout will be zero.
std::vector<const DenseTensor*> inputs = {};
......@@ -85,7 +85,7 @@ void FullLikeKernel(const Context& dev_ctx,
static_cast<float>(value)));
std::vector<const DenseTensor*> inputs = {};
std::vector<DenseTensor*> outputs = {out};
out->mutable_data<T>(dev_ctx.GetPlace());
dev_ctx.template Alloc<T>(out);
// This function has no input, so the inputs.size() == 0. Use kUnary, but the
// data will not be loaded in the kernel because the number of parameters in
// the operator is 0
......
......@@ -36,12 +36,12 @@ void AddGradImpl(const Context& dev_ctx,
x_grad->dims() == out_grad.dims()) {
VLOG(4) << "Special case when y_grad is not needed and x_grad doesn't "
"reduce";
phi::Copy(dev_ctx, out_grad, false, x_grad);
phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
} else if (x_grad == nullptr && y_grad != nullptr &&
y_grad->dims() == out_grad.dims()) {
VLOG(4) << "Special case when x_grad is not needed and y_grad doesn't "
"reduce";
phi::Copy(dev_ctx, out_grad, false, y_grad);
phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, y_grad);
} else {
grad_func(dev_ctx, x, y, *out, out_grad, x_grad, y_grad, axis);
}
......
......@@ -88,7 +88,7 @@ void ExpandGradKernel(const Context& ctx,
}
// no need reduce, just copy
if (just_copy) {
phi::Copy(ctx, out_grad, false, in_grad);
phi::Copy(ctx, out_grad, ctx.GetPlace(), false, in_grad);
} else {
PADDLE_ENFORCE_GE(dims,
1,
......
......@@ -32,7 +32,7 @@ void SizeKernel(const Context& ctx,
cpu_tensor.Resize(out->dims());
auto cpu_data = ctx.template HostAlloc<int64_t>(&cpu_tensor);
cpu_data[0] = input.numel();
phi::Copy(ctx, cpu_tensor, false, out);
phi::Copy(ctx, cpu_tensor, place, false, out);
}
}
......
......@@ -24,7 +24,7 @@ void ReshapeGradKernel(const Context& dev_ctx,
const DenseTensor& out_grad,
DenseTensor* x_grad) {
auto x_dims = x_grad->dims();
phi::Copy(dev_ctx, out_grad, false, x_grad);
phi::Copy(dev_ctx, out_grad, dev_ctx.GetPlace(), false, x_grad);
x_grad->Resize(x_dims);
}
......
......@@ -36,7 +36,7 @@ void ReshapeKernel(const Context& dev_ctx,
// TODO(chenweihang): the output dims are overwrite after copying,
// here we need to use copy method that only copy data
auto dims = out->dims();
phi::Copy(dev_ctx, x, false, out);
phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
out->Resize(dims);
out->ResetLoD(x.lod());
}
......
......@@ -27,12 +27,19 @@ namespace phi {
template <typename Context>
void Copy(const Context& dev_ctx,
const DenseTensor& src,
Place dst_place,
bool blocking,
DenseTensor* dst) {
auto* src_ptr = src.data();
auto* dst_ptr = dev_ctx.Alloc(dst, src.dtype());
void* dst_ptr = nullptr;
dst->Resize(src.dims());
if (paddle::platform::is_cpu_place(dst_place)) {
dst_ptr = dev_ctx.HostAlloc(dst, src.dtype());
} else {
dst_ptr = dev_ctx.Alloc(dst, src.dtype());
}
const auto& src_place = src.place();
const auto& dst_place = dst->place();
if (src_ptr == dst_ptr && src_place == dst_place) {
VLOG(3) << "Skip copy the same data async from " << src_place << " to "
......@@ -43,7 +50,7 @@ void Copy(const Context& dev_ctx,
VLOG(3) << "TensorCopy " << src.dims() << " from " << src.place() << " to "
<< dst_place;
dst->ResizeAndAllocate(src.dims());
CHECK(dst->layout() == src.layout());
auto size = src.numel() * paddle::experimental::SizeOf(src.dtype());
......
......@@ -127,8 +127,8 @@ TEST(API, matmul_cuda) {
auto place = paddle::platform::CUDAPlace();
auto* dev_ctx = static_cast<const phi::GPUContext*>(pool.GetByPlace(place));
phi::Copy(*dev_ctx, *ref_x.get(), false, dense_x.get());
phi::Copy(*dev_ctx, *ref_y.get(), false, dense_y.get());
phi::Copy(*dev_ctx, *ref_x.get(), phi::GPUPlace(), false, dense_x.get());
phi::Copy(*dev_ctx, *ref_y.get(), phi::GPUPlace(), false, dense_y.get());
paddle::experimental::Tensor x(dense_x);
paddle::experimental::Tensor y(dense_y);
......@@ -152,7 +152,7 @@ TEST(API, matmul_cuda) {
phi::DenseTensorMeta(
phi::DataType::FLOAT32, out.dims(), phi::DataLayout::NCHW));
phi::Copy(*dev_ctx, *dense_out.get(), false, ref_out.get());
phi::Copy(*dev_ctx, *dense_out.get(), phi::CPUPlace(), false, ref_out.get());
for (size_t i = 0; i < 9; i++) {
ASSERT_NEAR(sum[i], ref_out->data<float>()[i], 1e-6f);
......
......@@ -62,7 +62,8 @@ TEST(DEV_API, copy) {
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx.Init();
phi::Copy(dev_ctx, *(dense_src.get()), false, dense_dst.get());
phi::Copy(
dev_ctx, *(dense_src.get()), phi::CPUPlace(), false, dense_dst.get());
// 3. check result
for (int64_t i = 0; i < dense_src->numel(); i++) {
......
......@@ -39,7 +39,7 @@ TEST(DEV_API, empty) {
dev_ctx.Init();
// 2. test API
auto out = phi::Empty<float>(dev_ctx, {3, 2}, phi::DataType::INT32);
auto out = phi::Empty<int>(dev_ctx, {3, 2}, phi::DataType::INT32);
// 3. check result
ASSERT_EQ(out.dims().size(), 2);
......
......@@ -53,8 +53,8 @@ inline void CheckResult(
DenseTensorMeta(real_elements.dtype(),
real_elements.dims(),
real_elements.layout()));
phi::Copy(*dev_ctx_gpu, real_indices, true, &indices);
phi::Copy(*dev_ctx_gpu, real_elements, true, &elements);
phi::Copy(*dev_ctx_gpu, real_indices, indices.place(), true, &indices);
phi::Copy(*dev_ctx_gpu, real_elements, elements.place(), true, &elements);
int cmp_indices = memcmp(indices.data<IndicesT>(),
non_zero_indices.data(),
......@@ -122,7 +122,7 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
cuda_alloc.get(),
DenseTensorMeta(dense_x.dtype(), dense_x.dims(), dense_x.layout()));
phi::Copy(dev_ctx_gpu, dense_x, true, &d_dense_x);
phi::Copy(dev_ctx_gpu, dense_x, phi::GPUPlace(), true, &d_dense_x);
auto sparse_out =
sparse::DenseToSparseCoo<T>(dev_ctx_gpu, d_dense_x, sparse_dim);
CheckResult<T, int64_t>(&dev_ctx_gpu,
......@@ -327,9 +327,9 @@ void TestSparseCsrToCoo(const DDim& dense_dims,
phi::DenseTensor d_crows(cuda_alloc.get(), crows_meta);
phi::DenseTensor d_cols(cuda_alloc.get(), cols_meta);
phi::DenseTensor d_values(cuda_alloc.get(), values_meta);
phi::Copy(dev_ctx_gpu, crows, true, &d_crows);
phi::Copy(dev_ctx_gpu, cols, true, &d_cols);
phi::Copy(dev_ctx_gpu, values, true, &d_values);
phi::Copy(dev_ctx_gpu, crows, d_crows.place(), true, &d_crows);
phi::Copy(dev_ctx_gpu, cols, d_cols.place(), true, &d_cols);
phi::Copy(dev_ctx_gpu, values, d_values.place(), true, &d_values);
phi::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims);
auto cuda_sparse_out = sparse::SparseCsrToCoo<T>(dev_ctx_gpu, d_csr);
CheckResult<T, int64_t>(&dev_ctx_gpu,
......@@ -406,9 +406,9 @@ inline void CheckCsrResult(
DenseTensorMeta(real_elements.dtype(),
real_elements.dims(),
real_elements.layout()));
phi::Copy(*dev_ctx_gpu, real_crows, true, &crows);
phi::Copy(*dev_ctx_gpu, real_cols, true, &cols);
phi::Copy(*dev_ctx_gpu, real_elements, true, &elements);
phi::Copy(*dev_ctx_gpu, real_crows, crows.place(), true, &crows);
phi::Copy(*dev_ctx_gpu, real_cols, cols.place(), true, &cols);
phi::Copy(*dev_ctx_gpu, real_elements, elements.place(), true, &elements);
int cmp_crows = memcmp(crows.data<IndicesT>(),
non_zero_crows.data(),
......@@ -500,8 +500,8 @@ void TestCooToCsr(const DDim& dense_dims,
dev_ctx_gpu.PartialInitWithAllocator();
phi::DenseTensor d_indices(cuda_alloc.get(), indices_meta);
phi::DenseTensor d_values(cuda_alloc.get(), values_meta);
phi::Copy(dev_ctx_gpu, indices, true, &d_indices);
phi::Copy(dev_ctx_gpu, values, true, &d_values);
phi::Copy(dev_ctx_gpu, indices, phi::GPUPlace(), true, &d_indices);
phi::Copy(dev_ctx_gpu, values, phi::GPUPlace(), true, &d_values);
phi::SparseCooTensor d_coo(d_indices, d_values, dense_dims);
auto cuda_sparse_out = sparse::SparseCooToCsr<T>(dev_ctx_gpu, d_coo);
CheckCsrResult<T, int64_t>(&dev_ctx_gpu,
......@@ -593,7 +593,7 @@ void TestDenseToSparseCsr(const DenseTensor& dense_x,
.GetAllocator(phi::CPUPlace())
.get());
dev_ctx_gpu.PartialInitWithAllocator();
phi::Copy(dev_ctx_gpu, dense_x, true, &d_dense_x);
phi::Copy(dev_ctx_gpu, dense_x, phi::GPUPlace(), true, &d_dense_x);
auto sparse_out = sparse::DenseToSparseCsr<T>(dev_ctx_gpu, d_dense_x);
CheckCsrResult<T, int64_t>(&dev_ctx_gpu,
......@@ -720,8 +720,10 @@ void TestSparseCooToDense(const DDim& dense_dims,
dev_ctx_gpu.PartialInitWithAllocator();
DenseTensor d_dense_indices(cuda_alloc.get(), dense_indices.meta());
DenseTensor d_dense_elements(cuda_alloc.get(), dense_elements.meta());
phi::Copy(dev_ctx_gpu, dense_indices, true, &d_dense_indices);
phi::Copy(dev_ctx_gpu, dense_elements, true, &d_dense_elements);
phi::Copy(
dev_ctx_gpu, dense_indices, phi::GPUPlace(), true, &d_dense_indices);
phi::Copy(
dev_ctx_gpu, dense_elements, phi::GPUPlace(), true, &d_dense_elements);
SparseCooTensor coo_cuda(d_dense_indices, d_dense_elements, dense_dims);
auto dense_out_cuda = sparse::SparseCooToDense<T>(dev_ctx_gpu, coo_cuda);
......@@ -729,7 +731,8 @@ void TestSparseCooToDense(const DDim& dense_dims,
DenseTensorMeta(dense_out_cuda.dtype(),
dense_out_cuda.dims(),
dense_out_cuda.layout()));
phi::Copy(dev_ctx_gpu, dense_out_cuda, true, &h_dense_out);
phi::Copy(
dev_ctx_gpu, dense_out_cuda, h_dense_out.place(), true, &h_dense_out);
int cmp_cuda = memcmp(
&dense_data[0], h_dense_out.data<T>(), sizeof(T) * dense_data.size());
ASSERT_EQ(cmp_cuda, 0);
......@@ -858,13 +861,13 @@ void TestSparseCsrToDense(const DDim& dense_dims,
phi::DenseTensor d_crows(cuda_alloc.get(), crows_meta);
phi::DenseTensor d_cols(cuda_alloc.get(), cols_meta);
phi::DenseTensor d_values(cuda_alloc.get(), values_meta);
phi::Copy(dev_ctx_gpu, crows, true, &d_crows);
phi::Copy(dev_ctx_gpu, cols, true, &d_cols);
phi::Copy(dev_ctx_gpu, values, true, &d_values);
phi::Copy(dev_ctx_gpu, crows, phi::GPUPlace(), true, &d_crows);
phi::Copy(dev_ctx_gpu, cols, phi::GPUPlace(), true, &d_cols);
phi::Copy(dev_ctx_gpu, values, phi::GPUPlace(), true, &d_values);
phi::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims);
auto cuda_sparse_out = sparse::SparseCsrToDense<T>(dev_ctx_gpu, d_csr);
phi::DenseTensor h_out(alloc.get(), cpu_sparse_out.meta());
phi::Copy(dev_ctx_gpu, cuda_sparse_out, true, &h_out);
phi::Copy(dev_ctx_gpu, cuda_sparse_out, phi::CPUPlace(), true, &h_out);
int cmp_cuda =
memcmp(h_out.data<T>(), dense_data.data(), sizeof(T) * dense_data.size());
ASSERT_EQ(cmp_cuda, 0);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册