“5f9bf1bd5141b69cc820248fca766f461d72cf1f”上不存在“demo/image_classification/README.md”
未验证 提交 5752643b 编写于 作者: Z zhangkaihuo 提交者: GitHub

sparse convertion kernel support secondary dispatch (#43345)

* use GpuMemcpy and GpuMemset

* sparse convert kernel support double dispatch by indices dtype

* cudaMemcpyKind->gpuMemcpyKind
上级 c4c30e6f
...@@ -67,6 +67,16 @@ DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory, ...@@ -67,6 +67,16 @@ DECLARE_CONSTANT_FOR_GPU(gpuErrorOutOfMemory,
DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady); DECLARE_CONSTANT_FOR_GPU(gpuErrorNotReady, cudaErrorNotReady, hipErrorNotReady);
DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess); DECLARE_CONSTANT_FOR_GPU(gpuSuccess, cudaSuccess, hipSuccess);
DECLARE_CONSTANT_FOR_GPU(gpuMemcpyHostToDevice,
cudaMemcpyKind::cudaMemcpyHostToDevice,
hipMemcpyKind::hipMemcpyHostToDevice);
DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToHost,
cudaMemcpyKind::cudaMemcpyDeviceToHost,
hipMemcpyKind::hipMemcpyDeviceToHost);
DECLARE_CONSTANT_FOR_GPU(gpuMemcpyDeviceToDevice,
cudaMemcpyKind::cudaMemcpyDeviceToDevice,
hipMemcpyKind::hipMemcpyDeviceToDevice);
#undef DECLARE_CONSTANT_FOR_GPU #undef DECLARE_CONSTANT_FOR_GPU
} // namespace phi } // namespace phi
......
...@@ -17,6 +17,7 @@ limitations under the License. */ ...@@ -17,6 +17,7 @@ limitations under the License. */
#include "paddle/phi/api/lib/utils/allocator.h" #include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/core/tensor_meta.h"
#include "paddle/phi/core/visit_type.h"
#include "paddle/phi/kernels/funcs/sparse/common_shape.h" #include "paddle/phi/kernels/funcs/sparse/common_shape.h"
namespace phi { namespace phi {
...@@ -68,20 +69,23 @@ void DenseToSparseCooKernel(const Context& dev_ctx, ...@@ -68,20 +69,23 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
SparseCooTensor* out) { SparseCooTensor* out) {
const T* x_data = x.data<T>(); const T* x_data = x.data<T>();
const auto& x_dims = x.dims(); const auto& x_dims = x.dims();
PADDLE_ENFORCE_LE(sparse_dim,
x_dims.size(),
phi::errors::InvalidArgument(
"sparse_dim must be less than the size of x.dims()"));
PADDLE_ENFORCE_GT(
sparse_dim, 0, phi::errors::InvalidArgument("sparse_dim must be >0"));
int64_t non_zero_num = GetNonZeroNum<T>(x, sparse_dim); int64_t non_zero_num = GetNonZeroNum<T>(x, sparse_dim);
const auto place = dev_ctx.GetPlace();
const auto values_dims = const auto values_dims =
phi::funcs::sparse::InferDenseDims(x_dims, sparse_dim, non_zero_num); phi::funcs::sparse::InferDenseDims(x_dims, sparse_dim, non_zero_num);
DenseTensorMeta indices_meta(DataType::INT64,
{sparse_dim, static_cast<int64_t>(non_zero_num)},
DataLayout::NCHW);
DenseTensorMeta values_meta(x.meta().dtype, values_dims, x.meta().layout); DenseTensorMeta values_meta(x.meta().dtype, values_dims, x.meta().layout);
phi::DenseTensor indices = phi::Empty(dev_ctx, std::move(indices_meta)); phi::DenseTensor indices =
phi::Empty<int64_t>(dev_ctx, {sparse_dim, non_zero_num});
phi::DenseTensor values = phi::Empty(dev_ctx, std::move(values_meta)); phi::DenseTensor values = phi::Empty(dev_ctx, std::move(values_meta));
int64_t* indices_data = indices.mutable_data<int64_t>(place); int64_t* indices_data = indices.data<int64_t>();
T* values_data = values.mutable_data<T>(place); T* values_data = values.data<T>();
auto dims_2d = flatten_to_2d(x_dims, sparse_dim); auto dims_2d = flatten_to_2d(x_dims, sparse_dim);
const int rows = dims_2d[0]; const int rows = dims_2d[0];
...@@ -102,8 +106,8 @@ void DenseToSparseCooKernel(const Context& dev_ctx, ...@@ -102,8 +106,8 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
out->SetMember(indices, values, x_dims, true); out->SetMember(indices, values, x_dims, true);
} }
template <typename T, typename Context> template <typename T, typename IntT>
void SparseCsrToCooKernel(const Context& dev_ctx, void SparseCsrToCooCPUKernel(const CPUContext& dev_ctx,
const SparseCsrTensor& x, const SparseCsrTensor& x,
SparseCooTensor* out) { SparseCooTensor* out) {
const DDim& x_dims = x.dims(); const DDim& x_dims = x.dims();
...@@ -111,27 +115,23 @@ void SparseCsrToCooKernel(const Context& dev_ctx, ...@@ -111,27 +115,23 @@ void SparseCsrToCooKernel(const Context& dev_ctx,
const auto& csr_crows = x.non_zero_crows(); const auto& csr_crows = x.non_zero_crows();
const auto& csr_cols = x.non_zero_cols(); const auto& csr_cols = x.non_zero_cols();
const auto& csr_values = x.non_zero_elements(); const auto& csr_values = x.non_zero_elements();
const int64_t* csr_crows_data = csr_crows.data<int64_t>(); const IntT* csr_crows_data = csr_crows.data<IntT>();
const int64_t* csr_cols_data = csr_cols.data<int64_t>(); const IntT* csr_cols_data = csr_cols.data<IntT>();
const T* csr_values_data = csr_values.data<T>(); const T* csr_values_data = csr_values.data<T>();
int64_t sparse_dim = 2; int64_t sparse_dim = 2;
if (x_dims.size() == 3) { if (x_dims.size() == 3) {
sparse_dim = 3; sparse_dim = 3;
} }
const auto place = dev_ctx.GetPlace(); phi::DenseTensor indices =
DenseTensorMeta indices_meta( phi::Empty<IntT>(dev_ctx, {sparse_dim, non_zero_num});
DataType::INT64, {sparse_dim, non_zero_num}, DataLayout::NCHW); phi::DenseTensor values = phi::Empty<T>(dev_ctx, {non_zero_num});
DenseTensorMeta values_meta( IntT* coo_indices = indices.data<IntT>();
x.dtype(), {non_zero_num}, x.non_zero_elements().layout()); IntT* batch_ptr = x_dims.size() == 2 ? nullptr : coo_indices;
phi::DenseTensor indices = phi::Empty(dev_ctx, std::move(indices_meta)); IntT* coo_rows_data =
phi::DenseTensor values = phi::Empty(dev_ctx, std::move(values_meta));
int64_t* coo_indices = indices.mutable_data<int64_t>(place);
int64_t* batch_ptr = x_dims.size() == 2 ? nullptr : coo_indices;
int64_t* coo_rows_data =
x_dims.size() == 2 ? coo_indices : batch_ptr + non_zero_num; x_dims.size() == 2 ? coo_indices : batch_ptr + non_zero_num;
int64_t* coo_cols_data = coo_rows_data + non_zero_num; IntT* coo_cols_data = coo_rows_data + non_zero_num;
T* coo_values_data = values.mutable_data<T>(place); T* coo_values_data = values.data<T>();
int batch = x_dims.size() == 2 ? 1 : x_dims[0]; int batch = x_dims.size() == 2 ? 1 : x_dims[0];
int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1]; int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1];
...@@ -139,7 +139,7 @@ void SparseCsrToCooKernel(const Context& dev_ctx, ...@@ -139,7 +139,7 @@ void SparseCsrToCooKernel(const Context& dev_ctx,
int index = 0; int index = 0;
for (int b = 0; b < batch; b++) { for (int b = 0; b < batch; b++) {
for (int i = 0; i < rows; i++) { for (int i = 0; i < rows; i++) {
for (int j = csr_crows_data[b * (rows + 1) + i]; for (IntT j = csr_crows_data[b * (rows + 1) + i];
j < csr_crows_data[b * (rows + 1) + i + 1]; j < csr_crows_data[b * (rows + 1) + i + 1];
j++) { j++) {
coo_rows_data[index] = i; coo_rows_data[index] = i;
...@@ -151,13 +151,23 @@ void SparseCsrToCooKernel(const Context& dev_ctx, ...@@ -151,13 +151,23 @@ void SparseCsrToCooKernel(const Context& dev_ctx,
} }
} }
memcpy(coo_cols_data, csr_cols_data, sizeof(int64_t) * non_zero_num); memcpy(coo_cols_data, csr_cols_data, sizeof(IntT) * non_zero_num);
memcpy(coo_values_data, csr_values_data, sizeof(T) * non_zero_num); memcpy(coo_values_data, csr_values_data, sizeof(T) * non_zero_num);
out->SetMember(indices, values, x_dims, true); out->SetMember(indices, values, x_dims, true);
} }
template <typename T, typename Context> template <typename T, typename Context>
void SparseCooToCsrKernel(const Context& dev_ctx, void SparseCsrToCooKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
SparseCooTensor* out) {
PD_VISIT_INTEGRAL_TYPES(
x.non_zero_crows().dtype(), "SparseCsrToCooCPUKernel", ([&] {
SparseCsrToCooCPUKernel<T, data_t>(dev_ctx, x, out);
}));
}
template <typename T, typename IntT>
void SparseCooToCsrCPUKernel(const CPUContext& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
SparseCsrTensor* out) { SparseCsrTensor* out) {
const auto& x_dims = x.dims(); const auto& x_dims = x.dims();
...@@ -174,11 +184,11 @@ void SparseCooToCsrKernel(const Context& dev_ctx, ...@@ -174,11 +184,11 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
phi::DenseTensor non_zero_crows; phi::DenseTensor non_zero_crows;
non_zero_crows.Resize({batchs * (rows + 1)}); non_zero_crows.Resize({batchs * (rows + 1)});
int64_t* csr_crows_data = dev_ctx.template Alloc<int64_t>(&non_zero_crows); IntT* csr_crows_data = dev_ctx.template Alloc<IntT>(&non_zero_crows);
phi::DenseTensor non_zero_cols; phi::DenseTensor non_zero_cols;
non_zero_cols.Resize({non_zero_num}); non_zero_cols.Resize({non_zero_num});
int64_t* csr_cols_data = dev_ctx.template Alloc<int64_t>(&non_zero_cols); IntT* csr_cols_data = dev_ctx.template Alloc<IntT>(&non_zero_cols);
phi::DenseTensor non_zero_elements; phi::DenseTensor non_zero_elements;
non_zero_elements.Resize({non_zero_num}); non_zero_elements.Resize({non_zero_num});
...@@ -186,16 +196,12 @@ void SparseCooToCsrKernel(const Context& dev_ctx, ...@@ -186,16 +196,12 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
const auto& coo_indices = x.non_zero_indices(); const auto& coo_indices = x.non_zero_indices();
const auto& coo_values = x.non_zero_elements(); const auto& coo_values = x.non_zero_elements();
const int64_t* batchs_ptr = coo_indices.data<int64_t>(); const IntT* batchs_ptr = coo_indices.data<IntT>();
const int64_t* coo_rows_data = const IntT* coo_rows_data =
batchs == 1 ? batchs_ptr : batchs_ptr + non_zero_num; batchs == 1 ? batchs_ptr : batchs_ptr + non_zero_num;
const int64_t* coo_cols_data = coo_rows_data + non_zero_num; const IntT* coo_cols_data = coo_rows_data + non_zero_num;
const T* coo_values_data = coo_values.data<T>(); const T* coo_values_data = coo_values.data<T>();
if (!x.coalesced()) {
// TODO(zhangkahuo): call coalesced() to distinct and sort the indices
}
std::vector<int64_t> offsets(batchs, 0); std::vector<int64_t> offsets(batchs, 0);
if (batchs > 1) { if (batchs > 1) {
for (int i = 0; i < non_zero_num; i++) { for (int i = 0; i < non_zero_num; i++) {
...@@ -220,23 +226,32 @@ void SparseCooToCsrKernel(const Context& dev_ctx, ...@@ -220,23 +226,32 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
csr_crows_data[b * (rows + 1) + i] = 0; csr_crows_data[b * (rows + 1) + i] = 0;
} }
for (int64_t i = 1; i < batch_non_zero_num; i++) { for (int64_t i = 1; i < batch_non_zero_num; i++) {
for (int j = coo_rows_ptr[i - 1]; j < coo_rows_ptr[i]; j++) { for (IntT j = coo_rows_ptr[i - 1]; j < coo_rows_ptr[i]; j++) {
csr_crows_data[b * (rows + 1) + j + 1] = i; csr_crows_data[b * (rows + 1) + j + 1] = i;
} }
} }
for (int64_t i = coo_rows_ptr[batch_non_zero_num - 1] + 1; i < rows + 1; for (IntT i = coo_rows_ptr[batch_non_zero_num - 1] + 1; i < rows + 1; i++) {
i++) {
csr_crows_data[b * (rows + 1) + i] = batch_non_zero_num; csr_crows_data[b * (rows + 1) + i] = batch_non_zero_num;
} }
} }
memcpy(csr_cols_data, coo_cols_data, sizeof(int64_t) * non_zero_num); memcpy(csr_cols_data, coo_cols_data, sizeof(IntT) * non_zero_num);
memcpy(csr_values_data, coo_values_data, sizeof(T) * non_zero_num); memcpy(csr_values_data, coo_values_data, sizeof(T) * non_zero_num);
out->SetMember(non_zero_crows, non_zero_cols, non_zero_elements, x_dims); out->SetMember(non_zero_crows, non_zero_cols, non_zero_elements, x_dims);
} }
template <typename T, typename Context> template <typename T, typename Context>
void SparseCooToDenseKernel(const Context& dev_ctx, void SparseCooToCsrKernel(const Context& dev_ctx,
const SparseCooTensor& x,
SparseCsrTensor* out) {
PD_VISIT_INTEGRAL_TYPES(
x.non_zero_indices().dtype(), "SparseCooToCsrCPUKernel", ([&] {
SparseCooToCsrCPUKernel<T, data_t>(dev_ctx, x, out);
}));
}
template <typename T, typename IntT>
void SparseCooToDenseCPUKernel(const CPUContext& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
DenseTensor* out) { DenseTensor* out) {
const auto non_zero_num = x.nnz(); const auto non_zero_num = x.nnz();
...@@ -270,8 +285,7 @@ void SparseCooToDenseKernel(const Context& dev_ctx, ...@@ -270,8 +285,7 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
for (auto i = 0; i < non_zero_num; i++) { for (auto i = 0; i < non_zero_num; i++) {
int64_t index = 0; int64_t index = 0;
for (int j = 0; j < sparse_dim; j++) { for (int j = 0; j < sparse_dim; j++) {
index += index += indices.data<IntT>()[j * non_zero_num + i] * sparse_offsets[j];
indices.data<int64_t>()[j * non_zero_num + i] * sparse_offsets[j];
} }
for (int j = 0; j < base_offset; j++) { for (int j = 0; j < base_offset; j++) {
...@@ -280,6 +294,16 @@ void SparseCooToDenseKernel(const Context& dev_ctx, ...@@ -280,6 +294,16 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
} }
} }
template <typename T, typename Context>
void SparseCooToDenseKernel(const Context& dev_ctx,
const SparseCooTensor& x,
DenseTensor* out) {
PD_VISIT_INTEGRAL_TYPES(
x.non_zero_indices().dtype(), "SparseCooToDenseCPUKernel", ([&] {
SparseCooToDenseCPUKernel<T, data_t>(dev_ctx, x, out);
}));
}
} // namespace sparse } // namespace sparse
} // namespace phi } // namespace phi
......
...@@ -15,11 +15,12 @@ limitations under the License. */ ...@@ -15,11 +15,12 @@ limitations under the License. */
#include <thrust/execution_policy.h> #include <thrust/execution_policy.h>
#include <thrust/remove.h> #include <thrust/remove.h>
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/core/tensor_meta.h"
#include "paddle/phi/core/visit_type.h"
#include "paddle/phi/kernels/funcs/sparse/common_shape.h" #include "paddle/phi/kernels/funcs/sparse/common_shape.h"
#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h" #include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
...@@ -96,39 +97,33 @@ void DenseToSparseCooKernel(const Context& dev_ctx, ...@@ -96,39 +97,33 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
SparseCooTensor* out) { SparseCooTensor* out) {
const T* x_data = x.data<T>(); const T* x_data = x.data<T>();
const auto& x_dims = x.dims(); const auto& x_dims = x.dims();
PADDLE_ENFORCE_LE(sparse_dim,
x_dims.size(),
phi::errors::InvalidArgument(
"sparse_dim must be less than the size of x.dims()"));
PADDLE_ENFORCE_GT(
sparse_dim, 0, phi::errors::InvalidArgument("sparse_dim must be >0"));
auto dims_2d = flatten_to_2d(x_dims, sparse_dim); auto dims_2d = flatten_to_2d(x_dims, sparse_dim);
const int rows = dims_2d[0]; const int rows = dims_2d[0];
const int cols = dims_2d[1]; const int cols = dims_2d[1];
auto nums_meta = DenseTensor nums = phi::Empty<int32_t>(dev_ctx, {1});
phi::DenseTensorMeta(DataType::INT32, {1}, phi::DataLayout::NCHW); DenseTensor d_x_dims = phi::Empty<int64_t>(dev_ctx, {x_dims.size()});
DenseTensor nums = phi::Empty(dev_ctx, std::move(nums_meta));
auto x_dims_meta = phi::DenseTensorMeta(DataType::INT64,
{static_cast<int64_t>(x_dims.size())},
phi::DataLayout::NCHW);
DenseTensor d_x_dims = phi::Empty(dev_ctx, std::move(x_dims_meta));
const auto place = dev_ctx.GetPlace();
// 1. get numbers of non zero elements, and get the index of non zero elements // 1. get numbers of non zero elements, and get the index of non zero elements
int* nums_ptr = nums.mutable_data<int>(place); int* nums_ptr = nums.data<int>();
#ifdef PADDLE_WITH_HIP phi::backends::gpu::GpuMemsetAsync(
PADDLE_ENFORCE_GPU_SUCCESS( nums_ptr, 0, sizeof(int), dev_ctx.stream());
hipMemsetAsync(nums_ptr, 0, sizeof(int), dev_ctx.stream()));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemsetAsync(nums_ptr, 0, sizeof(int), dev_ctx.stream()));
#endif
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
auto temp_indexs_meta = DenseTensor temp_indexs = phi::Empty<int32_t>(dev_ctx, {rows});
phi::DenseTensorMeta(DataType::INT32, {rows}, phi::DataLayout::NCHW); int* temp_indexs_ptr = temp_indexs.data<int>();
DenseTensor temp_indexs = phi::Empty(dev_ctx, std::move(temp_indexs_meta));
int* temp_indexs_ptr = temp_indexs.mutable_data<int>(place);
GetNonZeroNums<<<config.block_per_grid.x, GetNonZeroNums<<<config.block_per_grid.x,
config.thread_per_block.x, config.thread_per_block.x,
0, 0,
dev_ctx.stream()>>>( dev_ctx.stream()>>>(
x_data, rows, cols, nums_ptr, temp_indexs_ptr); x_data, rows, cols, nums_ptr, temp_indexs_ptr);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
thrust::remove(thrust::hip::par.on(dev_ctx.stream()), thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
#else #else
...@@ -140,35 +135,16 @@ void DenseToSparseCooKernel(const Context& dev_ctx, ...@@ -140,35 +135,16 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
// 2. copy non_zero_num to host, copy x_dims to device // 2. copy non_zero_num to host, copy x_dims to device
int non_zero_num = 0; int non_zero_num = 0;
#ifdef PADDLE_WITH_HIP phi::backends::gpu::GpuMemcpyAsync(&non_zero_num,
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(&non_zero_num,
nums_ptr, nums_ptr,
sizeof(int), sizeof(int),
hipMemcpyDeviceToHost, gpuMemcpyDeviceToHost,
dev_ctx.stream())); dev_ctx.stream());
#else phi::backends::gpu::GpuMemcpyAsync(d_x_dims.data<int64_t>(),
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(&non_zero_num,
nums_ptr,
sizeof(int),
cudaMemcpyDeviceToHost,
dev_ctx.stream()));
#endif
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(
hipMemcpyAsync(d_x_dims.mutable_data<int64_t>(place),
x_dims.Get(), x_dims.Get(),
x_dims.size() * sizeof(x_dims[0]), x_dims.size() * sizeof(x_dims[0]),
hipMemcpyHostToDevice, gpuMemcpyHostToDevice,
dev_ctx.stream())); dev_ctx.stream());
#else
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemcpyAsync(d_x_dims.mutable_data<int64_t>(place),
x_dims.Get(),
x_dims.size() * sizeof(x_dims[0]),
cudaMemcpyHostToDevice,
dev_ctx.stream()));
#endif
dev_ctx.Wait(); // wait the copy dev_ctx.Wait(); // wait the copy
...@@ -197,20 +173,22 @@ void DenseToSparseCooKernel(const Context& dev_ctx, ...@@ -197,20 +173,22 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
out->SetMember(indices, values, x_dims, true); out->SetMember(indices, values, x_dims, true);
} }
__global__ void GetBatchSizes(const int64_t* crows, template <typename IntT>
__global__ void GetBatchSizes(const IntT* crows,
const int rows, const int rows,
const int batchs, const int batchs,
int* batch_sizes) { IntT* batch_sizes) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x; const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < batchs) { if (tid < batchs) {
batch_sizes[tid] = crows[tid * (rows + 1) + rows]; batch_sizes[tid] = crows[tid * (rows + 1) + rows];
} }
} }
__global__ void ConvertCsrCrowsToCooRows(const int64_t* crows_ptr, template <typename IntT>
const int* crows_offsets, __global__ void ConvertCsrCrowsToCooRows(const IntT* crows_ptr,
int64_t* rows_ptr, const IntT* crows_offsets,
int64_t* batch_ptr, IntT* rows_ptr,
IntT* batch_ptr,
const int rows) { const int rows) {
const int b = blockIdx.y; const int b = blockIdx.y;
const int64_t offset = crows_offsets ? crows_offsets[b] : 0; const int64_t offset = crows_offsets ? crows_offsets[b] : 0;
...@@ -227,8 +205,8 @@ __global__ void ConvertCsrCrowsToCooRows(const int64_t* crows_ptr, ...@@ -227,8 +205,8 @@ __global__ void ConvertCsrCrowsToCooRows(const int64_t* crows_ptr,
} }
} }
template <typename T, typename Context> template <typename T, typename IntT>
void SparseCsrToCooKernel(const Context& dev_ctx, void SparseCsrToCooGPUKernel(const GPUContext& dev_ctx,
const SparseCsrTensor& x, const SparseCsrTensor& x,
SparseCooTensor* out) { SparseCooTensor* out) {
const DDim& x_dims = x.dims(); const DDim& x_dims = x.dims();
...@@ -236,8 +214,8 @@ void SparseCsrToCooKernel(const Context& dev_ctx, ...@@ -236,8 +214,8 @@ void SparseCsrToCooKernel(const Context& dev_ctx,
const auto& csr_crows = x.non_zero_crows(); const auto& csr_crows = x.non_zero_crows();
const auto& csr_cols = x.non_zero_cols(); const auto& csr_cols = x.non_zero_cols();
const auto& csr_values = x.non_zero_elements(); const auto& csr_values = x.non_zero_elements();
const int64_t* csr_crows_data = csr_crows.data<int64_t>(); const IntT* csr_crows_data = csr_crows.data<IntT>();
const int64_t* csr_cols_data = csr_cols.data<int64_t>(); const IntT* csr_cols_data = csr_cols.data<IntT>();
const T* csr_values_data = csr_values.data<T>(); const T* csr_values_data = csr_values.data<T>();
int64_t sparse_dim = 2; int64_t sparse_dim = 2;
...@@ -247,26 +225,20 @@ void SparseCsrToCooKernel(const Context& dev_ctx, ...@@ -247,26 +225,20 @@ void SparseCsrToCooKernel(const Context& dev_ctx,
int batchs = x_dims.size() == 2 ? 1 : x_dims[0]; int batchs = x_dims.size() == 2 ? 1 : x_dims[0];
int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1]; int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1];
const auto place = dev_ctx.GetPlace(); DenseTensor indices = phi::Empty<IntT>(dev_ctx, {sparse_dim, non_zero_num});
DenseTensorMeta indices_meta( DenseTensor values = phi::EmptyLike<T, GPUContext>(dev_ctx, csr_values);
DataType::INT64, {sparse_dim, non_zero_num}, DataLayout::NCHW); DenseTensor offsets = phi::Empty<IntT>(dev_ctx, {batchs});
DenseTensorMeta values_meta( IntT* coo_indices = indices.data<IntT>();
x.dtype(), {non_zero_num}, x.non_zero_elements().layout()); IntT* batch_ptr = x_dims.size() == 2 ? nullptr : coo_indices;
DenseTensorMeta offsets_meta(DataType::INT32, {batchs}, DataLayout::NCHW); IntT* coo_rows_data =
DenseTensor indices = phi::Empty(dev_ctx, std::move(indices_meta));
DenseTensor values = phi::Empty(dev_ctx, std::move(values_meta));
DenseTensor offsets = phi::Empty(dev_ctx, std::move(offsets_meta));
int64_t* coo_indices = indices.mutable_data<int64_t>(place);
int64_t* batch_ptr = x_dims.size() == 2 ? nullptr : coo_indices;
int64_t* coo_rows_data =
x_dims.size() == 2 ? coo_indices : batch_ptr + non_zero_num; x_dims.size() == 2 ? coo_indices : batch_ptr + non_zero_num;
int64_t* coo_cols_data = coo_rows_data + non_zero_num; IntT* coo_cols_data = coo_rows_data + non_zero_num;
int* offsets_ptr = batchs == 1 ? nullptr : offsets.mutable_data<int>(place); IntT* offsets_ptr = batchs == 1 ? nullptr : offsets.data<IntT>();
T* coo_values_data = values.mutable_data<T>(place); T* coo_values_data = values.data<T>();
if (batchs > 1) { if (batchs > 1) {
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batchs, 1); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batchs, 1);
GetBatchSizes<<<config.block_per_grid.x, config.thread_per_block.x>>>( GetBatchSizes<IntT><<<config.block_per_grid.x, config.thread_per_block.x>>>(
csr_crows_data, rows, batchs, offsets_ptr); csr_crows_data, rows, batchs, offsets_ptr);
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -281,40 +253,38 @@ void SparseCsrToCooKernel(const Context& dev_ctx, ...@@ -281,40 +253,38 @@ void SparseCsrToCooKernel(const Context& dev_ctx,
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
config.block_per_grid.y = batchs; config.block_per_grid.y = batchs;
ConvertCsrCrowsToCooRows<<<config.block_per_grid, ConvertCsrCrowsToCooRows<IntT>
config.thread_per_block.x>>>( <<<config.block_per_grid, config.thread_per_block.x>>>(
csr_crows_data, offsets_ptr, coo_rows_data, batch_ptr, rows); csr_crows_data, offsets_ptr, coo_rows_data, batch_ptr, rows);
#ifdef PADDLE_WITH_HIP phi::backends::gpu::GpuMemcpyAsync(coo_cols_data,
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(coo_cols_data,
csr_cols_data,
sizeof(int64_t) * non_zero_num,
hipMemcpyDeviceToDevice,
dev_ctx.stream()));
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(coo_values_data,
csr_values_data,
sizeof(T) * non_zero_num,
hipMemcpyDeviceToDevice,
dev_ctx.stream()));
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(coo_cols_data,
csr_cols_data, csr_cols_data,
sizeof(int64_t) * non_zero_num, sizeof(IntT) * non_zero_num,
cudaMemcpyDeviceToDevice, gpuMemcpyDeviceToDevice,
dev_ctx.stream())); dev_ctx.stream());
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(coo_values_data, phi::backends::gpu::GpuMemcpyAsync(coo_values_data,
csr_values_data, csr_values_data,
sizeof(T) * non_zero_num, sizeof(T) * non_zero_num,
cudaMemcpyDeviceToDevice, gpuMemcpyDeviceToDevice,
dev_ctx.stream())); dev_ctx.stream());
#endif
out->SetMember(indices, values, x_dims, true); out->SetMember(indices, values, x_dims, true);
} }
__global__ void GetBatchsOffset(const int64_t* batchs_ptr, template <typename T, typename Context>
void SparseCsrToCooKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
SparseCooTensor* out) {
PD_VISIT_INTEGRAL_TYPES(
x.non_zero_crows().dtype(), "SparseCsrToCooGPUKernel", ([&] {
SparseCsrToCooGPUKernel<T, data_t>(dev_ctx, x, out);
}));
}
template <typename IntT>
__global__ void GetBatchsOffset(const IntT* batchs_ptr,
const int non_zero_num, const int non_zero_num,
int64_t* batchs_offset) { IntT* batchs_offset) {
int tid = threadIdx.x + blockIdx.x * blockDim.x; int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
if (i == non_zero_num - 1 || batchs_ptr[i] != batchs_ptr[i + 1]) { if (i == non_zero_num - 1 || batchs_ptr[i] != batchs_ptr[i + 1]) {
...@@ -323,35 +293,36 @@ __global__ void GetBatchsOffset(const int64_t* batchs_ptr, ...@@ -323,35 +293,36 @@ __global__ void GetBatchsOffset(const int64_t* batchs_ptr,
} }
} }
template <typename IntT>
__global__ void ConvertCooRowsToCsrCrows( __global__ void ConvertCooRowsToCsrCrows(
const int64_t* batchs_offset, // can be null if batchs = 1 const IntT* batchs_offset, // can be null if batchs = 1
const int64_t* coo_rows_data, const IntT* coo_rows_data,
int64_t* csr_crows_data, IntT* csr_crows_data,
const int rows, const int rows,
const int64_t non_zero_num) { const int64_t non_zero_num) {
const int b = blockIdx.y; const int b = blockIdx.y;
int batch_non_zero_num = int batch_non_zero_num =
batchs_offset == nullptr ? non_zero_num : batchs_offset[b]; batchs_offset == nullptr ? non_zero_num : batchs_offset[b];
if (batch_non_zero_num == 0) return; if (batch_non_zero_num == 0) return;
int batch_start = 0; IntT batch_start = 0;
if (b > 0) { if (b > 0) {
batch_start = batchs_offset[b - 1]; batch_start = batchs_offset[b - 1];
batch_non_zero_num -= batch_start; batch_non_zero_num -= batch_start;
} }
auto* coo_rows_ptr = coo_rows_data + batch_start; const IntT* coo_rows_ptr = coo_rows_data + batch_start;
const int tid = threadIdx.x + blockIdx.x * blockDim.x; const int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < batch_non_zero_num; i += gridDim.x * blockDim.x) { for (int i = tid; i < batch_non_zero_num; i += gridDim.x * blockDim.x) {
if (i == 0) { if (i == 0) {
for (int j = 0; j <= coo_rows_ptr[0]; j++) { for (IntT j = 0; j <= coo_rows_ptr[0]; j++) {
csr_crows_data[b * (rows + 1) + j] = 0; csr_crows_data[b * (rows + 1) + j] = 0;
} }
} else { } else {
for (int j = coo_rows_ptr[i - 1]; j < coo_rows_ptr[i]; j++) { for (IntT j = coo_rows_ptr[i - 1]; j < coo_rows_ptr[i]; j++) {
csr_crows_data[b * (rows + 1) + j + 1] = i; csr_crows_data[b * (rows + 1) + j + 1] = i;
} }
} }
if (i == batch_non_zero_num - 1) { if (i == batch_non_zero_num - 1) {
for (int64_t i = coo_rows_ptr[batch_non_zero_num - 1] + 1; i < rows + 1; for (IntT i = coo_rows_ptr[batch_non_zero_num - 1] + 1; i < rows + 1;
i++) { i++) {
csr_crows_data[b * (rows + 1) + i] = batch_non_zero_num; csr_crows_data[b * (rows + 1) + i] = batch_non_zero_num;
} }
...@@ -359,8 +330,8 @@ __global__ void ConvertCooRowsToCsrCrows( ...@@ -359,8 +330,8 @@ __global__ void ConvertCooRowsToCsrCrows(
} }
} }
template <typename T, typename Context> template <typename T, typename IntT>
void SparseCooToCsrKernel(const Context& dev_ctx, void SparseCooToCsrGPUKernel(const GPUContext& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
SparseCsrTensor* out) { SparseCsrTensor* out) {
const auto& x_dims = x.dims(); const auto& x_dims = x.dims();
...@@ -376,78 +347,71 @@ void SparseCooToCsrKernel(const Context& dev_ctx, ...@@ -376,78 +347,71 @@ void SparseCooToCsrKernel(const Context& dev_ctx,
int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1]; int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1];
phi::DenseTensor non_zero_crows = phi::DenseTensor non_zero_crows =
phi::Empty<int64_t>(dev_ctx, {batchs * (rows + 1)}); phi::Empty<IntT>(dev_ctx, {batchs * (rows + 1)});
phi::DenseTensor non_zero_cols = phi::Empty<int64_t>(dev_ctx, {non_zero_num}); phi::DenseTensor non_zero_cols = phi::Empty<IntT>(dev_ctx, {non_zero_num});
phi::DenseTensor non_zero_elements = phi::Empty<T>(dev_ctx, {non_zero_num}); phi::DenseTensor non_zero_elements =
int64_t* csr_crows_data = non_zero_crows.data<int64_t>(); phi::EmptyLike<T, GPUContext>(dev_ctx, x.non_zero_elements());
int64_t* csr_cols_data = non_zero_cols.data<int64_t>(); IntT* csr_crows_data = non_zero_crows.data<IntT>();
IntT* csr_cols_data = non_zero_cols.data<IntT>();
T* csr_values_data = non_zero_elements.data<T>(); T* csr_values_data = non_zero_elements.data<T>();
const auto& coo_indices = x.non_zero_indices(); const auto& coo_indices = x.non_zero_indices();
const auto& coo_values = x.non_zero_elements(); const auto& coo_values = x.non_zero_elements();
const int64_t* batchs_ptr = coo_indices.data<int64_t>(); const IntT* batchs_ptr = coo_indices.data<IntT>();
const int64_t* coo_rows_data = const IntT* coo_rows_data =
batchs == 1 ? batchs_ptr : batchs_ptr + non_zero_num; batchs == 1 ? batchs_ptr : batchs_ptr + non_zero_num;
const int64_t* coo_cols_data = coo_rows_data + non_zero_num; const IntT* coo_cols_data = coo_rows_data + non_zero_num;
const T* coo_values_data = coo_values.data<T>(); const T* coo_values_data = coo_values.data<T>();
if (!x.coalesced()) {
// TODO(zhangkahuo): call coalesced() to distinct and sort the indices
}
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batchs, 1); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batchs, 1);
if (batchs > 1) { if (batchs > 1) {
DenseTensorMeta batchs_meta(DataType::INT64, {batchs}, DataLayout::NCHW); phi::DenseTensor batchs_offset = phi::Empty<IntT>(dev_ctx, {batchs});
phi::DenseTensor batchs_offset = phi::Empty<int64_t>(dev_ctx, {batchs}); IntT* batchs_offset_ptr = batchs_offset.data<IntT>();
int64_t* batchs_offset_ptr = batchs_offset.data<int64_t>(); GetBatchsOffset<IntT>
GetBatchsOffset<<<config.block_per_grid.x, <<<config.block_per_grid.x,
config.thread_per_block.x, config.thread_per_block.x,
0, 0,
dev_ctx.stream()>>>( dev_ctx.stream()>>>(batchs_ptr, non_zero_num, batchs_offset_ptr);
batchs_ptr, non_zero_num, batchs_offset_ptr);
config.block_per_grid.y = batchs; config.block_per_grid.y = batchs;
ConvertCooRowsToCsrCrows<<<config.block_per_grid, ConvertCooRowsToCsrCrows<IntT><<<config.block_per_grid,
config.thread_per_block.x, config.thread_per_block.x,
0, 0,
dev_ctx.stream()>>>( dev_ctx.stream()>>>(
batchs_offset_ptr, coo_rows_data, csr_crows_data, rows, non_zero_num); batchs_offset_ptr, coo_rows_data, csr_crows_data, rows, non_zero_num);
} else { } else {
ConvertCooRowsToCsrCrows<<<config.block_per_grid.x, ConvertCooRowsToCsrCrows<IntT><<<config.block_per_grid.x,
config.thread_per_block.x, config.thread_per_block.x,
0, 0,
dev_ctx.stream()>>>( dev_ctx.stream()>>>(
nullptr, coo_rows_data, csr_crows_data, rows, non_zero_num); nullptr, coo_rows_data, csr_crows_data, rows, non_zero_num);
} }
#ifdef PADDLE_WITH_HIP phi::backends::gpu::GpuMemcpyAsync(csr_cols_data,
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(csr_cols_data,
coo_cols_data, coo_cols_data,
sizeof(int64_t) * non_zero_num, sizeof(IntT) * non_zero_num,
hipMemcpyDeviceToDevice, gpuMemcpyDeviceToDevice,
dev_ctx.stream())); dev_ctx.stream());
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(csr_values_data, phi::backends::gpu::GpuMemcpyAsync(csr_values_data,
coo_values_data, coo_values_data,
sizeof(T) * non_zero_num, sizeof(T) * non_zero_num,
hipMemcpyDeviceToDevice, gpuMemcpyDeviceToDevice,
dev_ctx.stream())); dev_ctx.stream());
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(csr_cols_data,
coo_cols_data,
sizeof(int64_t) * non_zero_num,
cudaMemcpyDeviceToDevice,
dev_ctx.stream()));
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(csr_values_data,
coo_values_data,
sizeof(T) * non_zero_num,
cudaMemcpyDeviceToDevice,
dev_ctx.stream()));
#endif
out->SetMember(non_zero_crows, non_zero_cols, non_zero_elements, x_dims); out->SetMember(non_zero_crows, non_zero_cols, non_zero_elements, x_dims);
} }
template <typename T, typename Context>
void SparseCooToCsrKernel(const Context& dev_ctx,
const SparseCooTensor& x,
SparseCsrTensor* out) {
PD_VISIT_INTEGRAL_TYPES(
x.non_zero_indices().dtype(), "SparseCooToCsrGPUKernel", ([&] {
SparseCooToCsrGPUKernel<T, data_t>(dev_ctx, x, out);
}));
}
template <typename ValueT, typename IndicesT> template <typename ValueT, typename IndicesT>
__global__ void KernelSparseCooToDense(const IndicesT* indices, __global__ void KernelSparseCooToDense(const IndicesT* indices,
const IndicesT* sparse_offsets, const int64_t* sparse_offsets,
const ValueT* data, const ValueT* data,
ValueT* dense_data, ValueT* dense_data,
const IndicesT non_zero_num, const IndicesT non_zero_num,
...@@ -466,8 +430,8 @@ __global__ void KernelSparseCooToDense(const IndicesT* indices, ...@@ -466,8 +430,8 @@ __global__ void KernelSparseCooToDense(const IndicesT* indices,
} }
} }
template <typename T, typename Context> template <typename T, typename IntT>
void SparseCooToDenseKernel(const Context& dev_ctx, void SparseCooToDenseGPUKernel(const GPUContext& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
DenseTensor* out) { DenseTensor* out) {
const auto non_zero_num = x.nnz(); const auto non_zero_num = x.nnz();
...@@ -498,38 +462,24 @@ void SparseCooToDenseKernel(const Context& dev_ctx, ...@@ -498,38 +462,24 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
offset *= dense_dims[i]; offset *= dense_dims[i];
} }
auto sparse_offset_meta = phi::DenseTensorMeta( DenseTensor d_sparse_offsets = Empty<int64_t>(dev_ctx, {sparse_dim});
DataType::INT64, {sparse_dim}, phi::DataLayout::NCHW);
DenseTensor d_sparse_offsets = Empty(dev_ctx, std::move(sparse_offset_meta));
#ifdef PADDLE_WITH_HIP phi::backends::gpu::GpuMemcpyAsync(d_sparse_offsets.data<int64_t>(),
PADDLE_ENFORCE_GPU_SUCCESS(
hipMemcpyAsync(d_sparse_offsets.mutable_data<int64_t>(place),
sparse_offsets.data(), sparse_offsets.data(),
sparse_dim * sizeof(int64_t), sparse_dim * sizeof(int64_t),
hipMemcpyHostToDevice, gpuMemcpyHostToDevice,
dev_ctx.stream())); dev_ctx.stream());
phi::backends::gpu::GpuMemsetAsync(
out_data, 0, sizeof(T) * out->numel(), dev_ctx.stream());
PADDLE_ENFORCE_GPU_SUCCESS(
hipMemsetAsync(out_data, 0, sizeof(T) * out->numel(), dev_ctx.stream()));
#else
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemcpyAsync(d_sparse_offsets.mutable_data<int64_t>(place),
sparse_offsets.data(),
sparse_dim * sizeof(int64_t),
cudaMemcpyHostToDevice,
dev_ctx.stream()));
PADDLE_ENFORCE_GPU_SUCCESS(
cudaMemsetAsync(out_data, 0, sizeof(T) * out->numel(), dev_ctx.stream()));
#endif
auto config = auto config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
KernelSparseCooToDense<T, int64_t> KernelSparseCooToDense<T, IntT>
<<<config.block_per_grid.x, <<<config.block_per_grid.x,
config.thread_per_block.x, config.thread_per_block.x,
0, 0,
dev_ctx.stream()>>>(indices.data<int64_t>(), dev_ctx.stream()>>>(indices.data<IntT>(),
d_sparse_offsets.data<int64_t>(), d_sparse_offsets.data<int64_t>(),
x_data, x_data,
out_data, out_data,
...@@ -538,6 +488,16 @@ void SparseCooToDenseKernel(const Context& dev_ctx, ...@@ -538,6 +488,16 @@ void SparseCooToDenseKernel(const Context& dev_ctx,
sparse_dim); sparse_dim);
} }
template <typename T, typename Context>
void SparseCooToDenseKernel(const Context& dev_ctx,
const SparseCooTensor& x,
DenseTensor* out) {
PD_VISIT_INTEGRAL_TYPES(
x.non_zero_indices().dtype(), "SparseCooToDenseGPUKernel", ([&] {
SparseCooToDenseGPUKernel<T, data_t>(dev_ctx, x, out);
}));
}
} // namespace sparse } // namespace sparse
} // namespace phi } // namespace phi
......
...@@ -168,8 +168,10 @@ class TestSparseConvert(unittest.TestCase): ...@@ -168,8 +168,10 @@ class TestSparseConvert(unittest.TestCase):
with _test_eager_guard(): with _test_eager_guard():
indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]] indices = [[0, 0, 1, 2, 2], [1, 3, 2, 0, 1]]
values = [1.0, 2.0, 3.0, 4.0, 5.0] values = [1.0, 2.0, 3.0, 4.0, 5.0]
indices_dtypes = ['int32', 'int64']
for indices_dtype in indices_dtypes:
sparse_x = paddle.incubate.sparse.sparse_coo_tensor( sparse_x = paddle.incubate.sparse.sparse_coo_tensor(
paddle.to_tensor(indices), paddle.to_tensor(indices, dtype=indices_dtype),
paddle.to_tensor(values), paddle.to_tensor(values),
shape=[3, 4], shape=[3, 4],
stop_gradient=False) stop_gradient=False)
...@@ -185,7 +187,7 @@ class TestSparseConvert(unittest.TestCase): ...@@ -185,7 +187,7 @@ class TestSparseConvert(unittest.TestCase):
paddle.device.set_device("cpu") paddle.device.set_device("cpu")
sparse_x_cpu = paddle.incubate.sparse.sparse_coo_tensor( sparse_x_cpu = paddle.incubate.sparse.sparse_coo_tensor(
paddle.to_tensor(indices), paddle.to_tensor(indices, dtype=indices_dtype),
paddle.to_tensor(values), paddle.to_tensor(values),
shape=[3, 4], shape=[3, 4],
stop_gradient=False) stop_gradient=False)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册