未验证 提交 bafea65c 编写于 作者: Z zhangkaihuo 提交者: GitHub

Add a Sparse OP:sparse_csr_to_coo (#39266)

* dense_to_sparse_coo

* optimize unit testing; support rocm

* 1. delete fluid related header file
2. update the copyright

* fix hipMemcpy

* update dense_to_sparsecoo

* add namespace sparse

* sparse_csr_to_dense

* test to_sparse_coo: csr_to_coo

* fix writing error
上级 7e29cea9
develop 1.8.5 2.4.1 Ligoml-patch-1 ZHUI-patch-1 add_kylinv10 add_some_yaml_config bugfix-eval-frame-leakgae cherry-pick-fix-customOP-random-fail cp_2.4_fix_numpy dingjiaweiww-patch-1 dy2static enable_eager_model_test final_state_intermediate fix-numpy-issue fix-run-program-grad-node-mem fix_check fix_custom_device_copy_sync fix_dlpack_for fix_newexe_gc fix_rnn_docs fix_tensor_type fix_var_stop_gradient_error hack_event incuabte/new_frl incubate/frl_train_eval incubate/new_frl incubate/new_frl_rc incubate/stride inplace_addto layer_norm matmul_double_grad move_embedding_to_phi move_histogram_to_pten move_sgd_to_phi move_slice_to_pten move_temporal_shift_to_phi move_yolo_box_to_phi npu_fix_alloc operator_opt pass-compile-eval-frame preln_ernie prv-md-even-more prv-onednn-2.5 prv-reshape-mkldnn-ut2 release-deleted/2.5 release-rc/2.5 release/2.3 release/2.3-fc-ernie-fix release/2.4 release/2.5 release/llm_2.5 revert-39143-adjust_empty revert-39227-move_trace_op_to_pten revert-39268-dev/remove_concat_fluid_kernel revert-40170-support_partial_grad revert-41056-revert-40727-move_some_activaion_to_phi revert-41065-revert-40993-mv_ele_floordiv_pow revert-41068-revert-40790-phi_new revert-41944-smaller_inference_api_test revert-42149-do-not-reset-default-stream-for-stream-safe-cuda-allocator revert-43155-fix_ut_tempfile revert-43882-revert-41944-smaller_inference_api_test revert-45808-phi/simplify_size_op revert-46827-deform_comment revert-47325-remove_cudnn_hardcode revert-47645-add_npu_storage_dims revert-48815-set_free_when_no_cache_hit_default_value_true revert-49499-test_ninja_on_ci revert-49654-prim_api_gen revert-49673-modify_get_single_cov revert-49763-fix_static_composite_gen revert-50158-fix_found_inf_bug_for_custom_optimizer revert-50188-refine_optimizer_create_accumulators revert-50335-fix_optminizer_set_auxiliary_var_bug revert-51676-flag_delete revert-51850-fix_softmaxce_dev revert-52175-dev_peak_memory revert-52186-deve revert-52523-test_py38 revert-52912-develop revert-53248-set_cmake_policy revert-54029-fix_windows_compile_bug revert-54068-support_translating_op_attribute revert-54214-modify_cmake_dependencies revert-54370-offline_pslib revert-54391-fix_cmake_md5error revert-54411-fix_cpp17_compile revert-54466-offline_pslib revert-54480-cmake-rocksdb revert-55568-fix_BF16_bug1 revert-56328-new_ir_support_vector_type_place_transfer revert-56366-fix_openssl_bug revert-56545-revert-56366-fix_openssl_bug revert-56620-fix_new_ir_ocr_bug revert-56925-check_inputs_grad_semantic revert-57005-refine_stride_flag sd_conv_linear_autocast semi-auto/rule-base support-0D-sort support_weight_transpose test_for_Filtetfiles zhiqiu-patch-1 v2.5.1 v2.5.0 v2.5.0-rc1 v2.5.0-rc0 v2.4.2 v2.4.1 v2.4.0 v2.4.0-rc0 v2.3.2 v2.3.1 v2.3.0 v2.3.0-rc0
无相关合并请求
......@@ -26,10 +26,6 @@ inline void check_shape(const DDim& dims) {
#define Check(non_zero_crows, non_zero_cols, non_zero_elements, dims) \
{ \
check_shape(dims); \
PADDLE_ENFORCE_EQ(dims.size(), \
2, \
paddle::platform::errors::InvalidArgument( \
"the SparseCsrTensor only support 2-D Tensor.")); \
PADDLE_ENFORCE_EQ( \
non_zero_cols.place(), \
non_zero_crows.place(), \
......@@ -50,7 +46,12 @@ SparseCsrTensor::SparseCsrTensor(const DenseTensor& non_zero_crows,
non_zero_cols_(non_zero_cols),
non_zero_elements_(non_zero_elements),
dims_(dims) {
Check(non_zero_crows_, non_zero_cols_, non_zero_elements_, dims_);
if (non_zero_crows.initialized()) {
Check(non_zero_crows_, non_zero_cols_, non_zero_elements_, dims_);
} else {
// create a empty tensor
check_shape(dims);
}
}
SparseCsrTensor::SparseCsrTensor(const SparseCsrTensor& other)
......
......@@ -102,6 +102,61 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
out->SetMember(indices, values, x_dims, true);
}
template <typename T, typename Context>
void SparseCsrToCooKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
SparseCooTensor* out) {
const DDim& x_dims = x.dims();
const int64_t non_zero_num = x.non_zero_cols().numel();
const auto& csr_crows = x.non_zero_crows();
const auto& csr_cols = x.non_zero_cols();
const auto& csr_values = x.non_zero_elements();
const int64_t* csr_crows_data = csr_crows.data<int64_t>();
const int64_t* csr_cols_data = csr_cols.data<int64_t>();
const T* csr_values_data = csr_values.data<T>();
int64_t sparse_dim = 2;
if (x_dims.size() == 3) {
sparse_dim = 3;
}
const auto place = dev_ctx.GetPlace();
DenseTensorMeta indices_meta(
DataType::INT64, {sparse_dim, non_zero_num}, DataLayout::NCHW);
DenseTensorMeta values_meta(x.dtype(), {non_zero_num}, x.layout());
pten::DenseTensor indices =
pten::Empty<int64_t, Context>(dev_ctx, std::move(indices_meta));
pten::DenseTensor values =
pten::Empty<T, Context>(dev_ctx, std::move(values_meta));
int64_t* coo_indices = indices.mutable_data<int64_t>(place);
int64_t* batch_ptr = x_dims.size() == 2 ? nullptr : coo_indices;
int64_t* coo_rows_data =
x_dims.size() == 2 ? coo_indices : batch_ptr + non_zero_num;
int64_t* coo_cols_data = coo_rows_data + non_zero_num;
T* coo_values_data = values.mutable_data<T>(place);
int batch = x_dims.size() == 2 ? 1 : x_dims[0];
int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1];
int index = 0;
for (int b = 0; b < batch; b++) {
for (int i = 0; i < rows; i++) {
for (int j = csr_crows_data[b * (rows + 1) + i];
j < csr_crows_data[b * (rows + 1) + i + 1];
j++) {
coo_rows_data[index] = i;
if (batch_ptr) {
batch_ptr[index] = b;
}
++index;
}
}
}
memcpy(coo_cols_data, csr_cols_data, sizeof(int64_t) * non_zero_num);
memcpy(coo_values_data, csr_values_data, sizeof(T) * non_zero_num);
out->SetMember(indices, values, x_dims, true);
}
} // namespace sparse
} // namespace pten
......@@ -117,3 +172,16 @@ PT_REGISTER_KERNEL(dense_to_sparse_coo,
int16_t,
int,
int64_t) {}
PT_REGISTER_KERNEL(sparse_csr_to_coo,
CPU,
ALL_LAYOUT,
pten::sparse::SparseCsrToCooKernel,
float,
double,
paddle::float16,
uint8_t,
int8_t,
int16_t,
int,
int64_t) {}
......@@ -214,6 +214,122 @@ void DenseToSparseCooKernel(const Context& dev_ctx,
out->SetMember(indices, values, x_dims, true);
}
__global__ void GetBatchSizes(const int64_t* crows,
const int rows,
const int batchs,
int* batch_sizes) {
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
if (tid < batchs) {
batch_sizes[tid] = crows[tid * (rows + 1) + rows];
}
}
__global__ void ConvertCsrCrowsToCooRows(const int64_t* crows_ptr,
const int* crows_offsets,
int64_t* rows_ptr,
int64_t* batch_ptr,
const int rows) {
const int b = blockIdx.y;
const int64_t offset = crows_offsets ? crows_offsets[b] : 0;
const int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < rows; i += gridDim.x * blockDim.x) {
for (int j = crows_ptr[b * (rows + 1) + i];
j < crows_ptr[b * (rows + 1) + i + 1];
j++) {
rows_ptr[offset + j] = i;
if (batch_ptr) {
batch_ptr[offset + j] = b;
}
}
}
}
template <typename T, typename Context>
void SparseCsrToCooKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
SparseCooTensor* out) {
const DDim& x_dims = x.dims();
const int64_t non_zero_num = x.non_zero_cols().numel();
const auto& csr_crows = x.non_zero_crows();
const auto& csr_cols = x.non_zero_cols();
const auto& csr_values = x.non_zero_elements();
const int64_t* csr_crows_data = csr_crows.data<int64_t>();
const int64_t* csr_cols_data = csr_cols.data<int64_t>();
const T* csr_values_data = csr_values.data<T>();
int64_t sparse_dim = 2;
if (x_dims.size() == 3) {
sparse_dim = 3;
}
int batchs = x_dims.size() == 2 ? 1 : x_dims[0];
int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1];
const auto place = dev_ctx.GetPlace();
DenseTensorMeta indices_meta(
DataType::INT64, {sparse_dim, non_zero_num}, DataLayout::NCHW);
DenseTensorMeta values_meta(x.dtype(), {non_zero_num}, x.layout());
DenseTensorMeta offsets_meta(DataType::INT32, {batchs}, DataLayout::NCHW);
DenseTensor indices =
pten::Empty<int64_t, Context>(dev_ctx, std::move(indices_meta));
DenseTensor values = pten::Empty<T, Context>(dev_ctx, std::move(values_meta));
DenseTensor offsets =
pten::Empty<T, Context>(dev_ctx, std::move(offsets_meta));
int64_t* coo_indices = indices.mutable_data<int64_t>(place);
int64_t* batch_ptr = x_dims.size() == 2 ? nullptr : coo_indices;
int64_t* coo_rows_data =
x_dims.size() == 2 ? coo_indices : batch_ptr + non_zero_num;
int64_t* coo_cols_data = coo_rows_data + non_zero_num;
int* offsets_ptr = batchs == 1 ? nullptr : offsets.mutable_data<int>(place);
T* coo_values_data = values.mutable_data<T>(place);
int grid_size = 1, block_size = 1;
if (batchs > 1) {
GetGpuLaunchConfig1D(dev_ctx, batchs, &grid_size, &block_size);
GetBatchSizes<<<grid_size, block_size>>>(
csr_crows_data, rows, batchs, offsets_ptr);
#ifdef PADDLE_WITH_HIP
thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
#else
thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
#endif
offsets_ptr,
offsets_ptr + batchs,
offsets_ptr);
}
GetGpuLaunchConfig1D(dev_ctx, rows, &grid_size, &block_size);
dim3 grids(grid_size, batchs, 1);
ConvertCsrCrowsToCooRows<<<grids, block_size>>>(
csr_crows_data, offsets_ptr, coo_rows_data, batch_ptr, rows);
#ifdef PADDLE_WITH_HIP
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(coo_cols_data,
csr_cols_data,
sizeof(int64_t) * non_zero_num,
hipMemcpyDeviceToDevice,
dev_ctx.stream()));
PADDLE_ENFORCE_GPU_SUCCESS(hipMemcpyAsync(coo_values_data,
csr_values_data,
sizeof(T) * non_zero_num,
hipMemcpyDeviceToDevice,
dev_ctx.stream()));
#else
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(coo_cols_data,
csr_cols_data,
sizeof(int64_t) * non_zero_num,
cudaMemcpyDeviceToDevice,
dev_ctx.stream()));
PADDLE_ENFORCE_GPU_SUCCESS(cudaMemcpyAsync(coo_values_data,
csr_values_data,
sizeof(T) * non_zero_num,
cudaMemcpyDeviceToDevice,
dev_ctx.stream()));
#endif
out->SetMember(indices, values, x_dims, true);
}
} // namespace sparse
} // namespace pten
......@@ -229,3 +345,16 @@ PT_REGISTER_KERNEL(dense_to_sparse_coo,
int16_t,
int,
int64_t) {}
PT_REGISTER_KERNEL(sparse_csr_to_coo,
GPU,
ALL_LAYOUT,
pten::sparse::SparseCsrToCooKernel,
float,
double,
pten::dtype::float16,
uint8_t,
int8_t,
int16_t,
int,
int64_t) {}
......@@ -57,5 +57,20 @@ SparseCooTensor DenseToSparseCoo(const Context& dev_ctx,
return coo;
}
template <typename T, typename Context>
void SparseCsrToCooKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
SparseCooTensor* out);
template <typename T, typename Context>
SparseCooTensor SparseCsrToCoo(const Context& dev_ctx,
const SparseCsrTensor& x) {
DenseTensor indices = pten::Empty<T, Context>(dev_ctx);
DenseTensor values = pten::Empty<T, Context>(dev_ctx);
SparseCooTensor coo(indices, values, x.dims());
SparseCsrToCooKernel<T, Context>(dev_ctx, x, &coo);
return coo;
}
} // namespace sparse
} // namespace pten
......@@ -62,4 +62,43 @@ TEST(API, to_sparse_coo) {
non_zero_data.data(),
non_zero_data.size() * sizeof(float));
ASSERT_EQ(cmp_elements, 0);
// 1. test sparse_csr_to_coo
auto dense_dims = pten::framework::make_ddim({3, 3});
pten::DenseTensorMeta crows_meta(
pten::DataType::INT64, {dense_dims[0] + 1}, pten::DataLayout::NCHW);
pten::DenseTensorMeta cols_meta(
pten::DataType::INT64, {non_zero_num}, pten::DataLayout::NCHW);
pten::DenseTensorMeta values_meta(
pten::DataType::FLOAT32, {non_zero_num}, pten::DataLayout::NCHW);
pten::CPUPlace place;
pten::DenseTensor crows(alloc.get(), crows_meta);
pten::DenseTensor cols(alloc.get(), cols_meta);
pten::DenseTensor values(alloc.get(), values_meta);
memcpy(crows.mutable_data<int64_t>(place),
crows_data.data(),
crows_data.size() * sizeof(int64_t));
memcpy(cols.mutable_data<int64_t>(place),
cols_data.data(),
cols_data.size() * sizeof(int64_t));
memcpy(values.mutable_data<float>(place),
non_zero_data.data(),
non_zero_data.size() * sizeof(float));
auto csr =
std::make_shared<pten::SparseCsrTensor>(crows, cols, values, dense_dims);
paddle::experimental::Tensor csr_x(csr);
auto out2 = paddle::experimental::sparse::to_sparse_coo(
csr_x, pten::Backend::CPU, sparse_dim);
auto coo2 = std::dynamic_pointer_cast<pten::SparseCooTensor>(out.impl());
ASSERT_EQ(coo2->nnz(), non_zero_num);
int cmp_indices2 = memcmp(coo2->non_zero_indices().data<int64_t>(),
indices_data.data(),
indices_data.size() * sizeof(int64_t));
ASSERT_EQ(cmp_indices2, 0);
int cmp_elements2 = memcmp(coo2->non_zero_elements().data<float>(),
non_zero_data.data(),
non_zero_data.size() * sizeof(float));
ASSERT_EQ(cmp_elements2, 0);
}
......@@ -246,5 +246,112 @@ TEST(DEV_API, to_sparse_coo_batch) {
dense_x, sparse_dim, non_zero_data, indices_data, non_zero_num);
}
template <typename T>
void TestSparseCsrToCoo(const DDim& dense_dims,
const std::vector<T>& non_zero_data,
const std::vector<int64_t>& crows_data,
const std::vector<int64_t>& cols_data,
const std::vector<int64_t>& indices_data,
const int64_t non_zero_num) {
int batchs = 1;
int rows = dense_dims[0];
if (dense_dims.size() == 3) {
batchs = dense_dims[0];
rows = dense_dims[1];
}
pten::DenseTensorMeta crows_meta(
DataType::INT64, {batchs * (rows + 1)}, DataLayout::NCHW);
pten::DenseTensorMeta cols_meta(
DataType::INT64, {non_zero_num}, DataLayout::NCHW);
pten::DenseTensorMeta values_meta(
paddle::experimental::CppTypeToDataType<T>::Type(),
{non_zero_num},
DataLayout::NCHW);
const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
paddle::platform::CPUPlace());
pten::CPUPlace place;
pten::DenseTensor crows(alloc.get(), crows_meta);
pten::DenseTensor cols(alloc.get(), cols_meta);
pten::DenseTensor values(alloc.get(), values_meta);
memcpy(crows.mutable_data<int64_t>(place),
crows_data.data(),
crows_data.size() * sizeof(int64_t));
memcpy(cols.mutable_data<int64_t>(place),
cols_data.data(),
cols_data.size() * sizeof(int64_t));
memcpy(values.mutable_data<T>(place),
non_zero_data.data(),
non_zero_data.size() * sizeof(T));
pten::SparseCsrTensor csr(crows, cols, values, dense_dims);
// 1. test cpu
pten::CPUContext dev_ctx_cpu;
auto cpu_sparse_out = sparse::SparseCsrToCoo<T>(dev_ctx_cpu, csr);
CheckResult<T, int64_t>(&dev_ctx_cpu,
cpu_sparse_out,
non_zero_data,
indices_data,
non_zero_num,
alloc);
// 2. test cuda
#if defined(PADDLE_WITH_CUDA)
const auto cuda_alloc =
std::make_shared<paddle::experimental::DefaultAllocator>(
paddle::platform::CUDAPlace());
auto& pool = paddle::platform::DeviceContextPool::Instance();
auto* dev_ctx_cuda = pool.GetByPlace(paddle::platform::CUDAPlace());
pten::DenseTensor d_crows(cuda_alloc.get(), crows_meta);
pten::DenseTensor d_cols(cuda_alloc.get(), cols_meta);
pten::DenseTensor d_values(cuda_alloc.get(), values_meta);
pten::Copy(*dev_ctx_cuda, crows, true, &d_crows);
pten::Copy(*dev_ctx_cuda, cols, true, &d_cols);
pten::Copy(*dev_ctx_cuda, values, true, &d_values);
pten::SparseCsrTensor d_csr(d_crows, d_cols, d_values, dense_dims);
auto cuda_sparse_out = sparse::SparseCsrToCoo<T>(*dev_ctx_cuda, d_csr);
CheckResult<T, int64_t>(dev_ctx_cuda,
cuda_sparse_out,
non_zero_data,
indices_data,
non_zero_num,
alloc);
#endif
}
TEST(DEV_API, sparse_csr_to_coo) {
DDim dense_dims = framework::make_ddim({3, 3});
std::vector<float> non_zero_data = {1.0, 2.0, 3.0, 3.2};
std::vector<int64_t> indices_data = {0, 1, 1, 2, 1, 0, 2, 0};
std::vector<int64_t> cols_data = {1, 0, 2, 0};
std::vector<int64_t> crows_data = {0, 1, 3, 4};
const int64_t non_zero_num = 4;
TestSparseCsrToCoo(dense_dims,
non_zero_data,
crows_data,
cols_data,
indices_data,
non_zero_num);
}
TEST(DEV_API, sparse_csr_to_coo_batch_and_fp16) {
DDim dense_dims = framework::make_ddim({2, 3, 3});
std::vector<float> non_zero_data = {1.0, 2.0, 3.0, 3.2, 1.0, 2.0, 3.0, 3.2};
std::vector<int64_t> cols_data = {1, 0, 2, 0, 1, 0, 2, 0};
std::vector<int64_t> crows_data = {0, 1, 3, 4, 0, 1, 3, 4};
std::vector<int64_t> indices_data = {0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 2,
0, 1, 1, 2, 1, 0, 2, 0, 1, 0, 2, 0};
const int64_t non_zero_num = 8;
using float16 = pten::dtype::float16;
std::vector<float16> non_zero_data_fp16(non_zero_num);
for (int64_t i = 0; i < non_zero_num; i++) {
non_zero_data_fp16[i] = static_cast<float16>(non_zero_data[i]);
}
TestSparseCsrToCoo(dense_dims,
non_zero_data_fp16,
crows_data,
cols_data,
indices_data,
non_zero_num);
}
} // namespace tests
} // namespace pten
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册
反馈
建议
客服 返回
顶部