未验证 提交 281ea2f4 编写于 作者: U umiswing 提交者: GitHub

[Dcu]: Add rocsparse_spmm for dcu. (#52200)

上级 5fbcf37d
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/platform/dynload/rocsparse.h"
namespace paddle {
namespace platform {
namespace dynload {
#define DEFINE_WRAP(__name) DynLoad__##__name __name
#ifdef ROCSPARSE_ROUTIN_EACH
ROCSPARSE_ROUTINE_EACH(DEFINE_WRAP);
#endif
#ifdef ROCSPARSE_ROUTINE_EACH_R2
ROCSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
#endif
#ifdef ROCSPARSE_ROUTINE_EACH_R3
ROCSPARSE_ROUTINE_EACH_R3(DEFINE_WRAP);
#endif
} // namespace dynload
} // namespace platform
} // namespace paddle
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <hip/hip_runtime.h>
#include <rocsparse.h>
#include <mutex> // NOLINT
#include <type_traits>
#include "paddle/phi/backends/dynload/rocsparse.h"
namespace paddle {
namespace platform {
namespace dynload {
/**
* The following macro definition can generate structs
* (for each function) to dynamic load rocsparse routine
* via operator overloading.
*
* note: default dynamic linked libs
*/
#define PLATFORM_DECLARE_DYNAMIC_LOAD_ROCSPARSE_WRAP(__name) \
using DynLoad__##__name = phi::dynload::DynLoad__##__name; \
extern DynLoad__##__name __name
#if defined(PADDLE_WITH_HIP)
#define ROCSPARSE_ROUTINE_EACH(__macro) \
__macro(rocsparse_create_handle); \
__macro(rocsparse_destroy_handle); \
__macro(rocsparse_set_stream); \
__macro(rocsparse_csr2coo);
ROCSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCSPARSE_WRAP)
#if HIP_VERSION >= 402
#define ROCSPARSE_ROUTINE_EACH_R2(__macro) \
__macro(rocsparse_create_coo_descr); \
__macro(rocsparse_create_csr_descr); \
__macro(rocsparse_destroy_spmat_descr); \
__macro(rocsparse_create_dnmat_descr); \
__macro(rocsparse_destroy_dnmat_descr); \
__macro(rocsparse_spmm);
ROCSPARSE_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCSPARSE_WRAP)
#endif
#if HIP_VERSION >= 403
#define ROCSPARSE_ROUTINE_EACH_R3(__macro) \
__macro(rocsparse_sddmm_buffer_size); \
__macro(rocsparse_sddmm_preprocess); \
__macro(rocsparse_sddmm);
ROCSPARSE_ROUTINE_EACH_R3(PLATFORM_DECLARE_DYNAMIC_LOAD_ROCSPARSE_WRAP)
#endif
#endif // PADDLE_WITH_HIP
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_ROCSPARSE_WRAP
} // namespace dynload
} // namespace platform
} // namespace paddle
......@@ -20,7 +20,14 @@ if(NOT WITH_NV_JETSON)
endif()
if(WITH_ROCM)
list(APPEND HIP_SRCS rocblas.cc miopen.cc hiprand.cc hipfft.cc)
list(
APPEND
HIP_SRCS
rocblas.cc
miopen.cc
hiprand.cc
hipfft.cc
rocsparse.cc)
endif()
# There is no macOS version of NCCL.
......
......@@ -427,6 +427,8 @@ void* GetCusparseDsoHandle() {
#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
return GetDsoHandleFromSearchPath(
FLAGS_cuda_dir, win_cusparse_lib, true, {cuda_lib_path});
#elif defined(PADDLE_WITH_HIP)
return GetDsoHandleFromSearchPath(FLAGS_rocm_dir, "librocsparse.so");
#else
return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusparse.so");
#endif
......
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/backends/dynload/rocsparse.h"
namespace phi {
namespace dynload {
std::once_flag rocsparse_dso_flag;
void *rocsparse_dso_handle = nullptr;
#define DEFINE_WRAP(__name) DynLoad__##__name __name
#ifdef ROCSPARSE_ROUTINE_EACH
ROCSPARSE_ROUTINE_EACH(DEFINE_WRAP)
#endif
#ifdef ROCSPARSE_ROUTINE_EACH_R2
ROCSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
#endif
#ifdef ROCSPARSE_ROUTINE_EACH_R3
ROCSPARSE_ROUTINE_EACH_R3(DEFINE_WRAP);
#endif
} // namespace dynload
} // namespace phi
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <hip/hip_runtime.h>
#include <rocsparse.h>
#include <mutex> // NOLINT
#include <type_traits>
#include "paddle/phi/backends/dynload/dynamic_loader.h"
#include "paddle/phi/backends/dynload/port.h"
namespace phi {
namespace dynload {
extern std::once_flag rocsparse_dso_flag;
extern void *rocsparse_dso_handle;
/**
* The following macro definition can generate structs
* (for each function) to dynamic load rocsparse routine
* via operator overloading.
*
* note: default dynamic linked libs
*/
#define DECLARE_DYNAMIC_LOAD_ROCSPARSE_WRAP(__name) \
struct DynLoad__##__name { \
template <typename... Args> \
rocsparse_status operator()(Args... args) { \
using rocsparse_func = decltype(&::__name); \
std::call_once(rocsparse_dso_flag, []() { \
rocsparse_dso_handle = phi::dynload::GetCusparseDsoHandle(); \
}); \
static void *p_##__name = dlsym(rocsparse_dso_handle, #__name); \
return reinterpret_cast<rocsparse_func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#if defined(PADDLE_WITH_HIP)
#define ROCSPARSE_ROUTINE_EACH(__macro) \
__macro(rocsparse_create_handle); \
__macro(rocsparse_destroy_handle); \
__macro(rocsparse_set_stream); \
__macro(rocsparse_csr2coo);
ROCSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_ROCSPARSE_WRAP)
#if HIP_VERSION >= 402
#define ROCSPARSE_ROUTINE_EACH_R2(__macro) \
__macro(rocsparse_create_coo_descr); \
__macro(rocsparse_create_csr_descr); \
__macro(rocsparse_destroy_spmat_descr); \
__macro(rocsparse_create_dnmat_descr); \
__macro(rocsparse_destroy_dnmat_descr); \
__macro(rocsparse_spmm);
ROCSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_ROCSPARSE_WRAP)
#endif
#if HIP_VERSION >= 403
#define ROCSPARSE_ROUTINE_EACH_R3(__macro) \
__macro(rocsparse_sddmm_buffer_size); \
__macro(rocsparse_sddmm_preprocess); \
__macro(rocsparse_sddmm);
ROCSPARSE_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_ROCSPARSE_WRAP)
#endif
#endif // PADDLE_WITH_HIP
#undef DECLARE_DYNAMIC_LOAD_ROCSPARSE_WRAP
} // namespace dynload
} // namespace phi
......@@ -33,6 +33,10 @@
#endif // !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
#endif // PADDLE_WITH_CUDA
#ifdef PADDLE_WITH_HIP
#include "paddle/phi/backends/dynload/rocsparse.h"
#endif
#include "glog/logging.h"
#include "unsupported/Eigen/CXX11/Tensor"
......@@ -295,6 +299,9 @@ void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) {
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(handle));
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseSetStream(*handle, stream));
#endif
#elif defined(PADDLE_WITH_HIP)
phi::dynload::rocsparse_create_handle(handle);
phi::dynload::rocsparse_set_stream(*handle, stream);
#endif
}
......@@ -306,6 +313,11 @@ void DestroySparseHandle(sparseHandle_t handle) {
handle = nullptr;
}
#endif
#elif defined(PADDLE_WITH_HIP)
if (handle != nullptr) {
phi::dynload::rocsparse_destroy_handle(handle);
handle = nullptr;
}
#endif
}
......
......@@ -97,3 +97,6 @@ inline SparseBlasT<DeviceContext, T> GetSparseBlas(
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11000
#include "paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h"
#endif
#if defined(PADDLE_WITH_HIP) && HIP_VERSION >= 402
#include "paddle/phi/kernels/funcs/sparse/sparse_blas_impl.hip.h"
#endif
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/backends/dynload/rocsparse.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/memory_utils.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
#include "paddle/phi/core/visit_type.h"
namespace phi {
namespace funcs {
namespace sparse {
template <typename IntT>
rocsparse_indextype GetGpuIndexType() {
if (std::is_same<IntT, int32_t>::value) {
return rocsparse_indextype_i32;
} else if (std::is_same<IntT, int64_t>::value) {
return rocsparse_indextype_i64;
}
}
template <typename T>
rocsparse_datatype GetGpuDataType() {
if (std::is_same<T, float>::value) {
return rocsparse_datatype_f32_r;
} else if (std::is_same<T, double>::value) {
return rocsparse_datatype_f64_r;
}
}
inline rocsparse_operation GetTransposeOperation(const bool trans) {
if (trans) {
return rocsparse_operation_transpose;
} else {
return rocsparse_operation_none;
}
}
template <typename TensorType>
inline rocsparse_spmm_alg GetSpMMAlgorithm(const TensorType& x) {
return rocsparse_spmm_alg_default;
}
/************* SPARSE MATRIX DESCRIPTOR (COO/CSR) ************/
template <typename T, typename IntT>
inline void CreateCsrDescriptor(const phi::SparseCsrTensor& x,
const phi::GPUContext& dev_ctx,
rocsparse_spmat_descr* descriptor) {
std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
auto x_ndims = xdim_vec.size();
PADDLE_ENFORCE_GE(
x_ndims,
2,
phi::errors::InvalidArgument("the dim size of SparseCsrTensor must be "
"greater than or eaqual to 2."));
int64_t M = xdim_vec[x_ndims - 2];
int64_t N = xdim_vec[x_ndims - 1];
int batch_size = 1;
for (int i = 0; i < x_ndims - 2; i++) {
batch_size *= xdim_vec[i];
}
PADDLE_ENFORCE_EQ(x.non_zero_crows().numel(),
batch_size * (M + 1),
phi::errors::PreconditionNotMet(
"the length of SparseCsrTensor crows is not right."));
const IntT* crows_data = x.non_zero_crows().data<IntT>();
const IntT* cols_data = x.non_zero_cols().data<IntT>();
const T* values_data = x.non_zero_elements().data<T>();
int64_t batch_nnz = x.nnz() / batch_size;
rocsparse_indextype itype = GetGpuIndexType<int64_t>();
rocsparse_indextype jtype = GetGpuIndexType<int64_t>();
rocsparse_datatype ttype = GetGpuDataType<T>();
dev_ctx.CusparseCall([&](rocsparse_handle handle) {
phi::dynload::rocsparse_create_csr_descr(descriptor,
M,
N,
batch_nnz,
const_cast<IntT*>(crows_data),
const_cast<IntT*>(cols_data),
const_cast<T*>(values_data),
itype,
jtype,
rocsparse_index_base_zero,
ttype);
});
if (batch_size > 1) {
// TODO(umiswing): Add batch sparse matmul support for ROCM after 5.2.0
PADDLE_THROW(phi::errors::Unimplemented(
"Batch Sparse matmul use 'rocsparse_coo_set_strided_batch', which is "
"supported from ROCM 5.2.0"));
}
}
template <typename T, typename IntT>
inline void CreateCooDescriptor(const phi::SparseCooTensor& x,
const phi::GPUContext& dev_ctx,
rocsparse_spmat_descr* descriptor) {
std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
auto x_ndims = xdim_vec.size();
PADDLE_ENFORCE_GE(
x_ndims,
2,
phi::errors::InvalidArgument("the dim size of SparseCooTensor must be "
"greater than or eaqual to 2."));
int64_t M = xdim_vec[x_ndims - 2];
int64_t N = xdim_vec[x_ndims - 1];
int batch_size = 1;
for (int i = 0; i < x_ndims - 2; i++) {
batch_size *= xdim_vec[i];
}
int64_t nnz = x.nnz();
const IntT* indices_data = x.non_zero_indices().data<IntT>();
const T* values_data = x.non_zero_elements().data<T>();
auto rows_data = indices_data + (x_ndims - 2) * nnz;
auto cols_data = indices_data + (x_ndims - 1) * nnz;
int64_t batch_nnz = nnz / batch_size;
rocsparse_indextype itype = GetGpuIndexType<int64_t>();
rocsparse_datatype ttype = GetGpuDataType<T>();
dev_ctx.CusparseCall([&](rocsparse_handle handle) {
phi::dynload::rocsparse_create_coo_descr(descriptor,
M,
N,
batch_nnz,
const_cast<IntT*>(rows_data),
const_cast<IntT*>(cols_data),
const_cast<T*>(values_data),
itype,
rocsparse_index_base_zero,
ttype);
});
if (batch_size > 1) {
// TODO(umiswing): Add batch sparse matmul support for ROCM after 5.2.0
PADDLE_THROW(phi::errors::Unimplemented(
"Batch Sparse matmul use 'rocsparse_coo_set_strided_batch', which is "
"supported from ROCM 5.2.0"));
}
}
template <typename T>
class RocSparseSpMatDescriptor {
public:
explicit RocSparseSpMatDescriptor(const phi::SparseCsrTensor& x,
const phi::GPUContext& dev_ctx)
: dev_ctx_(dev_ctx) {
PD_VISIT_BASE_INTEGRAL_TYPES(
x.non_zero_crows().dtype(), "Csr RocSparseSpMatDescriptor", ([&] {
CreateCsrDescriptor<T, data_t>(x, dev_ctx_, &descriptor_);
}));
VLOG(6) << "Create csr rocsparse_spmat_descr " << &descriptor_;
}
explicit RocSparseSpMatDescriptor(const phi::SparseCooTensor& x,
const phi::GPUContext& dev_ctx)
: dev_ctx_(dev_ctx) {
PD_VISIT_BASE_INTEGRAL_TYPES(
x.non_zero_indices().dtype(), "Coo RocSparseSpMatDescriptor", ([&] {
CreateCooDescriptor<T, data_t>(x, dev_ctx_, &descriptor_);
}));
VLOG(6) << "Create coo rocsparse_spmat_descr " << &descriptor_;
}
~RocSparseSpMatDescriptor() {
dev_ctx_.CusparseCall([&](rocsparse_handle handle) {
phi::dynload::rocsparse_destroy_spmat_descr(descriptor_);
});
VLOG(6) << "Destroy roscparse_spmat_descr " << &descriptor_;
}
const rocsparse_spmat_descr& descriptor() const { return descriptor_; }
private:
const phi::GPUContext& dev_ctx_;
rocsparse_spmat_descr descriptor_;
};
/************* DENSE MATRIX DESCRIPTOR ************/
template <typename T>
class RocSparseDnMatDescriptor {
public:
explicit RocSparseDnMatDescriptor(const phi::DenseTensor& x,
const phi::GPUContext& dev_ctx)
: dev_ctx_(dev_ctx) {
std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
auto x_ndims = xdim_vec.size();
PADDLE_ENFORCE_GE(
x_ndims,
2,
phi::errors::InvalidArgument("the dim size of DenseTensor must be "
"greater than or eaqual to 2."));
int64_t M = xdim_vec[x_ndims - 2];
int64_t N = xdim_vec[x_ndims - 1];
int batch_size = 1;
for (int i = 0; i < x_ndims - 2; i++) {
batch_size *= xdim_vec[i];
}
const T* x_data = x.data<T>();
rocsparse_datatype ttype = GetGpuDataType<T>();
dev_ctx.CusparseCall([&](rocsparse_handle handle) {
phi::dynload::rocsparse_create_dnmat_descr(&descriptor_,
M,
N,
N,
const_cast<T*>(x_data),
ttype,
rocsparse_order_row);
});
PADDLE_ENFORCE_EQ(
x.numel(),
batch_size * M * N,
phi::errors::InvalidArgument("The number of elements in DenseTensor "
"must equals to batch_size * M * N."));
if (batch_size > 1) {
// TODO(umiswing): Add batch sparse matmul support for ROCM after 5.2.0
PADDLE_THROW(phi::errors::Unimplemented(
"Batch Sparse matmul use 'rocsparse_dnmat_set_strided_batch', which "
"is supported from ROCM 5.2.0"));
}
VLOG(6) << "Create cusparseDnMatDescr_t " << &descriptor_;
}
~RocSparseDnMatDescriptor() {
dev_ctx_.CusparseCall([&](rocsparse_handle handle) {
phi::dynload::rocsparse_destroy_dnmat_descr(descriptor_);
});
VLOG(6) << "Destroy rocsparse_dnmat_descr " << &descriptor_;
}
const rocsparse_dnmat_descr& descriptor() const { return descriptor_; }
private:
const phi::GPUContext& dev_ctx_;
rocsparse_dnmat_descr descriptor_;
};
/************* SPARSE*DENSE->DENSE MATMUL ************/
template <>
template <typename T, typename TensorType>
void SparseBlas<phi::GPUContext>::SPMM(bool transa,
bool transb,
T alpha,
const TensorType& mat_a,
const phi::DenseTensor& mat_b,
T beta,
phi::DenseTensor* mat_out) const {
auto a_descriptor = RocSparseSpMatDescriptor<T>(mat_a, dev_ctx_);
auto b_descriptor = RocSparseDnMatDescriptor<T>(mat_b, dev_ctx_);
auto out_descriptor = RocSparseDnMatDescriptor<T>(*mat_out, dev_ctx_);
rocsparse_datatype ttype = GetGpuDataType<T>();
size_t buffer_size = 0;
// Query SpMM buffer
dev_ctx_.CusparseCall([&](rocsparse_handle handle) {
phi::dynload::rocsparse_spmm(handle,
GetTransposeOperation(transa),
GetTransposeOperation(transb),
&alpha,
a_descriptor.descriptor(),
b_descriptor.descriptor(),
&beta,
out_descriptor.descriptor(),
ttype,
GetSpMMAlgorithm(mat_a),
rocsparse_spmm_stage_buffer_size,
&buffer_size,
nullptr);
});
// Allocate buffer
phi::Allocator::AllocationPtr tmp_buffer = phi::memory_utils::Alloc(
dev_ctx_.GetPlace(),
buffer_size,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream())));
void* tmp_buffer_ptr = tmp_buffer->ptr();
// Preprocess data
dev_ctx_.CusparseCall([&](rocsparse_handle handle) {
phi::dynload::rocsparse_spmm(handle,
GetTransposeOperation(transa),
GetTransposeOperation(transb),
&alpha,
a_descriptor.descriptor(),
b_descriptor.descriptor(),
&beta,
out_descriptor.descriptor(),
ttype,
GetSpMMAlgorithm(mat_a),
rocsparse_spmm_stage_preprocess,
&buffer_size,
tmp_buffer_ptr);
});
// Performs the actual SpMM computation
dev_ctx_.CusparseCall([&](rocsparse_handle handle) {
phi::dynload::rocsparse_spmm(handle,
GetTransposeOperation(transa),
GetTransposeOperation(transb),
&alpha,
a_descriptor.descriptor(),
b_descriptor.descriptor(),
&beta,
out_descriptor.descriptor(),
ttype,
GetSpMMAlgorithm(mat_a),
rocsparse_spmm_stage_compute,
&buffer_size,
tmp_buffer_ptr);
});
}
/************* DENSE*DENSE->SPARSE MATMUL ************/
#if HIP_VERSION >= 403
template <>
template <typename T, typename TensorType>
void SparseBlas<phi::GPUContext>::SDDMM(bool transa,
bool transb,
T alpha,
const phi::DenseTensor& mat_a,
const phi::DenseTensor& mat_b,
T beta,
TensorType* mat_out) const {
auto a_descriptor = RocSparseDnMatDescriptor<T>(mat_a, dev_ctx_);
auto b_descriptor = RocSparseDnMatDescriptor<T>(mat_b, dev_ctx_);
auto out_descriptor = RocSparseSpMatDescriptor<T>(*mat_out, dev_ctx_);
rocsparse_datatype gpu_type = GetGpuDataType<T>();
size_t buffer_size = 0;
dev_ctx_.CusparseCall([&](rocsparse_handle handle) {
phi::dynload::rocsparse_sddmm_buffer_size(handle,
GetTransposeOperation(transa),
GetTransposeOperation(transb),
&alpha,
a_descriptor.descriptor(),
b_descriptor.descriptor(),
&beta,
out_descriptor.descriptor(),
gpu_type,
rocsparse_sddmm_alg_default,
&buffer_size);
});
phi::Allocator::AllocationPtr tmp_buffer = phi::memory_utils::Alloc(
dev_ctx_.GetPlace(),
buffer_size,
phi::Stream(reinterpret_cast<phi::StreamId>(dev_ctx_.stream())));
void* tmp_buffer_ptr = tmp_buffer->ptr();
dev_ctx_.CusparseCall([&](rocsparse_handle handle) {
phi::dynload::rocsparse_sddmm_preprocess(handle,
GetTransposeOperation(transa),
GetTransposeOperation(transb),
&alpha,
a_descriptor.descriptor(),
b_descriptor.descriptor(),
&beta,
out_descriptor.descriptor(),
gpu_type,
rocsparse_sddmm_alg_default,
tmp_buffer_ptr);
});
dev_ctx_.CusparseCall([&](rocsparse_handle handle) {
phi::dynload::rocsparse_sddmm(handle,
GetTransposeOperation(transa),
GetTransposeOperation(transb),
&alpha,
a_descriptor.descriptor(),
b_descriptor.descriptor(),
&beta,
out_descriptor.descriptor(),
gpu_type,
rocsparse_sddmm_alg_default,
tmp_buffer_ptr);
});
}
#endif
} // namespace sparse
} // namespace funcs
} // namespace phi
......@@ -20,6 +20,7 @@ limitations under the License. */
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/math_function_impl.h"
#include "paddle/phi/kernels/funcs/sparse/sparse_blas.h"
#include "paddle/phi/kernels/sparse/empty_kernel.h"
#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
......@@ -35,7 +36,7 @@ void MatmulCooDenseGradKernel(const Context& dev_ctx,
const DenseTensor& dout,
SparseCooTensor* dx,
DenseTensor* dy) {
#if CUDA_VERSION >= 11030
#if CUDA_VERSION >= 11030 || HIP_VERSION >= 403
auto sparse_blas = phi::funcs::sparse::GetSparseBlas<Context, T>(dev_ctx);
// dx{SparseCoo} = dout{Dense} * y'{Dense}
......@@ -44,8 +45,13 @@ void MatmulCooDenseGradKernel(const Context& dev_ctx,
// which will increase some expenses.
EmptyLikeCooKernel<T, Context>(dev_ctx, x, dx);
SparseCsrTensor dx_csr = CooToCsr<T, Context>(dev_ctx, *dx);
#ifdef PADDLE_WITH_HIP
phi::funcs::SetConstant<Context, T> set_zero;
set_zero(dev_ctx, dx_csr.mutable_non_zero_elements(), static_cast<T>(0.0f));
#endif
sparse_blas.SDDMM(
false, true, static_cast<T>(1), dout, y, static_cast<T>(0), &dx_csr);
CsrToCooKernel<T, Context>(dev_ctx, dx_csr, dx);
}
......@@ -56,13 +62,29 @@ void MatmulCooDenseGradKernel(const Context& dev_ctx,
meta_dy.set_dtype(y.dtype());
dev_ctx.template Alloc<T>(dy);
#ifdef PADDLE_WITH_HIP
SparseCsrTensor x_csr = CooToCsr<T, Context>(dev_ctx, x);
phi::funcs::SetConstant<Context, T> set_zero;
set_zero(dev_ctx, dy, static_cast<T>(0.0f));
sparse_blas.SPMM(
true, false, static_cast<T>(1), x_csr, dout, static_cast<T>(0), dy);
#elif defined(PADDLE_WITH_CUDA)
sparse_blas.SPMM(
true, false, static_cast<T>(1), x, dout, static_cast<T>(0), dy);
#endif
}
#else
#ifdef PADDLE_WITH_CUDA
PADDLE_THROW(phi::errors::Unimplemented(
"backward of 'sparse.matmul' use cusparseSDDMM, which is supported from "
"CUDA 11.3"));
#elif defined(PADDLE_WITH_HIP)
PADDLE_THROW(
phi::errors::Unimplemented("backward of 'sparse.matmul' use "
"rocsparse_sddmm with transpose, which is "
"supported from "
"ROCM 4.3.0"));
#endif
#endif
}
......@@ -73,7 +95,7 @@ void MatmulCsrDenseGradKernel(const Context& dev_ctx,
const DenseTensor& dout,
SparseCsrTensor* dx,
DenseTensor* dy) {
#if CUDA_VERSION >= 11030
#if CUDA_VERSION >= 11030 || HIP_VERSION >= 403
auto sparse_blas = phi::funcs::sparse::GetSparseBlas<Context, T>(dev_ctx);
// dx{SparseCsr} = dout{Dense} * y'{Dense}
......@@ -94,13 +116,26 @@ void MatmulCsrDenseGradKernel(const Context& dev_ctx,
dev_ctx.template Alloc<T>(dy);
#ifdef PADDLE_WITH_HIP
phi::funcs::SetConstant<Context, T> set_zero;
set_zero(dev_ctx, dy, static_cast<T>(0.0f));
#endif
sparse_blas.SPMM(
true, false, static_cast<T>(1), x, dout, static_cast<T>(0), dy);
}
#else
#ifdef PADDLE_WITH_CUDA
PADDLE_THROW(phi::errors::Unimplemented(
"backward of 'sparse.matmul' use cusparseSDDMM, which is supported from "
"CUDA 11.3"));
#elif defined(PADDLE_WITH_HIP)
PADDLE_THROW(
phi::errors::Unimplemented("backward of 'sparse.matmul' use "
"rocsparse_sddmm with transpose, which is "
"supported from "
"ROCM 4.3.0"));
#endif
#endif
}
......
......@@ -25,6 +25,7 @@ limitations under the License. */
#include "paddle/phi/core/sparse_csr_tensor.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/math_function_impl.h"
#include "paddle/phi/kernels/funcs/sparse/sparse_blas.h"
#include "paddle/phi/kernels/sparse/empty_kernel.h"
......@@ -36,7 +37,7 @@ void MatmulKernelImpl(const Context& dev_ctx,
const TensorType& x,
const DenseTensor& y,
DenseTensor* out) {
#if CUDA_VERSION >= 11000
#if CUDA_VERSION >= 11000 || HIP_VERSION >= 402
std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
std::vector<int64_t> ydim_vec = phi::vectorize(y.dims());
auto x_ndims = xdim_vec.size();
......@@ -80,13 +81,24 @@ void MatmulKernelImpl(const Context& dev_ctx,
dev_ctx.template Alloc<T>(out);
#ifdef PADDLE_WITH_HIP
phi::funcs::SetConstant<Context, T> set_zero;
set_zero(dev_ctx, out, static_cast<T>(0.0f));
#endif
auto sparse_blas = phi::funcs::sparse::GetSparseBlas<Context, T>(dev_ctx);
sparse_blas.SPMM(
false, false, static_cast<T>(1), x, y, static_cast<T>(0), out);
#else
#ifdef PADDLE_WITH_CUDA
PADDLE_THROW(
phi::errors::Unimplemented("forward of 'sparse.matmul' use cusparseSpMM, "
"which is supported from CUDA 11.0"));
#elif defined(PADDLE_WITH_HIP)
PADDLE_THROW(phi::errors::Unimplemented(
"forward of 'sparse.matmul' use rocsparse_spmm, "
"which is supported from ROCM 4.2.0"));
#endif
#endif
}
......
......@@ -17,12 +17,16 @@ limitations under the License. */
#include <thrust/execution_policy.h>
#include <thrust/remove.h>
#ifdef PADDLE_WITH_HIP
#include "paddle/phi/backends/dynload/rocsparse.h"
#endif
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h"
#include "paddle/phi/core/visit_type.h"
#include "paddle/phi/kernels/cast_kernel.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/sparse/common_shape.h"
......@@ -214,55 +218,88 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
SparseCooTensor* out) {
const DDim& x_dims = x.dims();
const int64_t non_zero_num = x.cols().numel();
// rocsparse_csr2coo only support index with type 'rocsparse_int' (aka 'int')
// now
#ifdef PADDLE_WITH_HIP
const auto& csr_crows = Cast<IntT>(dev_ctx, x.crows(), DataType::INT32);
const auto& csr_cols = Cast<IntT>(dev_ctx, x.cols(), DataType::INT32);
const int* csr_crows_data = csr_crows.template data<int>();
const int* csr_cols_data = csr_cols.template data<int>();
#else
const auto& csr_crows = x.crows();
const auto& csr_cols = x.cols();
const auto& csr_values = x.values();
const IntT* csr_crows_data = csr_crows.data<IntT>();
const IntT* csr_cols_data = csr_cols.data<IntT>();
#endif
const auto& csr_values = x.values();
const T* csr_values_data = csr_values.data<T>();
int64_t sparse_dim = 2;
if (x_dims.size() == 3) {
sparse_dim = 3;
}
int batchs = x_dims.size() == 2 ? 1 : x_dims[0];
int batches = x_dims.size() == 2 ? 1 : x_dims[0];
int rows = x_dims.size() == 2 ? x_dims[0] : x_dims[1];
#ifdef PADDLE_WITH_HIP
DenseTensor indices = phi::Empty<int>(dev_ctx, {sparse_dim, non_zero_num});
int* coo_indices = indices.data<int>();
int* coo_rows_data = coo_indices;
int* coo_cols_data = coo_rows_data + non_zero_num;
#else
DenseTensor indices = phi::Empty<IntT>(dev_ctx, {sparse_dim, non_zero_num});
DenseTensor values = phi::EmptyLike<T, GPUContext>(dev_ctx, csr_values);
DenseTensor offsets = phi::Empty<IntT>(dev_ctx, {batchs});
DenseTensor offsets = phi::Empty<IntT>(dev_ctx, {batches});
IntT* coo_indices = indices.data<IntT>();
IntT* batch_ptr = x_dims.size() == 2 ? nullptr : coo_indices;
IntT* coo_rows_data =
x_dims.size() == 2 ? coo_indices : batch_ptr + non_zero_num;
IntT* coo_cols_data = coo_rows_data + non_zero_num;
IntT* offsets_ptr = batchs == 1 ? nullptr : offsets.data<IntT>();
IntT* offsets_ptr = batches == 1 ? nullptr : offsets.data<IntT>();
#endif
DenseTensor values = phi::EmptyLike<T, GPUContext>(dev_ctx, csr_values);
T* coo_values_data = values.data<T>();
if (batchs > 1) {
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batchs, 1);
GetBatchSizes<IntT><<<config.block_per_grid.x, config.thread_per_block.x>>>(
csr_crows_data, rows, batchs, offsets_ptr);
if (batches > 1) {
#ifdef PADDLE_WITH_HIP
thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
PADDLE_THROW(
phi::errors::Unimplemented("'rocsparse_csr2coo' only supports batches "
"with a value of 1 currently."));
#else
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, batches, 1);
GetBatchSizes<IntT><<<config.block_per_grid.x, config.thread_per_block.x>>>(
csr_crows_data, rows, batches, offsets_ptr);
thrust::exclusive_scan(thrust::cuda::par.on(dev_ctx.stream()),
#endif
offsets_ptr,
offsets_ptr + batchs,
offsets_ptr + batches,
offsets_ptr);
#endif
}
#ifdef PADDLE_WITH_HIP
dev_ctx.CusparseCall([&](rocsparse_handle handle) {
phi::dynload::rocsparse_csr2coo(handle,
csr_crows_data,
non_zero_num,
rows,
coo_rows_data,
rocsparse_index_base_zero);
});
#else
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, rows, 1);
config.block_per_grid.y = batchs;
config.block_per_grid.y = batches;
ConvertCsrCrowsToCooRows<IntT>
<<<config.block_per_grid, config.thread_per_block.x>>>(
csr_crows_data, offsets_ptr, coo_rows_data, batch_ptr, rows);
#endif
phi::backends::gpu::GpuMemcpyAsync(coo_cols_data,
csr_cols_data,
#ifdef PADDLE_WITH_HIP
sizeof(int) * non_zero_num,
#else
sizeof(IntT) * non_zero_num,
#endif
gpuMemcpyDeviceToDevice,
dev_ctx.stream());
phi::backends::gpu::GpuMemcpyAsync(coo_values_data,
......@@ -271,6 +308,11 @@ void CsrToCooGPUKernel(const GPUContext& dev_ctx,
gpuMemcpyDeviceToDevice,
dev_ctx.stream());
#ifdef PADDLE_WITH_HIP
if (std::is_same<IntT, int64_t>::value)
indices = Cast<int>(dev_ctx, indices, DataType::INT64);
#endif
out->SetMember(indices, values, x_dims, true);
}
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册