未验证 提交 346efe96 编写于 作者: zhouweiwei2014's avatar zhouweiwei2014 提交者: GitHub

add some kernels(csr*dense->csr, dense*dense->csr) of SparseTensor matmul (#42935)

* add some kernel(csr*dense->csr, dense*dense->csr) of SparseTensor matmul

* fix CI

* fix CI

* fix comment

* fix comment
上级 19eb0eb8
......@@ -27,6 +27,7 @@
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
/**
* Implementation of GradNodeBase, Edge and GradTensorHolder.
......@@ -114,6 +115,10 @@ void GradNodeBase::SetGradInMeta(const paddle::experimental::Tensor& fwd_out,
phi::SparseCooTensor* coo_tensor =
static_cast<phi::SparseCooTensor*>(fwd_out.impl().get());
dense_tensor = coo_tensor->mutable_non_zero_elements();
} else if (phi::SparseCsrTensor::classof(fwd_out.impl().get())) {
phi::SparseCsrTensor* csr_tensor =
static_cast<phi::SparseCsrTensor*>(fwd_out.impl().get());
dense_tensor = csr_tensor->mutable_non_zero_elements();
} else {
VLOG(6) << "Unable to initialize the DenseTensorMeta of GradSlotMeta with "
"non-DenseTensor argument.";
......
......@@ -66,8 +66,17 @@ void GradTensorHolder::CopyValueFromTensor(
// Create new tensor->impl and fill it with 1.0
if (t.defined()) {
// Fill 1.0, use full to support complex, one_like don't support it.
if (t.is_dense_tensor()) {
buffer_[slot_id][rank] =
paddle::experimental::full(t.shape(), 1, t.dtype(), t.place());
} else if (t.is_sparse_csr_tensor() || t.is_sparse_coo_tensor()) {
buffer_[slot_id][rank] =
paddle::experimental::sparse::full_like(t, 1, t.dtype());
} else {
PADDLE_THROW(paddle::platform::errors::Fatal(
"Only Support DENSE_TENSOR, SPARSE_COO_TENSOR, SPARSE_CSR_TENSOR "
"now."));
}
egr::EagerUtils::autograd_meta(&(buffer_[slot_id][rank]))
->SetStopGradient(false);
}
......
......@@ -31,7 +31,7 @@ class CusparseHandleHolder {
// ROCM is not yet supported
#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 10010
#if CUDA_VERSION >= 11000
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(&handle_));
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseSetStream(handle_, stream));
#endif
......@@ -41,7 +41,7 @@ class CusparseHandleHolder {
~CusparseHandleHolder() PADDLE_MAY_THROW {
#if defined(PADDLE_WITH_CUDA)
#if CUDA_VERSION >= 10010
#if CUDA_VERSION >= 11000
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseDestroy(handle_));
#endif
#endif
......
......@@ -24,10 +24,6 @@ namespace dynload {
CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
#endif
#ifdef CUSPARSE_ROUTINE_EACH_11020
CUSPARSE_ROUTINE_EACH_11020(DEFINE_WRAP);
#endif
#ifdef CUSPARSE_ROUTINE_EACH_R2
CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
#endif
......
......@@ -29,8 +29,8 @@ namespace dynload {
extern DynLoad__##__name __name
#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 10010
// APIs available after CUDA 11.0
#if CUDA_VERSION >= 11000
#define CUSPARSE_ROUTINE_EACH(__macro) \
__macro(cusparseCreate); \
__macro(cusparseSetStream); \
......@@ -39,13 +39,7 @@ namespace dynload {
__macro(cusparseSnnz); \
__macro(cusparseDnnz); \
__macro(cusparseSetMatType); \
__macro(cusparseSetMatIndexBase);
CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
// APIs available after CUDA 11.2
#if CUDA_VERSION >= 11020
#define CUSPARSE_ROUTINE_EACH_11020(__macro) \
__macro(cusparseSetMatIndexBase); \
__macro(cusparseCreateCsr); \
__macro(cusparseCreateCoo); \
__macro(cusparseCreateDnMat); \
......@@ -59,11 +53,13 @@ CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
__macro(cusparseDenseToSparse_analysis); \
__macro(cusparseDenseToSparse_convert); \
__macro(cusparseSparseToDense_bufferSize); \
__macro(cusparseSparseToDense);
__macro(cusparseSparseToDense); \
__macro(cusparseDnMatSetStridedBatch); \
__macro(cusparseCsrSetStridedBatch);
CUSPARSE_ROUTINE_EACH_11020(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
CUSPARSE_ROUTINE_EACH(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif
// APIs available after CUDA 11.3
#if CUDA_VERSION >= 11030
#define CUSPARSE_ROUTINE_EACH_R2(__macro) \
__macro(cusparseSDDMM_bufferSize); \
......@@ -72,8 +68,7 @@ CUSPARSE_ROUTINE_EACH_11020(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
CUSPARSE_ROUTINE_EACH_R2(PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif
#endif
#endif
#endif
#undef PLATFORM_DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
......
......@@ -26,10 +26,6 @@ void *cusparse_dso_handle;
CUSPARSE_ROUTINE_EACH(DEFINE_WRAP);
#endif
#ifdef CUSPARSE_ROUTINE_EACH_11020
CUSPARSE_ROUTINE_EACH_11020(DEFINE_WRAP);
#endif
#ifdef CUSPARSE_ROUTINE_EACH_R2
CUSPARSE_ROUTINE_EACH_R2(DEFINE_WRAP);
#endif
......
......@@ -30,19 +30,19 @@ extern void *cusparse_dso_handle;
struct DynLoad__##__name { \
template <typename... Args> \
cusparseStatus_t operator()(Args... args) { \
using cusparseFunc = decltype(&::__name); \
using Func = decltype(&::__name); \
std::call_once(cusparse_dso_flag, []() { \
cusparse_dso_handle = phi::dynload::GetCusparseDsoHandle(); \
}); \
static void *p_##__name = dlsym(cusparse_dso_handle, #__name); \
return reinterpret_cast<cusparseFunc>(p_##__name)(args...); \
return reinterpret_cast<Func>(p_##__name)(args...); \
} \
}; \
extern DynLoad__##__name __name
#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 10010
// APIs available after CUDA 11.0
#if CUDA_VERSION >= 11000
#define CUSPARSE_ROUTINE_EACH(__macro) \
__macro(cusparseCreate); \
__macro(cusparseSetStream); \
......@@ -51,13 +51,7 @@ extern void *cusparse_dso_handle;
__macro(cusparseSnnz); \
__macro(cusparseDnnz); \
__macro(cusparseSetMatType); \
__macro(cusparseSetMatIndexBase);
CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
// APIs available after CUDA 11.2
#if CUDA_VERSION >= 11020
#define CUSPARSE_ROUTINE_EACH_11020(__macro) \
__macro(cusparseSetMatIndexBase); \
__macro(cusparseCreateCsr); \
__macro(cusparseCreateCoo); \
__macro(cusparseCreateDnMat); \
......@@ -71,11 +65,13 @@ CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP);
__macro(cusparseDenseToSparse_analysis); \
__macro(cusparseDenseToSparse_convert); \
__macro(cusparseSparseToDense_bufferSize); \
__macro(cusparseSparseToDense);
__macro(cusparseSparseToDense); \
__macro(cusparseDnMatSetStridedBatch); \
__macro(cusparseCsrSetStridedBatch);
CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
CUSPARSE_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif
// APIs available after CUDA 11.3
#if CUDA_VERSION >= 11030
#define CUSPARSE_ROUTINE_EACH_R2(__macro) \
__macro(cusparseSDDMM_bufferSize); \
......@@ -84,8 +80,7 @@ CUSPARSE_ROUTINE_EACH_11020(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
CUSPARSE_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP)
#endif
#endif
#endif
#endif
#undef DECLARE_DYNAMIC_LOAD_CUSPARSE_WRAP
......
......@@ -402,7 +402,10 @@ struct GPUContext::Impl {
void SetSolverHandle(solverHandle_t handle) { solver_handle_ = handle; }
sparseHandle_t GetSparseHandle() const {
sparseHandle_t GetSparseHandle() {
std::call_once(flag_sparse_, [=]() {
if (!sparse_handle_) phi::InitSparseHandle(&sparse_handle_, stream_);
});
PD_CHECK(sparse_handle_ != nullptr, "the gpu sparse handle is nullptr.");
return sparse_handle_;
}
......@@ -519,7 +522,12 @@ struct GPUContext::Impl {
}
inline void CusparseCall(
const std::function<void(sparseHandle_t)>& callback) const {
const std::function<void(sparseHandle_t)>& callback) {
std::call_once(flag_sparse_, [=]() {
if (!sparse_handle_) {
phi::InitSparseHandle(&sparse_handle_, stream_);
}
});
std::lock_guard<std::mutex> guard(sparse_mtx_);
callback(sparse_handle_);
}
......@@ -598,6 +606,7 @@ struct GPUContext::Impl {
sparseHandle_t sparse_handle_{nullptr};
DnnWorkspaceHandle* workspace_{nullptr};
std::once_flag flag_sparse_;
std::once_flag flag_blas_;
std::once_flag flag_blaslt_;
std::once_flag flag_dnn_;
......
......@@ -250,7 +250,7 @@ void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) {
// ROCM is not yet supported
#if defined(PADDLE_WITH_CUDA)
// The generic APIs is supported from CUDA10.1
#if CUDA_VERSION >= 10010
#if CUDA_VERSION >= 11000
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseCreate(handle));
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseSetStream(*handle, stream));
#endif
......@@ -259,7 +259,7 @@ void InitSparseHandle(sparseHandle_t* handle, gpuStream_t stream) {
void DestroySparseHandle(sparseHandle_t handle) {
#ifdef PADDLE_WITH_CUDA
#if CUDA_VERSION >= 10010
#if CUDA_VERSION >= 11000
if (handle != nullptr) {
PADDLE_RETRY_CUDA_SUCCESS(dynload::cusparseDestroy(handle));
handle = nullptr;
......
......@@ -85,6 +85,10 @@ class SparseCsrTensor : public TensorBase,
/// \return The non zero elemetns in original dense tensor.
const DenseTensor& non_zero_elements() const { return non_zero_elements_; }
/// \brief Returns the total number of non zero elements in original dense
/// tensor.
int64_t nnz() const { return non_zero_elements_.numel(); }
/// \brief Return the number of elements contained in original dense tensor
/// \return The number of elements contained in original dense tensor
int64_t numel() const override { return product(dims_); }
......
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/device_context.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
namespace phi {
namespace funcs {
namespace sparse {
template <typename DeviceContext>
class SparseBlas {
public:
explicit SparseBlas(const DeviceContext& dev_ctx) : dev_ctx_(dev_ctx) {}
// TODO(zhouwei25): implement "COO @ DENSE -> DENSE" of DSDMM
template <typename T>
void DSDMM(bool transa,
bool transb,
T alpha,
const phi::SparseCooTensor& mat_a,
const phi::DenseTensor& mat_b,
T beta,
phi::DenseTensor* mat_c) const;
template <typename T>
void DSDMM(bool transa,
bool transb,
T alpha,
const phi::SparseCsrTensor& mat_a,
const phi::DenseTensor& mat_b,
T beta,
phi::DenseTensor* mat_c) const;
template <typename T>
void SDDMM(bool transa,
bool transb,
T alpha,
const phi::DenseTensor& mat_a,
const phi::DenseTensor& mat_b,
T beta,
phi::SparseCsrTensor* mat_c) const;
private:
const DeviceContext& dev_ctx_;
};
template <typename DeviceContext, typename T>
class SparseBlasT : private SparseBlas<DeviceContext> {
public:
using SparseBlas<DeviceContext>::SparseBlas;
template <typename... ARGS>
void DSDMM(ARGS... args) const {
Base()->template DSDMM<T>(args...);
}
template <typename... ARGS>
void SDDMM(ARGS... args) const {
Base()->template SDDMM<T>(args...);
}
private:
const SparseBlas<DeviceContext>* Base() const {
return static_cast<const SparseBlas<DeviceContext>*>(this);
}
};
template <typename DeviceContext, typename T>
inline SparseBlasT<DeviceContext, T> GetSparseBlas(
const DeviceContext& dev_ctx) {
return SparseBlasT<DeviceContext, T>(dev_ctx);
}
} // namespace sparse
} // namespace funcs
} // namespace phi
#if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 11000
#include "paddle/phi/kernels/funcs/sparse/sparse_blas_impl.cu.h"
#endif
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/memory/malloc.h"
#include "paddle/phi/backends/dynload/cusparse.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
#include "paddle/phi/core/visit_type.h"
namespace phi {
namespace funcs {
namespace sparse {
template <typename T>
cudaDataType_t GetGpuDataType() {
if (std::is_same<T, float>::value) {
return CUDA_R_32F;
} else if (std::is_same<T, double>::value) {
return CUDA_R_64F;
} else if (std::is_same<T, phi::dtype::float16>::value) {
return CUDA_R_16F;
}
}
inline cusparseOperation_t GetTransposeOperation(const bool trans) {
if (trans) {
return CUSPARSE_OPERATION_TRANSPOSE;
} else {
return CUSPARSE_OPERATION_NON_TRANSPOSE;
}
}
template <typename T>
class CuSparseSpMatDescriptor {
public:
explicit CuSparseSpMatDescriptor(const phi::SparseCsrTensor& x,
const phi::GPUContext& dev_ctx)
: dev_ctx_(dev_ctx) {
PD_VISIT_INTEGRAL_TYPES(
x.non_zero_crows().dtype(), "CuSparseSpMatDescriptor", ([&] {
const data_t* crows_data = x.non_zero_crows().data<data_t>();
const data_t* cols_data = x.non_zero_cols().data<data_t>();
const T* values_data = x.non_zero_elements().data<T>();
int64_t nnz = x.nnz();
std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
auto x_ndims = xdim_vec.size();
int64_t M = xdim_vec[x_ndims - 2];
int64_t N = xdim_vec[x_ndims - 1];
int batch_size = 1;
for (int i = 0; i < x_ndims - 2; i++) {
batch_size *= xdim_vec[i];
}
cudaDataType_t gpu_type = GetGpuDataType<T>();
dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseCreateCsr(&descriptor_,
M,
N,
nnz,
const_cast<data_t*>(crows_data),
const_cast<data_t*>(cols_data),
const_cast<T*>(values_data),
CUSPARSE_INDEX_64I,
CUSPARSE_INDEX_64I,
CUSPARSE_INDEX_BASE_ZERO,
gpu_type);
});
PADDLE_ENFORCE_EQ(x.non_zero_crows().numel(), batch_size * (M + 1));
PADDLE_ENFORCE_EQ(x.non_zero_cols().numel(), x.nnz());
if (batch_size > 1) {
dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseCsrSetStridedBatch(
descriptor_, batch_size, M + 1, nnz);
});
}
}));
VLOG(6) << "Create cusparseSpMatDescr_t " << &descriptor_;
}
~CuSparseSpMatDescriptor() {
dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseDestroySpMat(descriptor_);
});
VLOG(6) << "Destroy cusparseSpMatDescr_t " << &descriptor_;
}
const cusparseSpMatDescr_t& descriptor() const { return descriptor_; }
private:
const phi::GPUContext& dev_ctx_;
cusparseSpMatDescr_t descriptor_;
};
template <typename T>
class CuSparseDnMatDescriptor {
public:
explicit CuSparseDnMatDescriptor(const phi::DenseTensor& x,
const phi::GPUContext& dev_ctx)
: dev_ctx_(dev_ctx) {
const T* x_data = x.data<T>();
std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
auto x_ndims = xdim_vec.size();
int64_t M = xdim_vec[x_ndims - 2];
int64_t N = xdim_vec[x_ndims - 1];
int batch_size = 1;
for (int i = 0; i < x_ndims - 2; i++) {
batch_size *= xdim_vec[i];
}
cudaDataType_t gpu_type = GetGpuDataType<T>();
dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseCreateDnMat(&descriptor_,
M,
N,
N,
const_cast<T*>(x_data),
gpu_type,
CUSPARSE_ORDER_ROW);
});
PADDLE_ENFORCE_EQ(x.numel(), batch_size * M * N);
if (batch_size > 1) {
dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseDnMatSetStridedBatch(
descriptor_, batch_size, M * N);
});
}
VLOG(6) << "Create cusparseDnMatDescr_t " << &descriptor_;
}
~CuSparseDnMatDescriptor() {
dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseDestroyDnMat(descriptor_);
});
VLOG(6) << "Destroy cusparseDnMatDescr_t " << &descriptor_;
}
const cusparseDnMatDescr_t& descriptor() const { return descriptor_; }
private:
const phi::GPUContext& dev_ctx_;
cusparseDnMatDescr_t descriptor_;
};
template <>
template <typename T>
void SparseBlas<phi::GPUContext>::DSDMM(bool transa,
bool transb,
T alpha,
const phi::SparseCsrTensor& mat_a,
const phi::DenseTensor& mat_b,
T beta,
phi::DenseTensor* mat_c) const {
cudaDataType_t gpu_type = GetGpuDataType<T>();
auto a_descriptor = CuSparseSpMatDescriptor<T>(mat_a, dev_ctx_);
auto b_descriptor = CuSparseDnMatDescriptor<T>(mat_b, dev_ctx_);
auto c_descriptor = CuSparseDnMatDescriptor<T>(*mat_c, dev_ctx_);
size_t buffer_size = 0;
dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseSpMM_bufferSize(handle,
GetTransposeOperation(transa),
GetTransposeOperation(transb),
&alpha,
a_descriptor.descriptor(),
b_descriptor.descriptor(),
&beta,
c_descriptor.descriptor(),
gpu_type,
CUSPARSE_SPMM_ALG_DEFAULT,
&buffer_size);
});
paddle::memory::allocation::AllocationPtr tmp_buffer =
paddle::memory::Alloc(dev_ctx_, buffer_size);
void* tmp_buffer_ptr = tmp_buffer->ptr();
dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseSpMM(handle,
GetTransposeOperation(transa),
GetTransposeOperation(transb),
&alpha,
a_descriptor.descriptor(),
b_descriptor.descriptor(),
&beta,
c_descriptor.descriptor(),
gpu_type,
CUSPARSE_SPMM_ALG_DEFAULT,
tmp_buffer_ptr);
});
}
#if CUDA_VERSION >= 11030
template <>
template <typename T>
void SparseBlas<phi::GPUContext>::SDDMM(bool transa,
bool transb,
T alpha,
const phi::DenseTensor& mat_a,
const phi::DenseTensor& mat_b,
T beta,
phi::SparseCsrTensor* mat_c) const {
cudaDataType_t gpu_type = GetGpuDataType<T>();
auto a_descriptor = CuSparseDnMatDescriptor<T>(mat_a, dev_ctx_);
auto b_descriptor = CuSparseDnMatDescriptor<T>(mat_b, dev_ctx_);
auto c_descriptor = CuSparseSpMatDescriptor<T>(*mat_c, dev_ctx_);
size_t buffer_size = 0;
dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseSDDMM_bufferSize(handle,
GetTransposeOperation(transa),
GetTransposeOperation(transb),
&alpha,
a_descriptor.descriptor(),
b_descriptor.descriptor(),
&beta,
c_descriptor.descriptor(),
gpu_type,
CUSPARSE_SDDMM_ALG_DEFAULT,
&buffer_size);
});
paddle::memory::allocation::AllocationPtr tmp_buffer =
paddle::memory::Alloc(dev_ctx_, buffer_size);
void* tmp_buffer_ptr = tmp_buffer->ptr();
dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseSDDMM_preprocess(handle,
GetTransposeOperation(transa),
GetTransposeOperation(transb),
&alpha,
a_descriptor.descriptor(),
b_descriptor.descriptor(),
&beta,
c_descriptor.descriptor(),
gpu_type,
CUSPARSE_SDDMM_ALG_DEFAULT,
tmp_buffer_ptr);
});
dev_ctx_.CusparseCall([&](cusparseHandle_t handle) {
phi::dynload::cusparseSDDMM(handle,
GetTransposeOperation(transa),
GetTransposeOperation(transb),
&alpha,
a_descriptor.descriptor(),
b_descriptor.descriptor(),
&beta,
c_descriptor.descriptor(),
gpu_type,
CUSPARSE_SDDMM_ALG_DEFAULT,
tmp_buffer_ptr);
});
}
#endif
} // namespace sparse
} // namespace funcs
} // namespace phi
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/sparse/full_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
namespace phi {
template <typename T, typename Context>
void FullValue(const Context& dev_ctx, DenseTensor* tensor, T val) {
dev_ctx.template Alloc<T>(tensor);
auto t = phi::EigenVector<T>::Flatten(*tensor);
t.device(*dev_ctx.eigen_device()) = t.constant(val);
}
template <typename T, typename Context>
void CooFullLikeKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const Scalar& val,
DataType dtype,
SparseCooTensor* out) {
phi::Copy<Context>(dev_ctx,
x.non_zero_indices(),
dev_ctx.GetPlace(),
false,
out->mutable_non_zero_indices());
DenseTensor* values = out->mutable_non_zero_elements();
values->Resize(x.non_zero_elements().dims());
dev_ctx.template Alloc<T>(values);
FullValue<T, Context>(dev_ctx, values, val.to<T>());
out->set_dims(x.dims());
}
template <typename T, typename Context>
void CsrFullLikeKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const Scalar& val,
DataType dtype,
SparseCsrTensor* out) {
phi::Copy<Context>(dev_ctx,
x.non_zero_crows(),
dev_ctx.GetPlace(),
false,
out->mutable_non_zero_crows());
phi::Copy<Context>(dev_ctx,
x.non_zero_cols(),
dev_ctx.GetPlace(),
false,
out->mutable_non_zero_cols());
DenseTensor* values = out->mutable_non_zero_elements();
values->Resize(x.non_zero_elements().dims());
dev_ctx.template Alloc<T>(values);
FullValue<T, Context>(dev_ctx, values, val.to<T>());
out->set_dims(x.dims());
}
} // namespace phi
PD_REGISTER_KERNEL(coo_full_like,
CPU,
ALL_LAYOUT,
phi::CooFullLikeKernel,
float,
double,
uint8_t,
int16_t,
int,
int64_t,
bool,
phi::dtype::bfloat16,
phi::dtype::float16,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
}
PD_REGISTER_KERNEL(csr_full_like,
CPU,
ALL_LAYOUT,
phi::CsrFullLikeKernel,
float,
double,
uint8_t,
int16_t,
int,
int64_t,
bool,
phi::dtype::bfloat16,
phi::dtype::float16,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/sparse/matmul_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
namespace sparse {
// TODO(zhouwei25): implement CPU backward kernel of " CSR @ DENSE -> DENSE"
template <typename T, typename Context>
void CsrDenseMatmulGradKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const DenseTensor& y,
const DenseTensor& dout,
SparseCsrTensor* dx,
DenseTensor* dy) {
PADDLE_THROW(phi::errors::Unimplemented(
"Not support CPU backward kernel of Sparse Matmul now."));
}
// TODO(zhouwei25): implement CPU kernel of " DENSE @ DENSE * CSR_MASK -> CSR"
template <typename T, typename Context>
void CsrMaskedMatmulGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const SparseCsrTensor& dout,
DenseTensor* dx,
DenseTensor* dy) {
PADDLE_THROW(phi::errors::Unimplemented(
"Not support CPU backward kernel of Matmul Mask As Sparse now."));
}
} // namespace sparse
} // namespace phi
PD_REGISTER_KERNEL(csr_dense_matmul_grad,
CPU,
ALL_LAYOUT,
phi::sparse::CsrDenseMatmulGradKernel,
float,
double) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
}
PD_REGISTER_KERNEL(csr_masked_matmul_grad,
CPU,
ALL_LAYOUT,
phi::sparse::CsrMaskedMatmulGradKernel,
float,
double) {}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/sparse/matmul_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
namespace phi {
namespace sparse {
// TODO(zhouwei25): implement CPU kernel of " CSR @ DENSE -> DENSE"
template <typename T, typename Context>
void CsrDenseMatmulKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const DenseTensor& y,
DenseTensor* out) {
PADDLE_THROW(phi::errors::Unimplemented(
"Not support CPU kernel of Sparse Matmul now."));
}
// TODO(zhouwei25): implement CPU kernel of " DENSE @ DENSE * CSR_MASK -> CSR"
template <typename T, typename Context>
void CsrMaskedMatmulKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const SparseCsrTensor& mask,
SparseCsrTensor* out) {
PADDLE_THROW(phi::errors::Unimplemented(
"Not support CPU kernel of Matmul Mask As Sparse now."));
}
} // namespace sparse
} // namespace phi
PD_REGISTER_KERNEL(csr_dense_matmul,
CPU,
ALL_LAYOUT,
phi::sparse::CsrDenseMatmulKernel,
float,
double) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
}
PD_REGISTER_KERNEL(csr_masked_matmul,
CPU,
ALL_LAYOUT,
phi::sparse::CsrMaskedMatmulKernel,
float,
double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/common/data_type.h"
#include "paddle/phi/common/scalar.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
namespace phi {
template <typename T, typename Context>
void CooFullLikeKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const Scalar& val,
DataType dtype,
SparseCooTensor* out);
template <typename T, typename Context>
void CsrFullLikeKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const Scalar& val,
DataType dtype,
SparseCsrTensor* out);
} // namespace phi
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/sparse/full_kernel.h"
namespace phi {
template <typename InT, typename OutT = InT>
struct FullFuctor {
OutT value;
template <typename VType>
explicit inline FullFuctor(VType val) {
value = static_cast<OutT>(val);
}
__device__ __forceinline__ OutT operator()() const {
return static_cast<OutT>(value);
}
};
template <typename T, typename Context>
void CooFullLikeKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const Scalar& val,
DataType dtype,
SparseCooTensor* out) {
phi::Copy<Context>(dev_ctx,
x.non_zero_indices(),
dev_ctx.GetPlace(),
false,
out->mutable_non_zero_indices());
DenseTensor* values = out->mutable_non_zero_elements();
values->Resize(x.non_zero_elements().dims());
dev_ctx.template Alloc<T>(values);
std::vector<const DenseTensor*> inputs = {};
std::vector<DenseTensor*> outputs = {values};
int numel = values->numel();
if (numel > 0) {
phi::funcs::ElementwiseKernel<T>(
dev_ctx, inputs, &outputs, FullFuctor<T>(val.to<T>()));
}
out->set_dims(x.dims());
}
template <typename T, typename Context>
void CsrFullLikeKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const Scalar& val,
DataType dtype,
SparseCsrTensor* out) {
phi::Copy<Context>(dev_ctx,
x.non_zero_crows(),
dev_ctx.GetPlace(),
false,
out->mutable_non_zero_crows());
phi::Copy<Context>(dev_ctx,
x.non_zero_cols(),
dev_ctx.GetPlace(),
false,
out->mutable_non_zero_cols());
DenseTensor* values = out->mutable_non_zero_elements();
values->Resize(x.non_zero_elements().dims());
dev_ctx.template Alloc<T>(values);
std::vector<const DenseTensor*> inputs = {};
std::vector<DenseTensor*> outputs = {values};
int numel = values->numel();
if (numel > 0) {
phi::funcs::ElementwiseKernel<T>(
dev_ctx, inputs, &outputs, FullFuctor<T>(val.to<T>()));
}
out->set_dims(x.dims());
}
} // namespace phi
PD_REGISTER_KERNEL(coo_full_like,
GPU,
ALL_LAYOUT,
phi::CooFullLikeKernel,
float,
double,
uint8_t,
int16_t,
int,
int64_t,
bool,
phi::dtype::bfloat16,
phi::dtype::float16,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_COO);
}
PD_REGISTER_KERNEL(csr_full_like,
GPU,
ALL_LAYOUT,
phi::CsrFullLikeKernel,
float,
double,
uint8_t,
int16_t,
int,
int64_t,
bool,
phi::dtype::bfloat16,
phi::dtype::float16,
phi::dtype::complex<float>,
phi::dtype::complex<double>) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <vector>
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/sparse/sparse_blas.h"
#include "paddle/phi/kernels/sparse/matmul_grad_kernel.h"
#include "paddle/phi/kernels/transpose_kernel.h"
namespace phi {
namespace sparse {
template <typename T, typename Context>
void CsrDenseMatmulGradKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const DenseTensor& y,
const DenseTensor& dout,
SparseCsrTensor* dx,
DenseTensor* dy) {
#if CUDA_VERSION >= 11030
auto sparse_blas = phi::funcs::sparse::GetSparseBlas<Context, T>(dev_ctx);
// dx{SparseCsr} = dout{Dense} * y'{Dense}
if (dx) {
// InferMeta of SparseCsrTensor 'dx'
dx->set_dims(x.dims());
phi::Copy(dev_ctx,
x.non_zero_crows(),
dev_ctx.GetPlace(),
false,
dx->mutable_non_zero_crows());
phi::Copy(dev_ctx,
x.non_zero_cols(),
dev_ctx.GetPlace(),
false,
dx->mutable_non_zero_cols());
DenseTensor* values = dx->mutable_non_zero_elements();
values->Resize(x.non_zero_elements().dims());
dev_ctx.template Alloc<T>(values);
sparse_blas.SDDMM(
false, true, static_cast<T>(1), dout, y, static_cast<T>(0), dx);
}
// dy{Dense} = x'{SparseCsr} * dout{Dense}
if (dy) {
// InferMeta of DenseTensor 'dy'
MetaTensor meta_dy(dy);
meta_dy.set_dims(y.dims());
meta_dy.set_dtype(y.dtype());
dev_ctx.template Alloc<T>(dy);
sparse_blas.DSDMM(
true, false, static_cast<T>(1), x, dout, static_cast<T>(0), dy);
}
#else
PADDLE_THROW(phi::errors::Unimplemented(
" backward of 'sparse.mm' use cusparseSDDMM, Only "
"support it from CUDA 11.3"));
#endif
}
template <typename T, typename Context>
void CsrMaskedMatmulGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const SparseCsrTensor& dout,
DenseTensor* dx,
DenseTensor* dy) {
#if CUDA_VERSION >= 11000
auto sparse_blas = phi::funcs::sparse::GetSparseBlas<Context, T>(dev_ctx);
// dx{Dense} = dout{SparseCsr} * y'{Dense}
if (dx) {
// InferMeta of DenseTensor 'dx'
MetaTensor meta_dx(dx);
meta_dx.set_dims(x.dims());
meta_dx.set_dtype(x.dtype());
dev_ctx.template Alloc<T>(dx);
sparse_blas.DSDMM(
false, true, static_cast<T>(1), dout, y, static_cast<T>(0), dx);
}
// dy{Dense} = x'{Dense} * dout{SparseCsr}
// That is: dy'{Dense} = dout'{SparseCsr} * x{Dense}
if (dy) {
std::vector<int> trans_dim_vec = phi::vectorize<int>(y.dims());
size_t rank = trans_dim_vec.size();
std::swap(trans_dim_vec[rank - 1], trans_dim_vec[rank - 2]);
DenseTensor trans_dy = phi::Empty<T, Context>(dev_ctx, trans_dim_vec);
sparse_blas.DSDMM(
true, false, static_cast<T>(1), dout, x, static_cast<T>(0), &trans_dy);
// InferMeta of DenseTensor 'dy'
MetaTensor meta_dy(dy);
meta_dy.set_dims(y.dims());
meta_dy.set_dtype(y.dtype());
dev_ctx.template Alloc<T>(dy);
size_t y_ndim = y.dims().size();
std::vector<int> axis(y_ndim);
for (size_t i = 0; i < y_ndim; ++i) {
axis[i] = i;
}
std::swap(axis[y_ndim - 1], axis[y_ndim - 2]);
TransposeKernel<T, Context>(dev_ctx, trans_dy, axis, dy);
}
#endif
}
} // namespace sparse
} // namespace phi
PD_REGISTER_KERNEL(csr_dense_matmul_grad,
GPU,
ALL_LAYOUT,
phi::sparse::CsrDenseMatmulGradKernel,
float,
double) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
}
PD_REGISTER_KERNEL(csr_masked_matmul_grad,
GPU,
ALL_LAYOUT,
phi::sparse::CsrMaskedMatmulGradKernel,
float,
double) {}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <vector>
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/ddim.h"
#include "paddle/phi/core/enforce.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/meta_tensor.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/sparse/sparse_blas.h"
#include "paddle/phi/kernels/sparse/matmul_kernel.h"
namespace phi {
namespace sparse {
template <typename T, typename Context>
void CsrDenseMatmulKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const DenseTensor& y,
DenseTensor* out) {
#if CUDA_VERSION >= 11000
std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
std::vector<int64_t> ydim_vec = phi::vectorize(y.dims());
auto x_ndims = xdim_vec.size();
auto y_ndims = ydim_vec.size();
PADDLE_ENFORCE_EQ(
x_ndims,
y_ndims,
phi::errors::PreconditionNotMet("The dims size of Input(x) and Input(y) "
"should be equal, But received X's "
"dimensions=%d, Y's dimensions=%d.",
x_ndims,
y_ndims));
PADDLE_ENFORCE_GE(
x_ndims,
2,
phi::errors::InvalidArgument("the dims size of Input(x) and "
"Input(y) must be greater than "
"or eaqual to 2."));
for (size_t i = 0; i < x_ndims - 2; ++i) {
PADDLE_ENFORCE_EQ(xdim_vec[i],
ydim_vec[i],
phi::errors::InvalidArgument(
"x.dim[%d] and x.dim[%d] must match.", i, i));
}
PADDLE_ENFORCE_GE(
xdim_vec[x_ndims - 1],
ydim_vec[y_ndims - 2],
phi::errors::PreconditionNotMet(
"The shape of Input(x) and Input(y) is not suitable for matmul "
"opetation, x_dim[-1] must be eaqual to y_dim[-2]."));
// InferMeta of DenseTensor 'out'
std::vector<int64_t> out_dim_vec(ydim_vec);
out_dim_vec[y_ndims - 2] = xdim_vec[x_ndims - 2];
out_dim_vec[y_ndims - 1] = ydim_vec[y_ndims - 1];
MetaTensor meta_out(out);
meta_out.set_dims(phi::make_ddim(out_dim_vec));
meta_out.set_dtype(x.non_zero_elements().dtype());
dev_ctx.template Alloc<T>(out);
auto sparse_blas = phi::funcs::sparse::GetSparseBlas<Context, T>(dev_ctx);
sparse_blas.DSDMM(
false, false, static_cast<T>(1), x, y, static_cast<T>(0), out);
#else
PADDLE_THROW(
phi::errors::Unimplemented(" forward of 'sparse.mm' use cusparseSpMM, "
"which is supported from CUDA 11.0"));
#endif
}
template <typename T, typename Context>
void CsrMaskedMatmulKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const SparseCsrTensor& mask,
SparseCsrTensor* out) {
#if CUDA_VERSION >= 11030
std::vector<int64_t> xdim_vec = phi::vectorize(x.dims());
std::vector<int64_t> ydim_vec = phi::vectorize(y.dims());
std::vector<int64_t> maskdim_vec = phi::vectorize(mask.dims());
auto x_ndims = xdim_vec.size();
auto y_ndims = ydim_vec.size();
auto mask_ndims = maskdim_vec.size();
PADDLE_ENFORCE_EQ(
x_ndims,
y_ndims,
phi::errors::PreconditionNotMet("The dims size of Input(x) and Input(y) "
"should be equal, But received X's "
"dimensions=%d, Y's dimensions=%d.",
x_ndims,
y_ndims));
PADDLE_ENFORCE_EQ(x_ndims,
mask_ndims,
phi::errors::PreconditionNotMet(
"The dims size of Input(x) and Input(mask) "
"should be equal, But received X's "
"dimensions=%d, mask's dimensions=%d.",
x_ndims,
mask_ndims));
PADDLE_ENFORCE_GE(
x_ndims,
2,
phi::errors::InvalidArgument("the dims size of Input(x) and "
"Input(y) must be greater than "
"or eaqual to 2."));
for (size_t i = 0; i < x_ndims - 2; ++i) {
PADDLE_ENFORCE_EQ(xdim_vec[i],
ydim_vec[i],
phi::errors::InvalidArgument(
"x.dim[%d] and x.dim[%d] must match.", i, i));
PADDLE_ENFORCE_EQ(xdim_vec[i],
maskdim_vec[i],
phi::errors::InvalidArgument(
"x.dim[%d] and mask.dim[%d] must match.", i, i));
}
PADDLE_ENFORCE_GE(
xdim_vec[x_ndims - 1],
ydim_vec[y_ndims - 2],
phi::errors::PreconditionNotMet(
"The shape of Input(x) and Input(y) is not suitable for matmul "
"opetation, x_dim[-1] must be eaqual to y_dim[-2]."));
PADDLE_ENFORCE_EQ(
maskdim_vec[mask_ndims - 2],
xdim_vec[x_ndims - 2],
phi::errors::PreconditionNotMet(
"The shape of Input(x) and Input(y) is not suitable for matmul "
"opetation, mask_dim[-2] must be eaqual to x_dim[-2]."));
PADDLE_ENFORCE_EQ(
maskdim_vec[mask_ndims - 1],
ydim_vec[y_ndims - 1],
phi::errors::PreconditionNotMet(
"The shape of Input(x) and Input(y) is not suitable for matmul "
"opetation, mask_dim[-1] must be eaqual to y_dim[-1]."));
// InferMeta of SparseCsrTensor 'out'
out->set_dims(mask.dims());
phi::Copy(dev_ctx,
mask.non_zero_crows(),
dev_ctx.GetPlace(),
false,
out->mutable_non_zero_crows());
phi::Copy(dev_ctx,
mask.non_zero_cols(),
dev_ctx.GetPlace(),
false,
out->mutable_non_zero_cols());
DenseTensor* values = out->mutable_non_zero_elements();
values->Resize(mask.non_zero_elements().dims());
dev_ctx.template Alloc<T>(values);
auto sparse_blas = phi::funcs::sparse::GetSparseBlas<Context, T>(dev_ctx);
sparse_blas.SDDMM(
false, false, static_cast<T>(1), x, y, static_cast<T>(0), out);
#else
PADDLE_THROW(
phi::errors::Unimplemented(" forward of 'sparse.masked_mm' use "
"cusparseSDDMM, which is supported from "
"CUDA 11.3"));
#endif
}
} // namespace sparse
} // namespace phi
PD_REGISTER_KERNEL(csr_dense_matmul,
GPU,
ALL_LAYOUT,
phi::sparse::CsrDenseMatmulKernel,
float,
double) {
kernel->InputAt(0).SetDataLayout(phi::DataLayout::SPARSE_CSR);
}
PD_REGISTER_KERNEL(csr_masked_matmul,
GPU,
ALL_LAYOUT,
phi::sparse::CsrMaskedMatmulKernel,
float,
double) {}
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
namespace phi {
namespace sparse {
// TODO(zhouwei25): implement Backward of " COO @ COO -> COO"
template <typename T, typename Context>
void CooCooMatmulGradKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const SparseCooTensor& y,
const SparseCooTensor& dout,
SparseCooTensor* dx,
SparseCooTensor* dy);
// TODO(zhouwei25): implement Backward of " COO @ DENSE -> DENSE"
template <typename T, typename Context>
void CooDenseMatmulGradKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& y,
const DenseTensor& dout,
SparseCooTensor* dx,
DenseTensor* dy);
// TODO(zhouwei25): implement Backward of " CSR @ CSR -> CSR"
template <typename T, typename Context>
void CsrCsrMatmulGradKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const SparseCsrTensor& y,
const SparseCsrTensor& dout,
SparseCsrTensor* dx,
SparseCsrTensor* dy);
/* Backward of "CSR @ DENSE -> DENSE" */
template <typename T, typename Context>
void CsrDenseMatmulGradKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const DenseTensor& y,
const DenseTensor& dout,
SparseCsrTensor* dx,
DenseTensor* dy);
/* Backward of "DENSE @ DENSE * CSR_MASK -> CSR" */
template <typename T, typename Context>
void CsrMaskedMatmulGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const SparseCsrTensor& dout,
DenseTensor* dx,
DenseTensor* dy);
} // namespace sparse
} // namespace phi
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/sparse_coo_tensor.h"
#include "paddle/phi/core/sparse_csr_tensor.h"
namespace phi {
namespace sparse {
// TODO(zhouwei25): implement " COO @ COO -> COO"
template <typename T, typename Context>
void CooCooMatmulKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const SparseCooTensor& y,
SparseCooTensor* out);
// TODO(zhouwei25): implement " COO @ DENSE -> DENSE"
template <typename T, typename Context>
void CooDenseMatmulKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& y,
DenseTensor* out);
// TODO(zhouwei25): implement " CSR @ CSR -> CSR"
template <typename T, typename Context>
void CsrCsrMatmulKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const SparseCsrTensor& y,
SparseCsrTensor* out);
/* CSR @ DENSE -> DENSE */
template <typename T, typename Context>
void CsrDenseMatmulKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const DenseTensor& y,
DenseTensor* out);
/* DENSE @ DENSE * CSR_MASK -> CSR */
template <typename T, typename Context>
void CsrMaskedMatmulKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
const SparseCsrTensor& mask,
SparseCsrTensor* out);
} // namespace sparse
} // namespace phi
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle.fluid.framework import _test_eager_guard
import numpy as np
import scipy
import scipy.sparse as sp
import unittest
import os
import re
np.random.seed(2022)
def get_cuda_version():
result = os.popen("nvcc --version").read()
regex = r'release (\S+),'
match = re.search(regex, result)
if match:
num = str(match.group(1))
integer, decimal = num.split('.')
return int(integer) * 1000 + int(float(decimal) * 10)
else:
return -1
@unittest.skipIf(
not paddle.is_compiled_with_cuda() or get_cuda_version() < 11000,
"paddle is not compiled with CUDA and cuda version need to >= 11.0")
class TestCsrDenseMatmul2D(unittest.TestCase):
# x: csr, y: dense, out: dense
def test_matmul(self):
with _test_eager_guard():
mask = np.random.rand(10, 12) < 0.2
np_x = np.random.rand(10, 12) * mask
np_csr = sp.csr_matrix(np_x)
np_dense = np.random.rand(12, 6)
np_out = np_csr @ np_dense
np_out_grad = np.ones([10, 6])
# dx(csr) = dout(dense) * y'(dense) * mask
np_csr_grad = sp.csr_matrix(
np.matmul(np_out_grad, np_dense.transpose(1, 0)) * mask)
# dy(dense) = x'(csr) * dout(dense)
np_dense_grad = np_csr.transpose() @ np_out_grad
csr = paddle.to_tensor(np_x, stop_gradient=False).to_sparse_csr()
dense = paddle.to_tensor(np_dense, stop_gradient=False)
out = paddle.incubate.sparse.matmul(csr, dense)
self.assertTrue(np.allclose(np_out, out.numpy()))
if get_cuda_version() >= 11030:
out.backward()
self.assertTrue(
np.allclose(np_csr_grad.indptr,
csr.grad.crows().numpy()))
self.assertTrue(
np.allclose(np_csr_grad.indices,
csr.grad.cols().numpy()))
self.assertTrue(
np.allclose(np_csr_grad.data,
csr.grad.values().numpy()))
self.assertTrue(np.allclose(np_dense_grad, dense.grad.numpy()))
@unittest.skipIf(
not paddle.is_compiled_with_cuda() or get_cuda_version() < 11030,
"paddle is not compiled with CUDA and cuda version need to >= 11.3")
class TestCsrMaskedMatmul2D(unittest.TestCase):
# x: dense, y: dense, out: csr
def test_matmul(self):
with _test_eager_guard():
np_mask = np.random.rand(10, 6) < 0.2
np_x = np.random.rand(10, 12)
np_y = np.random.rand(12, 6)
np_out = sp.csr_matrix(np.matmul(np_x, np_y) * np_mask)
np_out_grad = sp.csr_matrix(np.ones([10, 6]) * np_mask)
# dx(dense) = dout(csr) * y'(dense)
np_x_grad = np_out_grad @ np_y.transpose(1, 0)
# dy(dense) = x'(dense) * dout(csr) -> dy'(dense) = dout'(csr) * x(dense)
np_y_grad = (np_out_grad.transpose() @ np_x).transpose(1, 0)
x = paddle.to_tensor(np_x, stop_gradient=False)
y = paddle.to_tensor(np_y, stop_gradient=False)
mask = paddle.to_tensor(np.ones([10, 6]) * np_mask).to_sparse_csr()
out = paddle.incubate.sparse.masked_matmul(x, y, mask)
self.assertTrue(np.allclose(np_out.indptr, out.crows().numpy()))
self.assertTrue(np.allclose(np_out.indices, out.cols().numpy()))
self.assertTrue(np.allclose(np_out.data, out.values().numpy()))
out.backward()
self.assertTrue(np.allclose(out.is_sparse_csr(), True))
self.assertTrue(np.allclose(np_x_grad, x.grad.numpy()))
self.assertTrue(np.allclose(np_y_grad, y.grad.numpy()))
#TODO(zhouwei25): support unit test of batch 'paddle.sparse.mm/masked_mm'
if __name__ == "__main__":
unittest.main()
......@@ -28,10 +28,10 @@ from .tensor import segment_mean
from .tensor import segment_max
from .tensor import segment_min
from .passes import fuse_resnet_unit_pass
import paddle.incubate.autograd
import paddle.incubate.autotune
import paddle.incubate.sparse
from . import autograd #noqa: F401
from . import autotune #noqa: F401
from . import sparse #noqa: F401
from . import nn #noqa: F401
from . import asp #noqa: F401
......
......@@ -19,6 +19,9 @@ from .unary import sqrt
from .unary import sin
from .unary import tanh
from .binary import matmul
from .binary import masked_matmul
from . import nn
__all__ = [
......@@ -27,4 +30,6 @@ __all__ = [
'sqrt',
'sin',
'tanh',
'matmul',
'masked_matmul',
]
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle.common_ops_import import dygraph_only
from paddle import _C_ops
__all__ = []
@dygraph_only
def matmul(x, y, name=None):
"""
Warning:
This API is only used from ``CUDA 11.0`` .
Applies matrix multiplication of two Tensors.
The supported input/output Tensor layout are as follows:
Note:
x[SparseCsrTensor] @ y[SparseCsrTensor] -> out[SparseCsrTensor]
x[SparseCsrTensor] @ y[DenseTensor] -> out[DenseTensor]
x[SparseCooTensor] @ y[SparseCooTensor] -> out[SparseCooTensor]
x[SparseCooTensor] @ y[DenseTensor] -> out[DenseTensor]
It supports backward propagation.
Dimensions `x` and `y` must be >= 2D. Automatic broadcasting of Tensor is not supported.
the shape of `x` should be `[*, M, K]` , and the shape of `y` should be `[*, K, N]` , where `*`
is zero or more batch dimensions.
Args:
x (Tensor): The input tensor. It can be SparseCooTensor/SparseCsrTensor. The data type can be float32 or float64.
y (Tensor): The input tensor. It can be SparseCooTensor/SparseCsrTensor/DenseTensor. The data type can be float32 or float64.
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns:
Tensor: Its layout is determined by that of `x` and `y` .
Examples:
.. code-block:: python
import paddle
from paddle.fluid.framework import _test_eager_guard
paddle.seed(100)
# csr @ dense -> dense
with _test_eager_guard():
crows = [0, 2, 3, 5]
cols = [1, 3, 2, 0, 1]
values = [1., 2., 3., 4., 5.]
dense_shape = [3, 4]
csr = paddle.incubate.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
# Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
# crows=[0, 2, 3, 5],
# cols=[1, 3, 2, 0, 1],
# values=[1., 2., 3., 4., 5.])
dense = paddle.randn([4, 3])
out = paddle.incubate.sparse.matmul(csr, dense)
# Tensor(shape=[3, 3], dtype=float32, place=Place(gpu:0), stop_gradient=True,
# [[-1.94294846 , -3.33990622 , 0.62359387 ],
# [-4.12815523 , 3.46535444 , -3.27413893 ],
# [-0.15209436 , -19.23207283, -3.35593438 ]])
"""
return _C_ops.final_state_sparse_matmul(x, y)
@dygraph_only
def masked_matmul(x, y, mask, name=None):
"""
Warning:
This API is only used from ``CUDA 11.3`` .
Applies matrix multiplication of two Dense Tensors.
The supported input/output Tensor layout are as follows:
Note:
x[DenseTensor] @ y[DenseTensor] * mask[SparseCooTensor] -> out[SparseCooTensor]
x[DenseTensor] @ y[DenseTensor] * mask[SparseCsrTensor] -> out[SparseCsrTensor]
It supports backward propagation.
Dimensions `x` and `y` must be >= 2D. Automatic broadcasting of Tensor is not supported.
the shape of `x` should be `[*, M, K]` , and the shape of `y` should be `[*, K, N]` , and the shape of `mask` should be `[*, M, N]` ,
where `*` is zero or more batch dimensions.
Args:
x (Tensor): The input tensor. It is DenseTensor. The data type can be float32 or float64.
y (Tensor): The input tensor. It is DenseTensor. The data type can be float32 or float64.
mask (Tensor): The mask tensor, which can be SparseCooTensor/SparseCsrTensor. It specify sparse coordinates. The data type can be float32 or float64.
name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
Returns:
Tensor: SparseCoo or SparseCsr, which is determined by that of `mask` .
Examples:
.. code-block:: python
import paddle
from paddle.fluid.framework import _test_eager_guard
paddle.seed(100)
# dense @ dense * csr_mask -> csr
with _test_eager_guard():
crows = [0, 2, 3, 5]
cols = [1, 3, 2, 0, 1]
values = [1., 2., 3., 4., 5.]
dense_shape = [3, 4]
mask = paddle.incubate.sparse.sparse_csr_tensor(crows, cols, values, dense_shape)
# Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
# crows=[0, 2, 3, 5],
# cols=[1, 3, 2, 0, 1],
# values=[1., 2., 3., 4., 5.])
x = paddle.rand([3, 5])
y = paddle.rand([5, 4])
out = paddle.incubate.sparse.masked_matmul(x, y, mask)
# Tensor(shape=[3, 4], dtype=paddle.float32, place=Place(gpu:0), stop_gradient=True,
# crows=[0, 2, 3, 5],
# cols=[1, 3, 2, 0, 1],
# values=[0.98986477, 0.97800624, 1.14591956, 0.68561077, 0.94714981])
"""
return _C_ops.final_state_sparse_masked_matmul(x, y, mask)
......@@ -88,6 +88,34 @@
layout : x
backward : values_grad
- api: full_like
args : (Tensor x, Scalar value, DataType dtype=DataType::UNDEFINED)
output : Tensor(out)
kernel :
func : coo_full_like{sparse_coo -> sparse_coo},
csr_full_like{sparse_csr -> sparse_csr}
layout : x
data_type : dtype
- api: masked_matmul
args : (Tensor x, Tensor y, Tensor mask)
output : Tensor(out)
kernel :
func : csr_masked_matmul{dense, dense, sparse_csr -> sparse_csr}
layout : x
backward: masked_matmul_grad
- api: matmul
args : (Tensor x, Tensor y)
output : Tensor(out)
kernel :
func : csr_dense_matmul{sparse_csr, dense -> dense},
csr_csr_matmul{sparse_csr, sparse_csr -> sparse_csr},
coo_dense_matmul{sparse_coo, dense -> dense},
coo_coo_matmul{sparse_coo, sparse_coo -> sparse_coo}
layout : x
backward: matmul_grad
- api: maxpool
args : (Tensor x, int[] kernel_sizes, int[] paddings, int[] dilations, int[] strides)
output : Tensor(out), Tensor(rulebook)
......
......@@ -25,6 +25,20 @@
output : Tensor(x_grad)
invoke : to_dense_impl(out_grad)
- backward_api : masked_matmul_grad
forward : masked_matmul(Tensor x, Tensor y, Tensor mask) -> Tensor(out)
args : (Tensor x, Tensor y, Tensor out_grad)
output : Tensor(x_grad), Tensor(y_grad)
kernel :
func : csr_masked_matmul_grad{dense, dense, sparse_csr -> dense, dense}
- backward_api : matmul_grad
forward : matmul(Tensor x, Tensor y) -> Tensor(out)
args : (Tensor x, Tensor y, Tensor out_grad)
output : Tensor(x_grad), Tensor(y_grad)
kernel :
func : csr_dense_matmul_grad{sparse_csr, dense, dense -> sparse_csr, dense}
- backward_api : relu_grad
forward : relu(Tensor x) -> Tensor(out)
args : (Tensor out, Tensor out_grad)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册