未验证 提交 5d3fd4fe 编写于 作者: Z zhangkaihuo 提交者: GitHub

Sparse conv and pool support indices as template (#41137)

上级 66d1b1f6
...@@ -45,6 +45,7 @@ PD_REGISTER_KERNEL(empty, ...@@ -45,6 +45,7 @@ PD_REGISTER_KERNEL(empty,
phi::EmptyKernel, phi::EmptyKernel,
float, float,
double, double,
int8_t,
uint8_t, uint8_t,
int16_t, int16_t,
int, int,
...@@ -61,6 +62,7 @@ PD_REGISTER_KERNEL(empty_like, ...@@ -61,6 +62,7 @@ PD_REGISTER_KERNEL(empty_like,
phi::EmptyLikeKernel, phi::EmptyLikeKernel,
float, float,
double, double,
int8_t,
uint8_t, uint8_t,
int16_t, int16_t,
int, int,
...@@ -80,6 +82,7 @@ PD_REGISTER_KERNEL(empty, ...@@ -80,6 +82,7 @@ PD_REGISTER_KERNEL(empty,
phi::EmptyKernel, phi::EmptyKernel,
float, float,
double, double,
int8_t,
uint8_t, uint8_t,
int16_t, int16_t,
int, int,
...@@ -95,6 +98,7 @@ PD_REGISTER_KERNEL(empty_like, ...@@ -95,6 +98,7 @@ PD_REGISTER_KERNEL(empty_like,
phi::EmptyLikeKernel, phi::EmptyLikeKernel,
float, float,
double, double,
int8_t,
uint8_t, uint8_t,
int16_t, int16_t,
int, int,
......
...@@ -33,28 +33,30 @@ struct Dims4D { ...@@ -33,28 +33,30 @@ struct Dims4D {
}; };
// Judge whether the current position x is in (lower, upper) // Judge whether the current position x is in (lower, upper)
inline HOSTDEVICE bool Check(const int& x, template <typename IntT = int>
inline HOSTDEVICE bool Check(const IntT& x,
const int& kx, const int& kx,
const int& pad, const int& pad,
const int& stride, const int& stride,
const int dilation, const int dilation,
const int kdim, const int kdim,
const int xdim) { const int xdim) {
const int lower = x - dilation * kx + pad; const IntT lower = x - dilation * kx + pad;
const int uper = x + (kdim - kx - 1) * dilation - pad; const IntT uper = x + (kdim - kx - 1) * dilation - pad;
return (lower >= 0 && lower % stride == 0 && uper < xdim); return (lower >= 0 && lower % stride == 0 && uper < xdim);
} }
// Check whether the current position(x, y, z) is legal: // Check whether the current position(x, y, z) is legal:
// Judge the minimum and maximum values at each latitude // Judge the minimum and maximum values at each latitude
template <typename IntT = int>
inline HOSTDEVICE bool Check(const Dims4D& dims, inline HOSTDEVICE bool Check(const Dims4D& dims,
const Dims4D& kernel_dims, const Dims4D& kernel_dims,
const Dims4D& paddings, const Dims4D& paddings,
const Dims4D& dilations, const Dims4D& dilations,
const Dims4D& strides, const Dims4D& strides,
const int x, const IntT x,
const int y, const IntT y,
const int z, const IntT z,
const int kx, const int kx,
const int ky, const int ky,
const int kz) { const int kz) {
...@@ -67,22 +69,22 @@ inline HOSTDEVICE bool Check(const Dims4D& dims, ...@@ -67,22 +69,22 @@ inline HOSTDEVICE bool Check(const Dims4D& dims,
return (x_valid && y_valid && z_valid); return (x_valid && y_valid && z_valid);
} }
template <typename Dim> template <typename Dim, typename IntT = int>
inline HOSTDEVICE int PointToIndex(const int& batch, inline HOSTDEVICE IntT PointToIndex(const IntT& batch,
const int& x, const IntT& x,
const int& y, const IntT& y,
const int& z, const IntT& z,
const Dim& dims) { const Dim& dims) {
return batch * dims[1] * dims[2] * dims[3] + z * dims[2] * dims[3] + return batch * dims[1] * dims[2] * dims[3] + z * dims[2] * dims[3] +
y * dims[3] + x; y * dims[3] + x;
} }
// TODO(zhangkaihuo): use division and multiply to optimize // TODO(zhangkaihuo): use division and multiply to optimize
// modulo operation // modulo operation
template <typename Dim> template <typename Dim, typename IntT = int>
inline HOSTDEVICE void IndexToPoint( inline HOSTDEVICE void IndexToPoint(
const int index, const Dim& dims, int* batch, int* x, int* y, int* z) { const IntT index, const Dim& dims, IntT* batch, IntT* x, IntT* y, IntT* z) {
int n = index; IntT n = index;
*x = n % dims[3]; *x = n % dims[3];
n /= dims[3]; n /= dims[3];
*y = n % dims[2]; *y = n % dims[2];
...@@ -176,8 +178,9 @@ inline const std::vector<int> PoolResetKernel( ...@@ -176,8 +178,9 @@ inline const std::vector<int> PoolResetKernel(
return res; return res;
} }
inline void PrefixSum(const int* counter, int* offsets, const int n) { template <typename T>
int offset = 0; inline void PrefixSum(const T* counter, T* offsets, const int n) {
T offset = 0;
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
offsets[i] = offset; offsets[i] = offset;
offset += counter[i]; offset += counter[i];
......
...@@ -49,8 +49,8 @@ std::tuple<SparseCooTensor, DenseTensor> Conv3dGrad( ...@@ -49,8 +49,8 @@ std::tuple<SparseCooTensor, DenseTensor> Conv3dGrad(
const int groups, const int groups,
const bool subm) { const bool subm) {
SparseCooTensor x_grad; SparseCooTensor x_grad;
DenseTensor kernel_grad = phi::Empty<Context>( DenseTensor kernel_grad;
dev_ctx, DenseTensorMeta(kernel.dtype(), {1}, kernel.layout()));
// TODO(zhangkaihuo): call InferMeta func here // TODO(zhangkaihuo): call InferMeta func here
Conv3dGradKernel<T, Context>(dev_ctx, Conv3dGradKernel<T, Context>(dev_ctx,
x, x,
......
...@@ -45,11 +45,7 @@ SparseCooTensor Conv3d(const Context& dev_ctx, ...@@ -45,11 +45,7 @@ SparseCooTensor Conv3d(const Context& dev_ctx,
const int groups, const int groups,
const bool subm, const bool subm,
DenseTensor* rulebook) { DenseTensor* rulebook) {
DenseTensor indices = phi::Empty<Context>( SparseCooTensor coo;
dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
DenseTensor values =
phi::Empty<Context>(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout()));
SparseCooTensor coo(indices, values, x.dims());
Conv3dKernel<T, Context>(dev_ctx, Conv3dKernel<T, Context>(dev_ctx,
x, x,
kernel, kernel,
......
...@@ -31,7 +31,7 @@ using Dims4D = phi::funcs::sparse::Dims4D; ...@@ -31,7 +31,7 @@ using Dims4D = phi::funcs::sparse::Dims4D;
// such as: kernel(3, 3, 3), kernel_size = 27 // such as: kernel(3, 3, 3), kernel_size = 27
// counter_per_weight: (kernel_size) // counter_per_weight: (kernel_size)
// TODO(zhangkaihuo): optimize performance with multithreading // TODO(zhangkaihuo): optimize performance with multithreading
template <typename T, typename Context> template <typename T, typename Context, typename IntT = int>
void ProductRuleBook(const Context& dev_ctx, void ProductRuleBook(const Context& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
const std::vector<int>& kernel_sizes, const std::vector<int>& kernel_sizes,
...@@ -44,7 +44,7 @@ void ProductRuleBook(const Context& dev_ctx, ...@@ -44,7 +44,7 @@ void ProductRuleBook(const Context& dev_ctx,
DenseTensor* counter_per_kernel) { DenseTensor* counter_per_kernel) {
const int64_t non_zero_num = x.nnz(); const int64_t non_zero_num = x.nnz();
const auto& non_zero_indices = x.non_zero_indices(); const auto& non_zero_indices = x.non_zero_indices();
const int* indices_ptr = non_zero_indices.data<int>(); const IntT* indices_ptr = non_zero_indices.data<IntT>();
int* counter_ptr = counter_per_kernel->data<int>(); int* counter_ptr = counter_per_kernel->data<int>();
int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
memset(counter_ptr, 0, kernel_size * sizeof(int)); memset(counter_ptr, 0, kernel_size * sizeof(int));
...@@ -60,33 +60,33 @@ void ProductRuleBook(const Context& dev_ctx, ...@@ -60,33 +60,33 @@ void ProductRuleBook(const Context& dev_ctx,
const Dims4D c_strides(1, strides[2], strides[1], strides[0]); const Dims4D c_strides(1, strides[2], strides[1], strides[0]);
const Dims4D c_dilations(1, dilations[2], dilations[1], dilations[0]); const Dims4D c_dilations(1, dilations[2], dilations[1], dilations[0]);
std::set<int> hash_in; std::set<IntT> hash_in;
if (subm) { if (subm) {
for (int i = 0; i < non_zero_num; i++) { for (int i = 0; i < non_zero_num; i++) {
int batch = indices_ptr[i]; IntT batch = indices_ptr[i];
int in_z = indices_ptr[i + non_zero_num]; IntT in_z = indices_ptr[i + non_zero_num];
int in_y = indices_ptr[i + 2 * non_zero_num]; IntT in_y = indices_ptr[i + 2 * non_zero_num];
int in_x = indices_ptr[i + 3 * non_zero_num]; IntT in_x = indices_ptr[i + 3 * non_zero_num];
int index = phi::funcs::sparse::PointToIndex<DDim>( IntT index = phi::funcs::sparse::PointToIndex<DDim>(
batch, in_x, in_y, in_z, x_dims); batch, in_x, in_y, in_z, x_dims);
hash_in.insert(index); hash_in.insert(index);
} }
} }
auto f_calc_rulebook = [&](int* rulebook_ptr) { auto f_calc_rulebook = [&](IntT* rulebook_ptr) {
int kernel_index = 0, rulebook_index = 0; int kernel_index = 0, rulebook_index = 0;
for (int kz = 0; kz < kernel_sizes[0]; kz++) { for (int kz = 0; kz < kernel_sizes[0]; kz++) {
for (int ky = 0; ky < kernel_sizes[1]; ky++) { for (int ky = 0; ky < kernel_sizes[1]; ky++) {
for (int kx = 0; kx < kernel_sizes[2]; kx++) { for (int kx = 0; kx < kernel_sizes[2]; kx++) {
++kernel_index; ++kernel_index;
for (int64_t i = 0; i < non_zero_num; i++) { for (int64_t i = 0; i < non_zero_num; i++) {
int batch = indices_ptr[i]; IntT batch = indices_ptr[i];
int in_z = indices_ptr[i + non_zero_num]; IntT in_z = indices_ptr[i + non_zero_num];
int in_y = indices_ptr[i + 2 * non_zero_num]; IntT in_y = indices_ptr[i + 2 * non_zero_num];
int in_x = indices_ptr[i + 3 * non_zero_num]; IntT in_x = indices_ptr[i + 3 * non_zero_num];
int out_z = (in_z + paddings[0] - kz * dilations[0]) / strides[0]; IntT out_z = (in_z + paddings[0] - kz * dilations[0]) / strides[0];
int out_y = (in_y + paddings[1] - ky * dilations[1]) / strides[1]; IntT out_y = (in_y + paddings[1] - ky * dilations[1]) / strides[1];
int out_x = (in_x + paddings[2] - kx * dilations[2]) / strides[2]; IntT out_x = (in_x + paddings[2] - kx * dilations[2]) / strides[2];
if (phi::funcs::sparse::Check(c_x_dims, if (phi::funcs::sparse::Check(c_x_dims,
c_kernel_dims, c_kernel_dims,
c_paddings, c_paddings,
...@@ -99,7 +99,7 @@ void ProductRuleBook(const Context& dev_ctx, ...@@ -99,7 +99,7 @@ void ProductRuleBook(const Context& dev_ctx,
ky, ky,
kz)) { kz)) {
if (subm) { if (subm) {
int out_index = phi::funcs::sparse::PointToIndex<DDim>( IntT out_index = phi::funcs::sparse::PointToIndex<DDim>(
batch, out_x, out_y, out_z, out_dims); batch, out_x, out_y, out_z, out_dims);
if (hash_in.find(out_index) == hash_in.end()) { if (hash_in.find(out_index) == hash_in.end()) {
continue; continue;
...@@ -126,15 +126,16 @@ void ProductRuleBook(const Context& dev_ctx, ...@@ -126,15 +126,16 @@ void ProductRuleBook(const Context& dev_ctx,
f_calc_rulebook(nullptr); f_calc_rulebook(nullptr);
// alloc the rulebook // alloc the rulebook
DenseTensorMeta rulebook_meta( *rulebook = phi::Empty(
DataType::INT32, {3, rulebook_len}, DataLayout::NCHW); dev_ctx,
rulebook->set_meta(rulebook_meta); DenseTensorMeta(paddle::experimental::CppTypeToDataType<IntT>::Type(),
dev_ctx.Alloc(rulebook, rulebook->dtype(), rulebook->numel() * sizeof(int)); {3, rulebook_len},
int* rulebook_ptr = rulebook->data<int>(); DataLayout::NCHW));
IntT* rulebook_ptr = rulebook->data<IntT>();
f_calc_rulebook(rulebook_ptr); f_calc_rulebook(rulebook_ptr);
} }
template <typename T, typename Context> template <typename T, typename Context, typename IntT = int>
void UpdateRulebookAndOutIndex(const Context& dev_ctx, void UpdateRulebookAndOutIndex(const Context& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
const int kernel_size, const int kernel_size,
...@@ -142,9 +143,9 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx, ...@@ -142,9 +143,9 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx,
const DDim& out_dims, const DDim& out_dims,
DenseTensor* rulebook, DenseTensor* rulebook,
SparseCooTensor* out) { SparseCooTensor* out) {
std::set<int> out_indexs; std::set<IntT> out_indexs;
int n = rulebook->dims()[1]; int n = rulebook->dims()[1];
int* rulebook_ptr = rulebook->data<int>(); IntT* rulebook_ptr = rulebook->data<IntT>();
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
out_indexs.insert(rulebook_ptr[i + n * 2]); out_indexs.insert(rulebook_ptr[i + n * 2]);
} }
...@@ -152,17 +153,19 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx, ...@@ -152,17 +153,19 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx,
int out_non_zero_num = out_indexs.size(); int out_non_zero_num = out_indexs.size();
const int64_t sparse_dim = 4; const int64_t sparse_dim = 4;
DenseTensorMeta indices_meta( DenseTensorMeta indices_meta(
DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW); paddle::experimental::CppTypeToDataType<IntT>::Type(),
{sparse_dim, out_non_zero_num},
DataLayout::NCHW);
DenseTensorMeta values_meta(x.dtype(), DenseTensorMeta values_meta(x.dtype(),
{out_non_zero_num, out_channels}, {out_non_zero_num, out_channels},
x.non_zero_elements().layout()); x.non_zero_elements().layout());
phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta)); phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta)); phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
int* out_indices_ptr = out_indices.data<int>(); IntT* out_indices_ptr = out_indices.data<IntT>();
int i = 0; int i = 0;
for (auto it = out_indexs.begin(); it != out_indexs.end(); it++, i++) { for (auto it = out_indexs.begin(); it != out_indexs.end(); it++, i++) {
const int index = *it; const IntT index = *it;
int batch, x, y, z; IntT batch, x, y, z;
phi::funcs::sparse::IndexToPoint<DDim>(index, out_dims, &batch, &x, &y, &z); phi::funcs::sparse::IndexToPoint<DDim>(index, out_dims, &batch, &x, &y, &z);
out_indices_ptr[i] = batch; out_indices_ptr[i] = batch;
out_indices_ptr[i + out_non_zero_num] = z; out_indices_ptr[i + out_non_zero_num] = z;
...@@ -170,7 +173,7 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx, ...@@ -170,7 +173,7 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx,
out_indices_ptr[i + out_non_zero_num * 3] = x; out_indices_ptr[i + out_non_zero_num * 3] = x;
} }
for (i = 0; i < n; i++) { for (i = 0; i < n; i++) {
int out_index = rulebook_ptr[i + n * 2]; IntT out_index = rulebook_ptr[i + n * 2];
rulebook_ptr[i + n * 2] = rulebook_ptr[i + n * 2] =
std::distance(out_indexs.begin(), out_indexs.find(out_index)); std::distance(out_indexs.begin(), out_indexs.find(out_index));
} }
...@@ -178,20 +181,20 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx, ...@@ -178,20 +181,20 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx,
out->SetMember(out_indices, out_values, out_dims, true); out->SetMember(out_indices, out_values, out_dims, true);
} }
template <typename T> template <typename T, typename IntT = int>
void Gather( void Gather(
const T* x, const int* indexs, const int n, const int channels, T* out) { const T* x, const IntT* indexs, const int n, const int channels, T* out) {
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
int real_i = indexs[i]; IntT real_i = indexs[i];
memcpy(out + i * channels, x + real_i * channels, channels * sizeof(T)); memcpy(out + i * channels, x + real_i * channels, channels * sizeof(T));
} }
} }
template <typename T> template <typename T, typename IntT = int>
void Scatter( void Scatter(
const T* x, const int* indexs, const int n, const int channels, T* out) { const T* x, const IntT* indexs, const int n, const int channels, T* out) {
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
int real_i = indexs[i]; IntT real_i = indexs[i];
for (int j = 0; j < channels; j++) { for (int j = 0; j < channels; j++) {
out[real_i * channels + j] += x[i * channels + j]; out[real_i * channels + j] += x[i * channels + j];
} }
......
...@@ -18,6 +18,8 @@ limitations under the License. */ ...@@ -18,6 +18,8 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/sparse/cpu/convolution.h" #include "paddle/phi/kernels/sparse/cpu/convolution.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace phi { namespace phi {
namespace sparse { namespace sparse {
...@@ -29,24 +31,24 @@ namespace sparse { ...@@ -29,24 +31,24 @@ namespace sparse {
//] //]
// x_grad = out_grad * transpose(kenrel) // x_grad = out_grad * transpose(kenrel)
// kernel_grad = transpose(x) * out_grad // kernel_grad = transpose(x) * out_grad
template <typename T, typename Context> template <typename T, typename IntT = int>
void Conv3dGradKernel(const Context& dev_ctx, void Conv3dGradCPUKernel(const CPUContext& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
const DenseTensor& kernel, const DenseTensor& kernel,
const DenseTensor& rulebook, const DenseTensor& rulebook,
const SparseCooTensor& out_grad, const SparseCooTensor& out_grad,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations, const std::vector<int>& dilations,
const std::vector<int>& strides, const std::vector<int>& strides,
const int groups, const int groups,
const bool subm, const bool subm,
SparseCooTensor* x_grad, SparseCooTensor* x_grad,
DenseTensor* kernel_grad) { DenseTensor* kernel_grad) {
const auto& kernel_dims = kernel.dims(); const auto& kernel_dims = kernel.dims();
const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
const int in_channels = kernel_dims[3]; const int in_channels = kernel_dims[3];
const int out_channels = kernel_dims[4]; const int out_channels = kernel_dims[4];
const int* rulebook_ptr = rulebook.data<int>(); const IntT* rulebook_ptr = rulebook.data<IntT>();
const int rulebook_len = rulebook.dims()[1]; const int rulebook_len = rulebook.dims()[1];
...@@ -66,32 +68,30 @@ void Conv3dGradKernel(const Context& dev_ctx, ...@@ -66,32 +68,30 @@ void Conv3dGradKernel(const Context& dev_ctx,
T* in_features_ptr = in_features.data<T>(); T* in_features_ptr = in_features.data<T>();
T* d_x_features_ptr = d_x_features.data<T>(); T* d_x_features_ptr = d_x_features.data<T>();
T* out_grad_features_ptr = out_grad_features.data<T>(); T* out_grad_features_ptr = out_grad_features.data<T>();
kernel_grad->Resize(kernel_dims); *kernel_grad = phi::EmptyLike<T>(dev_ctx, kernel);
dev_ctx.Alloc(
kernel_grad, kernel_grad->dtype(), kernel_grad->numel() * sizeof(T));
T* d_kernel_ptr = kernel_grad->data<T>(); T* d_kernel_ptr = kernel_grad->data<T>();
memset(d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel()); memset(d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel());
int half_kernel_size = kernel_size / 2; int half_kernel_size = kernel_size / 2;
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx); auto blas = phi::funcs::GetBlas<CPUContext, T>(dev_ctx);
DenseTensor x_grad_indices = DenseTensor x_grad_indices =
phi::EmptyLike<int>(dev_ctx, x.non_zero_indices()); phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
DenseTensor x_grad_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements()); DenseTensor x_grad_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
T* x_grad_values_ptr = x_grad_values.data<T>(); T* x_grad_values_ptr = x_grad_values.data<T>();
memset(x_grad_values_ptr, 0, sizeof(T) * x_grad_values.numel()); memset(x_grad_values_ptr, 0, sizeof(T) * x_grad_values.numel());
memset(d_x_features_ptr, 0, sizeof(T) * d_x_features.numel()); memset(d_x_features_ptr, 0, sizeof(T) * d_x_features.numel());
phi::Copy<Context>(dev_ctx, phi::Copy<CPUContext>(dev_ctx,
x.non_zero_indices(), x.non_zero_indices(),
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
false, false,
&x_grad_indices); &x_grad_indices);
x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true); x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true);
std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0); std::vector<IntT> offsets(kernel_size + 1), counter(kernel_size, 0);
for (int i = 0; i < rulebook_len; i++) { for (int i = 0; i < rulebook_len; i++) {
counter[rulebook_ptr[i]] += 1; counter[rulebook_ptr[i]] += 1;
} }
int offset = 0, max_count = 0; IntT offset = 0, max_count = 0;
for (int i = 0; i < kernel_size; i++) { for (int i = 0; i < kernel_size; i++) {
offsets[i] = offset; offsets[i] = offset;
offset += counter[i]; offset += counter[i];
...@@ -102,30 +102,31 @@ void Conv3dGradKernel(const Context& dev_ctx, ...@@ -102,30 +102,31 @@ void Conv3dGradKernel(const Context& dev_ctx,
offsets[kernel_size] = offset; offsets[kernel_size] = offset;
if (subm) { if (subm) {
phi::funcs::sparse::SubmPreProcess<T, Context>(dev_ctx, phi::funcs::sparse::SubmPreProcess<T, CPUContext>(
x, dev_ctx,
kernel, x,
out_grad.non_zero_elements(), kernel,
in_channels, out_grad.non_zero_elements(),
out_channels, in_channels,
half_kernel_size, out_channels,
kernel_grad, half_kernel_size,
&x_grad_values); kernel_grad,
&x_grad_values);
if (max_count == 0) { if (max_count == 0) {
return; return;
} }
} }
Gather<T>(x.non_zero_elements().data<T>(), Gather<T, IntT>(x.non_zero_elements().data<T>(),
rulebook_ptr + rulebook_len, rulebook_ptr + rulebook_len,
rulebook_len, rulebook_len,
in_channels, in_channels,
in_features_ptr); in_features_ptr);
Gather<T>(out_grad.non_zero_elements().data<T>(), Gather<T, IntT>(out_grad.non_zero_elements().data<T>(),
rulebook_ptr + rulebook_len * 2, rulebook_ptr + rulebook_len * 2,
rulebook_len, rulebook_len,
out_channels, out_channels,
out_grad_features_ptr); out_grad_features_ptr);
const T* kernel_ptr = kernel.data<T>(); const T* kernel_ptr = kernel.data<T>();
for (int i = 0; i < kernel_size; i++) { for (int i = 0; i < kernel_size; i++) {
...@@ -170,11 +171,41 @@ void Conv3dGradKernel(const Context& dev_ctx, ...@@ -170,11 +171,41 @@ void Conv3dGradKernel(const Context& dev_ctx,
} }
// 4. scatter // 4. scatter
Scatter<T>(d_x_features_ptr, Scatter<T, IntT>(d_x_features_ptr,
rulebook.data<int>() + rulebook_len, rulebook.data<IntT>() + rulebook_len,
rulebook_len, rulebook_len,
in_channels, in_channels,
x_grad_values_ptr); x_grad_values_ptr);
}
template <typename T, typename Context>
void Conv3dGradKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& kernel,
const DenseTensor& rulebook,
const SparseCooTensor& out_grad,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const int groups,
const bool subm,
SparseCooTensor* x_grad,
DenseTensor* kernel_grad) {
PD_DISPATCH_INTEGRAL_TYPES(
x.non_zero_indices().dtype(), "Conv3dGradCPUKernel", ([&] {
Conv3dGradCPUKernel<T, data_t>(dev_ctx,
x,
kernel,
rulebook,
out_grad,
paddings,
dilations,
strides,
groups,
subm,
x_grad,
kernel_grad);
}));
} }
} // namespace sparse } // namespace sparse
......
...@@ -17,6 +17,8 @@ limitations under the License. */ ...@@ -17,6 +17,8 @@ limitations under the License. */
#include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/core/tensor_meta.h"
#include "paddle/phi/kernels/funcs/blas/blas.h" #include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace phi { namespace phi {
namespace sparse { namespace sparse {
...@@ -25,17 +27,17 @@ namespace sparse { ...@@ -25,17 +27,17 @@ namespace sparse {
* kernel: (D, H, W, C, OC) * kernel: (D, H, W, C, OC)
* out: (N, D, H, W, OC) * out: (N, D, H, W, OC)
**/ **/
template <typename T, typename Context> template <typename T, typename IntT = int>
void Conv3dKernel(const Context& dev_ctx, void Conv3dCPUKernel(const CPUContext& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
const DenseTensor& kernel, const DenseTensor& kernel,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations, const std::vector<int>& dilations,
const std::vector<int>& strides, const std::vector<int>& strides,
const int groups, const int groups,
const bool subm, const bool subm,
SparseCooTensor* out, SparseCooTensor* out,
DenseTensor* rulebook) { DenseTensor* rulebook) {
// update padding and dilation // update padding and dilation
// Currently, only support x.layout is NDHWC, groups = 1 // Currently, only support x.layout is NDHWC, groups = 1
// if x.layout != NDHWC then transpose(x), transpose(weight) // if x.layout != NDHWC then transpose(x), transpose(weight)
...@@ -66,18 +68,18 @@ void Conv3dKernel(const Context& dev_ctx, ...@@ -66,18 +68,18 @@ void Conv3dKernel(const Context& dev_ctx,
DataType::INT32, {kernel_size}, DataLayout::NCHW); DataType::INT32, {kernel_size}, DataLayout::NCHW);
DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
ProductRuleBook<T, Context>(dev_ctx, ProductRuleBook<T, CPUContext, IntT>(dev_ctx,
x, x,
kernel_sizes, kernel_sizes,
subm_paddings, subm_paddings,
dilations, dilations,
subm_strides, subm_strides,
out_dims, out_dims,
subm, subm,
rulebook, rulebook,
&counter_per_kernel); &counter_per_kernel);
UpdateRulebookAndOutIndex<T>( UpdateRulebookAndOutIndex<T, CPUContext, IntT>(
dev_ctx, x, kernel_size, out_channels, out_dims, rulebook, out); dev_ctx, x, kernel_size, out_channels, out_dims, rulebook, out);
int n = rulebook->dims()[1]; int n = rulebook->dims()[1];
...@@ -95,14 +97,14 @@ void Conv3dKernel(const Context& dev_ctx, ...@@ -95,14 +97,14 @@ void Conv3dKernel(const Context& dev_ctx,
T* in_features_ptr = in_features.data<T>(); T* in_features_ptr = in_features.data<T>();
T* out_features_ptr = out_features.data<T>(); T* out_features_ptr = out_features.data<T>();
Gather<T>(x.non_zero_elements().data<T>(), Gather<T, IntT>(x.non_zero_elements().data<T>(),
rulebook->data<int>() + n, rulebook->data<IntT>() + n,
n, n,
in_channels, in_channels,
in_features_ptr); in_features_ptr);
// 3. call gemm for every werght // 3. call gemm for every werght
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx); auto blas = phi::funcs::GetBlas<CPUContext, T>(dev_ctx);
std::vector<int> offsets(kernel_size + 1); std::vector<int> offsets(kernel_size + 1);
int offset = 0; int offset = 0;
for (int i = 0; i < kernel_size; i++) { for (int i = 0; i < kernel_size; i++) {
...@@ -139,11 +141,37 @@ void Conv3dKernel(const Context& dev_ctx, ...@@ -139,11 +141,37 @@ void Conv3dKernel(const Context& dev_ctx,
// 4. scatter // 4. scatter
T* out_values_ptr = out->mutable_non_zero_elements()->data<T>(); T* out_values_ptr = out->mutable_non_zero_elements()->data<T>();
memset(out_values_ptr, 0, sizeof(T) * out->nnz() * out_channels); memset(out_values_ptr, 0, sizeof(T) * out->nnz() * out_channels);
Scatter<T>(out_features_ptr, Scatter<T, IntT>(out_features_ptr,
rulebook->data<int>() + n * 2, rulebook->data<IntT>() + n * 2,
n, n,
out_channels, out_channels,
out_values_ptr); out_values_ptr);
}
template <typename T, typename Context>
void Conv3dKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& kernel,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const int groups,
const bool subm,
SparseCooTensor* out,
DenseTensor* rulebook) {
PD_DISPATCH_INTEGRAL_TYPES(
x.non_zero_indices().dtype(), "Conv3dCPUKernel", ([&] {
Conv3dCPUKernel<T, data_t>(dev_ctx,
x,
kernel,
paddings,
dilations,
strides,
groups,
subm,
out,
rulebook);
}));
} }
} // namespace sparse } // namespace sparse
......
...@@ -14,24 +14,28 @@ limitations under the License. */ ...@@ -14,24 +14,28 @@ limitations under the License. */
#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h" #include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/pooling.h" #include "paddle/phi/kernels/funcs/pooling.h"
#include "paddle/phi/kernels/funcs/sparse/convolution.h" #include "paddle/phi/kernels/funcs/sparse/convolution.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace phi { namespace phi {
namespace sparse { namespace sparse {
template <typename T, typename Context> template <typename T, typename IntT = int>
void MaxPoolGradKernel(const Context& dev_ctx, void MaxPoolGradCPUKernel(const CPUContext& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
const DenseTensor& rulebook, const DenseTensor& rulebook,
const SparseCooTensor& out, const SparseCooTensor& out,
const DenseTensor& out_grad, const SparseCooTensor& out_grad,
const std::vector<int>& kernel_sizes, const std::vector<int>& kernel_sizes,
DenseTensor* x_grad) { SparseCooTensor* x_grad) {
int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
const int channels = x.dims()[4]; const int channels = x.dims()[4];
int rulebook_len = rulebook.dims()[1]; int rulebook_len = rulebook.dims()[1];
const int* rulebook_ptr = rulebook.data<int>(); const IntT* rulebook_ptr = rulebook.data<IntT>();
std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0); std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0);
for (int i = 0; i < rulebook_len; i++) { for (int i = 0; i < rulebook_len; i++) {
counter[rulebook_ptr[i]] += 1; counter[rulebook_ptr[i]] += 1;
...@@ -40,15 +44,25 @@ void MaxPoolGradKernel(const Context& dev_ctx, ...@@ -40,15 +44,25 @@ void MaxPoolGradKernel(const Context& dev_ctx,
const T* in_features_ptr = x.non_zero_elements().data<T>(); const T* in_features_ptr = x.non_zero_elements().data<T>();
const T* out_features_ptr = out.non_zero_elements().data<T>(); const T* out_features_ptr = out.non_zero_elements().data<T>();
const T* out_grad_ptr = out_grad.data<T>(); const T* out_grad_ptr = out_grad.non_zero_elements().data<T>();
T* x_grad_ptr = x_grad->data<T>(); // TODO(zhangkaihuo): call phi::sparse::EmptyLike
DenseTensor x_grad_indices =
phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
DenseTensor x_grad_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true);
T* x_grad_ptr = x_grad_values.data<T>();
memset(x_grad_ptr, 0, sizeof(T) * x_grad->numel()); memset(x_grad_ptr, 0, sizeof(T) * x_grad->numel());
phi::Copy<CPUContext>(dev_ctx,
x.non_zero_indices(),
dev_ctx.GetPlace(),
false,
&x_grad_indices);
phi::funcs::MaxPoolGrad<T> grad_functor; phi::funcs::MaxPoolGrad<T> grad_functor;
for (int i = 0; i < kernel_size; i++) { for (int i = 0; i < kernel_size; i++) {
for (int j = 0; j < counter[i]; j++) { for (int j = 0; j < counter[i]; j++) {
int in_i = rulebook_ptr[rulebook_len + offsets[i] + j]; IntT in_i = rulebook_ptr[rulebook_len + offsets[i] + j];
int out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j]; IntT out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j];
for (int c = 0; c < channels; c++) { for (int c = 0; c < channels; c++) {
grad_functor.compute(in_features_ptr[in_i * channels + c], grad_functor.compute(in_features_ptr[in_i * channels + c],
out_features_ptr[out_i * channels + c], out_features_ptr[out_i * channels + c],
...@@ -60,6 +74,21 @@ void MaxPoolGradKernel(const Context& dev_ctx, ...@@ -60,6 +74,21 @@ void MaxPoolGradKernel(const Context& dev_ctx,
} }
} }
template <typename T, typename Context>
void MaxPoolGradKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& rulebook,
const SparseCooTensor& out,
const SparseCooTensor& out_grad,
const std::vector<int>& kernel_sizes,
SparseCooTensor* x_grad) {
PD_DISPATCH_INTEGRAL_TYPES(
x.non_zero_indices().dtype(), "MaxPoolGradCPUKernel", ([&] {
MaxPoolGradCPUKernel<T, data_t>(
dev_ctx, x, rulebook, out, out_grad, kernel_sizes, x_grad);
}));
}
} // namespace sparse } // namespace sparse
} // namespace phi } // namespace phi
......
...@@ -19,6 +19,8 @@ limitations under the License. */ ...@@ -19,6 +19,8 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/sparse/convolution.h" #include "paddle/phi/kernels/funcs/sparse/convolution.h"
#include "paddle/phi/kernels/sparse/cpu/convolution.h" #include "paddle/phi/kernels/sparse/cpu/convolution.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace phi { namespace phi {
namespace sparse { namespace sparse {
...@@ -27,15 +29,15 @@ namespace sparse { ...@@ -27,15 +29,15 @@ namespace sparse {
* kernel: (D, H, W, C, OC) * kernel: (D, H, W, C, OC)
* out: (N, D, H, W, OC) * out: (N, D, H, W, OC)
**/ **/
template <typename T, typename Context> template <typename T, typename IntT = int>
void MaxPoolKernel(const Context& dev_ctx, void MaxPoolCPUKernel(const CPUContext& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
const std::vector<int>& kernel_sizes, const std::vector<int>& kernel_sizes,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations, const std::vector<int>& dilations,
const std::vector<int>& strides, const std::vector<int>& strides,
SparseCooTensor* out, SparseCooTensor* out,
DenseTensor* rulebook) { DenseTensor* rulebook) {
const auto& x_dims = x.dims(); const auto& x_dims = x.dims();
int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
const std::vector<int>& real_kernel_sizes = const std::vector<int>& real_kernel_sizes =
...@@ -51,22 +53,22 @@ void MaxPoolKernel(const Context& dev_ctx, ...@@ -51,22 +53,22 @@ void MaxPoolKernel(const Context& dev_ctx,
const T* in_features_ptr = x.non_zero_elements().data<T>(); const T* in_features_ptr = x.non_zero_elements().data<T>();
// 1. product rule book // 1. product rule book
ProductRuleBook<T, Context>(dev_ctx, ProductRuleBook<T, CPUContext, IntT>(dev_ctx,
x, x,
real_kernel_sizes, real_kernel_sizes,
paddings, paddings,
dilations, dilations,
strides, strides,
out_dims, out_dims,
false, false,
rulebook, rulebook,
&counter_per_kernel); &counter_per_kernel);
UpdateRulebookAndOutIndex<T>( UpdateRulebookAndOutIndex<T, CPUContext, IntT>(
dev_ctx, x, kernel_size, in_channels, out_dims, rulebook, out); dev_ctx, x, kernel_size, in_channels, out_dims, rulebook, out);
int rulebook_len = rulebook->dims()[1]; int rulebook_len = rulebook->dims()[1];
const int* rulebook_ptr = rulebook->data<int>(); const IntT* rulebook_ptr = rulebook->data<IntT>();
const int* counter_ptr = counter_per_kernel.data<int>(); const int* counter_ptr = counter_per_kernel.data<int>();
std::vector<int> offsets(kernel_size + 1); std::vector<int> offsets(kernel_size + 1);
...@@ -78,8 +80,8 @@ void MaxPoolKernel(const Context& dev_ctx, ...@@ -78,8 +80,8 @@ void MaxPoolKernel(const Context& dev_ctx,
phi::funcs::MaxPool<T> max_pool_functor; phi::funcs::MaxPool<T> max_pool_functor;
for (int i = 0; i < kernel_size; i++) { for (int i = 0; i < kernel_size; i++) {
for (int j = 0; j < counter_ptr[i]; j++) { for (int j = 0; j < counter_ptr[i]; j++) {
int in_i = rulebook_ptr[rulebook_len + offsets[i] + j]; IntT in_i = rulebook_ptr[rulebook_len + offsets[i] + j];
int out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j]; IntT out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j];
if (!out_flags[out_i]) { if (!out_flags[out_i]) {
out_flags[out_i] = true; out_flags[out_i] = true;
memcpy(&out_features_ptr[out_i * in_channels], memcpy(&out_features_ptr[out_i * in_channels],
...@@ -95,6 +97,28 @@ void MaxPoolKernel(const Context& dev_ctx, ...@@ -95,6 +97,28 @@ void MaxPoolKernel(const Context& dev_ctx,
} }
} }
template <typename T, typename Context>
void MaxPoolKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const std::vector<int>& kernel_sizes,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
SparseCooTensor* out,
DenseTensor* rulebook) {
PD_DISPATCH_INTEGRAL_TYPES(
x.non_zero_indices().dtype(), "MaxPoolCPUKernel", ([&] {
MaxPoolCPUKernel<T, data_t>(dev_ctx,
x,
kernel_sizes,
paddings,
dilations,
strides,
out,
rulebook);
}));
}
} // namespace sparse } // namespace sparse
} // namespace phi } // namespace phi
......
...@@ -98,21 +98,21 @@ __global__ void ScatterKernel(const T* input, ...@@ -98,21 +98,21 @@ __global__ void ScatterKernel(const T* input,
} }
} }
template <typename Context> template <typename Context, typename IntT = int>
inline int* SortedAndUniqueIndex(const Context& dev_ctx, inline IntT* SortedAndUniqueIndex(const Context& dev_ctx,
const int* rulebook_ptr, const IntT* rulebook_ptr,
const int len, const int len,
DenseTensor* out_index, DenseTensor* out_index,
DenseTensor* unique_key, DenseTensor* unique_key,
DenseTensor* unique_value) { DenseTensor* unique_value) {
phi::IndexKernel<int, kps::IdentityFunctor<int>>( phi::IndexKernel<int, kps::IdentityFunctor<int>>(
dev_ctx, out_index, kps::IdentityFunctor<int>()); dev_ctx, out_index, kps::IdentityFunctor<int>());
phi::IndexKernel<int, kps::IdentityFunctor<int>>( phi::IndexKernel<int, kps::IdentityFunctor<int>>(
dev_ctx, unique_value, kps::IdentityFunctor<int>()); dev_ctx, unique_value, kps::IdentityFunctor<int>());
phi::backends::gpu::GpuMemcpyAsync(unique_key->data<int>(), phi::backends::gpu::GpuMemcpyAsync(unique_key->data<IntT>(),
rulebook_ptr, rulebook_ptr,
sizeof(int) * len, sizeof(IntT) * len,
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
hipMemcpyDeviceToDevice, hipMemcpyDeviceToDevice,
#else #else
...@@ -126,19 +126,19 @@ inline int* SortedAndUniqueIndex(const Context& dev_ctx, ...@@ -126,19 +126,19 @@ inline int* SortedAndUniqueIndex(const Context& dev_ctx,
#else #else
thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()), thrust::sort_by_key(thrust::cuda::par.on(dev_ctx.stream()),
#endif #endif
unique_key->data<int>(), unique_key->data<IntT>(),
unique_key->data<int>() + len, unique_key->data<IntT>() + len,
out_index->data<int>()); out_index->data<int>());
// 4. unique // 4. unique
thrust::pair<int*, int*> new_end = thrust::pair<IntT*, int*> new_end =
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()), thrust::unique_by_key(thrust::hip::par.on(dev_ctx.stream()),
#else #else
thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()), thrust::unique_by_key(thrust::cuda::par.on(dev_ctx.stream()),
#endif #endif
unique_key->data<int>(), unique_key->data<IntT>(),
unique_key->data<int>() + len, unique_key->data<IntT>() + len,
unique_value->data<int>()); unique_value->data<int>());
return new_end.first; return new_end.first;
} }
...@@ -159,7 +159,7 @@ __global__ void SetFlagAndUpdateCounterKernel(const int* indexs, ...@@ -159,7 +159,7 @@ __global__ void SetFlagAndUpdateCounterKernel(const int* indexs,
for (int i = tid; i < n; i += gridDim.x * blockDim.x) { for (int i = tid; i < n; i += gridDim.x * blockDim.x) {
int index = indexs[i]; int index = indexs[i];
int kernel_index = rulebook_ptr[index]; T kernel_index = rulebook_ptr[index];
rulebook_ptr[index + rulebook_len] = -1; rulebook_ptr[index + rulebook_len] = -1;
rulebook_ptr[index + 2 * rulebook_len] = -1; rulebook_ptr[index + 2 * rulebook_len] = -1;
rulebook_ptr[index] = -1; rulebook_ptr[index] = -1;
...@@ -183,18 +183,18 @@ __global__ void SetFlagAndUpdateCounterKernel(const int* indexs, ...@@ -183,18 +183,18 @@ __global__ void SetFlagAndUpdateCounterKernel(const int* indexs,
* rulebook_out_indexs: the output index in rulebook * rulebook_out_indexs: the output index in rulebook
**/ **/
template <typename T> template <typename T>
__global__ void UpdateIndexKernel(const int* unique_keys, __global__ void UpdateIndexKernel(const T* unique_keys,
const int* unique_values, const int* unique_values,
const int* out_indexs, const int* out_indexs,
const int non_zero_num, const int64_t non_zero_num,
const int rulebook_len, const int rulebook_len,
const Dims4D out_dims, const Dims4D out_dims,
T* out_indices, T* out_indices,
T* rulebook_out_indexs) { T* rulebook_out_indexs) {
int tid = threadIdx.x + blockIdx.x * blockDim.x; int tid = threadIdx.x + blockIdx.x * blockDim.x;
for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
const int index = unique_keys[i]; const T index = unique_keys[i];
int batch, x, y, z; T batch, x, y, z;
phi::funcs::sparse::IndexToPoint<Dims4D>( phi::funcs::sparse::IndexToPoint<Dims4D>(
index, out_dims, &batch, &x, &y, &z); index, out_dims, &batch, &x, &y, &z);
// get out indices // get out indices
...@@ -207,7 +207,7 @@ __global__ void UpdateIndexKernel(const int* unique_keys, ...@@ -207,7 +207,7 @@ __global__ void UpdateIndexKernel(const int* unique_keys,
int start = unique_values[i]; int start = unique_values[i];
int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1]; int end = i == non_zero_num - 1 ? rulebook_len : unique_values[i + 1];
// max(end-start) = kernel_size // max(end-start) = kernel_size
for (int j = start; j < end; j++) { for (T j = start; j < end; j++) {
rulebook_out_indexs[out_indexs[j]] = i; rulebook_out_indexs[out_indexs[j]] = i;
} }
} }
...@@ -215,7 +215,7 @@ __global__ void UpdateIndexKernel(const int* unique_keys, ...@@ -215,7 +215,7 @@ __global__ void UpdateIndexKernel(const int* unique_keys,
// brief: calculation the distance between start and end // brief: calculation the distance between start and end
template <typename T> template <typename T>
__global__ void DistanceKernel(const T* start, const T* end, int* distance) { __global__ void DistanceKernel(const T* start, const T* end, T* distance) {
if (threadIdx.x == 0) { if (threadIdx.x == 0) {
*distance = end - start; *distance = end - start;
} }
...@@ -249,7 +249,7 @@ __global__ void ProductRuleBookKernel(const T* x_indices, ...@@ -249,7 +249,7 @@ __global__ void ProductRuleBookKernel(const T* x_indices,
const bool subm, const bool subm,
T* rulebook, T* rulebook,
int* counter, int* counter,
int* in_indexs) { T* in_indexs) {
int tid = threadIdx.x + blockIdx.x * blockDim.x; int tid = threadIdx.x + blockIdx.x * blockDim.x;
extern __shared__ int counter_buf[]; // kernel_size extern __shared__ int counter_buf[]; // kernel_size
const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1]; const int kernel_size = kernel_dims[3] * kernel_dims[2] * kernel_dims[1];
...@@ -261,10 +261,10 @@ __global__ void ProductRuleBookKernel(const T* x_indices, ...@@ -261,10 +261,10 @@ __global__ void ProductRuleBookKernel(const T* x_indices,
for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) { for (int i = tid; i < non_zero_num; i += gridDim.x * blockDim.x) {
int kernel_index = 0; int kernel_index = 0;
int batch = x_indices[i]; T batch = x_indices[i];
int in_z = x_indices[i + non_zero_num]; T in_z = x_indices[i + non_zero_num];
int in_y = x_indices[i + 2 * non_zero_num]; T in_y = x_indices[i + 2 * non_zero_num];
int in_x = x_indices[i + 3 * non_zero_num]; T in_x = x_indices[i + 3 * non_zero_num];
if (subm) { if (subm) {
in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims); in_indexs[i] = PointToIndex(batch, in_x, in_y, in_z, x_dims);
} }
...@@ -283,9 +283,9 @@ __global__ void ProductRuleBookKernel(const T* x_indices, ...@@ -283,9 +283,9 @@ __global__ void ProductRuleBookKernel(const T* x_indices,
kx, kx,
ky, ky,
kz)) { kz)) {
int out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1]; T out_z = (in_z + paddings[1] - kz * dilations[1]) / strides[1];
int out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2]; T out_y = (in_y + paddings[2] - ky * dilations[2]) / strides[2];
int out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3]; T out_x = (in_x + paddings[3] - kx * dilations[3]) / strides[3];
in_i = i; in_i = i;
out_index = phi::funcs::sparse::PointToIndex<Dims4D>( out_index = phi::funcs::sparse::PointToIndex<Dims4D>(
batch, out_x, out_y, out_z, out_dims); batch, out_x, out_y, out_z, out_dims);
...@@ -321,7 +321,7 @@ __global__ void ProductRuleBookKernel(const T* x_indices, ...@@ -321,7 +321,7 @@ __global__ void ProductRuleBookKernel(const T* x_indices,
// 5. update the out_index by unique_key, uniqe_value and the index of // 5. update the out_index by unique_key, uniqe_value and the index of
// unique_value: // unique_value:
// the new out_index: 0, 2, 3, 2, 3, 0, 1 // the new out_index: 0, 2, 3, 2, 3, 0, 1
template <typename T, typename Context> template <typename T, typename Context, typename IntT = int>
int ProductRuleBook(const Context& dev_ctx, int ProductRuleBook(const Context& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
const std::vector<int>& kernel_sizes, const std::vector<int>& kernel_sizes,
...@@ -334,26 +334,26 @@ int ProductRuleBook(const Context& dev_ctx, ...@@ -334,26 +334,26 @@ int ProductRuleBook(const Context& dev_ctx,
DenseTensor* counter_per_kernel, DenseTensor* counter_per_kernel,
DenseTensor* offsets_per_kernel, DenseTensor* offsets_per_kernel,
DenseTensor* out_index, DenseTensor* out_index,
DenseTensor* unique_key,
DenseTensor* unique_value, DenseTensor* unique_value,
SparseCooTensor* out, SparseCooTensor* out,
std::vector<int>* h_counter, std::vector<int>* h_counter,
std::vector<int>* h_offsets) { std::vector<int>* h_offsets) {
// TODO(zhangkaihuo): use PD_DISPATCH_INTEGRAL_TYPES for secondary dispatch
auto indices_dtype = paddle::experimental::CppTypeToDataType<IntT>::Type();
const int64_t non_zero_num = x.nnz(); const int64_t non_zero_num = x.nnz();
const auto& non_zero_indices = x.non_zero_indices(); const auto& non_zero_indices = x.non_zero_indices();
const int* indices_ptr = non_zero_indices.data<int>(); const IntT* indices_ptr = non_zero_indices.data<IntT>();
DenseTensor in_indexs = phi::Empty<Context>( DenseTensor in_indexs = phi::Empty<Context>(
dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW)); dev_ctx, DenseTensorMeta(indices_dtype, {x.nnz()}, DataLayout::NCHW));
int* counter_ptr = counter_per_kernel->data<int>(); int* counter_ptr = counter_per_kernel->data<int>();
int* offsets_ptr = offsets_per_kernel->data<int>(); int* offsets_ptr = offsets_per_kernel->data<int>();
int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
const int rulebook_rows = 3; const int rulebook_rows = 3;
const int rulebook_cols = kernel_size * non_zero_num; const int rulebook_cols = kernel_size * non_zero_num;
DenseTensorMeta rulebook_meta( DenseTensorMeta rulebook_meta(
DataType::INT32, {rulebook_rows, rulebook_cols}, DataLayout::NCHW); indices_dtype, {rulebook_rows, rulebook_cols}, DataLayout::NCHW);
rulebook->set_meta(rulebook_meta); *rulebook = phi::Empty(dev_ctx, std::move(rulebook_meta));
dev_ctx.Alloc(rulebook, rulebook->dtype(), rulebook->numel() * sizeof(int)); IntT* rulebook_ptr = rulebook->data<IntT>();
int* rulebook_ptr = rulebook->data<int>();
const auto x_dims = x.dims(); const auto x_dims = x.dims();
Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]); Dims4D d_x_dims(x_dims[0], x_dims[3], x_dims[2], x_dims[1]);
...@@ -369,39 +369,39 @@ int ProductRuleBook(const Context& dev_ctx, ...@@ -369,39 +369,39 @@ int ProductRuleBook(const Context& dev_ctx,
auto config = auto config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1); phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, non_zero_num, 1);
ProductRuleBookKernel<int><<<config.block_per_grid.x, ProductRuleBookKernel<IntT><<<config.block_per_grid.x,
config.thread_per_block.x, config.thread_per_block.x,
kernel_size * sizeof(int), kernel_size * sizeof(int),
dev_ctx.stream()>>>(indices_ptr, dev_ctx.stream()>>>(indices_ptr,
d_x_dims, d_x_dims,
d_kernel_dims, d_kernel_dims,
d_out_dims, d_out_dims,
non_zero_num, non_zero_num,
d_paddings, d_paddings,
d_dilations, d_dilations,
d_strides, d_strides,
subm, subm,
rulebook_ptr, rulebook_ptr,
counter_ptr, counter_ptr,
in_indexs.data<int>()); in_indexs.data<IntT>());
// 2. remove -1 // 2. remove -1
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), IntT* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
#else #else
int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), IntT* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
#endif #endif
rulebook_ptr, rulebook_ptr,
rulebook_ptr + rulebook_rows * rulebook_cols, rulebook_ptr + rulebook_rows * rulebook_cols,
-1); -1);
DistanceKernel<int><<<1, 1, 0, dev_ctx.stream()>>>( DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1); rulebook_ptr, last, rulebook_ptr + 3 * kernel_size * non_zero_num - 1);
int rulebook_len = 0; IntT rulebook_len = 0;
phi::backends::gpu::GpuMemcpyAsync( phi::backends::gpu::GpuMemcpyAsync(
&rulebook_len, &rulebook_len,
rulebook_ptr + 3 * kernel_size * non_zero_num - 1, rulebook_ptr + 3 * kernel_size * non_zero_num - 1,
sizeof(int), sizeof(IntT),
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
hipMemcpyDeviceToHost, hipMemcpyDeviceToHost,
#else #else
...@@ -418,11 +418,10 @@ int ProductRuleBook(const Context& dev_ctx, ...@@ -418,11 +418,10 @@ int ProductRuleBook(const Context& dev_ctx,
// and then the intermediate output index is subtracted from the input index // and then the intermediate output index is subtracted from the input index
// to obain the rulebook. // to obain the rulebook.
// get difference // get difference
int32_t* A_key_ptr = rulebook_ptr + 2 * rulebook_len; IntT* A_key_ptr = rulebook_ptr + 2 * rulebook_len;
int32_t* B_key_ptr = in_indexs.data<int>(); IntT* B_key_ptr = in_indexs.data<IntT>();
DenseTensor A_val = phi::Empty<Context>( DenseTensorMeta val_meta(DataType::INT32, {rulebook_len}, DataLayout::NCHW);
dev_ctx, DenseTensor A_val = phi::Empty<Context>(dev_ctx, std::move(val_meta));
DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
DenseTensor B_val = phi::Empty<Context>( DenseTensor B_val = phi::Empty<Context>(
dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW)); dev_ctx, DenseTensorMeta(DataType::INT32, {x.nnz()}, DataLayout::NCHW));
phi::IndexKernel<int, kps::IdentityFunctor<int>>( phi::IndexKernel<int, kps::IdentityFunctor<int>>(
...@@ -431,10 +430,8 @@ int ProductRuleBook(const Context& dev_ctx, ...@@ -431,10 +430,8 @@ int ProductRuleBook(const Context& dev_ctx,
dev_ctx, &B_val, kps::IdentityFunctor<int>()); dev_ctx, &B_val, kps::IdentityFunctor<int>());
DenseTensor key_result = phi::Empty<Context>( DenseTensor key_result = phi::Empty<Context>(
dev_ctx, dev_ctx,
DenseTensorMeta(DataType::INT32, {rulebook_len + 1}, DataLayout::NCHW)); DenseTensorMeta(indices_dtype, {rulebook_len + 1}, DataLayout::NCHW));
DenseTensor val_result = phi::Empty<Context>( DenseTensor val_result = phi::Empty<Context>(dev_ctx, std::move(val_meta));
dev_ctx,
DenseTensorMeta(DataType::INT32, {rulebook_len}, DataLayout::NCHW));
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()), thrust::exclusive_scan(thrust::hip::par.on(dev_ctx.stream()),
...@@ -457,7 +454,7 @@ int ProductRuleBook(const Context& dev_ctx, ...@@ -457,7 +454,7 @@ int ProductRuleBook(const Context& dev_ctx,
dev_ctx.stream()); dev_ctx.stream());
dev_ctx.Wait(); dev_ctx.Wait();
thrust::pair<int*, int*> end; thrust::pair<IntT*, int*> end;
// Because set_diff does not support duplicate data, set_diff is performed // Because set_diff does not support duplicate data, set_diff is performed
// separately for each segment of data. // separately for each segment of data.
// TODO(zhangkaihuo): Using hashtable here may get better performance, // TODO(zhangkaihuo): Using hashtable here may get better performance,
...@@ -465,7 +462,7 @@ int ProductRuleBook(const Context& dev_ctx, ...@@ -465,7 +462,7 @@ int ProductRuleBook(const Context& dev_ctx,
for (int i = 0; i < kernel_size; i++) { for (int i = 0; i < kernel_size; i++) {
int start = offsets[i]; int start = offsets[i];
int stop = i == kernel_size - 1 ? rulebook_len : offsets[i + 1]; int stop = i == kernel_size - 1 ? rulebook_len : offsets[i + 1];
int* key_result_start = (i == 0 ? key_result.data<int>() : end.first); IntT* key_result_start = (i == 0 ? key_result.data<IntT>() : end.first);
int* val_result_start = i == 0 ? val_result.data<int>() : end.second; int* val_result_start = i == 0 ? val_result.data<int>() : end.second;
end = end =
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
...@@ -483,14 +480,14 @@ int ProductRuleBook(const Context& dev_ctx, ...@@ -483,14 +480,14 @@ int ProductRuleBook(const Context& dev_ctx,
val_result_start); val_result_start);
} }
DistanceKernel<int><<<1, 1, 0, dev_ctx.stream()>>>( DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
key_result.data<int>(), key_result.data<IntT>(),
end.first, end.first,
key_result.data<int>() + rulebook_len); key_result.data<IntT>() + rulebook_len);
int len = 0; IntT len = 0;
phi::backends::gpu::GpuMemcpyAsync(&len, phi::backends::gpu::GpuMemcpyAsync(&len,
key_result.data<int>() + rulebook_len, key_result.data<IntT>() + rulebook_len,
sizeof(int), sizeof(IntT),
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
hipMemcpyDeviceToHost, hipMemcpyDeviceToHost,
#else #else
...@@ -500,10 +497,10 @@ int ProductRuleBook(const Context& dev_ctx, ...@@ -500,10 +497,10 @@ int ProductRuleBook(const Context& dev_ctx,
dev_ctx.Wait(); dev_ctx.Wait();
// set the diff value = -1, and update counter // set the diff value = -1, and update counter
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len, 1); auto config = phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, len, 1);
SetFlagAndUpdateCounterKernel<int><<<config.block_per_grid.x, SetFlagAndUpdateCounterKernel<IntT><<<config.block_per_grid.x,
config.thread_per_block, config.thread_per_block,
kernel_size * sizeof(int), kernel_size * sizeof(int),
dev_ctx.stream()>>>( dev_ctx.stream()>>>(
val_result.data<int>(), val_result.data<int>(),
len, len,
rulebook_len, rulebook_len,
...@@ -512,18 +509,18 @@ int ProductRuleBook(const Context& dev_ctx, ...@@ -512,18 +509,18 @@ int ProductRuleBook(const Context& dev_ctx,
counter_ptr); counter_ptr);
// remove -1 // remove -1
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
int* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()), IntT* last = thrust::remove(thrust::hip::par.on(dev_ctx.stream()),
#else #else
int* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()), IntT* last = thrust::remove(thrust::cuda::par.on(dev_ctx.stream()),
#endif #endif
rulebook_ptr, rulebook_ptr,
rulebook_ptr + 3 * rulebook_len, rulebook_ptr + 3 * rulebook_len,
-1); -1);
DistanceKernel<int><<<1, 1, 0, dev_ctx.stream()>>>( DistanceKernel<IntT><<<1, 1, 0, dev_ctx.stream()>>>(
rulebook_ptr, last, key_result.data<int>() + rulebook_len); rulebook_ptr, last, key_result.data<IntT>() + rulebook_len);
phi::backends::gpu::GpuMemcpyAsync(&rulebook_len, phi::backends::gpu::GpuMemcpyAsync(&rulebook_len,
key_result.data<int>() + rulebook_len, key_result.data<IntT>() + rulebook_len,
sizeof(int), sizeof(IntT),
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
hipMemcpyDeviceToHost, hipMemcpyDeviceToHost,
#else #else
...@@ -566,42 +563,47 @@ int ProductRuleBook(const Context& dev_ctx, ...@@ -566,42 +563,47 @@ int ProductRuleBook(const Context& dev_ctx,
cudaMemcpyDeviceToHost, cudaMemcpyDeviceToHost,
dev_ctx.stream()); dev_ctx.stream());
#endif #endif
rulebook->Resize({rulebook_rows, rulebook_len}); rulebook->Resize({rulebook_rows, static_cast<int>(rulebook_len)});
// 3. sorted or merge the out index // 3. sorted or merge the out index
out_index->ResizeAndAllocate({rulebook_len}); out_index->ResizeAndAllocate({static_cast<int>(rulebook_len)});
unique_value->ResizeAndAllocate({rulebook_len}); unique_value->ResizeAndAllocate({static_cast<int>(rulebook_len)});
unique_key->ResizeAndAllocate({rulebook_len}); DenseTensor unique_key = phi::Empty(
dev_ctx,
DenseTensorMeta(paddle::experimental::CppTypeToDataType<IntT>::Type(),
{static_cast<int>(rulebook_len)},
DataLayout::NCHW));
int* out_index_ptr = out_index->data<int>(); int* out_index_ptr = out_index->data<int>();
int* unique_value_ptr = unique_value->data<int>(); int* unique_value_ptr = unique_value->data<int>();
int* unique_key_ptr = unique_key->data<int>(); IntT* unique_key_ptr = unique_key.data<IntT>();
int* new_end = SortedAndUniqueIndex(dev_ctx, IntT* new_end =
rulebook_ptr + 2 * rulebook_len, SortedAndUniqueIndex<Context, IntT>(dev_ctx,
rulebook_len, rulebook_ptr + 2 * rulebook_len,
out_index, rulebook_len,
unique_key, out_index,
unique_value); &unique_key,
unique_value);
// thrust::distance doesn't support stream parameters // thrust::distance doesn't support stream parameters
// const int out_non_zero_num = thrust::distance(unique_key_ptr, // const int out_non_zero_num = thrust::distance(unique_key_ptr,
// new_end.first); // new_end.first);
DistanceKernel<int><<<1, 1>>>( DistanceKernel<IntT><<<1, 1>>>(
unique_key_ptr, unique_key_ptr,
new_end, new_end,
rulebook_ptr + rulebook_rows * rulebook_cols - 1); rulebook_ptr + rulebook_rows * rulebook_cols - 1);
int out_non_zero_num = 0; IntT out_non_zero_num = 0;
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
phi::backends::gpu::GpuMemcpyAsync( phi::backends::gpu::GpuMemcpyAsync(
&out_non_zero_num, &out_non_zero_num,
rulebook_ptr + rulebook_rows * rulebook_cols - 1, rulebook_ptr + rulebook_rows * rulebook_cols - 1,
sizeof(int), sizeof(IntT),
hipMemcpyDeviceToHost, hipMemcpyDeviceToHost,
dev_ctx.stream()); dev_ctx.stream());
#else #else
phi::backends::gpu::GpuMemcpyAsync( phi::backends::gpu::GpuMemcpyAsync(
&out_non_zero_num, &out_non_zero_num,
rulebook_ptr + rulebook_rows * rulebook_cols - 1, rulebook_ptr + rulebook_rows * rulebook_cols - 1,
sizeof(int), sizeof(IntT),
cudaMemcpyDeviceToHost, cudaMemcpyDeviceToHost,
dev_ctx.stream()); dev_ctx.stream());
#endif #endif
...@@ -610,28 +612,29 @@ int ProductRuleBook(const Context& dev_ctx, ...@@ -610,28 +612,29 @@ int ProductRuleBook(const Context& dev_ctx,
// 5. update out_indices and rulebook by unique_value_ptr // 5. update out_indices and rulebook by unique_value_ptr
const int64_t sparse_dim = 4; const int64_t sparse_dim = 4;
DenseTensorMeta indices_meta( DenseTensorMeta indices_meta(
DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW); indices_dtype, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
DenseTensorMeta values_meta(x.dtype(), DenseTensorMeta values_meta(x.dtype(),
{out_non_zero_num, kernel_sizes[4]}, {out_non_zero_num, kernel_sizes[4]},
x.non_zero_elements().layout()); x.non_zero_elements().layout());
phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta)); phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta)); phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
int* out_indices_ptr = out_indices.data<int>(); IntT* out_indices_ptr = out_indices.data<IntT>();
config = config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1); phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_non_zero_num, 1);
UpdateIndexKernel<int><<<config.block_per_grid.x, UpdateIndexKernel<IntT><<<config.block_per_grid.x,
config.thread_per_block.x, config.thread_per_block.x,
0, 0,
dev_ctx.stream()>>>(unique_key_ptr, dev_ctx.stream()>>>(
unique_value_ptr, unique_key_ptr,
out_index_ptr, unique_value_ptr,
out_non_zero_num, out_index_ptr,
rulebook_len, out_non_zero_num,
d_out_dims, rulebook_len,
out_indices_ptr, d_out_dims,
rulebook_ptr + 2 * rulebook_len); out_indices_ptr,
rulebook_ptr + 2 * rulebook_len);
out->SetMember(out_indices, out_values, out_dims, true); out->SetMember(out_indices, out_values, out_dims, true);
return rulebook_len; return rulebook_len;
} }
......
...@@ -24,6 +24,8 @@ limitations under the License. */ ...@@ -24,6 +24,8 @@ limitations under the License. */
#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h" #include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace phi { namespace phi {
namespace sparse { namespace sparse {
...@@ -35,24 +37,24 @@ namespace sparse { ...@@ -35,24 +37,24 @@ namespace sparse {
//] //]
// x_grad = out_grad * transpose(kenrel) // x_grad = out_grad * transpose(kenrel)
// kernel_grad = transpose(x) * out_grad // kernel_grad = transpose(x) * out_grad
template <typename T, typename Context> template <typename T, typename IntT>
void Conv3dGradKernel(const Context& dev_ctx, void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
const DenseTensor& kernel, const DenseTensor& kernel,
const DenseTensor& rulebook, const DenseTensor& rulebook,
const SparseCooTensor& out_grad, const SparseCooTensor& out_grad,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations, const std::vector<int>& dilations,
const std::vector<int>& strides, const std::vector<int>& strides,
const int groups, const int groups,
const bool subm, const bool subm,
SparseCooTensor* x_grad, SparseCooTensor* x_grad,
DenseTensor* kernel_grad) { DenseTensor* kernel_grad) {
const auto& kernel_dims = kernel.dims(); const auto& kernel_dims = kernel.dims();
const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
const int in_channels = kernel_dims[3]; const int in_channels = kernel_dims[3];
const int out_channels = kernel_dims[4]; const int out_channels = kernel_dims[4];
const int* rulebook_ptr = rulebook.data<int>(); const IntT* rulebook_ptr = rulebook.data<IntT>();
const int rulebook_len = rulebook.dims()[1]; const int rulebook_len = rulebook.dims()[1];
...@@ -74,29 +76,29 @@ void Conv3dGradKernel(const Context& dev_ctx, ...@@ -74,29 +76,29 @@ void Conv3dGradKernel(const Context& dev_ctx,
T* out_grad_features_ptr = out_grad_features.data<T>(); T* out_grad_features_ptr = out_grad_features.data<T>();
*kernel_grad = phi::EmptyLike<T>(dev_ctx, kernel); *kernel_grad = phi::EmptyLike<T>(dev_ctx, kernel);
T* d_kernel_ptr = kernel_grad->data<T>(); T* d_kernel_ptr = kernel_grad->data<T>();
phi::funcs::SetConstant<Context, T> set_zero; phi::funcs::SetConstant<GPUContext, T> set_zero;
set_zero(dev_ctx, kernel_grad, static_cast<T>(0.0f)); set_zero(dev_ctx, kernel_grad, static_cast<T>(0.0f));
int half_kernel_size = kernel_size / 2; int half_kernel_size = kernel_size / 2;
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx); auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
DenseTensor x_grad_indices = DenseTensor x_grad_indices =
phi::EmptyLike<int>(dev_ctx, x.non_zero_indices()); phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
DenseTensor x_grad_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements()); DenseTensor x_grad_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
T* x_grad_values_ptr = x_grad_values.data<T>(); T* x_grad_values_ptr = x_grad_values.data<T>();
set_zero(dev_ctx, &x_grad_values, static_cast<T>(0.0f)); set_zero(dev_ctx, &x_grad_values, static_cast<T>(0.0f));
set_zero(dev_ctx, &d_x_features, static_cast<T>(0.0f)); set_zero(dev_ctx, &d_x_features, static_cast<T>(0.0f));
phi::Copy<Context>(dev_ctx, phi::Copy<GPUContext>(dev_ctx,
x.non_zero_indices(), x.non_zero_indices(),
dev_ctx.GetPlace(), dev_ctx.GetPlace(),
false, false,
&x_grad_indices); &x_grad_indices);
x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true); x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true);
std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0), std::vector<IntT> offsets(kernel_size + 1), counter(kernel_size, 0),
h_counter(rulebook_len, 0); h_counter(rulebook_len, 0);
phi::backends::gpu::GpuMemcpyAsync(&h_counter[0], phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
rulebook_ptr, rulebook_ptr,
rulebook_len * sizeof(int), rulebook_len * sizeof(IntT),
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
hipMemcpyDeviceToHost, hipMemcpyDeviceToHost,
#else #else
...@@ -109,7 +111,7 @@ void Conv3dGradKernel(const Context& dev_ctx, ...@@ -109,7 +111,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
for (int i = 0; i < rulebook_len; i++) { for (int i = 0; i < rulebook_len; i++) {
counter[h_counter[i]] += 1; counter[h_counter[i]] += 1;
} }
int offset = 0, max_count = 0; IntT offset = 0, max_count = 0;
for (int i = 0; i < kernel_size; i++) { for (int i = 0; i < kernel_size; i++) {
offsets[i] = offset; offsets[i] = offset;
offset += counter[i]; offset += counter[i];
...@@ -120,15 +122,16 @@ void Conv3dGradKernel(const Context& dev_ctx, ...@@ -120,15 +122,16 @@ void Conv3dGradKernel(const Context& dev_ctx,
offsets[kernel_size] = offset; offsets[kernel_size] = offset;
if (subm) { if (subm) {
phi::funcs::sparse::SubmPreProcess<T, Context>(dev_ctx, phi::funcs::sparse::SubmPreProcess<T, GPUContext>(
x, dev_ctx,
kernel, x,
out_grad.non_zero_elements(), kernel,
in_channels, out_grad.non_zero_elements(),
out_channels, in_channels,
half_kernel_size, out_channels,
kernel_grad, half_kernel_size,
&x_grad_values); kernel_grad,
&x_grad_values);
if (max_count == 0) { if (max_count == 0) {
return; return;
} }
...@@ -136,21 +139,21 @@ void Conv3dGradKernel(const Context& dev_ctx, ...@@ -136,21 +139,21 @@ void Conv3dGradKernel(const Context& dev_ctx,
auto config = phi::backends::gpu::GetGpuLaunchConfig1D( auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
dev_ctx, rulebook_len * in_channels, 1); dev_ctx, rulebook_len * in_channels, 1);
GatherKernel<T, int><<<config.block_per_grid.x, GatherKernel<T, IntT><<<config.block_per_grid.x,
config.thread_per_block.x, config.thread_per_block.x,
0, 0,
dev_ctx.stream()>>>(x.non_zero_elements().data<T>(), dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
rulebook_ptr + rulebook_len, rulebook_ptr + rulebook_len,
in_features_ptr, in_features_ptr,
rulebook_len, rulebook_len,
in_channels); in_channels);
config = phi::backends::gpu::GetGpuLaunchConfig1D( config = phi::backends::gpu::GetGpuLaunchConfig1D(
dev_ctx, rulebook_len * out_channels, 1); dev_ctx, rulebook_len * out_channels, 1);
GatherKernel<T, int><<<config.block_per_grid.x, GatherKernel<T, IntT><<<config.block_per_grid.x,
config.thread_per_block.x, config.thread_per_block.x,
0, 0,
dev_ctx.stream()>>>( dev_ctx.stream()>>>(
out_grad.non_zero_elements().data<T>(), out_grad.non_zero_elements().data<T>(),
rulebook_ptr + rulebook_len * 2, rulebook_ptr + rulebook_len * 2,
out_grad_features_ptr, out_grad_features_ptr,
...@@ -203,15 +206,19 @@ void Conv3dGradKernel(const Context& dev_ctx, ...@@ -203,15 +206,19 @@ void Conv3dGradKernel(const Context& dev_ctx,
// x_grad->ResizeAndAllocate(x.non_zero_elements().dims()); // x_grad->ResizeAndAllocate(x.non_zero_elements().dims());
DenseTensorMeta index_meta(DataType::INT32, {rulebook_len}, DataLayout::NCHW); DenseTensorMeta index_meta(DataType::INT32, {rulebook_len}, DataLayout::NCHW);
DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta)); DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta)); DenseTensor unique_key = phi::Empty(
dev_ctx,
DenseTensorMeta(paddle::experimental::CppTypeToDataType<IntT>::Type(),
{rulebook_len},
DataLayout::NCHW));
DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta)); DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
SortedAndUniqueIndex(dev_ctx, SortedAndUniqueIndex<GPUContext, IntT>(dev_ctx,
rulebook_ptr + rulebook_len, rulebook_ptr + rulebook_len,
rulebook_len, rulebook_len,
&out_index, &out_index,
&unique_key, &unique_key,
&unique_value); &unique_value);
config = phi::backends::gpu::GetGpuLaunchConfig1D( config = phi::backends::gpu::GetGpuLaunchConfig1D(
dev_ctx, rulebook_len * in_channels, 1); dev_ctx, rulebook_len * in_channels, 1);
...@@ -229,6 +236,36 @@ void Conv3dGradKernel(const Context& dev_ctx, ...@@ -229,6 +236,36 @@ void Conv3dGradKernel(const Context& dev_ctx,
subm); subm);
} }
template <typename T, typename Context>
void Conv3dGradKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& kernel,
const DenseTensor& rulebook,
const SparseCooTensor& out_grad,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const int groups,
const bool subm,
SparseCooTensor* x_grad,
DenseTensor* kernel_grad) {
PD_DISPATCH_INTEGRAL_TYPES(
x.non_zero_indices().dtype(), "Conv3dGradGPUKernel", ([&] {
Conv3dGradGPUKernel<T, data_t>(dev_ctx,
x,
kernel,
rulebook,
out_grad,
paddings,
dilations,
strides,
groups,
subm,
x_grad,
kernel_grad);
}));
}
} // namespace sparse } // namespace sparse
} // namespace phi } // namespace phi
......
...@@ -19,29 +19,25 @@ limitations under the License. */ ...@@ -19,29 +19,25 @@ limitations under the License. */
#include "paddle/phi/kernels/sparse/convolution_kernel.h" #include "paddle/phi/kernels/sparse/convolution_kernel.h"
#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace phi { namespace phi {
namespace sparse { namespace sparse {
/** template <typename T, typename IntT>
* x: (N, D, H, W, C) void Conv3dGPUKernel(const GPUContext& dev_ctx,
* kernel: (D, H, W, C, OC) const SparseCooTensor& x,
* out: (N, D, H, W, OC) const DenseTensor& kernel,
**/ const std::vector<int>& paddings,
template <typename T, typename Context> const std::vector<int>& dilations,
void Conv3dKernel(const Context& dev_ctx, const std::vector<int>& strides,
const SparseCooTensor& x, const int groups,
const DenseTensor& kernel, const bool subm,
const std::vector<int>& paddings, SparseCooTensor* out,
const std::vector<int>& dilations, DenseTensor* rulebook) {
const std::vector<int>& strides,
const int groups,
const bool subm,
SparseCooTensor* out,
DenseTensor* rulebook) {
// update padding and dilation // update padding and dilation
// Currently, only support x.layout is NDHWC, groups = 1 // Currently, only support x.layout is NDHWC, groups = 1
// if x.layout != NDHWC then transpose(x), transpose(weight) // if x.layout != NDHWC then transpose(x), transpose(weight)
const auto& x_dims = x.dims(); const auto& x_dims = x.dims();
const auto& kernel_dims = kernel.dims(); const auto& kernel_dims = kernel.dims();
int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2]; int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
...@@ -67,7 +63,6 @@ void Conv3dKernel(const Context& dev_ctx, ...@@ -67,7 +63,6 @@ void Conv3dKernel(const Context& dev_ctx,
DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta)); DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta));
DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW); DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW);
DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta)); DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta));
DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta)); DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
std::vector<int> subm_paddings(paddings), subm_strides(strides); std::vector<int> subm_paddings(paddings), subm_strides(strides);
...@@ -75,28 +70,26 @@ void Conv3dKernel(const Context& dev_ctx, ...@@ -75,28 +70,26 @@ void Conv3dKernel(const Context& dev_ctx,
phi::funcs::sparse::ResetSubmKernelSizeAndStrides( phi::funcs::sparse::ResetSubmKernelSizeAndStrides(
kernel.dims(), &subm_paddings, &subm_strides); kernel.dims(), &subm_paddings, &subm_strides);
} }
int n = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
int n = ProductRuleBook<T, Context>(dev_ctx, x,
x, kernel_sizes,
kernel_sizes, subm_paddings,
subm_paddings, dilations,
dilations, subm_strides,
subm_strides, out_dims,
out_dims, subm,
subm, rulebook,
rulebook, &counter_per_kernel,
&counter_per_kernel, &offsets_per_kernel,
&offsets_per_kernel, &out_index,
&out_index, &unique_value,
&unique_key, out,
&unique_value, &h_counter,
out, &offsets);
&h_counter,
&offsets);
const int* counter_ptr = counter_per_kernel.data<int>(); const int* counter_ptr = counter_per_kernel.data<int>();
const int* offsets_ptr = counter_per_kernel.data<int>(); const int* offsets_ptr = counter_per_kernel.data<int>();
const int* rulebook_ptr = rulebook->data<int>(); const IntT* rulebook_ptr = rulebook->data<IntT>();
// 2. gather // 2. gather
DenseTensorMeta in_features_meta( DenseTensorMeta in_features_meta(
...@@ -109,22 +102,22 @@ void Conv3dKernel(const Context& dev_ctx, ...@@ -109,22 +102,22 @@ void Conv3dKernel(const Context& dev_ctx,
phi::Empty(dev_ctx, std::move(out_features_meta)); phi::Empty(dev_ctx, std::move(out_features_meta));
T* in_features_ptr = in_features.data<T>(); T* in_features_ptr = in_features.data<T>();
T* out_features_ptr = out_features.data<T>(); T* out_features_ptr = out_features.data<T>();
phi::funcs::SetConstant<Context, T> set_zero; phi::funcs::SetConstant<GPUContext, T> set_zero;
set_zero(dev_ctx, &out_features, static_cast<T>(0.0f)); set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
auto config = auto config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1); phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
GatherKernel<T, int><<<config.block_per_grid.x, GatherKernel<T, IntT><<<config.block_per_grid.x,
config.thread_per_block.x, config.thread_per_block.x,
0, 0,
dev_ctx.stream()>>>(x.non_zero_elements().data<T>(), dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
rulebook_ptr + n, rulebook_ptr + n,
in_features_ptr, in_features_ptr,
n, n,
in_channels); in_channels);
// 3. call gemm for every werght // 3. call gemm for every werght
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx); auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
auto* out_values = out->mutable_non_zero_elements(); auto* out_values = out->mutable_non_zero_elements();
T* out_values_ptr = out_values->data<T>(); T* out_values_ptr = out_values->data<T>();
...@@ -168,6 +161,36 @@ void Conv3dKernel(const Context& dev_ctx, ...@@ -168,6 +161,36 @@ void Conv3dKernel(const Context& dev_ctx,
out_channels, out_channels,
out_values_ptr); out_values_ptr);
} }
/**
* x: (N, D, H, W, C)
* kernel: (D, H, W, C, OC)
* out: (N, D, H, W, OC)
**/
template <typename T, typename Context>
void Conv3dKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& kernel,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const int groups,
const bool subm,
SparseCooTensor* out,
DenseTensor* rulebook) {
PD_DISPATCH_INTEGRAL_TYPES(
x.non_zero_indices().dtype(), "Conv3dGPUKernel", ([&] {
Conv3dGPUKernel<T, data_t>(dev_ctx,
x,
kernel,
paddings,
dilations,
strides,
groups,
subm,
out,
rulebook);
}));
}
} // namespace sparse } // namespace sparse
} // namespace phi } // namespace phi
......
...@@ -12,24 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,24 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_info.h" #include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h" #include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/pooling.h" #include "paddle/phi/kernels/funcs/pooling.h"
#include "paddle/phi/kernels/funcs/sparse/convolution.h" #include "paddle/phi/kernels/funcs/sparse/convolution.h"
#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h" #include "paddle/phi/api/ext/dispatch.h"
namespace phi { namespace phi {
namespace sparse { namespace sparse {
template <typename T> template <typename T, typename IntT = int>
__global__ void MaxPoolGradCudaKernel(const T* in_features_ptr, __global__ void MaxPoolGradCudaKernel(const T* in_features_ptr,
const T* out_features_ptr, const T* out_features_ptr,
const T* out_grad_ptr, const T* out_grad_ptr,
const int* rulebook_ptr, const IntT* rulebook_ptr,
const int n, const int n,
const int rulebook_len, const int rulebook_len,
const int channels, const int channels,
...@@ -38,8 +42,8 @@ __global__ void MaxPoolGradCudaKernel(const T* in_features_ptr, ...@@ -38,8 +42,8 @@ __global__ void MaxPoolGradCudaKernel(const T* in_features_ptr,
CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) { CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) {
int real_i = i / channels; int real_i = i / channels;
int c = i - real_i * channels; int c = i - real_i * channels;
int in_i = rulebook_ptr[real_i]; IntT in_i = rulebook_ptr[real_i];
int out_i = rulebook_ptr[real_i + rulebook_len]; IntT out_i = rulebook_ptr[real_i + rulebook_len];
grad_functor.compute(in_features_ptr[in_i * channels + c], grad_functor.compute(in_features_ptr[in_i * channels + c],
out_features_ptr[out_i * channels + c], out_features_ptr[out_i * channels + c],
out_grad_ptr[out_i * channels + c], out_grad_ptr[out_i * channels + c],
...@@ -48,23 +52,23 @@ __global__ void MaxPoolGradCudaKernel(const T* in_features_ptr, ...@@ -48,23 +52,23 @@ __global__ void MaxPoolGradCudaKernel(const T* in_features_ptr,
} }
} }
template <typename T, typename Context> template <typename T, typename IntT = int>
void MaxPoolGradKernel(const Context& dev_ctx, void MaxPoolGradGPUKernel(const GPUContext& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
const DenseTensor& rulebook, const DenseTensor& rulebook,
const SparseCooTensor& out, const SparseCooTensor& out,
const DenseTensor& out_grad, const SparseCooTensor& out_grad,
const std::vector<int>& kernel_sizes, const std::vector<int>& kernel_sizes,
DenseTensor* x_grad) { SparseCooTensor* x_grad) {
int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
const int in_channels = x.dims()[4]; const int in_channels = x.dims()[4];
int rulebook_len = rulebook.dims()[1]; int rulebook_len = rulebook.dims()[1];
const int* rulebook_ptr = rulebook.data<int>(); const IntT* rulebook_ptr = rulebook.data<IntT>();
std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0), std::vector<IntT> offsets(kernel_size + 1), counter(kernel_size, 0),
h_counter(kernel_size); h_counter(kernel_size);
phi::backends::gpu::GpuMemcpyAsync(&h_counter[0], phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
rulebook_ptr, rulebook_ptr,
rulebook_len * sizeof(int), rulebook_len * sizeof(IntT),
#ifdef PADDLE_WITH_HIP #ifdef PADDLE_WITH_HIP
hipMemcpyDeviceToHost, hipMemcpyDeviceToHost,
#else #else
...@@ -80,10 +84,20 @@ void MaxPoolGradKernel(const Context& dev_ctx, ...@@ -80,10 +84,20 @@ void MaxPoolGradKernel(const Context& dev_ctx,
const T* in_features_ptr = x.non_zero_elements().data<T>(); const T* in_features_ptr = x.non_zero_elements().data<T>();
const T* out_features_ptr = out.non_zero_elements().data<T>(); const T* out_features_ptr = out.non_zero_elements().data<T>();
const T* out_grad_ptr = out_grad.data<T>(); const T* out_grad_ptr = out_grad.non_zero_elements().data<T>();
T* x_grad_ptr = x_grad->data<T>(); // TODO(zhangkaihuo): call phi::sparse::EmptyLike
phi::funcs::SetConstant<Context, T> set_zero; DenseTensor x_grad_indices =
set_zero(dev_ctx, x_grad, static_cast<T>(0.0f)); phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
DenseTensor x_grad_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true);
T* x_grad_ptr = x_grad_values.data<T>();
phi::funcs::SetConstant<GPUContext, T> set_zero;
set_zero(dev_ctx, &x_grad_values, static_cast<T>(0.0f));
phi::Copy<GPUContext>(dev_ctx,
x.non_zero_indices(),
dev_ctx.GetPlace(),
false,
&x_grad_indices);
for (int i = 0; i < kernel_size; i++) { for (int i = 0; i < kernel_size; i++) {
if (counter[i] <= 0) { if (counter[i] <= 0) {
...@@ -92,10 +106,10 @@ void MaxPoolGradKernel(const Context& dev_ctx, ...@@ -92,10 +106,10 @@ void MaxPoolGradKernel(const Context& dev_ctx,
auto config = phi::backends::gpu::GetGpuLaunchConfig1D( auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
dev_ctx, counter[i] * in_channels, 1); dev_ctx, counter[i] * in_channels, 1);
MaxPoolGradCudaKernel<T><<<config.block_per_grid.x, MaxPoolGradCudaKernel<T, IntT><<<config.block_per_grid.x,
config.thread_per_block.x, config.thread_per_block.x,
0, 0,
dev_ctx.stream()>>>( dev_ctx.stream()>>>(
in_features_ptr, in_features_ptr,
out_features_ptr, out_features_ptr,
out_grad_ptr, out_grad_ptr,
...@@ -107,6 +121,21 @@ void MaxPoolGradKernel(const Context& dev_ctx, ...@@ -107,6 +121,21 @@ void MaxPoolGradKernel(const Context& dev_ctx,
} }
} }
template <typename T, typename Context>
void MaxPoolGradKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& rulebook,
const SparseCooTensor& out,
const SparseCooTensor& out_grad,
const std::vector<int>& kernel_sizes,
SparseCooTensor* x_grad) {
PD_DISPATCH_INTEGRAL_TYPES(
x.non_zero_indices().dtype(), "MaxPoolGradGPUKernel", ([&] {
MaxPoolGradGPUKernel<T, data_t>(
dev_ctx, x, rulebook, out, out_grad, kernel_sizes, x_grad);
}));
}
} // namespace sparse } // namespace sparse
} // namespace phi } // namespace phi
......
...@@ -12,19 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,19 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h" #include "paddle/phi/core/tensor_meta.h"
#include "paddle/phi/kernels/funcs/pooling.h" #include "paddle/phi/kernels/funcs/pooling.h"
#include "paddle/phi/kernels/funcs/sparse/convolution.h" #include "paddle/phi/kernels/funcs/sparse/convolution.h"
#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h" #include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace phi { namespace phi {
namespace sparse { namespace sparse {
template <typename T> template <typename T, typename IntT = int>
__global__ void MaxPoolCudaKernel(const T* in_features_ptr, __global__ void MaxPoolCudaKernel(const T* in_features_ptr,
const int* rulebook_ptr, const IntT* rulebook_ptr,
const int n, const int n,
const int rulebook_len, const int rulebook_len,
const int channels, const int channels,
...@@ -33,8 +36,8 @@ __global__ void MaxPoolCudaKernel(const T* in_features_ptr, ...@@ -33,8 +36,8 @@ __global__ void MaxPoolCudaKernel(const T* in_features_ptr,
CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) { CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) {
int real_i = i / channels; int real_i = i / channels;
int channel_i = i - real_i * channels; int channel_i = i - real_i * channels;
int in_i = rulebook_ptr[real_i]; IntT in_i = rulebook_ptr[real_i];
int out_i = rulebook_ptr[real_i + rulebook_len]; IntT out_i = rulebook_ptr[real_i + rulebook_len];
max_pool_functor.compute(in_features_ptr[in_i * channels + channel_i], max_pool_functor.compute(in_features_ptr[in_i * channels + channel_i],
&out_features_ptr[out_i * channels + channel_i]); &out_features_ptr[out_i * channels + channel_i]);
} }
...@@ -45,15 +48,15 @@ __global__ void MaxPoolCudaKernel(const T* in_features_ptr, ...@@ -45,15 +48,15 @@ __global__ void MaxPoolCudaKernel(const T* in_features_ptr,
* kernel: (D, H, W, C, OC) * kernel: (D, H, W, C, OC)
* out: (N, D, H, W, OC) * out: (N, D, H, W, OC)
**/ **/
template <typename T, typename Context> template <typename T, typename IntT = int>
void MaxPoolKernel(const Context& dev_ctx, void MaxPoolGPUKernel(const GPUContext& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
const std::vector<int>& kernel_sizes, const std::vector<int>& kernel_sizes,
const std::vector<int>& paddings, const std::vector<int>& paddings,
const std::vector<int>& dilations, const std::vector<int>& dilations,
const std::vector<int>& strides, const std::vector<int>& strides,
SparseCooTensor* out, SparseCooTensor* out,
DenseTensor* rulebook) { DenseTensor* rulebook) {
const auto& x_dims = x.dims(); const auto& x_dims = x.dims();
int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2]; int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
const std::vector<int>& real_kernel_sizes = const std::vector<int>& real_kernel_sizes =
...@@ -70,29 +73,27 @@ void MaxPoolKernel(const Context& dev_ctx, ...@@ -70,29 +73,27 @@ void MaxPoolKernel(const Context& dev_ctx,
DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta)); DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW); DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW);
DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta)); DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta));
DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta)); DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
// 1. product rulebook // 1. product rulebook
int rulebook_len = ProductRuleBook<T, Context>(dev_ctx, int rulebook_len = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
x, x,
real_kernel_sizes, real_kernel_sizes,
paddings, paddings,
dilations, dilations,
strides, strides,
out_dims, out_dims,
false, false,
rulebook, rulebook,
&counter_per_kernel, &counter_per_kernel,
&offsets_per_kernel, &offsets_per_kernel,
&out_index, &out_index,
&unique_key, &unique_value,
&unique_value, out,
out, &counter,
&counter, &offsets);
&offsets);
const IntT* rulebook_ptr = rulebook->data<IntT>();
const int* rulebook_ptr = rulebook->data<int>();
T* out_features_ptr = out->mutable_non_zero_elements()->data<T>(); T* out_features_ptr = out->mutable_non_zero_elements()->data<T>();
const T* in_features_ptr = x.non_zero_elements().data<T>(); const T* in_features_ptr = x.non_zero_elements().data<T>();
...@@ -113,10 +114,10 @@ void MaxPoolKernel(const Context& dev_ctx, ...@@ -113,10 +114,10 @@ void MaxPoolKernel(const Context& dev_ctx,
auto config = phi::backends::gpu::GetGpuLaunchConfig1D( auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
dev_ctx, counter[i] * in_channels, 1); dev_ctx, counter[i] * in_channels, 1);
MaxPoolCudaKernel<T><<<config.block_per_grid.x, MaxPoolCudaKernel<T, IntT><<<config.block_per_grid.x,
config.thread_per_block.x, config.thread_per_block.x,
0, 0,
dev_ctx.stream()>>>( dev_ctx.stream()>>>(
in_features_ptr, in_features_ptr,
rulebook_ptr + offsets[i] + rulebook_len, rulebook_ptr + offsets[i] + rulebook_len,
counter[i], counter[i],
...@@ -126,6 +127,28 @@ void MaxPoolKernel(const Context& dev_ctx, ...@@ -126,6 +127,28 @@ void MaxPoolKernel(const Context& dev_ctx,
} }
} }
template <typename T, typename Context>
void MaxPoolKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const std::vector<int>& kernel_sizes,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
SparseCooTensor* out,
DenseTensor* rulebook) {
PD_DISPATCH_INTEGRAL_TYPES(
x.non_zero_indices().dtype(), "MaxPoolGPUKernel", ([&] {
MaxPoolGPUKernel<T, data_t>(dev_ctx,
x,
kernel_sizes,
paddings,
dilations,
strides,
out,
rulebook);
}));
}
} // namespace sparse } // namespace sparse
} // namespace phi } // namespace phi
......
...@@ -26,20 +26,18 @@ void MaxPoolGradKernel(const Context& dev_ctx, ...@@ -26,20 +26,18 @@ void MaxPoolGradKernel(const Context& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
const DenseTensor& rulebook, const DenseTensor& rulebook,
const SparseCooTensor& out, const SparseCooTensor& out,
const DenseTensor& out_grad, const SparseCooTensor& out_grad,
const std::vector<int>& kernel_sizes, const std::vector<int>& kernel_sizes,
DenseTensor* x_grad); SparseCooTensor* x_grad);
template <typename T, typename Context> template <typename T, typename Context>
DenseTensor MaxPoolGrad(const Context& dev_ctx, SparseCooTensor MaxPoolGrad(const Context& dev_ctx,
const SparseCooTensor& x, const SparseCooTensor& x,
const DenseTensor& rulebook, const DenseTensor& rulebook,
const SparseCooTensor& out, const SparseCooTensor& out,
const DenseTensor& out_grad, const SparseCooTensor& out_grad,
const std::vector<int>& kernel_sizes) { const std::vector<int>& kernel_sizes) {
DenseTensor x_grad = phi::Empty<Context>( SparseCooTensor x_grad;
dev_ctx,
DenseTensorMeta(x.dtype(), x.non_zero_elements().dims(), x.layout()));
MaxPoolGradKernel<T, Context>( MaxPoolGradKernel<T, Context>(
dev_ctx, x, rulebook, out, out_grad, kernel_sizes, &x_grad); dev_ctx, x, rulebook, out, out_grad, kernel_sizes, &x_grad);
return x_grad; return x_grad;
......
...@@ -39,11 +39,7 @@ SparseCooTensor MaxPool(const Context& dev_ctx, ...@@ -39,11 +39,7 @@ SparseCooTensor MaxPool(const Context& dev_ctx,
const std::vector<int>& dilations, const std::vector<int>& dilations,
const std::vector<int>& strides, const std::vector<int>& strides,
DenseTensor* rulebook) { DenseTensor* rulebook) {
DenseTensor indices = phi::Empty<Context>( SparseCooTensor coo;
dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
DenseTensor values =
phi::Empty<Context>(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout()));
SparseCooTensor coo(indices, values, x.dims());
MaxPoolKernel<T, Context>( MaxPoolKernel<T, Context>(
dev_ctx, x, kernel_sizes, paddings, dilations, strides, &coo, rulebook); dev_ctx, x, kernel_sizes, paddings, dilations, strides, &coo, rulebook);
return coo; return coo;
......
...@@ -48,13 +48,13 @@ std::vector<T2> cast(const std::vector<T1>& in) { ...@@ -48,13 +48,13 @@ std::vector<T2> cast(const std::vector<T1>& in) {
return out; return out;
} }
template <typename T> template <typename T, typename IntT = int>
void TestConv3dBase(const std::vector<int>& indices, void TestConv3dBase(const std::vector<IntT>& indices,
const std::vector<T>& features, const std::vector<T>& features,
const DDim& x_dims, const DDim& x_dims,
const std::vector<T>& kernel, const std::vector<T>& kernel,
const DDim& kernel_dims, const DDim& kernel_dims,
const std::vector<int>& correct_out_indices, const std::vector<IntT>& correct_out_indices,
const std::vector<T>& correct_out_features, const std::vector<T>& correct_out_features,
const DDim& correct_out_dims, const DDim& correct_out_dims,
const int non_zero_num, const int non_zero_num,
...@@ -80,11 +80,13 @@ void TestConv3dBase(const std::vector<int>& indices, ...@@ -80,11 +80,13 @@ void TestConv3dBase(const std::vector<int>& indices,
const int in_channels = kernel_dims[3]; const int in_channels = kernel_dims[3];
const int out_channels = kernel_dims[4]; const int out_channels = kernel_dims[4];
auto indices_dtype = paddle::experimental::CppTypeToDataType<IntT>::Type();
DenseTensor indices_tensor = phi::Empty( DenseTensor indices_tensor = phi::Empty(
dev_ctx_cpu, dev_ctx_cpu,
DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW)); DenseTensorMeta(indices_dtype, {4, non_zero_num}, DataLayout::NCHW));
memcpy( memcpy(indices_tensor.data<IntT>(),
indices_tensor.data<int>(), indices.data(), indices.size() * sizeof(int)); indices.data(),
indices.size() * sizeof(IntT));
DenseTensor features_tensor = phi::Empty( DenseTensor features_tensor = phi::Empty(
dev_ctx_cpu, dev_ctx_cpu,
DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(), DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
...@@ -111,7 +113,7 @@ void TestConv3dBase(const std::vector<int>& indices, ...@@ -111,7 +113,7 @@ void TestConv3dBase(const std::vector<int>& indices,
if (!std::is_same<T, phi::dtype::float16>::value) { if (!std::is_same<T, phi::dtype::float16>::value) {
DenseTensor rulebook = phi::Empty( DenseTensor rulebook = phi::Empty(
dev_ctx_cpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW)); dev_ctx_cpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW));
SparseCooTensor out = sparse::Conv3d<T>(dev_ctx_cpu, SparseCooTensor out = sparse::Conv3d<T>(dev_ctx_cpu,
x_tensor, x_tensor,
kernel_tensor, kernel_tensor,
...@@ -129,8 +131,8 @@ void TestConv3dBase(const std::vector<int>& indices, ...@@ -129,8 +131,8 @@ void TestConv3dBase(const std::vector<int>& indices,
ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out.nnz()); ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out.nnz());
int cmp_indices = memcmp(correct_out_indices.data(), int cmp_indices = memcmp(correct_out_indices.data(),
out.non_zero_indices().data<int>(), out.non_zero_indices().data<IntT>(),
correct_out_indices.size() * sizeof(int)); correct_out_indices.size() * sizeof(IntT));
ASSERT_EQ(cmp_indices, 0); ASSERT_EQ(cmp_indices, 0);
f_verify(out.non_zero_elements().data<T>(), correct_out_features); f_verify(out.non_zero_elements().data<T>(), correct_out_features);
...@@ -172,7 +174,7 @@ void TestConv3dBase(const std::vector<int>& indices, ...@@ -172,7 +174,7 @@ void TestConv3dBase(const std::vector<int>& indices,
DenseTensor d_indices_tensor = phi::Empty( DenseTensor d_indices_tensor = phi::Empty(
dev_ctx_gpu, dev_ctx_gpu,
DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW)); DenseTensorMeta(indices_dtype, {4, non_zero_num}, DataLayout::NCHW));
phi::Copy( phi::Copy(
dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor); dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor);
...@@ -195,7 +197,7 @@ void TestConv3dBase(const std::vector<int>& indices, ...@@ -195,7 +197,7 @@ void TestConv3dBase(const std::vector<int>& indices,
dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor); dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor);
DenseTensor d_rulebook = phi::Empty( DenseTensor d_rulebook = phi::Empty(
dev_ctx_gpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW)); dev_ctx_gpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW));
SparseCooTensor d_out = sparse::Conv3d<T>(dev_ctx_gpu, SparseCooTensor d_out = sparse::Conv3d<T>(dev_ctx_gpu,
d_x_tensor, d_x_tensor,
d_kernel_tensor, d_kernel_tensor,
...@@ -214,7 +216,7 @@ void TestConv3dBase(const std::vector<int>& indices, ...@@ -214,7 +216,7 @@ void TestConv3dBase(const std::vector<int>& indices,
DenseTensor h_indices_tensor = phi::Empty( DenseTensor h_indices_tensor = phi::Empty(
dev_ctx_cpu, dev_ctx_cpu,
DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW)); DenseTensorMeta(indices_dtype, {4, d_out.nnz()}, DataLayout::NCHW));
phi::Copy(dev_ctx_gpu, phi::Copy(dev_ctx_gpu,
d_out.non_zero_indices(), d_out.non_zero_indices(),
phi::CPUPlace(), phi::CPUPlace(),
...@@ -222,8 +224,8 @@ void TestConv3dBase(const std::vector<int>& indices, ...@@ -222,8 +224,8 @@ void TestConv3dBase(const std::vector<int>& indices,
&h_indices_tensor); &h_indices_tensor);
int cmp_indices2 = memcmp(correct_out_indices.data(), int cmp_indices2 = memcmp(correct_out_indices.data(),
h_indices_tensor.data<int>(), h_indices_tensor.data<IntT>(),
correct_out_indices.size() * sizeof(int)); correct_out_indices.size() * sizeof(IntT));
ASSERT_EQ(cmp_indices2, 0); ASSERT_EQ(cmp_indices2, 0);
DenseTensor h_features_tensor = DenseTensor h_features_tensor =
...@@ -264,12 +266,13 @@ void TestConv3dBase(const std::vector<int>& indices, ...@@ -264,12 +266,13 @@ void TestConv3dBase(const std::vector<int>& indices,
#endif #endif
} }
void TestConv3d(const std::vector<int>& indices, template <typename IntT = int>
void TestConv3d(const std::vector<IntT>& indices,
const std::vector<float>& features, const std::vector<float>& features,
const DDim& x_dims, const DDim& x_dims,
const std::vector<float>& kernel, const std::vector<float>& kernel,
const DDim& kernel_dims, const DDim& kernel_dims,
const std::vector<int>& correct_out_indices, const std::vector<IntT>& correct_out_indices,
const std::vector<float>& correct_out_features, const std::vector<float>& correct_out_features,
const DDim& correct_out_dims, const DDim& correct_out_dims,
const int non_zero_num, const int non_zero_num,
...@@ -282,41 +285,41 @@ void TestConv3d(const std::vector<int>& indices, ...@@ -282,41 +285,41 @@ void TestConv3d(const std::vector<int>& indices,
const std::vector<float> kernel_grad = {}, const std::vector<float> kernel_grad = {},
const bool subm = false) { const bool subm = false) {
// test float // test float
TestConv3dBase<float>(indices, TestConv3dBase<float, IntT>(indices,
features, features,
x_dims, x_dims,
kernel, kernel,
kernel_dims, kernel_dims,
correct_out_indices, correct_out_indices,
correct_out_features, correct_out_features,
correct_out_dims, correct_out_dims,
non_zero_num, non_zero_num,
paddings, paddings,
strides, strides,
dilations, dilations,
diff, diff,
backward, backward,
features_grad, features_grad,
kernel_grad, kernel_grad,
subm); subm);
// test double // test double
TestConv3dBase<double>(indices, TestConv3dBase<double, IntT>(indices,
cast<float, double>(features), cast<float, double>(features),
x_dims, x_dims,
cast<float, double>(kernel), cast<float, double>(kernel),
kernel_dims, kernel_dims,
correct_out_indices, correct_out_indices,
cast<float, double>(correct_out_features), cast<float, double>(correct_out_features),
correct_out_dims, correct_out_dims,
non_zero_num, non_zero_num,
paddings, paddings,
strides, strides,
dilations, dilations,
diff, diff,
backward, backward,
cast<float, double>(features_grad), cast<float, double>(features_grad),
cast<float, double>(kernel_grad), cast<float, double>(kernel_grad),
subm); subm);
} }
TEST(DEV_API, sparse_conv3d) { TEST(DEV_API, sparse_conv3d) {
...@@ -616,6 +619,51 @@ TEST(DEV_API, sparse_conv2d) { ...@@ -616,6 +619,51 @@ TEST(DEV_API, sparse_conv2d) {
dilations); dilations);
} }
TEST(DEV_API, sparse_conv2d_int64) {
const int in_channels = 1;
const int out_channels = 1;
DDim x_dims = {1, 1, 5, 5, in_channels};
DDim kernel_dims = {1, 3, 3, in_channels, out_channels};
DDim out_dims = {1, 1, 3, 3, out_channels};
std::vector<int> paddings = {0, 0, 0};
std::vector<int> strides = {1, 1, 1};
std::vector<int> dilations = {1, 1, 1};
const int non_zero_num = 3;
std::vector<int64_t> indices_flatten = {0, 0, 0, 0, 0, 0, 0, 4, 0, 3, 2, 4};
std::vector<float> features = {-0.79394531, -0.3125, -0.55029297};
// 3*3*3=27
std::vector<float> kernel = {0.65820312,
0.75048828,
0.21411133,
0.17370605,
0.85546875,
0.53076172,
0.28833008,
0.71044922,
0.00659943};
std::vector<int64_t> out_indices_flatten = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 2, 2, 2, 1, 2, 0, 1, 2};
std::vector<float> out_features = {
-0.17004, -0.71338, -0.00206, -0.22205, -0.09009};
TestConv3d<int64_t>(indices_flatten,
features,
x_dims,
kernel,
kernel_dims,
out_indices_flatten,
out_features,
out_dims,
non_zero_num,
paddings,
strides,
dilations);
}
TEST(DEV_API, sparse_conv3d_backward) { TEST(DEV_API, sparse_conv3d_backward) {
const int in_channels = 1; const int in_channels = 1;
const int out_channels = 1; const int out_channels = 1;
......
...@@ -36,11 +36,11 @@ std::vector<T2> cast(const std::vector<T1>& in) { ...@@ -36,11 +36,11 @@ std::vector<T2> cast(const std::vector<T1>& in) {
} }
return out; return out;
} }
template <typename T> template <typename T, typename IntT = int>
void TestMaxPoolBase(const std::vector<int>& indices, void TestMaxPoolBase(const std::vector<IntT>& indices,
const std::vector<T>& features, const std::vector<T>& features,
const DDim& x_dims, const DDim& x_dims,
const std::vector<int>& correct_out_indices, const std::vector<IntT>& correct_out_indices,
const std::vector<T>& correct_out_features, const std::vector<T>& correct_out_features,
const DDim& correct_out_dims, const DDim& correct_out_dims,
const int non_zero_num, const int non_zero_num,
...@@ -65,11 +65,13 @@ void TestMaxPoolBase(const std::vector<int>& indices, ...@@ -65,11 +65,13 @@ void TestMaxPoolBase(const std::vector<int>& indices,
const int in_channels = x_dims[4]; const int in_channels = x_dims[4];
const int out_channels = in_channels; const int out_channels = in_channels;
auto indices_dtype = paddle::experimental::CppTypeToDataType<IntT>::Type();
DenseTensor indices_tensor = phi::Empty( DenseTensor indices_tensor = phi::Empty(
dev_ctx_cpu, dev_ctx_cpu,
DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW)); DenseTensorMeta(indices_dtype, {4, non_zero_num}, DataLayout::NCHW));
memcpy( memcpy(indices_tensor.data<IntT>(),
indices_tensor.data<int>(), indices.data(), indices.size() * sizeof(int)); indices.data(),
indices.size() * sizeof(IntT));
DenseTensor features_tensor = phi::Empty( DenseTensor features_tensor = phi::Empty(
dev_ctx_cpu, dev_ctx_cpu,
DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(), DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
...@@ -88,8 +90,7 @@ void TestMaxPoolBase(const std::vector<int>& indices, ...@@ -88,8 +90,7 @@ void TestMaxPoolBase(const std::vector<int>& indices,
}; };
if (!std::is_same<T, phi::dtype::float16>::value) { if (!std::is_same<T, phi::dtype::float16>::value) {
DenseTensor rulebook = phi::Empty( DenseTensor rulebook;
dev_ctx_cpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
SparseCooTensor out = sparse::MaxPool<T>(dev_ctx_cpu, SparseCooTensor out = sparse::MaxPool<T>(dev_ctx_cpu,
x_tensor, x_tensor,
kernel_sizes, kernel_sizes,
...@@ -105,20 +106,16 @@ void TestMaxPoolBase(const std::vector<int>& indices, ...@@ -105,20 +106,16 @@ void TestMaxPoolBase(const std::vector<int>& indices,
ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out.nnz()); ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out.nnz());
int cmp_indices = memcmp(correct_out_indices.data(), int cmp_indices = memcmp(correct_out_indices.data(),
out.non_zero_indices().data<int>(), out.non_zero_indices().data<IntT>(),
correct_out_indices.size() * sizeof(int)); correct_out_indices.size() * sizeof(IntT));
ASSERT_EQ(cmp_indices, 0); ASSERT_EQ(cmp_indices, 0);
f_verify(out.non_zero_elements().data<T>(), correct_out_features); f_verify(out.non_zero_elements().data<T>(), correct_out_features);
if (backward) { if (backward) {
DenseTensor x_grad = sparse::MaxPoolGrad<T>(dev_ctx_cpu, SparseCooTensor x_grad = sparse::MaxPoolGrad<T>(
x_tensor, dev_ctx_cpu, x_tensor, rulebook, out, out, kernel_sizes);
rulebook, f_verify(x_grad.non_zero_elements().data<T>(), features_grad);
out,
out.non_zero_elements(),
kernel_sizes);
f_verify(x_grad.data<T>(), features_grad);
} }
} }
...@@ -142,7 +139,7 @@ void TestMaxPoolBase(const std::vector<int>& indices, ...@@ -142,7 +139,7 @@ void TestMaxPoolBase(const std::vector<int>& indices,
DenseTensor d_indices_tensor = phi::Empty( DenseTensor d_indices_tensor = phi::Empty(
dev_ctx_gpu, dev_ctx_gpu,
DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW)); DenseTensorMeta(indices_dtype, {4, non_zero_num}, DataLayout::NCHW));
phi::Copy( phi::Copy(
dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor); dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor);
...@@ -153,8 +150,7 @@ void TestMaxPoolBase(const std::vector<int>& indices, ...@@ -153,8 +150,7 @@ void TestMaxPoolBase(const std::vector<int>& indices,
SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims); SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims);
DenseTensor d_rulebook = phi::Empty( DenseTensor d_rulebook;
dev_ctx_gpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
SparseCooTensor d_out = sparse::MaxPool<T>(dev_ctx_gpu, SparseCooTensor d_out = sparse::MaxPool<T>(dev_ctx_gpu,
d_x_tensor, d_x_tensor,
kernel_sizes, kernel_sizes,
...@@ -171,7 +167,7 @@ void TestMaxPoolBase(const std::vector<int>& indices, ...@@ -171,7 +167,7 @@ void TestMaxPoolBase(const std::vector<int>& indices,
DenseTensor h_indices_tensor = phi::Empty( DenseTensor h_indices_tensor = phi::Empty(
dev_ctx_cpu, dev_ctx_cpu,
DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW)); DenseTensorMeta(indices_dtype, {4, d_out.nnz()}, DataLayout::NCHW));
phi::Copy(dev_ctx_gpu, phi::Copy(dev_ctx_gpu,
d_out.non_zero_indices(), d_out.non_zero_indices(),
phi::CPUPlace(), phi::CPUPlace(),
...@@ -179,8 +175,8 @@ void TestMaxPoolBase(const std::vector<int>& indices, ...@@ -179,8 +175,8 @@ void TestMaxPoolBase(const std::vector<int>& indices,
&h_indices_tensor); &h_indices_tensor);
int cmp_indices2 = memcmp(correct_out_indices.data(), int cmp_indices2 = memcmp(correct_out_indices.data(),
h_indices_tensor.data<int>(), h_indices_tensor.data<IntT>(),
correct_out_indices.size() * sizeof(int)); correct_out_indices.size() * sizeof(IntT));
ASSERT_EQ(cmp_indices2, 0); ASSERT_EQ(cmp_indices2, 0);
DenseTensor h_features_tensor = DenseTensor h_features_tensor =
...@@ -194,23 +190,25 @@ void TestMaxPoolBase(const std::vector<int>& indices, ...@@ -194,23 +190,25 @@ void TestMaxPoolBase(const std::vector<int>& indices,
f_verify(h_features_tensor.data<T>(), correct_out_features); f_verify(h_features_tensor.data<T>(), correct_out_features);
if (backward) { if (backward) {
DenseTensor x_grad = sparse::MaxPoolGrad<T>(dev_ctx_gpu, SparseCooTensor x_grad = sparse::MaxPoolGrad<T>(
d_x_tensor, dev_ctx_gpu, d_x_tensor, d_rulebook, d_out, d_out, kernel_sizes);
d_rulebook, DenseTensor h_features_grad =
d_out, phi::EmptyLike<T>(dev_ctx_cpu, x_grad.non_zero_elements());
d_out.non_zero_elements(), phi::Copy(dev_ctx_gpu,
kernel_sizes); x_grad.non_zero_elements(),
DenseTensor h_features_grad = phi::EmptyLike<T>(dev_ctx_cpu, x_grad); phi::CPUPlace(),
phi::Copy(dev_ctx_gpu, x_grad, phi::CPUPlace(), true, &h_features_grad); true,
&h_features_grad);
f_verify(h_features_grad.data<T>(), features_grad); f_verify(h_features_grad.data<T>(), features_grad);
} }
#endif #endif
} }
void TestMaxPool(const std::vector<int>& indices, template <typename IntT = int>
void TestMaxPool(const std::vector<IntT>& indices,
const std::vector<float>& features, const std::vector<float>& features,
const DDim& x_dims, const DDim& x_dims,
const std::vector<int>& correct_out_indices, const std::vector<IntT>& correct_out_indices,
const std::vector<float>& correct_out_features, const std::vector<float>& correct_out_features,
const DDim& correct_out_dims, const DDim& correct_out_dims,
const int non_zero_num, const int non_zero_num,
...@@ -222,35 +220,35 @@ void TestMaxPool(const std::vector<int>& indices, ...@@ -222,35 +220,35 @@ void TestMaxPool(const std::vector<int>& indices,
const bool backward = false, const bool backward = false,
const std::vector<float> features_grad = {}) { const std::vector<float> features_grad = {}) {
// test float // test float
TestMaxPoolBase<float>(indices, TestMaxPoolBase<float, IntT>(indices,
features, features,
x_dims, x_dims,
correct_out_indices, correct_out_indices,
correct_out_features, correct_out_features,
correct_out_dims, correct_out_dims,
non_zero_num, non_zero_num,
kernel_sizes, kernel_sizes,
paddings, paddings,
strides, strides,
dilations, dilations,
diff, diff,
backward, backward,
features_grad); features_grad);
// test double // test double
TestMaxPoolBase<double>(indices, TestMaxPoolBase<double, IntT>(indices,
cast<float, double>(features), cast<float, double>(features),
x_dims, x_dims,
correct_out_indices, correct_out_indices,
cast<float, double>(correct_out_features), cast<float, double>(correct_out_features),
correct_out_dims, correct_out_dims,
non_zero_num, non_zero_num,
kernel_sizes, kernel_sizes,
paddings, paddings,
strides, strides,
dilations, dilations,
diff, diff,
backward, backward,
cast<float, double>(features_grad)); cast<float, double>(features_grad));
} }
TEST(DEV_API, sparse_maxpool) { TEST(DEV_API, sparse_maxpool) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册