未验证 提交 5d3fd4fe 编写于 作者: Z zhangkaihuo 提交者: GitHub

Sparse conv and pool support indices as template (#41137)

上级 66d1b1f6
......@@ -45,6 +45,7 @@ PD_REGISTER_KERNEL(empty,
phi::EmptyKernel,
float,
double,
int8_t,
uint8_t,
int16_t,
int,
......@@ -61,6 +62,7 @@ PD_REGISTER_KERNEL(empty_like,
phi::EmptyLikeKernel,
float,
double,
int8_t,
uint8_t,
int16_t,
int,
......@@ -80,6 +82,7 @@ PD_REGISTER_KERNEL(empty,
phi::EmptyKernel,
float,
double,
int8_t,
uint8_t,
int16_t,
int,
......@@ -95,6 +98,7 @@ PD_REGISTER_KERNEL(empty_like,
phi::EmptyLikeKernel,
float,
double,
int8_t,
uint8_t,
int16_t,
int,
......
......@@ -33,28 +33,30 @@ struct Dims4D {
};
// Judge whether the current position x is in (lower, upper)
inline HOSTDEVICE bool Check(const int& x,
template <typename IntT = int>
inline HOSTDEVICE bool Check(const IntT& x,
const int& kx,
const int& pad,
const int& stride,
const int dilation,
const int kdim,
const int xdim) {
const int lower = x - dilation * kx + pad;
const int uper = x + (kdim - kx - 1) * dilation - pad;
const IntT lower = x - dilation * kx + pad;
const IntT uper = x + (kdim - kx - 1) * dilation - pad;
return (lower >= 0 && lower % stride == 0 && uper < xdim);
}
// Check whether the current position(x, y, z) is legal:
// Judge the minimum and maximum values at each latitude
template <typename IntT = int>
inline HOSTDEVICE bool Check(const Dims4D& dims,
const Dims4D& kernel_dims,
const Dims4D& paddings,
const Dims4D& dilations,
const Dims4D& strides,
const int x,
const int y,
const int z,
const IntT x,
const IntT y,
const IntT z,
const int kx,
const int ky,
const int kz) {
......@@ -67,22 +69,22 @@ inline HOSTDEVICE bool Check(const Dims4D& dims,
return (x_valid && y_valid && z_valid);
}
template <typename Dim>
inline HOSTDEVICE int PointToIndex(const int& batch,
const int& x,
const int& y,
const int& z,
const Dim& dims) {
template <typename Dim, typename IntT = int>
inline HOSTDEVICE IntT PointToIndex(const IntT& batch,
const IntT& x,
const IntT& y,
const IntT& z,
const Dim& dims) {
return batch * dims[1] * dims[2] * dims[3] + z * dims[2] * dims[3] +
y * dims[3] + x;
}
// TODO(zhangkaihuo): use division and multiply to optimize
// modulo operation
template <typename Dim>
template <typename Dim, typename IntT = int>
inline HOSTDEVICE void IndexToPoint(
const int index, const Dim& dims, int* batch, int* x, int* y, int* z) {
int n = index;
const IntT index, const Dim& dims, IntT* batch, IntT* x, IntT* y, IntT* z) {
IntT n = index;
*x = n % dims[3];
n /= dims[3];
*y = n % dims[2];
......@@ -176,8 +178,9 @@ inline const std::vector<int> PoolResetKernel(
return res;
}
inline void PrefixSum(const int* counter, int* offsets, const int n) {
int offset = 0;
template <typename T>
inline void PrefixSum(const T* counter, T* offsets, const int n) {
T offset = 0;
for (int i = 0; i < n; i++) {
offsets[i] = offset;
offset += counter[i];
......
......@@ -49,8 +49,8 @@ std::tuple<SparseCooTensor, DenseTensor> Conv3dGrad(
const int groups,
const bool subm) {
SparseCooTensor x_grad;
DenseTensor kernel_grad = phi::Empty<Context>(
dev_ctx, DenseTensorMeta(kernel.dtype(), {1}, kernel.layout()));
DenseTensor kernel_grad;
// TODO(zhangkaihuo): call InferMeta func here
Conv3dGradKernel<T, Context>(dev_ctx,
x,
......
......@@ -45,11 +45,7 @@ SparseCooTensor Conv3d(const Context& dev_ctx,
const int groups,
const bool subm,
DenseTensor* rulebook) {
DenseTensor indices = phi::Empty<Context>(
dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
DenseTensor values =
phi::Empty<Context>(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout()));
SparseCooTensor coo(indices, values, x.dims());
SparseCooTensor coo;
Conv3dKernel<T, Context>(dev_ctx,
x,
kernel,
......
......@@ -31,7 +31,7 @@ using Dims4D = phi::funcs::sparse::Dims4D;
// such as: kernel(3, 3, 3), kernel_size = 27
// counter_per_weight: (kernel_size)
// TODO(zhangkaihuo): optimize performance with multithreading
template <typename T, typename Context>
template <typename T, typename Context, typename IntT = int>
void ProductRuleBook(const Context& dev_ctx,
const SparseCooTensor& x,
const std::vector<int>& kernel_sizes,
......@@ -44,7 +44,7 @@ void ProductRuleBook(const Context& dev_ctx,
DenseTensor* counter_per_kernel) {
const int64_t non_zero_num = x.nnz();
const auto& non_zero_indices = x.non_zero_indices();
const int* indices_ptr = non_zero_indices.data<int>();
const IntT* indices_ptr = non_zero_indices.data<IntT>();
int* counter_ptr = counter_per_kernel->data<int>();
int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
memset(counter_ptr, 0, kernel_size * sizeof(int));
......@@ -60,33 +60,33 @@ void ProductRuleBook(const Context& dev_ctx,
const Dims4D c_strides(1, strides[2], strides[1], strides[0]);
const Dims4D c_dilations(1, dilations[2], dilations[1], dilations[0]);
std::set<int> hash_in;
std::set<IntT> hash_in;
if (subm) {
for (int i = 0; i < non_zero_num; i++) {
int batch = indices_ptr[i];
int in_z = indices_ptr[i + non_zero_num];
int in_y = indices_ptr[i + 2 * non_zero_num];
int in_x = indices_ptr[i + 3 * non_zero_num];
int index = phi::funcs::sparse::PointToIndex<DDim>(
IntT batch = indices_ptr[i];
IntT in_z = indices_ptr[i + non_zero_num];
IntT in_y = indices_ptr[i + 2 * non_zero_num];
IntT in_x = indices_ptr[i + 3 * non_zero_num];
IntT index = phi::funcs::sparse::PointToIndex<DDim>(
batch, in_x, in_y, in_z, x_dims);
hash_in.insert(index);
}
}
auto f_calc_rulebook = [&](int* rulebook_ptr) {
auto f_calc_rulebook = [&](IntT* rulebook_ptr) {
int kernel_index = 0, rulebook_index = 0;
for (int kz = 0; kz < kernel_sizes[0]; kz++) {
for (int ky = 0; ky < kernel_sizes[1]; ky++) {
for (int kx = 0; kx < kernel_sizes[2]; kx++) {
++kernel_index;
for (int64_t i = 0; i < non_zero_num; i++) {
int batch = indices_ptr[i];
int in_z = indices_ptr[i + non_zero_num];
int in_y = indices_ptr[i + 2 * non_zero_num];
int in_x = indices_ptr[i + 3 * non_zero_num];
int out_z = (in_z + paddings[0] - kz * dilations[0]) / strides[0];
int out_y = (in_y + paddings[1] - ky * dilations[1]) / strides[1];
int out_x = (in_x + paddings[2] - kx * dilations[2]) / strides[2];
IntT batch = indices_ptr[i];
IntT in_z = indices_ptr[i + non_zero_num];
IntT in_y = indices_ptr[i + 2 * non_zero_num];
IntT in_x = indices_ptr[i + 3 * non_zero_num];
IntT out_z = (in_z + paddings[0] - kz * dilations[0]) / strides[0];
IntT out_y = (in_y + paddings[1] - ky * dilations[1]) / strides[1];
IntT out_x = (in_x + paddings[2] - kx * dilations[2]) / strides[2];
if (phi::funcs::sparse::Check(c_x_dims,
c_kernel_dims,
c_paddings,
......@@ -99,7 +99,7 @@ void ProductRuleBook(const Context& dev_ctx,
ky,
kz)) {
if (subm) {
int out_index = phi::funcs::sparse::PointToIndex<DDim>(
IntT out_index = phi::funcs::sparse::PointToIndex<DDim>(
batch, out_x, out_y, out_z, out_dims);
if (hash_in.find(out_index) == hash_in.end()) {
continue;
......@@ -126,15 +126,16 @@ void ProductRuleBook(const Context& dev_ctx,
f_calc_rulebook(nullptr);
// alloc the rulebook
DenseTensorMeta rulebook_meta(
DataType::INT32, {3, rulebook_len}, DataLayout::NCHW);
rulebook->set_meta(rulebook_meta);
dev_ctx.Alloc(rulebook, rulebook->dtype(), rulebook->numel() * sizeof(int));
int* rulebook_ptr = rulebook->data<int>();
*rulebook = phi::Empty(
dev_ctx,
DenseTensorMeta(paddle::experimental::CppTypeToDataType<IntT>::Type(),
{3, rulebook_len},
DataLayout::NCHW));
IntT* rulebook_ptr = rulebook->data<IntT>();
f_calc_rulebook(rulebook_ptr);
}
template <typename T, typename Context>
template <typename T, typename Context, typename IntT = int>
void UpdateRulebookAndOutIndex(const Context& dev_ctx,
const SparseCooTensor& x,
const int kernel_size,
......@@ -142,9 +143,9 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx,
const DDim& out_dims,
DenseTensor* rulebook,
SparseCooTensor* out) {
std::set<int> out_indexs;
std::set<IntT> out_indexs;
int n = rulebook->dims()[1];
int* rulebook_ptr = rulebook->data<int>();
IntT* rulebook_ptr = rulebook->data<IntT>();
for (int i = 0; i < n; i++) {
out_indexs.insert(rulebook_ptr[i + n * 2]);
}
......@@ -152,17 +153,19 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx,
int out_non_zero_num = out_indexs.size();
const int64_t sparse_dim = 4;
DenseTensorMeta indices_meta(
DataType::INT32, {sparse_dim, out_non_zero_num}, DataLayout::NCHW);
paddle::experimental::CppTypeToDataType<IntT>::Type(),
{sparse_dim, out_non_zero_num},
DataLayout::NCHW);
DenseTensorMeta values_meta(x.dtype(),
{out_non_zero_num, out_channels},
x.non_zero_elements().layout());
phi::DenseTensor out_indices = phi::Empty(dev_ctx, std::move(indices_meta));
phi::DenseTensor out_values = phi::Empty(dev_ctx, std::move(values_meta));
int* out_indices_ptr = out_indices.data<int>();
IntT* out_indices_ptr = out_indices.data<IntT>();
int i = 0;
for (auto it = out_indexs.begin(); it != out_indexs.end(); it++, i++) {
const int index = *it;
int batch, x, y, z;
const IntT index = *it;
IntT batch, x, y, z;
phi::funcs::sparse::IndexToPoint<DDim>(index, out_dims, &batch, &x, &y, &z);
out_indices_ptr[i] = batch;
out_indices_ptr[i + out_non_zero_num] = z;
......@@ -170,7 +173,7 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx,
out_indices_ptr[i + out_non_zero_num * 3] = x;
}
for (i = 0; i < n; i++) {
int out_index = rulebook_ptr[i + n * 2];
IntT out_index = rulebook_ptr[i + n * 2];
rulebook_ptr[i + n * 2] =
std::distance(out_indexs.begin(), out_indexs.find(out_index));
}
......@@ -178,20 +181,20 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx,
out->SetMember(out_indices, out_values, out_dims, true);
}
template <typename T>
template <typename T, typename IntT = int>
void Gather(
const T* x, const int* indexs, const int n, const int channels, T* out) {
const T* x, const IntT* indexs, const int n, const int channels, T* out) {
for (int i = 0; i < n; i++) {
int real_i = indexs[i];
IntT real_i = indexs[i];
memcpy(out + i * channels, x + real_i * channels, channels * sizeof(T));
}
}
template <typename T>
template <typename T, typename IntT = int>
void Scatter(
const T* x, const int* indexs, const int n, const int channels, T* out) {
const T* x, const IntT* indexs, const int n, const int channels, T* out) {
for (int i = 0; i < n; i++) {
int real_i = indexs[i];
IntT real_i = indexs[i];
for (int j = 0; j < channels; j++) {
out[real_i * channels + j] += x[i * channels + j];
}
......
......@@ -18,6 +18,8 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/sparse/cpu/convolution.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace phi {
namespace sparse {
......@@ -29,24 +31,24 @@ namespace sparse {
//]
// x_grad = out_grad * transpose(kenrel)
// kernel_grad = transpose(x) * out_grad
template <typename T, typename Context>
void Conv3dGradKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& kernel,
const DenseTensor& rulebook,
const SparseCooTensor& out_grad,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const int groups,
const bool subm,
SparseCooTensor* x_grad,
DenseTensor* kernel_grad) {
template <typename T, typename IntT = int>
void Conv3dGradCPUKernel(const CPUContext& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& kernel,
const DenseTensor& rulebook,
const SparseCooTensor& out_grad,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const int groups,
const bool subm,
SparseCooTensor* x_grad,
DenseTensor* kernel_grad) {
const auto& kernel_dims = kernel.dims();
const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
const int in_channels = kernel_dims[3];
const int out_channels = kernel_dims[4];
const int* rulebook_ptr = rulebook.data<int>();
const IntT* rulebook_ptr = rulebook.data<IntT>();
const int rulebook_len = rulebook.dims()[1];
......@@ -66,32 +68,30 @@ void Conv3dGradKernel(const Context& dev_ctx,
T* in_features_ptr = in_features.data<T>();
T* d_x_features_ptr = d_x_features.data<T>();
T* out_grad_features_ptr = out_grad_features.data<T>();
kernel_grad->Resize(kernel_dims);
dev_ctx.Alloc(
kernel_grad, kernel_grad->dtype(), kernel_grad->numel() * sizeof(T));
*kernel_grad = phi::EmptyLike<T>(dev_ctx, kernel);
T* d_kernel_ptr = kernel_grad->data<T>();
memset(d_kernel_ptr, 0, sizeof(T) * kernel_grad->numel());
int half_kernel_size = kernel_size / 2;
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
auto blas = phi::funcs::GetBlas<CPUContext, T>(dev_ctx);
DenseTensor x_grad_indices =
phi::EmptyLike<int>(dev_ctx, x.non_zero_indices());
phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
DenseTensor x_grad_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
T* x_grad_values_ptr = x_grad_values.data<T>();
memset(x_grad_values_ptr, 0, sizeof(T) * x_grad_values.numel());
memset(d_x_features_ptr, 0, sizeof(T) * d_x_features.numel());
phi::Copy<Context>(dev_ctx,
x.non_zero_indices(),
dev_ctx.GetPlace(),
false,
&x_grad_indices);
phi::Copy<CPUContext>(dev_ctx,
x.non_zero_indices(),
dev_ctx.GetPlace(),
false,
&x_grad_indices);
x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true);
std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0);
std::vector<IntT> offsets(kernel_size + 1), counter(kernel_size, 0);
for (int i = 0; i < rulebook_len; i++) {
counter[rulebook_ptr[i]] += 1;
}
int offset = 0, max_count = 0;
IntT offset = 0, max_count = 0;
for (int i = 0; i < kernel_size; i++) {
offsets[i] = offset;
offset += counter[i];
......@@ -102,30 +102,31 @@ void Conv3dGradKernel(const Context& dev_ctx,
offsets[kernel_size] = offset;
if (subm) {
phi::funcs::sparse::SubmPreProcess<T, Context>(dev_ctx,
x,
kernel,
out_grad.non_zero_elements(),
in_channels,
out_channels,
half_kernel_size,
kernel_grad,
&x_grad_values);
phi::funcs::sparse::SubmPreProcess<T, CPUContext>(
dev_ctx,
x,
kernel,
out_grad.non_zero_elements(),
in_channels,
out_channels,
half_kernel_size,
kernel_grad,
&x_grad_values);
if (max_count == 0) {
return;
}
}
Gather<T>(x.non_zero_elements().data<T>(),
rulebook_ptr + rulebook_len,
rulebook_len,
in_channels,
in_features_ptr);
Gather<T>(out_grad.non_zero_elements().data<T>(),
rulebook_ptr + rulebook_len * 2,
rulebook_len,
out_channels,
out_grad_features_ptr);
Gather<T, IntT>(x.non_zero_elements().data<T>(),
rulebook_ptr + rulebook_len,
rulebook_len,
in_channels,
in_features_ptr);
Gather<T, IntT>(out_grad.non_zero_elements().data<T>(),
rulebook_ptr + rulebook_len * 2,
rulebook_len,
out_channels,
out_grad_features_ptr);
const T* kernel_ptr = kernel.data<T>();
for (int i = 0; i < kernel_size; i++) {
......@@ -170,11 +171,41 @@ void Conv3dGradKernel(const Context& dev_ctx,
}
// 4. scatter
Scatter<T>(d_x_features_ptr,
rulebook.data<int>() + rulebook_len,
rulebook_len,
in_channels,
x_grad_values_ptr);
Scatter<T, IntT>(d_x_features_ptr,
rulebook.data<IntT>() + rulebook_len,
rulebook_len,
in_channels,
x_grad_values_ptr);
}
template <typename T, typename Context>
void Conv3dGradKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& kernel,
const DenseTensor& rulebook,
const SparseCooTensor& out_grad,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const int groups,
const bool subm,
SparseCooTensor* x_grad,
DenseTensor* kernel_grad) {
PD_DISPATCH_INTEGRAL_TYPES(
x.non_zero_indices().dtype(), "Conv3dGradCPUKernel", ([&] {
Conv3dGradCPUKernel<T, data_t>(dev_ctx,
x,
kernel,
rulebook,
out_grad,
paddings,
dilations,
strides,
groups,
subm,
x_grad,
kernel_grad);
}));
}
} // namespace sparse
......
......@@ -17,6 +17,8 @@ limitations under the License. */
#include "paddle/phi/core/tensor_meta.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace phi {
namespace sparse {
......@@ -25,17 +27,17 @@ namespace sparse {
* kernel: (D, H, W, C, OC)
* out: (N, D, H, W, OC)
**/
template <typename T, typename Context>
void Conv3dKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& kernel,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const int groups,
const bool subm,
SparseCooTensor* out,
DenseTensor* rulebook) {
template <typename T, typename IntT = int>
void Conv3dCPUKernel(const CPUContext& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& kernel,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const int groups,
const bool subm,
SparseCooTensor* out,
DenseTensor* rulebook) {
// update padding and dilation
// Currently, only support x.layout is NDHWC, groups = 1
// if x.layout != NDHWC then transpose(x), transpose(weight)
......@@ -66,18 +68,18 @@ void Conv3dKernel(const Context& dev_ctx,
DataType::INT32, {kernel_size}, DataLayout::NCHW);
DenseTensor counter_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
ProductRuleBook<T, Context>(dev_ctx,
x,
kernel_sizes,
subm_paddings,
dilations,
subm_strides,
out_dims,
subm,
rulebook,
&counter_per_kernel);
UpdateRulebookAndOutIndex<T>(
ProductRuleBook<T, CPUContext, IntT>(dev_ctx,
x,
kernel_sizes,
subm_paddings,
dilations,
subm_strides,
out_dims,
subm,
rulebook,
&counter_per_kernel);
UpdateRulebookAndOutIndex<T, CPUContext, IntT>(
dev_ctx, x, kernel_size, out_channels, out_dims, rulebook, out);
int n = rulebook->dims()[1];
......@@ -95,14 +97,14 @@ void Conv3dKernel(const Context& dev_ctx,
T* in_features_ptr = in_features.data<T>();
T* out_features_ptr = out_features.data<T>();
Gather<T>(x.non_zero_elements().data<T>(),
rulebook->data<int>() + n,
n,
in_channels,
in_features_ptr);
Gather<T, IntT>(x.non_zero_elements().data<T>(),
rulebook->data<IntT>() + n,
n,
in_channels,
in_features_ptr);
// 3. call gemm for every werght
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
auto blas = phi::funcs::GetBlas<CPUContext, T>(dev_ctx);
std::vector<int> offsets(kernel_size + 1);
int offset = 0;
for (int i = 0; i < kernel_size; i++) {
......@@ -139,11 +141,37 @@ void Conv3dKernel(const Context& dev_ctx,
// 4. scatter
T* out_values_ptr = out->mutable_non_zero_elements()->data<T>();
memset(out_values_ptr, 0, sizeof(T) * out->nnz() * out_channels);
Scatter<T>(out_features_ptr,
rulebook->data<int>() + n * 2,
n,
out_channels,
out_values_ptr);
Scatter<T, IntT>(out_features_ptr,
rulebook->data<IntT>() + n * 2,
n,
out_channels,
out_values_ptr);
}
template <typename T, typename Context>
void Conv3dKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& kernel,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const int groups,
const bool subm,
SparseCooTensor* out,
DenseTensor* rulebook) {
PD_DISPATCH_INTEGRAL_TYPES(
x.non_zero_indices().dtype(), "Conv3dCPUKernel", ([&] {
Conv3dCPUKernel<T, data_t>(dev_ctx,
x,
kernel,
paddings,
dilations,
strides,
groups,
subm,
out,
rulebook);
}));
}
} // namespace sparse
......
......@@ -14,24 +14,28 @@ limitations under the License. */
#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/pooling.h"
#include "paddle/phi/kernels/funcs/sparse/convolution.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace phi {
namespace sparse {
template <typename T, typename Context>
void MaxPoolGradKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& rulebook,
const SparseCooTensor& out,
const DenseTensor& out_grad,
const std::vector<int>& kernel_sizes,
DenseTensor* x_grad) {
template <typename T, typename IntT = int>
void MaxPoolGradCPUKernel(const CPUContext& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& rulebook,
const SparseCooTensor& out,
const SparseCooTensor& out_grad,
const std::vector<int>& kernel_sizes,
SparseCooTensor* x_grad) {
int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
const int channels = x.dims()[4];
int rulebook_len = rulebook.dims()[1];
const int* rulebook_ptr = rulebook.data<int>();
const IntT* rulebook_ptr = rulebook.data<IntT>();
std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0);
for (int i = 0; i < rulebook_len; i++) {
counter[rulebook_ptr[i]] += 1;
......@@ -40,15 +44,25 @@ void MaxPoolGradKernel(const Context& dev_ctx,
const T* in_features_ptr = x.non_zero_elements().data<T>();
const T* out_features_ptr = out.non_zero_elements().data<T>();
const T* out_grad_ptr = out_grad.data<T>();
T* x_grad_ptr = x_grad->data<T>();
const T* out_grad_ptr = out_grad.non_zero_elements().data<T>();
// TODO(zhangkaihuo): call phi::sparse::EmptyLike
DenseTensor x_grad_indices =
phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
DenseTensor x_grad_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true);
T* x_grad_ptr = x_grad_values.data<T>();
memset(x_grad_ptr, 0, sizeof(T) * x_grad->numel());
phi::Copy<CPUContext>(dev_ctx,
x.non_zero_indices(),
dev_ctx.GetPlace(),
false,
&x_grad_indices);
phi::funcs::MaxPoolGrad<T> grad_functor;
for (int i = 0; i < kernel_size; i++) {
for (int j = 0; j < counter[i]; j++) {
int in_i = rulebook_ptr[rulebook_len + offsets[i] + j];
int out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j];
IntT in_i = rulebook_ptr[rulebook_len + offsets[i] + j];
IntT out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j];
for (int c = 0; c < channels; c++) {
grad_functor.compute(in_features_ptr[in_i * channels + c],
out_features_ptr[out_i * channels + c],
......@@ -60,6 +74,21 @@ void MaxPoolGradKernel(const Context& dev_ctx,
}
}
template <typename T, typename Context>
void MaxPoolGradKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& rulebook,
const SparseCooTensor& out,
const SparseCooTensor& out_grad,
const std::vector<int>& kernel_sizes,
SparseCooTensor* x_grad) {
PD_DISPATCH_INTEGRAL_TYPES(
x.non_zero_indices().dtype(), "MaxPoolGradCPUKernel", ([&] {
MaxPoolGradCPUKernel<T, data_t>(
dev_ctx, x, rulebook, out, out_grad, kernel_sizes, x_grad);
}));
}
} // namespace sparse
} // namespace phi
......
......@@ -19,6 +19,8 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/sparse/convolution.h"
#include "paddle/phi/kernels/sparse/cpu/convolution.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace phi {
namespace sparse {
......@@ -27,15 +29,15 @@ namespace sparse {
* kernel: (D, H, W, C, OC)
* out: (N, D, H, W, OC)
**/
template <typename T, typename Context>
void MaxPoolKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const std::vector<int>& kernel_sizes,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
SparseCooTensor* out,
DenseTensor* rulebook) {
template <typename T, typename IntT = int>
void MaxPoolCPUKernel(const CPUContext& dev_ctx,
const SparseCooTensor& x,
const std::vector<int>& kernel_sizes,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
SparseCooTensor* out,
DenseTensor* rulebook) {
const auto& x_dims = x.dims();
int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
const std::vector<int>& real_kernel_sizes =
......@@ -51,22 +53,22 @@ void MaxPoolKernel(const Context& dev_ctx,
const T* in_features_ptr = x.non_zero_elements().data<T>();
// 1. product rule book
ProductRuleBook<T, Context>(dev_ctx,
x,
real_kernel_sizes,
paddings,
dilations,
strides,
out_dims,
false,
rulebook,
&counter_per_kernel);
UpdateRulebookAndOutIndex<T>(
ProductRuleBook<T, CPUContext, IntT>(dev_ctx,
x,
real_kernel_sizes,
paddings,
dilations,
strides,
out_dims,
false,
rulebook,
&counter_per_kernel);
UpdateRulebookAndOutIndex<T, CPUContext, IntT>(
dev_ctx, x, kernel_size, in_channels, out_dims, rulebook, out);
int rulebook_len = rulebook->dims()[1];
const int* rulebook_ptr = rulebook->data<int>();
const IntT* rulebook_ptr = rulebook->data<IntT>();
const int* counter_ptr = counter_per_kernel.data<int>();
std::vector<int> offsets(kernel_size + 1);
......@@ -78,8 +80,8 @@ void MaxPoolKernel(const Context& dev_ctx,
phi::funcs::MaxPool<T> max_pool_functor;
for (int i = 0; i < kernel_size; i++) {
for (int j = 0; j < counter_ptr[i]; j++) {
int in_i = rulebook_ptr[rulebook_len + offsets[i] + j];
int out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j];
IntT in_i = rulebook_ptr[rulebook_len + offsets[i] + j];
IntT out_i = rulebook_ptr[rulebook_len * 2 + offsets[i] + j];
if (!out_flags[out_i]) {
out_flags[out_i] = true;
memcpy(&out_features_ptr[out_i * in_channels],
......@@ -95,6 +97,28 @@ void MaxPoolKernel(const Context& dev_ctx,
}
}
template <typename T, typename Context>
void MaxPoolKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const std::vector<int>& kernel_sizes,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
SparseCooTensor* out,
DenseTensor* rulebook) {
PD_DISPATCH_INTEGRAL_TYPES(
x.non_zero_indices().dtype(), "MaxPoolCPUKernel", ([&] {
MaxPoolCPUKernel<T, data_t>(dev_ctx,
x,
kernel_sizes,
paddings,
dilations,
strides,
out,
rulebook);
}));
}
} // namespace sparse
} // namespace phi
......
......@@ -24,6 +24,8 @@ limitations under the License. */
#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace phi {
namespace sparse {
......@@ -35,24 +37,24 @@ namespace sparse {
//]
// x_grad = out_grad * transpose(kenrel)
// kernel_grad = transpose(x) * out_grad
template <typename T, typename Context>
void Conv3dGradKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& kernel,
const DenseTensor& rulebook,
const SparseCooTensor& out_grad,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const int groups,
const bool subm,
SparseCooTensor* x_grad,
DenseTensor* kernel_grad) {
template <typename T, typename IntT>
void Conv3dGradGPUKernel(const GPUContext& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& kernel,
const DenseTensor& rulebook,
const SparseCooTensor& out_grad,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const int groups,
const bool subm,
SparseCooTensor* x_grad,
DenseTensor* kernel_grad) {
const auto& kernel_dims = kernel.dims();
const int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
const int in_channels = kernel_dims[3];
const int out_channels = kernel_dims[4];
const int* rulebook_ptr = rulebook.data<int>();
const IntT* rulebook_ptr = rulebook.data<IntT>();
const int rulebook_len = rulebook.dims()[1];
......@@ -74,29 +76,29 @@ void Conv3dGradKernel(const Context& dev_ctx,
T* out_grad_features_ptr = out_grad_features.data<T>();
*kernel_grad = phi::EmptyLike<T>(dev_ctx, kernel);
T* d_kernel_ptr = kernel_grad->data<T>();
phi::funcs::SetConstant<Context, T> set_zero;
phi::funcs::SetConstant<GPUContext, T> set_zero;
set_zero(dev_ctx, kernel_grad, static_cast<T>(0.0f));
int half_kernel_size = kernel_size / 2;
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
DenseTensor x_grad_indices =
phi::EmptyLike<int>(dev_ctx, x.non_zero_indices());
phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
DenseTensor x_grad_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
T* x_grad_values_ptr = x_grad_values.data<T>();
set_zero(dev_ctx, &x_grad_values, static_cast<T>(0.0f));
set_zero(dev_ctx, &d_x_features, static_cast<T>(0.0f));
phi::Copy<Context>(dev_ctx,
x.non_zero_indices(),
dev_ctx.GetPlace(),
false,
&x_grad_indices);
phi::Copy<GPUContext>(dev_ctx,
x.non_zero_indices(),
dev_ctx.GetPlace(),
false,
&x_grad_indices);
x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true);
std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0),
std::vector<IntT> offsets(kernel_size + 1), counter(kernel_size, 0),
h_counter(rulebook_len, 0);
phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
rulebook_ptr,
rulebook_len * sizeof(int),
rulebook_len * sizeof(IntT),
#ifdef PADDLE_WITH_HIP
hipMemcpyDeviceToHost,
#else
......@@ -109,7 +111,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
for (int i = 0; i < rulebook_len; i++) {
counter[h_counter[i]] += 1;
}
int offset = 0, max_count = 0;
IntT offset = 0, max_count = 0;
for (int i = 0; i < kernel_size; i++) {
offsets[i] = offset;
offset += counter[i];
......@@ -120,15 +122,16 @@ void Conv3dGradKernel(const Context& dev_ctx,
offsets[kernel_size] = offset;
if (subm) {
phi::funcs::sparse::SubmPreProcess<T, Context>(dev_ctx,
x,
kernel,
out_grad.non_zero_elements(),
in_channels,
out_channels,
half_kernel_size,
kernel_grad,
&x_grad_values);
phi::funcs::sparse::SubmPreProcess<T, GPUContext>(
dev_ctx,
x,
kernel,
out_grad.non_zero_elements(),
in_channels,
out_channels,
half_kernel_size,
kernel_grad,
&x_grad_values);
if (max_count == 0) {
return;
}
......@@ -136,21 +139,21 @@ void Conv3dGradKernel(const Context& dev_ctx,
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
dev_ctx, rulebook_len * in_channels, 1);
GatherKernel<T, int><<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
rulebook_ptr + rulebook_len,
in_features_ptr,
rulebook_len,
in_channels);
GatherKernel<T, IntT><<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
rulebook_ptr + rulebook_len,
in_features_ptr,
rulebook_len,
in_channels);
config = phi::backends::gpu::GetGpuLaunchConfig1D(
dev_ctx, rulebook_len * out_channels, 1);
GatherKernel<T, int><<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(
GatherKernel<T, IntT><<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(
out_grad.non_zero_elements().data<T>(),
rulebook_ptr + rulebook_len * 2,
out_grad_features_ptr,
......@@ -203,15 +206,19 @@ void Conv3dGradKernel(const Context& dev_ctx,
// x_grad->ResizeAndAllocate(x.non_zero_elements().dims());
DenseTensorMeta index_meta(DataType::INT32, {rulebook_len}, DataLayout::NCHW);
DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta));
DenseTensor unique_key = phi::Empty(
dev_ctx,
DenseTensorMeta(paddle::experimental::CppTypeToDataType<IntT>::Type(),
{rulebook_len},
DataLayout::NCHW));
DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
SortedAndUniqueIndex(dev_ctx,
rulebook_ptr + rulebook_len,
rulebook_len,
&out_index,
&unique_key,
&unique_value);
SortedAndUniqueIndex<GPUContext, IntT>(dev_ctx,
rulebook_ptr + rulebook_len,
rulebook_len,
&out_index,
&unique_key,
&unique_value);
config = phi::backends::gpu::GetGpuLaunchConfig1D(
dev_ctx, rulebook_len * in_channels, 1);
......@@ -229,6 +236,36 @@ void Conv3dGradKernel(const Context& dev_ctx,
subm);
}
template <typename T, typename Context>
void Conv3dGradKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& kernel,
const DenseTensor& rulebook,
const SparseCooTensor& out_grad,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const int groups,
const bool subm,
SparseCooTensor* x_grad,
DenseTensor* kernel_grad) {
PD_DISPATCH_INTEGRAL_TYPES(
x.non_zero_indices().dtype(), "Conv3dGradGPUKernel", ([&] {
Conv3dGradGPUKernel<T, data_t>(dev_ctx,
x,
kernel,
rulebook,
out_grad,
paddings,
dilations,
strides,
groups,
subm,
x_grad,
kernel_grad);
}));
}
} // namespace sparse
} // namespace phi
......
......@@ -19,29 +19,25 @@ limitations under the License. */
#include "paddle/phi/kernels/sparse/convolution_kernel.h"
#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace phi {
namespace sparse {
/**
* x: (N, D, H, W, C)
* kernel: (D, H, W, C, OC)
* out: (N, D, H, W, OC)
**/
template <typename T, typename Context>
void Conv3dKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& kernel,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const int groups,
const bool subm,
SparseCooTensor* out,
DenseTensor* rulebook) {
template <typename T, typename IntT>
void Conv3dGPUKernel(const GPUContext& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& kernel,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const int groups,
const bool subm,
SparseCooTensor* out,
DenseTensor* rulebook) {
// update padding and dilation
// Currently, only support x.layout is NDHWC, groups = 1
// if x.layout != NDHWC then transpose(x), transpose(weight)
const auto& x_dims = x.dims();
const auto& kernel_dims = kernel.dims();
int kernel_size = kernel_dims[0] * kernel_dims[1] * kernel_dims[2];
......@@ -67,7 +63,6 @@ void Conv3dKernel(const Context& dev_ctx,
DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(offsets_meta));
DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW);
DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta));
DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
std::vector<int> subm_paddings(paddings), subm_strides(strides);
......@@ -75,28 +70,26 @@ void Conv3dKernel(const Context& dev_ctx,
phi::funcs::sparse::ResetSubmKernelSizeAndStrides(
kernel.dims(), &subm_paddings, &subm_strides);
}
int n = ProductRuleBook<T, Context>(dev_ctx,
x,
kernel_sizes,
subm_paddings,
dilations,
subm_strides,
out_dims,
subm,
rulebook,
&counter_per_kernel,
&offsets_per_kernel,
&out_index,
&unique_key,
&unique_value,
out,
&h_counter,
&offsets);
int n = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
x,
kernel_sizes,
subm_paddings,
dilations,
subm_strides,
out_dims,
subm,
rulebook,
&counter_per_kernel,
&offsets_per_kernel,
&out_index,
&unique_value,
out,
&h_counter,
&offsets);
const int* counter_ptr = counter_per_kernel.data<int>();
const int* offsets_ptr = counter_per_kernel.data<int>();
const int* rulebook_ptr = rulebook->data<int>();
const IntT* rulebook_ptr = rulebook->data<IntT>();
// 2. gather
DenseTensorMeta in_features_meta(
......@@ -109,22 +102,22 @@ void Conv3dKernel(const Context& dev_ctx,
phi::Empty(dev_ctx, std::move(out_features_meta));
T* in_features_ptr = in_features.data<T>();
T* out_features_ptr = out_features.data<T>();
phi::funcs::SetConstant<Context, T> set_zero;
phi::funcs::SetConstant<GPUContext, T> set_zero;
set_zero(dev_ctx, &out_features, static_cast<T>(0.0f));
auto config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, n * in_channels, 1);
GatherKernel<T, int><<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
rulebook_ptr + n,
in_features_ptr,
n,
in_channels);
GatherKernel<T, IntT><<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(x.non_zero_elements().data<T>(),
rulebook_ptr + n,
in_features_ptr,
n,
in_channels);
// 3. call gemm for every werght
auto blas = phi::funcs::GetBlas<Context, T>(dev_ctx);
auto blas = phi::funcs::GetBlas<GPUContext, T>(dev_ctx);
auto* out_values = out->mutable_non_zero_elements();
T* out_values_ptr = out_values->data<T>();
......@@ -168,6 +161,36 @@ void Conv3dKernel(const Context& dev_ctx,
out_channels,
out_values_ptr);
}
/**
* x: (N, D, H, W, C)
* kernel: (D, H, W, C, OC)
* out: (N, D, H, W, OC)
**/
template <typename T, typename Context>
void Conv3dKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& kernel,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
const int groups,
const bool subm,
SparseCooTensor* out,
DenseTensor* rulebook) {
PD_DISPATCH_INTEGRAL_TYPES(
x.non_zero_indices().dtype(), "Conv3dGPUKernel", ([&] {
Conv3dGPUKernel<T, data_t>(dev_ctx,
x,
kernel,
paddings,
dilations,
strides,
groups,
subm,
out,
rulebook);
}));
}
} // namespace sparse
} // namespace phi
......
......@@ -12,24 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/pooling.h"
#include "paddle/phi/kernels/funcs/sparse/convolution.h"
#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace phi {
namespace sparse {
template <typename T>
template <typename T, typename IntT = int>
__global__ void MaxPoolGradCudaKernel(const T* in_features_ptr,
const T* out_features_ptr,
const T* out_grad_ptr,
const int* rulebook_ptr,
const IntT* rulebook_ptr,
const int n,
const int rulebook_len,
const int channels,
......@@ -38,8 +42,8 @@ __global__ void MaxPoolGradCudaKernel(const T* in_features_ptr,
CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) {
int real_i = i / channels;
int c = i - real_i * channels;
int in_i = rulebook_ptr[real_i];
int out_i = rulebook_ptr[real_i + rulebook_len];
IntT in_i = rulebook_ptr[real_i];
IntT out_i = rulebook_ptr[real_i + rulebook_len];
grad_functor.compute(in_features_ptr[in_i * channels + c],
out_features_ptr[out_i * channels + c],
out_grad_ptr[out_i * channels + c],
......@@ -48,23 +52,23 @@ __global__ void MaxPoolGradCudaKernel(const T* in_features_ptr,
}
}
template <typename T, typename Context>
void MaxPoolGradKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& rulebook,
const SparseCooTensor& out,
const DenseTensor& out_grad,
const std::vector<int>& kernel_sizes,
DenseTensor* x_grad) {
template <typename T, typename IntT = int>
void MaxPoolGradGPUKernel(const GPUContext& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& rulebook,
const SparseCooTensor& out,
const SparseCooTensor& out_grad,
const std::vector<int>& kernel_sizes,
SparseCooTensor* x_grad) {
int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
const int in_channels = x.dims()[4];
int rulebook_len = rulebook.dims()[1];
const int* rulebook_ptr = rulebook.data<int>();
std::vector<int> offsets(kernel_size + 1), counter(kernel_size, 0),
const IntT* rulebook_ptr = rulebook.data<IntT>();
std::vector<IntT> offsets(kernel_size + 1), counter(kernel_size, 0),
h_counter(kernel_size);
phi::backends::gpu::GpuMemcpyAsync(&h_counter[0],
rulebook_ptr,
rulebook_len * sizeof(int),
rulebook_len * sizeof(IntT),
#ifdef PADDLE_WITH_HIP
hipMemcpyDeviceToHost,
#else
......@@ -80,10 +84,20 @@ void MaxPoolGradKernel(const Context& dev_ctx,
const T* in_features_ptr = x.non_zero_elements().data<T>();
const T* out_features_ptr = out.non_zero_elements().data<T>();
const T* out_grad_ptr = out_grad.data<T>();
T* x_grad_ptr = x_grad->data<T>();
phi::funcs::SetConstant<Context, T> set_zero;
set_zero(dev_ctx, x_grad, static_cast<T>(0.0f));
const T* out_grad_ptr = out_grad.non_zero_elements().data<T>();
// TODO(zhangkaihuo): call phi::sparse::EmptyLike
DenseTensor x_grad_indices =
phi::EmptyLike<IntT>(dev_ctx, x.non_zero_indices());
DenseTensor x_grad_values = phi::EmptyLike<T>(dev_ctx, x.non_zero_elements());
x_grad->SetMember(x_grad_indices, x_grad_values, x.dims(), true);
T* x_grad_ptr = x_grad_values.data<T>();
phi::funcs::SetConstant<GPUContext, T> set_zero;
set_zero(dev_ctx, &x_grad_values, static_cast<T>(0.0f));
phi::Copy<GPUContext>(dev_ctx,
x.non_zero_indices(),
dev_ctx.GetPlace(),
false,
&x_grad_indices);
for (int i = 0; i < kernel_size; i++) {
if (counter[i] <= 0) {
......@@ -92,10 +106,10 @@ void MaxPoolGradKernel(const Context& dev_ctx,
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
dev_ctx, counter[i] * in_channels, 1);
MaxPoolGradCudaKernel<T><<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(
MaxPoolGradCudaKernel<T, IntT><<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(
in_features_ptr,
out_features_ptr,
out_grad_ptr,
......@@ -107,6 +121,21 @@ void MaxPoolGradKernel(const Context& dev_ctx,
}
}
template <typename T, typename Context>
void MaxPoolGradKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& rulebook,
const SparseCooTensor& out,
const SparseCooTensor& out_grad,
const std::vector<int>& kernel_sizes,
SparseCooTensor* x_grad) {
PD_DISPATCH_INTEGRAL_TYPES(
x.non_zero_indices().dtype(), "MaxPoolGradGPUKernel", ([&] {
MaxPoolGradGPUKernel<T, data_t>(
dev_ctx, x, rulebook, out, out_grad, kernel_sizes, x_grad);
}));
}
} // namespace sparse
} // namespace phi
......
......@@ -12,19 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h"
#include "paddle/phi/kernels/funcs/pooling.h"
#include "paddle/phi/kernels/funcs/sparse/convolution.h"
#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace phi {
namespace sparse {
template <typename T>
template <typename T, typename IntT = int>
__global__ void MaxPoolCudaKernel(const T* in_features_ptr,
const int* rulebook_ptr,
const IntT* rulebook_ptr,
const int n,
const int rulebook_len,
const int channels,
......@@ -33,8 +36,8 @@ __global__ void MaxPoolCudaKernel(const T* in_features_ptr,
CUDA_KERNEL_LOOP_TYPE(i, n * channels, int64_t) {
int real_i = i / channels;
int channel_i = i - real_i * channels;
int in_i = rulebook_ptr[real_i];
int out_i = rulebook_ptr[real_i + rulebook_len];
IntT in_i = rulebook_ptr[real_i];
IntT out_i = rulebook_ptr[real_i + rulebook_len];
max_pool_functor.compute(in_features_ptr[in_i * channels + channel_i],
&out_features_ptr[out_i * channels + channel_i]);
}
......@@ -45,15 +48,15 @@ __global__ void MaxPoolCudaKernel(const T* in_features_ptr,
* kernel: (D, H, W, C, OC)
* out: (N, D, H, W, OC)
**/
template <typename T, typename Context>
void MaxPoolKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const std::vector<int>& kernel_sizes,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
SparseCooTensor* out,
DenseTensor* rulebook) {
template <typename T, typename IntT = int>
void MaxPoolGPUKernel(const GPUContext& dev_ctx,
const SparseCooTensor& x,
const std::vector<int>& kernel_sizes,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
SparseCooTensor* out,
DenseTensor* rulebook) {
const auto& x_dims = x.dims();
int kernel_size = kernel_sizes[0] * kernel_sizes[1] * kernel_sizes[2];
const std::vector<int>& real_kernel_sizes =
......@@ -70,29 +73,27 @@ void MaxPoolKernel(const Context& dev_ctx,
DenseTensor offsets_per_kernel = phi::Empty(dev_ctx, std::move(counter_meta));
DenseTensorMeta index_meta(DataType::INT32, {1}, DataLayout::NCHW);
DenseTensor out_index = phi::Empty(dev_ctx, std::move(index_meta));
DenseTensor unique_key = phi::Empty(dev_ctx, std::move(index_meta));
DenseTensor unique_value = phi::Empty(dev_ctx, std::move(index_meta));
// 1. product rulebook
int rulebook_len = ProductRuleBook<T, Context>(dev_ctx,
x,
real_kernel_sizes,
paddings,
dilations,
strides,
out_dims,
false,
rulebook,
&counter_per_kernel,
&offsets_per_kernel,
&out_index,
&unique_key,
&unique_value,
out,
&counter,
&offsets);
const int* rulebook_ptr = rulebook->data<int>();
int rulebook_len = ProductRuleBook<T, GPUContext, IntT>(dev_ctx,
x,
real_kernel_sizes,
paddings,
dilations,
strides,
out_dims,
false,
rulebook,
&counter_per_kernel,
&offsets_per_kernel,
&out_index,
&unique_value,
out,
&counter,
&offsets);
const IntT* rulebook_ptr = rulebook->data<IntT>();
T* out_features_ptr = out->mutable_non_zero_elements()->data<T>();
const T* in_features_ptr = x.non_zero_elements().data<T>();
......@@ -113,10 +114,10 @@ void MaxPoolKernel(const Context& dev_ctx,
auto config = phi::backends::gpu::GetGpuLaunchConfig1D(
dev_ctx, counter[i] * in_channels, 1);
MaxPoolCudaKernel<T><<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(
MaxPoolCudaKernel<T, IntT><<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(
in_features_ptr,
rulebook_ptr + offsets[i] + rulebook_len,
counter[i],
......@@ -126,6 +127,28 @@ void MaxPoolKernel(const Context& dev_ctx,
}
}
template <typename T, typename Context>
void MaxPoolKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const std::vector<int>& kernel_sizes,
const std::vector<int>& paddings,
const std::vector<int>& dilations,
const std::vector<int>& strides,
SparseCooTensor* out,
DenseTensor* rulebook) {
PD_DISPATCH_INTEGRAL_TYPES(
x.non_zero_indices().dtype(), "MaxPoolGPUKernel", ([&] {
MaxPoolGPUKernel<T, data_t>(dev_ctx,
x,
kernel_sizes,
paddings,
dilations,
strides,
out,
rulebook);
}));
}
} // namespace sparse
} // namespace phi
......
......@@ -26,20 +26,18 @@ void MaxPoolGradKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& rulebook,
const SparseCooTensor& out,
const DenseTensor& out_grad,
const SparseCooTensor& out_grad,
const std::vector<int>& kernel_sizes,
DenseTensor* x_grad);
SparseCooTensor* x_grad);
template <typename T, typename Context>
DenseTensor MaxPoolGrad(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& rulebook,
const SparseCooTensor& out,
const DenseTensor& out_grad,
const std::vector<int>& kernel_sizes) {
DenseTensor x_grad = phi::Empty<Context>(
dev_ctx,
DenseTensorMeta(x.dtype(), x.non_zero_elements().dims(), x.layout()));
SparseCooTensor MaxPoolGrad(const Context& dev_ctx,
const SparseCooTensor& x,
const DenseTensor& rulebook,
const SparseCooTensor& out,
const SparseCooTensor& out_grad,
const std::vector<int>& kernel_sizes) {
SparseCooTensor x_grad;
MaxPoolGradKernel<T, Context>(
dev_ctx, x, rulebook, out, out_grad, kernel_sizes, &x_grad);
return x_grad;
......
......@@ -39,11 +39,7 @@ SparseCooTensor MaxPool(const Context& dev_ctx,
const std::vector<int>& dilations,
const std::vector<int>& strides,
DenseTensor* rulebook) {
DenseTensor indices = phi::Empty<Context>(
dev_ctx, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
DenseTensor values =
phi::Empty<Context>(dev_ctx, DenseTensorMeta(x.dtype(), {1}, x.layout()));
SparseCooTensor coo(indices, values, x.dims());
SparseCooTensor coo;
MaxPoolKernel<T, Context>(
dev_ctx, x, kernel_sizes, paddings, dilations, strides, &coo, rulebook);
return coo;
......
......@@ -48,13 +48,13 @@ std::vector<T2> cast(const std::vector<T1>& in) {
return out;
}
template <typename T>
void TestConv3dBase(const std::vector<int>& indices,
template <typename T, typename IntT = int>
void TestConv3dBase(const std::vector<IntT>& indices,
const std::vector<T>& features,
const DDim& x_dims,
const std::vector<T>& kernel,
const DDim& kernel_dims,
const std::vector<int>& correct_out_indices,
const std::vector<IntT>& correct_out_indices,
const std::vector<T>& correct_out_features,
const DDim& correct_out_dims,
const int non_zero_num,
......@@ -80,11 +80,13 @@ void TestConv3dBase(const std::vector<int>& indices,
const int in_channels = kernel_dims[3];
const int out_channels = kernel_dims[4];
auto indices_dtype = paddle::experimental::CppTypeToDataType<IntT>::Type();
DenseTensor indices_tensor = phi::Empty(
dev_ctx_cpu,
DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
memcpy(
indices_tensor.data<int>(), indices.data(), indices.size() * sizeof(int));
DenseTensorMeta(indices_dtype, {4, non_zero_num}, DataLayout::NCHW));
memcpy(indices_tensor.data<IntT>(),
indices.data(),
indices.size() * sizeof(IntT));
DenseTensor features_tensor = phi::Empty(
dev_ctx_cpu,
DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
......@@ -111,7 +113,7 @@ void TestConv3dBase(const std::vector<int>& indices,
if (!std::is_same<T, phi::dtype::float16>::value) {
DenseTensor rulebook = phi::Empty(
dev_ctx_cpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
dev_ctx_cpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW));
SparseCooTensor out = sparse::Conv3d<T>(dev_ctx_cpu,
x_tensor,
kernel_tensor,
......@@ -129,8 +131,8 @@ void TestConv3dBase(const std::vector<int>& indices,
ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out.nnz());
int cmp_indices = memcmp(correct_out_indices.data(),
out.non_zero_indices().data<int>(),
correct_out_indices.size() * sizeof(int));
out.non_zero_indices().data<IntT>(),
correct_out_indices.size() * sizeof(IntT));
ASSERT_EQ(cmp_indices, 0);
f_verify(out.non_zero_elements().data<T>(), correct_out_features);
......@@ -172,7 +174,7 @@ void TestConv3dBase(const std::vector<int>& indices,
DenseTensor d_indices_tensor = phi::Empty(
dev_ctx_gpu,
DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
DenseTensorMeta(indices_dtype, {4, non_zero_num}, DataLayout::NCHW));
phi::Copy(
dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor);
......@@ -195,7 +197,7 @@ void TestConv3dBase(const std::vector<int>& indices,
dev_ctx_gpu, kernel_tensor, phi::GPUPlace(), true, &d_kernel_tensor);
DenseTensor d_rulebook = phi::Empty(
dev_ctx_gpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
dev_ctx_gpu, DenseTensorMeta(indices_dtype, {1}, DataLayout::NCHW));
SparseCooTensor d_out = sparse::Conv3d<T>(dev_ctx_gpu,
d_x_tensor,
d_kernel_tensor,
......@@ -214,7 +216,7 @@ void TestConv3dBase(const std::vector<int>& indices,
DenseTensor h_indices_tensor = phi::Empty(
dev_ctx_cpu,
DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW));
DenseTensorMeta(indices_dtype, {4, d_out.nnz()}, DataLayout::NCHW));
phi::Copy(dev_ctx_gpu,
d_out.non_zero_indices(),
phi::CPUPlace(),
......@@ -222,8 +224,8 @@ void TestConv3dBase(const std::vector<int>& indices,
&h_indices_tensor);
int cmp_indices2 = memcmp(correct_out_indices.data(),
h_indices_tensor.data<int>(),
correct_out_indices.size() * sizeof(int));
h_indices_tensor.data<IntT>(),
correct_out_indices.size() * sizeof(IntT));
ASSERT_EQ(cmp_indices2, 0);
DenseTensor h_features_tensor =
......@@ -264,12 +266,13 @@ void TestConv3dBase(const std::vector<int>& indices,
#endif
}
void TestConv3d(const std::vector<int>& indices,
template <typename IntT = int>
void TestConv3d(const std::vector<IntT>& indices,
const std::vector<float>& features,
const DDim& x_dims,
const std::vector<float>& kernel,
const DDim& kernel_dims,
const std::vector<int>& correct_out_indices,
const std::vector<IntT>& correct_out_indices,
const std::vector<float>& correct_out_features,
const DDim& correct_out_dims,
const int non_zero_num,
......@@ -282,41 +285,41 @@ void TestConv3d(const std::vector<int>& indices,
const std::vector<float> kernel_grad = {},
const bool subm = false) {
// test float
TestConv3dBase<float>(indices,
features,
x_dims,
kernel,
kernel_dims,
correct_out_indices,
correct_out_features,
correct_out_dims,
non_zero_num,
paddings,
strides,
dilations,
diff,
backward,
features_grad,
kernel_grad,
subm);
TestConv3dBase<float, IntT>(indices,
features,
x_dims,
kernel,
kernel_dims,
correct_out_indices,
correct_out_features,
correct_out_dims,
non_zero_num,
paddings,
strides,
dilations,
diff,
backward,
features_grad,
kernel_grad,
subm);
// test double
TestConv3dBase<double>(indices,
cast<float, double>(features),
x_dims,
cast<float, double>(kernel),
kernel_dims,
correct_out_indices,
cast<float, double>(correct_out_features),
correct_out_dims,
non_zero_num,
paddings,
strides,
dilations,
diff,
backward,
cast<float, double>(features_grad),
cast<float, double>(kernel_grad),
subm);
TestConv3dBase<double, IntT>(indices,
cast<float, double>(features),
x_dims,
cast<float, double>(kernel),
kernel_dims,
correct_out_indices,
cast<float, double>(correct_out_features),
correct_out_dims,
non_zero_num,
paddings,
strides,
dilations,
diff,
backward,
cast<float, double>(features_grad),
cast<float, double>(kernel_grad),
subm);
}
TEST(DEV_API, sparse_conv3d) {
......@@ -616,6 +619,51 @@ TEST(DEV_API, sparse_conv2d) {
dilations);
}
TEST(DEV_API, sparse_conv2d_int64) {
const int in_channels = 1;
const int out_channels = 1;
DDim x_dims = {1, 1, 5, 5, in_channels};
DDim kernel_dims = {1, 3, 3, in_channels, out_channels};
DDim out_dims = {1, 1, 3, 3, out_channels};
std::vector<int> paddings = {0, 0, 0};
std::vector<int> strides = {1, 1, 1};
std::vector<int> dilations = {1, 1, 1};
const int non_zero_num = 3;
std::vector<int64_t> indices_flatten = {0, 0, 0, 0, 0, 0, 0, 4, 0, 3, 2, 4};
std::vector<float> features = {-0.79394531, -0.3125, -0.55029297};
// 3*3*3=27
std::vector<float> kernel = {0.65820312,
0.75048828,
0.21411133,
0.17370605,
0.85546875,
0.53076172,
0.28833008,
0.71044922,
0.00659943};
std::vector<int64_t> out_indices_flatten = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 2, 2, 2, 1, 2, 0, 1, 2};
std::vector<float> out_features = {
-0.17004, -0.71338, -0.00206, -0.22205, -0.09009};
TestConv3d<int64_t>(indices_flatten,
features,
x_dims,
kernel,
kernel_dims,
out_indices_flatten,
out_features,
out_dims,
non_zero_num,
paddings,
strides,
dilations);
}
TEST(DEV_API, sparse_conv3d_backward) {
const int in_channels = 1;
const int out_channels = 1;
......
......@@ -36,11 +36,11 @@ std::vector<T2> cast(const std::vector<T1>& in) {
}
return out;
}
template <typename T>
void TestMaxPoolBase(const std::vector<int>& indices,
template <typename T, typename IntT = int>
void TestMaxPoolBase(const std::vector<IntT>& indices,
const std::vector<T>& features,
const DDim& x_dims,
const std::vector<int>& correct_out_indices,
const std::vector<IntT>& correct_out_indices,
const std::vector<T>& correct_out_features,
const DDim& correct_out_dims,
const int non_zero_num,
......@@ -65,11 +65,13 @@ void TestMaxPoolBase(const std::vector<int>& indices,
const int in_channels = x_dims[4];
const int out_channels = in_channels;
auto indices_dtype = paddle::experimental::CppTypeToDataType<IntT>::Type();
DenseTensor indices_tensor = phi::Empty(
dev_ctx_cpu,
DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
memcpy(
indices_tensor.data<int>(), indices.data(), indices.size() * sizeof(int));
DenseTensorMeta(indices_dtype, {4, non_zero_num}, DataLayout::NCHW));
memcpy(indices_tensor.data<IntT>(),
indices.data(),
indices.size() * sizeof(IntT));
DenseTensor features_tensor = phi::Empty(
dev_ctx_cpu,
DenseTensorMeta(paddle::experimental::CppTypeToDataType<T>::Type(),
......@@ -88,8 +90,7 @@ void TestMaxPoolBase(const std::vector<int>& indices,
};
if (!std::is_same<T, phi::dtype::float16>::value) {
DenseTensor rulebook = phi::Empty(
dev_ctx_cpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
DenseTensor rulebook;
SparseCooTensor out = sparse::MaxPool<T>(dev_ctx_cpu,
x_tensor,
kernel_sizes,
......@@ -105,20 +106,16 @@ void TestMaxPoolBase(const std::vector<int>& indices,
ASSERT_EQ((int64_t)correct_out_features.size() / out_channels, out.nnz());
int cmp_indices = memcmp(correct_out_indices.data(),
out.non_zero_indices().data<int>(),
correct_out_indices.size() * sizeof(int));
out.non_zero_indices().data<IntT>(),
correct_out_indices.size() * sizeof(IntT));
ASSERT_EQ(cmp_indices, 0);
f_verify(out.non_zero_elements().data<T>(), correct_out_features);
if (backward) {
DenseTensor x_grad = sparse::MaxPoolGrad<T>(dev_ctx_cpu,
x_tensor,
rulebook,
out,
out.non_zero_elements(),
kernel_sizes);
f_verify(x_grad.data<T>(), features_grad);
SparseCooTensor x_grad = sparse::MaxPoolGrad<T>(
dev_ctx_cpu, x_tensor, rulebook, out, out, kernel_sizes);
f_verify(x_grad.non_zero_elements().data<T>(), features_grad);
}
}
......@@ -142,7 +139,7 @@ void TestMaxPoolBase(const std::vector<int>& indices,
DenseTensor d_indices_tensor = phi::Empty(
dev_ctx_gpu,
DenseTensorMeta(DataType::INT32, {4, non_zero_num}, DataLayout::NCHW));
DenseTensorMeta(indices_dtype, {4, non_zero_num}, DataLayout::NCHW));
phi::Copy(
dev_ctx_gpu, indices_tensor, phi::GPUPlace(), true, &d_indices_tensor);
......@@ -153,8 +150,7 @@ void TestMaxPoolBase(const std::vector<int>& indices,
SparseCooTensor d_x_tensor(d_indices_tensor, d_features_tensor, x_dims);
DenseTensor d_rulebook = phi::Empty(
dev_ctx_gpu, DenseTensorMeta(DataType::INT32, {1}, DataLayout::NCHW));
DenseTensor d_rulebook;
SparseCooTensor d_out = sparse::MaxPool<T>(dev_ctx_gpu,
d_x_tensor,
kernel_sizes,
......@@ -171,7 +167,7 @@ void TestMaxPoolBase(const std::vector<int>& indices,
DenseTensor h_indices_tensor = phi::Empty(
dev_ctx_cpu,
DenseTensorMeta(DataType::INT32, {4, d_out.nnz()}, DataLayout::NCHW));
DenseTensorMeta(indices_dtype, {4, d_out.nnz()}, DataLayout::NCHW));
phi::Copy(dev_ctx_gpu,
d_out.non_zero_indices(),
phi::CPUPlace(),
......@@ -179,8 +175,8 @@ void TestMaxPoolBase(const std::vector<int>& indices,
&h_indices_tensor);
int cmp_indices2 = memcmp(correct_out_indices.data(),
h_indices_tensor.data<int>(),
correct_out_indices.size() * sizeof(int));
h_indices_tensor.data<IntT>(),
correct_out_indices.size() * sizeof(IntT));
ASSERT_EQ(cmp_indices2, 0);
DenseTensor h_features_tensor =
......@@ -194,23 +190,25 @@ void TestMaxPoolBase(const std::vector<int>& indices,
f_verify(h_features_tensor.data<T>(), correct_out_features);
if (backward) {
DenseTensor x_grad = sparse::MaxPoolGrad<T>(dev_ctx_gpu,
d_x_tensor,
d_rulebook,
d_out,
d_out.non_zero_elements(),
kernel_sizes);
DenseTensor h_features_grad = phi::EmptyLike<T>(dev_ctx_cpu, x_grad);
phi::Copy(dev_ctx_gpu, x_grad, phi::CPUPlace(), true, &h_features_grad);
SparseCooTensor x_grad = sparse::MaxPoolGrad<T>(
dev_ctx_gpu, d_x_tensor, d_rulebook, d_out, d_out, kernel_sizes);
DenseTensor h_features_grad =
phi::EmptyLike<T>(dev_ctx_cpu, x_grad.non_zero_elements());
phi::Copy(dev_ctx_gpu,
x_grad.non_zero_elements(),
phi::CPUPlace(),
true,
&h_features_grad);
f_verify(h_features_grad.data<T>(), features_grad);
}
#endif
}
void TestMaxPool(const std::vector<int>& indices,
template <typename IntT = int>
void TestMaxPool(const std::vector<IntT>& indices,
const std::vector<float>& features,
const DDim& x_dims,
const std::vector<int>& correct_out_indices,
const std::vector<IntT>& correct_out_indices,
const std::vector<float>& correct_out_features,
const DDim& correct_out_dims,
const int non_zero_num,
......@@ -222,35 +220,35 @@ void TestMaxPool(const std::vector<int>& indices,
const bool backward = false,
const std::vector<float> features_grad = {}) {
// test float
TestMaxPoolBase<float>(indices,
features,
x_dims,
correct_out_indices,
correct_out_features,
correct_out_dims,
non_zero_num,
kernel_sizes,
paddings,
strides,
dilations,
diff,
backward,
features_grad);
TestMaxPoolBase<float, IntT>(indices,
features,
x_dims,
correct_out_indices,
correct_out_features,
correct_out_dims,
non_zero_num,
kernel_sizes,
paddings,
strides,
dilations,
diff,
backward,
features_grad);
// test double
TestMaxPoolBase<double>(indices,
cast<float, double>(features),
x_dims,
correct_out_indices,
cast<float, double>(correct_out_features),
correct_out_dims,
non_zero_num,
kernel_sizes,
paddings,
strides,
dilations,
diff,
backward,
cast<float, double>(features_grad));
TestMaxPoolBase<double, IntT>(indices,
cast<float, double>(features),
x_dims,
correct_out_indices,
cast<float, double>(correct_out_features),
correct_out_dims,
non_zero_num,
kernel_sizes,
paddings,
strides,
dilations,
diff,
backward,
cast<float, double>(features_grad));
}
TEST(DEV_API, sparse_maxpool) {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册