Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
5d3fd4fe
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
5d3fd4fe
编写于
4月 02, 2022
作者:
Z
zhangkaihuo
提交者:
GitHub
4月 02, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Sparse conv and pool support indices as template (#41137)
上级
66d1b1f6
变更
18
展开全部
隐藏空白更改
内联
并排
Showing
18 changed file
with
862 addition
and
589 deletion
+862
-589
paddle/phi/kernels/empty_kernel.cc
paddle/phi/kernels/empty_kernel.cc
+4
-0
paddle/phi/kernels/funcs/sparse/convolution.h
paddle/phi/kernels/funcs/sparse/convolution.h
+20
-17
paddle/phi/kernels/sparse/convolution_grad_kernel.h
paddle/phi/kernels/sparse/convolution_grad_kernel.h
+2
-2
paddle/phi/kernels/sparse/convolution_kernel.h
paddle/phi/kernels/sparse/convolution_kernel.h
+1
-5
paddle/phi/kernels/sparse/cpu/convolution.h
paddle/phi/kernels/sparse/cpu/convolution.h
+39
-36
paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
+81
-50
paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
+62
-34
paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
+42
-13
paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
+48
-24
paddle/phi/kernels/sparse/gpu/convolution.cu.h
paddle/phi/kernels/sparse/gpu/convolution.cu.h
+122
-119
paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
+90
-53
paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
+70
-47
paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
+53
-24
paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
+61
-38
paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h
paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h
+9
-11
paddle/phi/kernels/sparse/sparse_pool_kernel.h
paddle/phi/kernels/sparse/sparse_pool_kernel.h
+1
-5
paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+98
-50
paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
+59
-61
未找到文件。
paddle/phi/kernels/empty_kernel.cc
浏览文件 @
5d3fd4fe
...
...
@@ -45,6 +45,7 @@ PD_REGISTER_KERNEL(empty,
phi
::
EmptyKernel
,
float
,
double
,
int8_t
,
uint8_t
,
int16_t
,
int
,
...
...
@@ -61,6 +62,7 @@ PD_REGISTER_KERNEL(empty_like,
phi
::
EmptyLikeKernel
,
float
,
double
,
int8_t
,
uint8_t
,
int16_t
,
int
,
...
...
@@ -80,6 +82,7 @@ PD_REGISTER_KERNEL(empty,
phi
::
EmptyKernel
,
float
,
double
,
int8_t
,
uint8_t
,
int16_t
,
int
,
...
...
@@ -95,6 +98,7 @@ PD_REGISTER_KERNEL(empty_like,
phi
::
EmptyLikeKernel
,
float
,
double
,
int8_t
,
uint8_t
,
int16_t
,
int
,
...
...
paddle/phi/kernels/funcs/sparse/convolution.h
浏览文件 @
5d3fd4fe
...
...
@@ -33,28 +33,30 @@ struct Dims4D {
};
// Judge whether the current position x is in (lower, upper)
inline
HOSTDEVICE
bool
Check
(
const
int
&
x
,
template
<
typename
IntT
=
int
>
inline
HOSTDEVICE
bool
Check
(
const
IntT
&
x
,
const
int
&
kx
,
const
int
&
pad
,
const
int
&
stride
,
const
int
dilation
,
const
int
kdim
,
const
int
xdim
)
{
const
int
lower
=
x
-
dilation
*
kx
+
pad
;
const
int
uper
=
x
+
(
kdim
-
kx
-
1
)
*
dilation
-
pad
;
const
IntT
lower
=
x
-
dilation
*
kx
+
pad
;
const
IntT
uper
=
x
+
(
kdim
-
kx
-
1
)
*
dilation
-
pad
;
return
(
lower
>=
0
&&
lower
%
stride
==
0
&&
uper
<
xdim
);
}
// Check whether the current position(x, y, z) is legal:
// Judge the minimum and maximum values at each latitude
template
<
typename
IntT
=
int
>
inline
HOSTDEVICE
bool
Check
(
const
Dims4D
&
dims
,
const
Dims4D
&
kernel_dims
,
const
Dims4D
&
paddings
,
const
Dims4D
&
dilations
,
const
Dims4D
&
strides
,
const
int
x
,
const
int
y
,
const
int
z
,
const
IntT
x
,
const
IntT
y
,
const
IntT
z
,
const
int
kx
,
const
int
ky
,
const
int
kz
)
{
...
...
@@ -67,22 +69,22 @@ inline HOSTDEVICE bool Check(const Dims4D& dims,
return
(
x_valid
&&
y_valid
&&
z_valid
);
}
template
<
typename
Dim
>
inline
HOSTDEVICE
int
PointToIndex
(
const
int
&
batch
,
const
int
&
x
,
const
int
&
y
,
const
int
&
z
,
const
Dim
&
dims
)
{
template
<
typename
Dim
,
typename
IntT
=
int
>
inline
HOSTDEVICE
IntT
PointToIndex
(
const
IntT
&
batch
,
const
IntT
&
x
,
const
IntT
&
y
,
const
IntT
&
z
,
const
Dim
&
dims
)
{
return
batch
*
dims
[
1
]
*
dims
[
2
]
*
dims
[
3
]
+
z
*
dims
[
2
]
*
dims
[
3
]
+
y
*
dims
[
3
]
+
x
;
}
// TODO(zhangkaihuo): use division and multiply to optimize
// modulo operation
template
<
typename
Dim
>
template
<
typename
Dim
,
typename
IntT
=
int
>
inline
HOSTDEVICE
void
IndexToPoint
(
const
int
index
,
const
Dim
&
dims
,
int
*
batch
,
int
*
x
,
int
*
y
,
int
*
z
)
{
int
n
=
index
;
const
IntT
index
,
const
Dim
&
dims
,
IntT
*
batch
,
IntT
*
x
,
IntT
*
y
,
IntT
*
z
)
{
IntT
n
=
index
;
*
x
=
n
%
dims
[
3
];
n
/=
dims
[
3
];
*
y
=
n
%
dims
[
2
];
...
...
@@ -176,8 +178,9 @@ inline const std::vector<int> PoolResetKernel(
return
res
;
}
inline
void
PrefixSum
(
const
int
*
counter
,
int
*
offsets
,
const
int
n
)
{
int
offset
=
0
;
template
<
typename
T
>
inline
void
PrefixSum
(
const
T
*
counter
,
T
*
offsets
,
const
int
n
)
{
T
offset
=
0
;
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
offsets
[
i
]
=
offset
;
offset
+=
counter
[
i
];
...
...
paddle/phi/kernels/sparse/convolution_grad_kernel.h
浏览文件 @
5d3fd4fe
...
...
@@ -49,8 +49,8 @@ std::tuple<SparseCooTensor, DenseTensor> Conv3dGrad(
const
int
groups
,
const
bool
subm
)
{
SparseCooTensor
x_grad
;
DenseTensor
kernel_grad
=
phi
::
Empty
<
Context
>
(
dev_ctx
,
DenseTensorMeta
(
kernel
.
dtype
(),
{
1
},
kernel
.
layout
()));
DenseTensor
kernel_grad
;
// TODO(zhangkaihuo): call InferMeta func here
Conv3dGradKernel
<
T
,
Context
>
(
dev_ctx
,
x
,
...
...
paddle/phi/kernels/sparse/convolution_kernel.h
浏览文件 @
5d3fd4fe
...
...
@@ -45,11 +45,7 @@ SparseCooTensor Conv3d(const Context& dev_ctx,
const
int
groups
,
const
bool
subm
,
DenseTensor
*
rulebook
)
{
DenseTensor
indices
=
phi
::
Empty
<
Context
>
(
dev_ctx
,
DenseTensorMeta
(
DataType
::
INT32
,
{
1
},
DataLayout
::
NCHW
));
DenseTensor
values
=
phi
::
Empty
<
Context
>
(
dev_ctx
,
DenseTensorMeta
(
x
.
dtype
(),
{
1
},
x
.
layout
()));
SparseCooTensor
coo
(
indices
,
values
,
x
.
dims
());
SparseCooTensor
coo
;
Conv3dKernel
<
T
,
Context
>
(
dev_ctx
,
x
,
kernel
,
...
...
paddle/phi/kernels/sparse/cpu/convolution.h
浏览文件 @
5d3fd4fe
...
...
@@ -31,7 +31,7 @@ using Dims4D = phi::funcs::sparse::Dims4D;
// such as: kernel(3, 3, 3), kernel_size = 27
// counter_per_weight: (kernel_size)
// TODO(zhangkaihuo): optimize performance with multithreading
template
<
typename
T
,
typename
Context
>
template
<
typename
T
,
typename
Context
,
typename
IntT
=
int
>
void
ProductRuleBook
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
std
::
vector
<
int
>&
kernel_sizes
,
...
...
@@ -44,7 +44,7 @@ void ProductRuleBook(const Context& dev_ctx,
DenseTensor
*
counter_per_kernel
)
{
const
int64_t
non_zero_num
=
x
.
nnz
();
const
auto
&
non_zero_indices
=
x
.
non_zero_indices
();
const
int
*
indices_ptr
=
non_zero_indices
.
data
<
int
>
();
const
IntT
*
indices_ptr
=
non_zero_indices
.
data
<
IntT
>
();
int
*
counter_ptr
=
counter_per_kernel
->
data
<
int
>
();
int
kernel_size
=
kernel_sizes
[
0
]
*
kernel_sizes
[
1
]
*
kernel_sizes
[
2
];
memset
(
counter_ptr
,
0
,
kernel_size
*
sizeof
(
int
));
...
...
@@ -60,33 +60,33 @@ void ProductRuleBook(const Context& dev_ctx,
const
Dims4D
c_strides
(
1
,
strides
[
2
],
strides
[
1
],
strides
[
0
]);
const
Dims4D
c_dilations
(
1
,
dilations
[
2
],
dilations
[
1
],
dilations
[
0
]);
std
::
set
<
int
>
hash_in
;
std
::
set
<
IntT
>
hash_in
;
if
(
subm
)
{
for
(
int
i
=
0
;
i
<
non_zero_num
;
i
++
)
{
int
batch
=
indices_ptr
[
i
];
int
in_z
=
indices_ptr
[
i
+
non_zero_num
];
int
in_y
=
indices_ptr
[
i
+
2
*
non_zero_num
];
int
in_x
=
indices_ptr
[
i
+
3
*
non_zero_num
];
int
index
=
phi
::
funcs
::
sparse
::
PointToIndex
<
DDim
>
(
IntT
batch
=
indices_ptr
[
i
];
IntT
in_z
=
indices_ptr
[
i
+
non_zero_num
];
IntT
in_y
=
indices_ptr
[
i
+
2
*
non_zero_num
];
IntT
in_x
=
indices_ptr
[
i
+
3
*
non_zero_num
];
IntT
index
=
phi
::
funcs
::
sparse
::
PointToIndex
<
DDim
>
(
batch
,
in_x
,
in_y
,
in_z
,
x_dims
);
hash_in
.
insert
(
index
);
}
}
auto
f_calc_rulebook
=
[
&
](
int
*
rulebook_ptr
)
{
auto
f_calc_rulebook
=
[
&
](
IntT
*
rulebook_ptr
)
{
int
kernel_index
=
0
,
rulebook_index
=
0
;
for
(
int
kz
=
0
;
kz
<
kernel_sizes
[
0
];
kz
++
)
{
for
(
int
ky
=
0
;
ky
<
kernel_sizes
[
1
];
ky
++
)
{
for
(
int
kx
=
0
;
kx
<
kernel_sizes
[
2
];
kx
++
)
{
++
kernel_index
;
for
(
int64_t
i
=
0
;
i
<
non_zero_num
;
i
++
)
{
int
batch
=
indices_ptr
[
i
];
int
in_z
=
indices_ptr
[
i
+
non_zero_num
];
int
in_y
=
indices_ptr
[
i
+
2
*
non_zero_num
];
int
in_x
=
indices_ptr
[
i
+
3
*
non_zero_num
];
int
out_z
=
(
in_z
+
paddings
[
0
]
-
kz
*
dilations
[
0
])
/
strides
[
0
];
int
out_y
=
(
in_y
+
paddings
[
1
]
-
ky
*
dilations
[
1
])
/
strides
[
1
];
int
out_x
=
(
in_x
+
paddings
[
2
]
-
kx
*
dilations
[
2
])
/
strides
[
2
];
IntT
batch
=
indices_ptr
[
i
];
IntT
in_z
=
indices_ptr
[
i
+
non_zero_num
];
IntT
in_y
=
indices_ptr
[
i
+
2
*
non_zero_num
];
IntT
in_x
=
indices_ptr
[
i
+
3
*
non_zero_num
];
IntT
out_z
=
(
in_z
+
paddings
[
0
]
-
kz
*
dilations
[
0
])
/
strides
[
0
];
IntT
out_y
=
(
in_y
+
paddings
[
1
]
-
ky
*
dilations
[
1
])
/
strides
[
1
];
IntT
out_x
=
(
in_x
+
paddings
[
2
]
-
kx
*
dilations
[
2
])
/
strides
[
2
];
if
(
phi
::
funcs
::
sparse
::
Check
(
c_x_dims
,
c_kernel_dims
,
c_paddings
,
...
...
@@ -99,7 +99,7 @@ void ProductRuleBook(const Context& dev_ctx,
ky
,
kz
))
{
if
(
subm
)
{
int
out_index
=
phi
::
funcs
::
sparse
::
PointToIndex
<
DDim
>
(
IntT
out_index
=
phi
::
funcs
::
sparse
::
PointToIndex
<
DDim
>
(
batch
,
out_x
,
out_y
,
out_z
,
out_dims
);
if
(
hash_in
.
find
(
out_index
)
==
hash_in
.
end
())
{
continue
;
...
...
@@ -126,15 +126,16 @@ void ProductRuleBook(const Context& dev_ctx,
f_calc_rulebook
(
nullptr
);
// alloc the rulebook
DenseTensorMeta
rulebook_meta
(
DataType
::
INT32
,
{
3
,
rulebook_len
},
DataLayout
::
NCHW
);
rulebook
->
set_meta
(
rulebook_meta
);
dev_ctx
.
Alloc
(
rulebook
,
rulebook
->
dtype
(),
rulebook
->
numel
()
*
sizeof
(
int
));
int
*
rulebook_ptr
=
rulebook
->
data
<
int
>
();
*
rulebook
=
phi
::
Empty
(
dev_ctx
,
DenseTensorMeta
(
paddle
::
experimental
::
CppTypeToDataType
<
IntT
>::
Type
(),
{
3
,
rulebook_len
},
DataLayout
::
NCHW
));
IntT
*
rulebook_ptr
=
rulebook
->
data
<
IntT
>
();
f_calc_rulebook
(
rulebook_ptr
);
}
template
<
typename
T
,
typename
Context
>
template
<
typename
T
,
typename
Context
,
typename
IntT
=
int
>
void
UpdateRulebookAndOutIndex
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
int
kernel_size
,
...
...
@@ -142,9 +143,9 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx,
const
DDim
&
out_dims
,
DenseTensor
*
rulebook
,
SparseCooTensor
*
out
)
{
std
::
set
<
int
>
out_indexs
;
std
::
set
<
IntT
>
out_indexs
;
int
n
=
rulebook
->
dims
()[
1
];
int
*
rulebook_ptr
=
rulebook
->
data
<
int
>
();
IntT
*
rulebook_ptr
=
rulebook
->
data
<
IntT
>
();
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
out_indexs
.
insert
(
rulebook_ptr
[
i
+
n
*
2
]);
}
...
...
@@ -152,17 +153,19 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx,
int
out_non_zero_num
=
out_indexs
.
size
();
const
int64_t
sparse_dim
=
4
;
DenseTensorMeta
indices_meta
(
DataType
::
INT32
,
{
sparse_dim
,
out_non_zero_num
},
DataLayout
::
NCHW
);
paddle
::
experimental
::
CppTypeToDataType
<
IntT
>::
Type
(),
{
sparse_dim
,
out_non_zero_num
},
DataLayout
::
NCHW
);
DenseTensorMeta
values_meta
(
x
.
dtype
(),
{
out_non_zero_num
,
out_channels
},
x
.
non_zero_elements
().
layout
());
phi
::
DenseTensor
out_indices
=
phi
::
Empty
(
dev_ctx
,
std
::
move
(
indices_meta
));
phi
::
DenseTensor
out_values
=
phi
::
Empty
(
dev_ctx
,
std
::
move
(
values_meta
));
int
*
out_indices_ptr
=
out_indices
.
data
<
int
>
();
IntT
*
out_indices_ptr
=
out_indices
.
data
<
IntT
>
();
int
i
=
0
;
for
(
auto
it
=
out_indexs
.
begin
();
it
!=
out_indexs
.
end
();
it
++
,
i
++
)
{
const
int
index
=
*
it
;
int
batch
,
x
,
y
,
z
;
const
IntT
index
=
*
it
;
IntT
batch
,
x
,
y
,
z
;
phi
::
funcs
::
sparse
::
IndexToPoint
<
DDim
>
(
index
,
out_dims
,
&
batch
,
&
x
,
&
y
,
&
z
);
out_indices_ptr
[
i
]
=
batch
;
out_indices_ptr
[
i
+
out_non_zero_num
]
=
z
;
...
...
@@ -170,7 +173,7 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx,
out_indices_ptr
[
i
+
out_non_zero_num
*
3
]
=
x
;
}
for
(
i
=
0
;
i
<
n
;
i
++
)
{
int
out_index
=
rulebook_ptr
[
i
+
n
*
2
];
IntT
out_index
=
rulebook_ptr
[
i
+
n
*
2
];
rulebook_ptr
[
i
+
n
*
2
]
=
std
::
distance
(
out_indexs
.
begin
(),
out_indexs
.
find
(
out_index
));
}
...
...
@@ -178,20 +181,20 @@ void UpdateRulebookAndOutIndex(const Context& dev_ctx,
out
->
SetMember
(
out_indices
,
out_values
,
out_dims
,
true
);
}
template
<
typename
T
>
template
<
typename
T
,
typename
IntT
=
int
>
void
Gather
(
const
T
*
x
,
const
int
*
indexs
,
const
int
n
,
const
int
channels
,
T
*
out
)
{
const
T
*
x
,
const
IntT
*
indexs
,
const
int
n
,
const
int
channels
,
T
*
out
)
{
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
int
real_i
=
indexs
[
i
];
IntT
real_i
=
indexs
[
i
];
memcpy
(
out
+
i
*
channels
,
x
+
real_i
*
channels
,
channels
*
sizeof
(
T
));
}
}
template
<
typename
T
>
template
<
typename
T
,
typename
IntT
=
int
>
void
Scatter
(
const
T
*
x
,
const
int
*
indexs
,
const
int
n
,
const
int
channels
,
T
*
out
)
{
const
T
*
x
,
const
IntT
*
indexs
,
const
int
n
,
const
int
channels
,
T
*
out
)
{
for
(
int
i
=
0
;
i
<
n
;
i
++
)
{
int
real_i
=
indexs
[
i
];
IntT
real_i
=
indexs
[
i
];
for
(
int
j
=
0
;
j
<
channels
;
j
++
)
{
out
[
real_i
*
channels
+
j
]
+=
x
[
i
*
channels
+
j
];
}
...
...
paddle/phi/kernels/sparse/cpu/convolution_grad_kernel.cc
浏览文件 @
5d3fd4fe
...
...
@@ -18,6 +18,8 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/sparse/cpu/convolution.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace
phi
{
namespace
sparse
{
...
...
@@ -29,24 +31,24 @@ namespace sparse {
//]
// x_grad = out_grad * transpose(kenrel)
// kernel_grad = transpose(x) * out_grad
template
<
typename
T
,
typename
Contex
t
>
void
Conv3dGrad
Kernel
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
kernel
,
const
DenseTensor
&
rulebook
,
const
SparseCooTensor
&
out_grad
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
strides
,
const
int
groups
,
const
bool
subm
,
SparseCooTensor
*
x_grad
,
DenseTensor
*
kernel_grad
)
{
template
<
typename
T
,
typename
IntT
=
in
t
>
void
Conv3dGrad
CPUKernel
(
const
CPU
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
kernel
,
const
DenseTensor
&
rulebook
,
const
SparseCooTensor
&
out_grad
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
strides
,
const
int
groups
,
const
bool
subm
,
SparseCooTensor
*
x_grad
,
DenseTensor
*
kernel_grad
)
{
const
auto
&
kernel_dims
=
kernel
.
dims
();
const
int
kernel_size
=
kernel_dims
[
0
]
*
kernel_dims
[
1
]
*
kernel_dims
[
2
];
const
int
in_channels
=
kernel_dims
[
3
];
const
int
out_channels
=
kernel_dims
[
4
];
const
int
*
rulebook_ptr
=
rulebook
.
data
<
int
>
();
const
IntT
*
rulebook_ptr
=
rulebook
.
data
<
IntT
>
();
const
int
rulebook_len
=
rulebook
.
dims
()[
1
];
...
...
@@ -66,32 +68,30 @@ void Conv3dGradKernel(const Context& dev_ctx,
T
*
in_features_ptr
=
in_features
.
data
<
T
>
();
T
*
d_x_features_ptr
=
d_x_features
.
data
<
T
>
();
T
*
out_grad_features_ptr
=
out_grad_features
.
data
<
T
>
();
kernel_grad
->
Resize
(
kernel_dims
);
dev_ctx
.
Alloc
(
kernel_grad
,
kernel_grad
->
dtype
(),
kernel_grad
->
numel
()
*
sizeof
(
T
));
*
kernel_grad
=
phi
::
EmptyLike
<
T
>
(
dev_ctx
,
kernel
);
T
*
d_kernel_ptr
=
kernel_grad
->
data
<
T
>
();
memset
(
d_kernel_ptr
,
0
,
sizeof
(
T
)
*
kernel_grad
->
numel
());
int
half_kernel_size
=
kernel_size
/
2
;
auto
blas
=
phi
::
funcs
::
GetBlas
<
Context
,
T
>
(
dev_ctx
);
auto
blas
=
phi
::
funcs
::
GetBlas
<
C
PUC
ontext
,
T
>
(
dev_ctx
);
DenseTensor
x_grad_indices
=
phi
::
EmptyLike
<
int
>
(
dev_ctx
,
x
.
non_zero_indices
());
phi
::
EmptyLike
<
IntT
>
(
dev_ctx
,
x
.
non_zero_indices
());
DenseTensor
x_grad_values
=
phi
::
EmptyLike
<
T
>
(
dev_ctx
,
x
.
non_zero_elements
());
T
*
x_grad_values_ptr
=
x_grad_values
.
data
<
T
>
();
memset
(
x_grad_values_ptr
,
0
,
sizeof
(
T
)
*
x_grad_values
.
numel
());
memset
(
d_x_features_ptr
,
0
,
sizeof
(
T
)
*
d_x_features
.
numel
());
phi
::
Copy
<
Context
>
(
dev_ctx
,
x
.
non_zero_indices
(),
dev_ctx
.
GetPlace
(),
false
,
&
x_grad_indices
);
phi
::
Copy
<
C
PUC
ontext
>
(
dev_ctx
,
x
.
non_zero_indices
(),
dev_ctx
.
GetPlace
(),
false
,
&
x_grad_indices
);
x_grad
->
SetMember
(
x_grad_indices
,
x_grad_values
,
x
.
dims
(),
true
);
std
::
vector
<
int
>
offsets
(
kernel_size
+
1
),
counter
(
kernel_size
,
0
);
std
::
vector
<
IntT
>
offsets
(
kernel_size
+
1
),
counter
(
kernel_size
,
0
);
for
(
int
i
=
0
;
i
<
rulebook_len
;
i
++
)
{
counter
[
rulebook_ptr
[
i
]]
+=
1
;
}
int
offset
=
0
,
max_count
=
0
;
IntT
offset
=
0
,
max_count
=
0
;
for
(
int
i
=
0
;
i
<
kernel_size
;
i
++
)
{
offsets
[
i
]
=
offset
;
offset
+=
counter
[
i
];
...
...
@@ -102,30 +102,31 @@ void Conv3dGradKernel(const Context& dev_ctx,
offsets
[
kernel_size
]
=
offset
;
if
(
subm
)
{
phi
::
funcs
::
sparse
::
SubmPreProcess
<
T
,
Context
>
(
dev_ctx
,
x
,
kernel
,
out_grad
.
non_zero_elements
(),
in_channels
,
out_channels
,
half_kernel_size
,
kernel_grad
,
&
x_grad_values
);
phi
::
funcs
::
sparse
::
SubmPreProcess
<
T
,
CPUContext
>
(
dev_ctx
,
x
,
kernel
,
out_grad
.
non_zero_elements
(),
in_channels
,
out_channels
,
half_kernel_size
,
kernel_grad
,
&
x_grad_values
);
if
(
max_count
==
0
)
{
return
;
}
}
Gather
<
T
>
(
x
.
non_zero_elements
().
data
<
T
>
(),
rulebook_ptr
+
rulebook_len
,
rulebook_len
,
in_channels
,
in_features_ptr
);
Gather
<
T
>
(
out_grad
.
non_zero_elements
().
data
<
T
>
(),
rulebook_ptr
+
rulebook_len
*
2
,
rulebook_len
,
out_channels
,
out_grad_features_ptr
);
Gather
<
T
,
IntT
>
(
x
.
non_zero_elements
().
data
<
T
>
(),
rulebook_ptr
+
rulebook_len
,
rulebook_len
,
in_channels
,
in_features_ptr
);
Gather
<
T
,
IntT
>
(
out_grad
.
non_zero_elements
().
data
<
T
>
(),
rulebook_ptr
+
rulebook_len
*
2
,
rulebook_len
,
out_channels
,
out_grad_features_ptr
);
const
T
*
kernel_ptr
=
kernel
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
kernel_size
;
i
++
)
{
...
...
@@ -170,11 +171,41 @@ void Conv3dGradKernel(const Context& dev_ctx,
}
// 4. scatter
Scatter
<
T
>
(
d_x_features_ptr
,
rulebook
.
data
<
int
>
()
+
rulebook_len
,
rulebook_len
,
in_channels
,
x_grad_values_ptr
);
Scatter
<
T
,
IntT
>
(
d_x_features_ptr
,
rulebook
.
data
<
IntT
>
()
+
rulebook_len
,
rulebook_len
,
in_channels
,
x_grad_values_ptr
);
}
template
<
typename
T
,
typename
Context
>
void
Conv3dGradKernel
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
kernel
,
const
DenseTensor
&
rulebook
,
const
SparseCooTensor
&
out_grad
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
strides
,
const
int
groups
,
const
bool
subm
,
SparseCooTensor
*
x_grad
,
DenseTensor
*
kernel_grad
)
{
PD_DISPATCH_INTEGRAL_TYPES
(
x
.
non_zero_indices
().
dtype
(),
"Conv3dGradCPUKernel"
,
([
&
]
{
Conv3dGradCPUKernel
<
T
,
data_t
>
(
dev_ctx
,
x
,
kernel
,
rulebook
,
out_grad
,
paddings
,
dilations
,
strides
,
groups
,
subm
,
x_grad
,
kernel_grad
);
}));
}
}
// namespace sparse
...
...
paddle/phi/kernels/sparse/cpu/convolution_kernel.cc
浏览文件 @
5d3fd4fe
...
...
@@ -17,6 +17,8 @@ limitations under the License. */
#include "paddle/phi/core/tensor_meta.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace
phi
{
namespace
sparse
{
...
...
@@ -25,17 +27,17 @@ namespace sparse {
* kernel: (D, H, W, C, OC)
* out: (N, D, H, W, OC)
**/
template
<
typename
T
,
typename
Contex
t
>
void
Conv3d
Kernel
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
kernel
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
strides
,
const
int
groups
,
const
bool
subm
,
SparseCooTensor
*
out
,
DenseTensor
*
rulebook
)
{
template
<
typename
T
,
typename
IntT
=
in
t
>
void
Conv3d
CPUKernel
(
const
CPU
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
kernel
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
strides
,
const
int
groups
,
const
bool
subm
,
SparseCooTensor
*
out
,
DenseTensor
*
rulebook
)
{
// update padding and dilation
// Currently, only support x.layout is NDHWC, groups = 1
// if x.layout != NDHWC then transpose(x), transpose(weight)
...
...
@@ -66,18 +68,18 @@ void Conv3dKernel(const Context& dev_ctx,
DataType
::
INT32
,
{
kernel_size
},
DataLayout
::
NCHW
);
DenseTensor
counter_per_kernel
=
phi
::
Empty
(
dev_ctx
,
std
::
move
(
counter_meta
));
ProductRuleBook
<
T
,
C
ontext
>
(
dev_ctx
,
x
,
kernel_sizes
,
subm_paddings
,
dilations
,
subm_strides
,
out_dims
,
subm
,
rulebook
,
&
counter_per_kernel
);
UpdateRulebookAndOutIndex
<
T
>
(
ProductRuleBook
<
T
,
C
PUContext
,
IntT
>
(
dev_ctx
,
x
,
kernel_sizes
,
subm_paddings
,
dilations
,
subm_strides
,
out_dims
,
subm
,
rulebook
,
&
counter_per_kernel
);
UpdateRulebookAndOutIndex
<
T
,
CPUContext
,
IntT
>
(
dev_ctx
,
x
,
kernel_size
,
out_channels
,
out_dims
,
rulebook
,
out
);
int
n
=
rulebook
->
dims
()[
1
];
...
...
@@ -95,14 +97,14 @@ void Conv3dKernel(const Context& dev_ctx,
T
*
in_features_ptr
=
in_features
.
data
<
T
>
();
T
*
out_features_ptr
=
out_features
.
data
<
T
>
();
Gather
<
T
>
(
x
.
non_zero_elements
().
data
<
T
>
(),
rulebook
->
data
<
int
>
()
+
n
,
n
,
in_channels
,
in_features_ptr
);
Gather
<
T
,
IntT
>
(
x
.
non_zero_elements
().
data
<
T
>
(),
rulebook
->
data
<
IntT
>
()
+
n
,
n
,
in_channels
,
in_features_ptr
);
// 3. call gemm for every werght
auto
blas
=
phi
::
funcs
::
GetBlas
<
Context
,
T
>
(
dev_ctx
);
auto
blas
=
phi
::
funcs
::
GetBlas
<
C
PUC
ontext
,
T
>
(
dev_ctx
);
std
::
vector
<
int
>
offsets
(
kernel_size
+
1
);
int
offset
=
0
;
for
(
int
i
=
0
;
i
<
kernel_size
;
i
++
)
{
...
...
@@ -139,11 +141,37 @@ void Conv3dKernel(const Context& dev_ctx,
// 4. scatter
T
*
out_values_ptr
=
out
->
mutable_non_zero_elements
()
->
data
<
T
>
();
memset
(
out_values_ptr
,
0
,
sizeof
(
T
)
*
out
->
nnz
()
*
out_channels
);
Scatter
<
T
>
(
out_features_ptr
,
rulebook
->
data
<
int
>
()
+
n
*
2
,
n
,
out_channels
,
out_values_ptr
);
Scatter
<
T
,
IntT
>
(
out_features_ptr
,
rulebook
->
data
<
IntT
>
()
+
n
*
2
,
n
,
out_channels
,
out_values_ptr
);
}
template
<
typename
T
,
typename
Context
>
void
Conv3dKernel
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
kernel
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
strides
,
const
int
groups
,
const
bool
subm
,
SparseCooTensor
*
out
,
DenseTensor
*
rulebook
)
{
PD_DISPATCH_INTEGRAL_TYPES
(
x
.
non_zero_indices
().
dtype
(),
"Conv3dCPUKernel"
,
([
&
]
{
Conv3dCPUKernel
<
T
,
data_t
>
(
dev_ctx
,
x
,
kernel
,
paddings
,
dilations
,
strides
,
groups
,
subm
,
out
,
rulebook
);
}));
}
}
// namespace sparse
...
...
paddle/phi/kernels/sparse/cpu/sparse_pool_grad_kernel.cc
浏览文件 @
5d3fd4fe
...
...
@@ -14,24 +14,28 @@ limitations under the License. */
#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/pooling.h"
#include "paddle/phi/kernels/funcs/sparse/convolution.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace
phi
{
namespace
sparse
{
template
<
typename
T
,
typename
Contex
t
>
void
MaxPoolGrad
Kernel
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
rulebook
,
const
SparseCooTensor
&
out
,
const
Dense
Tensor
&
out_grad
,
const
std
::
vector
<
int
>&
kernel_sizes
,
Dense
Tensor
*
x_grad
)
{
template
<
typename
T
,
typename
IntT
=
in
t
>
void
MaxPoolGrad
CPUKernel
(
const
CPU
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
rulebook
,
const
SparseCooTensor
&
out
,
const
SparseCoo
Tensor
&
out_grad
,
const
std
::
vector
<
int
>&
kernel_sizes
,
SparseCoo
Tensor
*
x_grad
)
{
int
kernel_size
=
kernel_sizes
[
0
]
*
kernel_sizes
[
1
]
*
kernel_sizes
[
2
];
const
int
channels
=
x
.
dims
()[
4
];
int
rulebook_len
=
rulebook
.
dims
()[
1
];
const
int
*
rulebook_ptr
=
rulebook
.
data
<
int
>
();
const
IntT
*
rulebook_ptr
=
rulebook
.
data
<
IntT
>
();
std
::
vector
<
int
>
offsets
(
kernel_size
+
1
),
counter
(
kernel_size
,
0
);
for
(
int
i
=
0
;
i
<
rulebook_len
;
i
++
)
{
counter
[
rulebook_ptr
[
i
]]
+=
1
;
...
...
@@ -40,15 +44,25 @@ void MaxPoolGradKernel(const Context& dev_ctx,
const
T
*
in_features_ptr
=
x
.
non_zero_elements
().
data
<
T
>
();
const
T
*
out_features_ptr
=
out
.
non_zero_elements
().
data
<
T
>
();
const
T
*
out_grad_ptr
=
out_grad
.
data
<
T
>
();
T
*
x_grad_ptr
=
x_grad
->
data
<
T
>
();
const
T
*
out_grad_ptr
=
out_grad
.
non_zero_elements
().
data
<
T
>
();
// TODO(zhangkaihuo): call phi::sparse::EmptyLike
DenseTensor
x_grad_indices
=
phi
::
EmptyLike
<
IntT
>
(
dev_ctx
,
x
.
non_zero_indices
());
DenseTensor
x_grad_values
=
phi
::
EmptyLike
<
T
>
(
dev_ctx
,
x
.
non_zero_elements
());
x_grad
->
SetMember
(
x_grad_indices
,
x_grad_values
,
x
.
dims
(),
true
);
T
*
x_grad_ptr
=
x_grad_values
.
data
<
T
>
();
memset
(
x_grad_ptr
,
0
,
sizeof
(
T
)
*
x_grad
->
numel
());
phi
::
Copy
<
CPUContext
>
(
dev_ctx
,
x
.
non_zero_indices
(),
dev_ctx
.
GetPlace
(),
false
,
&
x_grad_indices
);
phi
::
funcs
::
MaxPoolGrad
<
T
>
grad_functor
;
for
(
int
i
=
0
;
i
<
kernel_size
;
i
++
)
{
for
(
int
j
=
0
;
j
<
counter
[
i
];
j
++
)
{
int
in_i
=
rulebook_ptr
[
rulebook_len
+
offsets
[
i
]
+
j
];
int
out_i
=
rulebook_ptr
[
rulebook_len
*
2
+
offsets
[
i
]
+
j
];
IntT
in_i
=
rulebook_ptr
[
rulebook_len
+
offsets
[
i
]
+
j
];
IntT
out_i
=
rulebook_ptr
[
rulebook_len
*
2
+
offsets
[
i
]
+
j
];
for
(
int
c
=
0
;
c
<
channels
;
c
++
)
{
grad_functor
.
compute
(
in_features_ptr
[
in_i
*
channels
+
c
],
out_features_ptr
[
out_i
*
channels
+
c
],
...
...
@@ -60,6 +74,21 @@ void MaxPoolGradKernel(const Context& dev_ctx,
}
}
template
<
typename
T
,
typename
Context
>
void
MaxPoolGradKernel
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
rulebook
,
const
SparseCooTensor
&
out
,
const
SparseCooTensor
&
out_grad
,
const
std
::
vector
<
int
>&
kernel_sizes
,
SparseCooTensor
*
x_grad
)
{
PD_DISPATCH_INTEGRAL_TYPES
(
x
.
non_zero_indices
().
dtype
(),
"MaxPoolGradCPUKernel"
,
([
&
]
{
MaxPoolGradCPUKernel
<
T
,
data_t
>
(
dev_ctx
,
x
,
rulebook
,
out
,
out_grad
,
kernel_sizes
,
x_grad
);
}));
}
}
// namespace sparse
}
// namespace phi
...
...
paddle/phi/kernels/sparse/cpu/sparse_pool_kernel.cc
浏览文件 @
5d3fd4fe
...
...
@@ -19,6 +19,8 @@ limitations under the License. */
#include "paddle/phi/kernels/funcs/sparse/convolution.h"
#include "paddle/phi/kernels/sparse/cpu/convolution.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace
phi
{
namespace
sparse
{
...
...
@@ -27,15 +29,15 @@ namespace sparse {
* kernel: (D, H, W, C, OC)
* out: (N, D, H, W, OC)
**/
template
<
typename
T
,
typename
Contex
t
>
void
MaxPool
Kernel
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
std
::
vector
<
int
>&
kernel_sizes
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
strides
,
SparseCooTensor
*
out
,
DenseTensor
*
rulebook
)
{
template
<
typename
T
,
typename
IntT
=
in
t
>
void
MaxPool
CPUKernel
(
const
CPU
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
std
::
vector
<
int
>&
kernel_sizes
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
strides
,
SparseCooTensor
*
out
,
DenseTensor
*
rulebook
)
{
const
auto
&
x_dims
=
x
.
dims
();
int
kernel_size
=
kernel_sizes
[
0
]
*
kernel_sizes
[
1
]
*
kernel_sizes
[
2
];
const
std
::
vector
<
int
>&
real_kernel_sizes
=
...
...
@@ -51,22 +53,22 @@ void MaxPoolKernel(const Context& dev_ctx,
const
T
*
in_features_ptr
=
x
.
non_zero_elements
().
data
<
T
>
();
// 1. product rule book
ProductRuleBook
<
T
,
C
ontext
>
(
dev_ctx
,
x
,
real_kernel_sizes
,
paddings
,
dilations
,
strides
,
out_dims
,
false
,
rulebook
,
&
counter_per_kernel
);
UpdateRulebookAndOutIndex
<
T
>
(
ProductRuleBook
<
T
,
C
PUContext
,
IntT
>
(
dev_ctx
,
x
,
real_kernel_sizes
,
paddings
,
dilations
,
strides
,
out_dims
,
false
,
rulebook
,
&
counter_per_kernel
);
UpdateRulebookAndOutIndex
<
T
,
CPUContext
,
IntT
>
(
dev_ctx
,
x
,
kernel_size
,
in_channels
,
out_dims
,
rulebook
,
out
);
int
rulebook_len
=
rulebook
->
dims
()[
1
];
const
int
*
rulebook_ptr
=
rulebook
->
data
<
int
>
();
const
IntT
*
rulebook_ptr
=
rulebook
->
data
<
IntT
>
();
const
int
*
counter_ptr
=
counter_per_kernel
.
data
<
int
>
();
std
::
vector
<
int
>
offsets
(
kernel_size
+
1
);
...
...
@@ -78,8 +80,8 @@ void MaxPoolKernel(const Context& dev_ctx,
phi
::
funcs
::
MaxPool
<
T
>
max_pool_functor
;
for
(
int
i
=
0
;
i
<
kernel_size
;
i
++
)
{
for
(
int
j
=
0
;
j
<
counter_ptr
[
i
];
j
++
)
{
int
in_i
=
rulebook_ptr
[
rulebook_len
+
offsets
[
i
]
+
j
];
int
out_i
=
rulebook_ptr
[
rulebook_len
*
2
+
offsets
[
i
]
+
j
];
IntT
in_i
=
rulebook_ptr
[
rulebook_len
+
offsets
[
i
]
+
j
];
IntT
out_i
=
rulebook_ptr
[
rulebook_len
*
2
+
offsets
[
i
]
+
j
];
if
(
!
out_flags
[
out_i
])
{
out_flags
[
out_i
]
=
true
;
memcpy
(
&
out_features_ptr
[
out_i
*
in_channels
],
...
...
@@ -95,6 +97,28 @@ void MaxPoolKernel(const Context& dev_ctx,
}
}
template
<
typename
T
,
typename
Context
>
void
MaxPoolKernel
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
std
::
vector
<
int
>&
kernel_sizes
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
strides
,
SparseCooTensor
*
out
,
DenseTensor
*
rulebook
)
{
PD_DISPATCH_INTEGRAL_TYPES
(
x
.
non_zero_indices
().
dtype
(),
"MaxPoolCPUKernel"
,
([
&
]
{
MaxPoolCPUKernel
<
T
,
data_t
>
(
dev_ctx
,
x
,
kernel_sizes
,
paddings
,
dilations
,
strides
,
out
,
rulebook
);
}));
}
}
// namespace sparse
}
// namespace phi
...
...
paddle/phi/kernels/sparse/gpu/convolution.cu.h
浏览文件 @
5d3fd4fe
此差异已折叠。
点击以展开。
paddle/phi/kernels/sparse/gpu/convolution_grad_kernel.cu
浏览文件 @
5d3fd4fe
...
...
@@ -24,6 +24,8 @@ limitations under the License. */
#include "paddle/phi/kernels/sparse/convolution_grad_kernel.h"
#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace
phi
{
namespace
sparse
{
...
...
@@ -35,24 +37,24 @@ namespace sparse {
//]
// x_grad = out_grad * transpose(kenrel)
// kernel_grad = transpose(x) * out_grad
template
<
typename
T
,
typename
Context
>
void
Conv3dGrad
Kernel
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
kernel
,
const
DenseTensor
&
rulebook
,
const
SparseCooTensor
&
out_grad
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
strides
,
const
int
groups
,
const
bool
subm
,
SparseCooTensor
*
x_grad
,
DenseTensor
*
kernel_grad
)
{
template
<
typename
T
,
typename
IntT
>
void
Conv3dGrad
GPUKernel
(
const
GPU
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
kernel
,
const
DenseTensor
&
rulebook
,
const
SparseCooTensor
&
out_grad
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
strides
,
const
int
groups
,
const
bool
subm
,
SparseCooTensor
*
x_grad
,
DenseTensor
*
kernel_grad
)
{
const
auto
&
kernel_dims
=
kernel
.
dims
();
const
int
kernel_size
=
kernel_dims
[
0
]
*
kernel_dims
[
1
]
*
kernel_dims
[
2
];
const
int
in_channels
=
kernel_dims
[
3
];
const
int
out_channels
=
kernel_dims
[
4
];
const
int
*
rulebook_ptr
=
rulebook
.
data
<
int
>
();
const
IntT
*
rulebook_ptr
=
rulebook
.
data
<
IntT
>
();
const
int
rulebook_len
=
rulebook
.
dims
()[
1
];
...
...
@@ -74,29 +76,29 @@ void Conv3dGradKernel(const Context& dev_ctx,
T
*
out_grad_features_ptr
=
out_grad_features
.
data
<
T
>
();
*
kernel_grad
=
phi
::
EmptyLike
<
T
>
(
dev_ctx
,
kernel
);
T
*
d_kernel_ptr
=
kernel_grad
->
data
<
T
>
();
phi
::
funcs
::
SetConstant
<
Context
,
T
>
set_zero
;
phi
::
funcs
::
SetConstant
<
GPU
Context
,
T
>
set_zero
;
set_zero
(
dev_ctx
,
kernel_grad
,
static_cast
<
T
>
(
0.0
f
));
int
half_kernel_size
=
kernel_size
/
2
;
auto
blas
=
phi
::
funcs
::
GetBlas
<
Context
,
T
>
(
dev_ctx
);
auto
blas
=
phi
::
funcs
::
GetBlas
<
GPU
Context
,
T
>
(
dev_ctx
);
DenseTensor
x_grad_indices
=
phi
::
EmptyLike
<
int
>
(
dev_ctx
,
x
.
non_zero_indices
());
phi
::
EmptyLike
<
IntT
>
(
dev_ctx
,
x
.
non_zero_indices
());
DenseTensor
x_grad_values
=
phi
::
EmptyLike
<
T
>
(
dev_ctx
,
x
.
non_zero_elements
());
T
*
x_grad_values_ptr
=
x_grad_values
.
data
<
T
>
();
set_zero
(
dev_ctx
,
&
x_grad_values
,
static_cast
<
T
>
(
0.0
f
));
set_zero
(
dev_ctx
,
&
d_x_features
,
static_cast
<
T
>
(
0.0
f
));
phi
::
Copy
<
Context
>
(
dev_ctx
,
x
.
non_zero_indices
(),
dev_ctx
.
GetPlace
(),
false
,
&
x_grad_indices
);
phi
::
Copy
<
GPU
Context
>
(
dev_ctx
,
x
.
non_zero_indices
(),
dev_ctx
.
GetPlace
(),
false
,
&
x_grad_indices
);
x_grad
->
SetMember
(
x_grad_indices
,
x_grad_values
,
x
.
dims
(),
true
);
std
::
vector
<
int
>
offsets
(
kernel_size
+
1
),
counter
(
kernel_size
,
0
),
std
::
vector
<
IntT
>
offsets
(
kernel_size
+
1
),
counter
(
kernel_size
,
0
),
h_counter
(
rulebook_len
,
0
);
phi
::
backends
::
gpu
::
GpuMemcpyAsync
(
&
h_counter
[
0
],
rulebook_ptr
,
rulebook_len
*
sizeof
(
int
),
rulebook_len
*
sizeof
(
IntT
),
#ifdef PADDLE_WITH_HIP
hipMemcpyDeviceToHost
,
#else
...
...
@@ -109,7 +111,7 @@ void Conv3dGradKernel(const Context& dev_ctx,
for
(
int
i
=
0
;
i
<
rulebook_len
;
i
++
)
{
counter
[
h_counter
[
i
]]
+=
1
;
}
int
offset
=
0
,
max_count
=
0
;
IntT
offset
=
0
,
max_count
=
0
;
for
(
int
i
=
0
;
i
<
kernel_size
;
i
++
)
{
offsets
[
i
]
=
offset
;
offset
+=
counter
[
i
];
...
...
@@ -120,15 +122,16 @@ void Conv3dGradKernel(const Context& dev_ctx,
offsets
[
kernel_size
]
=
offset
;
if
(
subm
)
{
phi
::
funcs
::
sparse
::
SubmPreProcess
<
T
,
Context
>
(
dev_ctx
,
x
,
kernel
,
out_grad
.
non_zero_elements
(),
in_channels
,
out_channels
,
half_kernel_size
,
kernel_grad
,
&
x_grad_values
);
phi
::
funcs
::
sparse
::
SubmPreProcess
<
T
,
GPUContext
>
(
dev_ctx
,
x
,
kernel
,
out_grad
.
non_zero_elements
(),
in_channels
,
out_channels
,
half_kernel_size
,
kernel_grad
,
&
x_grad_values
);
if
(
max_count
==
0
)
{
return
;
}
...
...
@@ -136,21 +139,21 @@ void Conv3dGradKernel(const Context& dev_ctx,
auto
config
=
phi
::
backends
::
gpu
::
GetGpuLaunchConfig1D
(
dev_ctx
,
rulebook_len
*
in_channels
,
1
);
GatherKernel
<
T
,
int
><<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
0
,
dev_ctx
.
stream
()
>>>
(
x
.
non_zero_elements
().
data
<
T
>
(),
rulebook_ptr
+
rulebook_len
,
in_features_ptr
,
rulebook_len
,
in_channels
);
GatherKernel
<
T
,
IntT
><<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
0
,
dev_ctx
.
stream
()
>>>
(
x
.
non_zero_elements
().
data
<
T
>
(),
rulebook_ptr
+
rulebook_len
,
in_features_ptr
,
rulebook_len
,
in_channels
);
config
=
phi
::
backends
::
gpu
::
GetGpuLaunchConfig1D
(
dev_ctx
,
rulebook_len
*
out_channels
,
1
);
GatherKernel
<
T
,
int
><<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
0
,
dev_ctx
.
stream
()
>>>
(
GatherKernel
<
T
,
IntT
><<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
0
,
dev_ctx
.
stream
()
>>>
(
out_grad
.
non_zero_elements
().
data
<
T
>
(),
rulebook_ptr
+
rulebook_len
*
2
,
out_grad_features_ptr
,
...
...
@@ -203,15 +206,19 @@ void Conv3dGradKernel(const Context& dev_ctx,
// x_grad->ResizeAndAllocate(x.non_zero_elements().dims());
DenseTensorMeta
index_meta
(
DataType
::
INT32
,
{
rulebook_len
},
DataLayout
::
NCHW
);
DenseTensor
out_index
=
phi
::
Empty
(
dev_ctx
,
std
::
move
(
index_meta
));
DenseTensor
unique_key
=
phi
::
Empty
(
dev_ctx
,
std
::
move
(
index_meta
));
DenseTensor
unique_key
=
phi
::
Empty
(
dev_ctx
,
DenseTensorMeta
(
paddle
::
experimental
::
CppTypeToDataType
<
IntT
>::
Type
(),
{
rulebook_len
},
DataLayout
::
NCHW
));
DenseTensor
unique_value
=
phi
::
Empty
(
dev_ctx
,
std
::
move
(
index_meta
));
SortedAndUniqueIndex
(
dev_ctx
,
rulebook_ptr
+
rulebook_len
,
rulebook_len
,
&
out_index
,
&
unique_key
,
&
unique_value
);
SortedAndUniqueIndex
<
GPUContext
,
IntT
>
(
dev_ctx
,
rulebook_ptr
+
rulebook_len
,
rulebook_len
,
&
out_index
,
&
unique_key
,
&
unique_value
);
config
=
phi
::
backends
::
gpu
::
GetGpuLaunchConfig1D
(
dev_ctx
,
rulebook_len
*
in_channels
,
1
);
...
...
@@ -229,6 +236,36 @@ void Conv3dGradKernel(const Context& dev_ctx,
subm
);
}
template
<
typename
T
,
typename
Context
>
void
Conv3dGradKernel
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
kernel
,
const
DenseTensor
&
rulebook
,
const
SparseCooTensor
&
out_grad
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
strides
,
const
int
groups
,
const
bool
subm
,
SparseCooTensor
*
x_grad
,
DenseTensor
*
kernel_grad
)
{
PD_DISPATCH_INTEGRAL_TYPES
(
x
.
non_zero_indices
().
dtype
(),
"Conv3dGradGPUKernel"
,
([
&
]
{
Conv3dGradGPUKernel
<
T
,
data_t
>
(
dev_ctx
,
x
,
kernel
,
rulebook
,
out_grad
,
paddings
,
dilations
,
strides
,
groups
,
subm
,
x_grad
,
kernel_grad
);
}));
}
}
// namespace sparse
}
// namespace phi
...
...
paddle/phi/kernels/sparse/gpu/convolution_kernel.cu
浏览文件 @
5d3fd4fe
...
...
@@ -19,29 +19,25 @@ limitations under the License. */
#include "paddle/phi/kernels/sparse/convolution_kernel.h"
#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace
phi
{
namespace
sparse
{
/**
* x: (N, D, H, W, C)
* kernel: (D, H, W, C, OC)
* out: (N, D, H, W, OC)
**/
template
<
typename
T
,
typename
Context
>
void
Conv3dKernel
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
kernel
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
strides
,
const
int
groups
,
const
bool
subm
,
SparseCooTensor
*
out
,
DenseTensor
*
rulebook
)
{
template
<
typename
T
,
typename
IntT
>
void
Conv3dGPUKernel
(
const
GPUContext
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
kernel
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
strides
,
const
int
groups
,
const
bool
subm
,
SparseCooTensor
*
out
,
DenseTensor
*
rulebook
)
{
// update padding and dilation
// Currently, only support x.layout is NDHWC, groups = 1
// if x.layout != NDHWC then transpose(x), transpose(weight)
const
auto
&
x_dims
=
x
.
dims
();
const
auto
&
kernel_dims
=
kernel
.
dims
();
int
kernel_size
=
kernel_dims
[
0
]
*
kernel_dims
[
1
]
*
kernel_dims
[
2
];
...
...
@@ -67,7 +63,6 @@ void Conv3dKernel(const Context& dev_ctx,
DenseTensor
offsets_per_kernel
=
phi
::
Empty
(
dev_ctx
,
std
::
move
(
offsets_meta
));
DenseTensorMeta
index_meta
(
DataType
::
INT32
,
{
1
},
DataLayout
::
NCHW
);
DenseTensor
out_index
=
phi
::
Empty
(
dev_ctx
,
std
::
move
(
index_meta
));
DenseTensor
unique_key
=
phi
::
Empty
(
dev_ctx
,
std
::
move
(
index_meta
));
DenseTensor
unique_value
=
phi
::
Empty
(
dev_ctx
,
std
::
move
(
index_meta
));
std
::
vector
<
int
>
subm_paddings
(
paddings
),
subm_strides
(
strides
);
...
...
@@ -75,28 +70,26 @@ void Conv3dKernel(const Context& dev_ctx,
phi
::
funcs
::
sparse
::
ResetSubmKernelSizeAndStrides
(
kernel
.
dims
(),
&
subm_paddings
,
&
subm_strides
);
}
int
n
=
ProductRuleBook
<
T
,
Context
>
(
dev_ctx
,
x
,
kernel_sizes
,
subm_paddings
,
dilations
,
subm_strides
,
out_dims
,
subm
,
rulebook
,
&
counter_per_kernel
,
&
offsets_per_kernel
,
&
out_index
,
&
unique_key
,
&
unique_value
,
out
,
&
h_counter
,
&
offsets
);
int
n
=
ProductRuleBook
<
T
,
GPUContext
,
IntT
>
(
dev_ctx
,
x
,
kernel_sizes
,
subm_paddings
,
dilations
,
subm_strides
,
out_dims
,
subm
,
rulebook
,
&
counter_per_kernel
,
&
offsets_per_kernel
,
&
out_index
,
&
unique_value
,
out
,
&
h_counter
,
&
offsets
);
const
int
*
counter_ptr
=
counter_per_kernel
.
data
<
int
>
();
const
int
*
offsets_ptr
=
counter_per_kernel
.
data
<
int
>
();
const
int
*
rulebook_ptr
=
rulebook
->
data
<
int
>
();
const
IntT
*
rulebook_ptr
=
rulebook
->
data
<
IntT
>
();
// 2. gather
DenseTensorMeta
in_features_meta
(
...
...
@@ -109,22 +102,22 @@ void Conv3dKernel(const Context& dev_ctx,
phi
::
Empty
(
dev_ctx
,
std
::
move
(
out_features_meta
));
T
*
in_features_ptr
=
in_features
.
data
<
T
>
();
T
*
out_features_ptr
=
out_features
.
data
<
T
>
();
phi
::
funcs
::
SetConstant
<
Context
,
T
>
set_zero
;
phi
::
funcs
::
SetConstant
<
GPU
Context
,
T
>
set_zero
;
set_zero
(
dev_ctx
,
&
out_features
,
static_cast
<
T
>
(
0.0
f
));
auto
config
=
phi
::
backends
::
gpu
::
GetGpuLaunchConfig1D
(
dev_ctx
,
n
*
in_channels
,
1
);
GatherKernel
<
T
,
int
><<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
0
,
dev_ctx
.
stream
()
>>>
(
x
.
non_zero_elements
().
data
<
T
>
(),
rulebook_ptr
+
n
,
in_features_ptr
,
n
,
in_channels
);
GatherKernel
<
T
,
IntT
><<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
0
,
dev_ctx
.
stream
()
>>>
(
x
.
non_zero_elements
().
data
<
T
>
(),
rulebook_ptr
+
n
,
in_features_ptr
,
n
,
in_channels
);
// 3. call gemm for every werght
auto
blas
=
phi
::
funcs
::
GetBlas
<
Context
,
T
>
(
dev_ctx
);
auto
blas
=
phi
::
funcs
::
GetBlas
<
GPU
Context
,
T
>
(
dev_ctx
);
auto
*
out_values
=
out
->
mutable_non_zero_elements
();
T
*
out_values_ptr
=
out_values
->
data
<
T
>
();
...
...
@@ -168,6 +161,36 @@ void Conv3dKernel(const Context& dev_ctx,
out_channels
,
out_values_ptr
);
}
/**
* x: (N, D, H, W, C)
* kernel: (D, H, W, C, OC)
* out: (N, D, H, W, OC)
**/
template
<
typename
T
,
typename
Context
>
void
Conv3dKernel
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
kernel
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
strides
,
const
int
groups
,
const
bool
subm
,
SparseCooTensor
*
out
,
DenseTensor
*
rulebook
)
{
PD_DISPATCH_INTEGRAL_TYPES
(
x
.
non_zero_indices
().
dtype
(),
"Conv3dGPUKernel"
,
([
&
]
{
Conv3dGPUKernel
<
T
,
data_t
>
(
dev_ctx
,
x
,
kernel
,
paddings
,
dilations
,
strides
,
groups
,
subm
,
out
,
rulebook
);
}));
}
}
// namespace sparse
}
// namespace phi
...
...
paddle/phi/kernels/sparse/gpu/sparse_pool_grad_kernel.cu
浏览文件 @
5d3fd4fe
...
...
@@ -12,24 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_info.h"
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/copy_kernel.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/pooling.h"
#include "paddle/phi/kernels/funcs/sparse/convolution.h"
#include "paddle/phi/
kernels/sparse/sparse_pool_grad_kernel
.h"
#include "paddle/phi/
api/ext/dispatch
.h"
namespace
phi
{
namespace
sparse
{
template
<
typename
T
>
template
<
typename
T
,
typename
IntT
=
int
>
__global__
void
MaxPoolGradCudaKernel
(
const
T
*
in_features_ptr
,
const
T
*
out_features_ptr
,
const
T
*
out_grad_ptr
,
const
int
*
rulebook_ptr
,
const
IntT
*
rulebook_ptr
,
const
int
n
,
const
int
rulebook_len
,
const
int
channels
,
...
...
@@ -38,8 +42,8 @@ __global__ void MaxPoolGradCudaKernel(const T* in_features_ptr,
CUDA_KERNEL_LOOP_TYPE
(
i
,
n
*
channels
,
int64_t
)
{
int
real_i
=
i
/
channels
;
int
c
=
i
-
real_i
*
channels
;
int
in_i
=
rulebook_ptr
[
real_i
];
int
out_i
=
rulebook_ptr
[
real_i
+
rulebook_len
];
IntT
in_i
=
rulebook_ptr
[
real_i
];
IntT
out_i
=
rulebook_ptr
[
real_i
+
rulebook_len
];
grad_functor
.
compute
(
in_features_ptr
[
in_i
*
channels
+
c
],
out_features_ptr
[
out_i
*
channels
+
c
],
out_grad_ptr
[
out_i
*
channels
+
c
],
...
...
@@ -48,23 +52,23 @@ __global__ void MaxPoolGradCudaKernel(const T* in_features_ptr,
}
}
template
<
typename
T
,
typename
Contex
t
>
void
MaxPoolGrad
Kernel
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
rulebook
,
const
SparseCooTensor
&
out
,
const
Dense
Tensor
&
out_grad
,
const
std
::
vector
<
int
>&
kernel_sizes
,
Dense
Tensor
*
x_grad
)
{
template
<
typename
T
,
typename
IntT
=
in
t
>
void
MaxPoolGrad
GPUKernel
(
const
GPU
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
rulebook
,
const
SparseCooTensor
&
out
,
const
SparseCoo
Tensor
&
out_grad
,
const
std
::
vector
<
int
>&
kernel_sizes
,
SparseCoo
Tensor
*
x_grad
)
{
int
kernel_size
=
kernel_sizes
[
0
]
*
kernel_sizes
[
1
]
*
kernel_sizes
[
2
];
const
int
in_channels
=
x
.
dims
()[
4
];
int
rulebook_len
=
rulebook
.
dims
()[
1
];
const
int
*
rulebook_ptr
=
rulebook
.
data
<
int
>
();
std
::
vector
<
int
>
offsets
(
kernel_size
+
1
),
counter
(
kernel_size
,
0
),
const
IntT
*
rulebook_ptr
=
rulebook
.
data
<
IntT
>
();
std
::
vector
<
IntT
>
offsets
(
kernel_size
+
1
),
counter
(
kernel_size
,
0
),
h_counter
(
kernel_size
);
phi
::
backends
::
gpu
::
GpuMemcpyAsync
(
&
h_counter
[
0
],
rulebook_ptr
,
rulebook_len
*
sizeof
(
int
),
rulebook_len
*
sizeof
(
IntT
),
#ifdef PADDLE_WITH_HIP
hipMemcpyDeviceToHost
,
#else
...
...
@@ -80,10 +84,20 @@ void MaxPoolGradKernel(const Context& dev_ctx,
const
T
*
in_features_ptr
=
x
.
non_zero_elements
().
data
<
T
>
();
const
T
*
out_features_ptr
=
out
.
non_zero_elements
().
data
<
T
>
();
const
T
*
out_grad_ptr
=
out_grad
.
data
<
T
>
();
T
*
x_grad_ptr
=
x_grad
->
data
<
T
>
();
phi
::
funcs
::
SetConstant
<
Context
,
T
>
set_zero
;
set_zero
(
dev_ctx
,
x_grad
,
static_cast
<
T
>
(
0.0
f
));
const
T
*
out_grad_ptr
=
out_grad
.
non_zero_elements
().
data
<
T
>
();
// TODO(zhangkaihuo): call phi::sparse::EmptyLike
DenseTensor
x_grad_indices
=
phi
::
EmptyLike
<
IntT
>
(
dev_ctx
,
x
.
non_zero_indices
());
DenseTensor
x_grad_values
=
phi
::
EmptyLike
<
T
>
(
dev_ctx
,
x
.
non_zero_elements
());
x_grad
->
SetMember
(
x_grad_indices
,
x_grad_values
,
x
.
dims
(),
true
);
T
*
x_grad_ptr
=
x_grad_values
.
data
<
T
>
();
phi
::
funcs
::
SetConstant
<
GPUContext
,
T
>
set_zero
;
set_zero
(
dev_ctx
,
&
x_grad_values
,
static_cast
<
T
>
(
0.0
f
));
phi
::
Copy
<
GPUContext
>
(
dev_ctx
,
x
.
non_zero_indices
(),
dev_ctx
.
GetPlace
(),
false
,
&
x_grad_indices
);
for
(
int
i
=
0
;
i
<
kernel_size
;
i
++
)
{
if
(
counter
[
i
]
<=
0
)
{
...
...
@@ -92,10 +106,10 @@ void MaxPoolGradKernel(const Context& dev_ctx,
auto
config
=
phi
::
backends
::
gpu
::
GetGpuLaunchConfig1D
(
dev_ctx
,
counter
[
i
]
*
in_channels
,
1
);
MaxPoolGradCudaKernel
<
T
><<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
0
,
dev_ctx
.
stream
()
>>>
(
MaxPoolGradCudaKernel
<
T
,
IntT
><<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
0
,
dev_ctx
.
stream
()
>>>
(
in_features_ptr
,
out_features_ptr
,
out_grad_ptr
,
...
...
@@ -107,6 +121,21 @@ void MaxPoolGradKernel(const Context& dev_ctx,
}
}
template
<
typename
T
,
typename
Context
>
void
MaxPoolGradKernel
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
rulebook
,
const
SparseCooTensor
&
out
,
const
SparseCooTensor
&
out_grad
,
const
std
::
vector
<
int
>&
kernel_sizes
,
SparseCooTensor
*
x_grad
)
{
PD_DISPATCH_INTEGRAL_TYPES
(
x
.
non_zero_indices
().
dtype
(),
"MaxPoolGradGPUKernel"
,
([
&
]
{
MaxPoolGradGPUKernel
<
T
,
data_t
>
(
dev_ctx
,
x
,
rulebook
,
out
,
out_grad
,
kernel_sizes
,
x_grad
);
}));
}
}
// namespace sparse
}
// namespace phi
...
...
paddle/phi/kernels/sparse/gpu/sparse_pool_kernel.cu
浏览文件 @
5d3fd4fe
...
...
@@ -12,19 +12,22 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_meta.h"
#include "paddle/phi/kernels/funcs/pooling.h"
#include "paddle/phi/kernels/funcs/sparse/convolution.h"
#include "paddle/phi/kernels/sparse/gpu/convolution.cu.h"
#include "paddle/phi/kernels/sparse/sparse_pool_kernel.h"
#include "paddle/phi/api/ext/dispatch.h"
namespace
phi
{
namespace
sparse
{
template
<
typename
T
>
template
<
typename
T
,
typename
IntT
=
int
>
__global__
void
MaxPoolCudaKernel
(
const
T
*
in_features_ptr
,
const
int
*
rulebook_ptr
,
const
IntT
*
rulebook_ptr
,
const
int
n
,
const
int
rulebook_len
,
const
int
channels
,
...
...
@@ -33,8 +36,8 @@ __global__ void MaxPoolCudaKernel(const T* in_features_ptr,
CUDA_KERNEL_LOOP_TYPE
(
i
,
n
*
channels
,
int64_t
)
{
int
real_i
=
i
/
channels
;
int
channel_i
=
i
-
real_i
*
channels
;
int
in_i
=
rulebook_ptr
[
real_i
];
int
out_i
=
rulebook_ptr
[
real_i
+
rulebook_len
];
IntT
in_i
=
rulebook_ptr
[
real_i
];
IntT
out_i
=
rulebook_ptr
[
real_i
+
rulebook_len
];
max_pool_functor
.
compute
(
in_features_ptr
[
in_i
*
channels
+
channel_i
],
&
out_features_ptr
[
out_i
*
channels
+
channel_i
]);
}
...
...
@@ -45,15 +48,15 @@ __global__ void MaxPoolCudaKernel(const T* in_features_ptr,
* kernel: (D, H, W, C, OC)
* out: (N, D, H, W, OC)
**/
template
<
typename
T
,
typename
Contex
t
>
void
MaxPool
Kernel
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
std
::
vector
<
int
>&
kernel_sizes
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
strides
,
SparseCooTensor
*
out
,
DenseTensor
*
rulebook
)
{
template
<
typename
T
,
typename
IntT
=
in
t
>
void
MaxPool
GPUKernel
(
const
GPU
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
std
::
vector
<
int
>&
kernel_sizes
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
strides
,
SparseCooTensor
*
out
,
DenseTensor
*
rulebook
)
{
const
auto
&
x_dims
=
x
.
dims
();
int
kernel_size
=
kernel_sizes
[
0
]
*
kernel_sizes
[
1
]
*
kernel_sizes
[
2
];
const
std
::
vector
<
int
>&
real_kernel_sizes
=
...
...
@@ -70,29 +73,27 @@ void MaxPoolKernel(const Context& dev_ctx,
DenseTensor
offsets_per_kernel
=
phi
::
Empty
(
dev_ctx
,
std
::
move
(
counter_meta
));
DenseTensorMeta
index_meta
(
DataType
::
INT32
,
{
1
},
DataLayout
::
NCHW
);
DenseTensor
out_index
=
phi
::
Empty
(
dev_ctx
,
std
::
move
(
index_meta
));
DenseTensor
unique_key
=
phi
::
Empty
(
dev_ctx
,
std
::
move
(
index_meta
));
DenseTensor
unique_value
=
phi
::
Empty
(
dev_ctx
,
std
::
move
(
index_meta
));
// 1. product rulebook
int
rulebook_len
=
ProductRuleBook
<
T
,
Context
>
(
dev_ctx
,
x
,
real_kernel_sizes
,
paddings
,
dilations
,
strides
,
out_dims
,
false
,
rulebook
,
&
counter_per_kernel
,
&
offsets_per_kernel
,
&
out_index
,
&
unique_key
,
&
unique_value
,
out
,
&
counter
,
&
offsets
);
const
int
*
rulebook_ptr
=
rulebook
->
data
<
int
>
();
int
rulebook_len
=
ProductRuleBook
<
T
,
GPUContext
,
IntT
>
(
dev_ctx
,
x
,
real_kernel_sizes
,
paddings
,
dilations
,
strides
,
out_dims
,
false
,
rulebook
,
&
counter_per_kernel
,
&
offsets_per_kernel
,
&
out_index
,
&
unique_value
,
out
,
&
counter
,
&
offsets
);
const
IntT
*
rulebook_ptr
=
rulebook
->
data
<
IntT
>
();
T
*
out_features_ptr
=
out
->
mutable_non_zero_elements
()
->
data
<
T
>
();
const
T
*
in_features_ptr
=
x
.
non_zero_elements
().
data
<
T
>
();
...
...
@@ -113,10 +114,10 @@ void MaxPoolKernel(const Context& dev_ctx,
auto
config
=
phi
::
backends
::
gpu
::
GetGpuLaunchConfig1D
(
dev_ctx
,
counter
[
i
]
*
in_channels
,
1
);
MaxPoolCudaKernel
<
T
><<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
0
,
dev_ctx
.
stream
()
>>>
(
MaxPoolCudaKernel
<
T
,
IntT
><<<
config
.
block_per_grid
.
x
,
config
.
thread_per_block
.
x
,
0
,
dev_ctx
.
stream
()
>>>
(
in_features_ptr
,
rulebook_ptr
+
offsets
[
i
]
+
rulebook_len
,
counter
[
i
],
...
...
@@ -126,6 +127,28 @@ void MaxPoolKernel(const Context& dev_ctx,
}
}
template
<
typename
T
,
typename
Context
>
void
MaxPoolKernel
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
std
::
vector
<
int
>&
kernel_sizes
,
const
std
::
vector
<
int
>&
paddings
,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
strides
,
SparseCooTensor
*
out
,
DenseTensor
*
rulebook
)
{
PD_DISPATCH_INTEGRAL_TYPES
(
x
.
non_zero_indices
().
dtype
(),
"MaxPoolGPUKernel"
,
([
&
]
{
MaxPoolGPUKernel
<
T
,
data_t
>
(
dev_ctx
,
x
,
kernel_sizes
,
paddings
,
dilations
,
strides
,
out
,
rulebook
);
}));
}
}
// namespace sparse
}
// namespace phi
...
...
paddle/phi/kernels/sparse/sparse_pool_grad_kernel.h
浏览文件 @
5d3fd4fe
...
...
@@ -26,20 +26,18 @@ void MaxPoolGradKernel(const Context& dev_ctx,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
rulebook
,
const
SparseCooTensor
&
out
,
const
Dense
Tensor
&
out_grad
,
const
SparseCoo
Tensor
&
out_grad
,
const
std
::
vector
<
int
>&
kernel_sizes
,
Dense
Tensor
*
x_grad
);
SparseCoo
Tensor
*
x_grad
);
template
<
typename
T
,
typename
Context
>
DenseTensor
MaxPoolGrad
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
rulebook
,
const
SparseCooTensor
&
out
,
const
DenseTensor
&
out_grad
,
const
std
::
vector
<
int
>&
kernel_sizes
)
{
DenseTensor
x_grad
=
phi
::
Empty
<
Context
>
(
dev_ctx
,
DenseTensorMeta
(
x
.
dtype
(),
x
.
non_zero_elements
().
dims
(),
x
.
layout
()));
SparseCooTensor
MaxPoolGrad
(
const
Context
&
dev_ctx
,
const
SparseCooTensor
&
x
,
const
DenseTensor
&
rulebook
,
const
SparseCooTensor
&
out
,
const
SparseCooTensor
&
out_grad
,
const
std
::
vector
<
int
>&
kernel_sizes
)
{
SparseCooTensor
x_grad
;
MaxPoolGradKernel
<
T
,
Context
>
(
dev_ctx
,
x
,
rulebook
,
out
,
out_grad
,
kernel_sizes
,
&
x_grad
);
return
x_grad
;
...
...
paddle/phi/kernels/sparse/sparse_pool_kernel.h
浏览文件 @
5d3fd4fe
...
...
@@ -39,11 +39,7 @@ SparseCooTensor MaxPool(const Context& dev_ctx,
const
std
::
vector
<
int
>&
dilations
,
const
std
::
vector
<
int
>&
strides
,
DenseTensor
*
rulebook
)
{
DenseTensor
indices
=
phi
::
Empty
<
Context
>
(
dev_ctx
,
DenseTensorMeta
(
DataType
::
INT32
,
{
1
},
DataLayout
::
NCHW
));
DenseTensor
values
=
phi
::
Empty
<
Context
>
(
dev_ctx
,
DenseTensorMeta
(
x
.
dtype
(),
{
1
},
x
.
layout
()));
SparseCooTensor
coo
(
indices
,
values
,
x
.
dims
());
SparseCooTensor
coo
;
MaxPoolKernel
<
T
,
Context
>
(
dev_ctx
,
x
,
kernel_sizes
,
paddings
,
dilations
,
strides
,
&
coo
,
rulebook
);
return
coo
;
...
...
paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
浏览文件 @
5d3fd4fe
...
...
@@ -48,13 +48,13 @@ std::vector<T2> cast(const std::vector<T1>& in) {
return
out
;
}
template
<
typename
T
>
void
TestConv3dBase
(
const
std
::
vector
<
int
>&
indices
,
template
<
typename
T
,
typename
IntT
=
int
>
void
TestConv3dBase
(
const
std
::
vector
<
IntT
>&
indices
,
const
std
::
vector
<
T
>&
features
,
const
DDim
&
x_dims
,
const
std
::
vector
<
T
>&
kernel
,
const
DDim
&
kernel_dims
,
const
std
::
vector
<
int
>&
correct_out_indices
,
const
std
::
vector
<
IntT
>&
correct_out_indices
,
const
std
::
vector
<
T
>&
correct_out_features
,
const
DDim
&
correct_out_dims
,
const
int
non_zero_num
,
...
...
@@ -80,11 +80,13 @@ void TestConv3dBase(const std::vector<int>& indices,
const
int
in_channels
=
kernel_dims
[
3
];
const
int
out_channels
=
kernel_dims
[
4
];
auto
indices_dtype
=
paddle
::
experimental
::
CppTypeToDataType
<
IntT
>::
Type
();
DenseTensor
indices_tensor
=
phi
::
Empty
(
dev_ctx_cpu
,
DenseTensorMeta
(
DataType
::
INT32
,
{
4
,
non_zero_num
},
DataLayout
::
NCHW
));
memcpy
(
indices_tensor
.
data
<
int
>
(),
indices
.
data
(),
indices
.
size
()
*
sizeof
(
int
));
DenseTensorMeta
(
indices_dtype
,
{
4
,
non_zero_num
},
DataLayout
::
NCHW
));
memcpy
(
indices_tensor
.
data
<
IntT
>
(),
indices
.
data
(),
indices
.
size
()
*
sizeof
(
IntT
));
DenseTensor
features_tensor
=
phi
::
Empty
(
dev_ctx_cpu
,
DenseTensorMeta
(
paddle
::
experimental
::
CppTypeToDataType
<
T
>::
Type
(),
...
...
@@ -111,7 +113,7 @@ void TestConv3dBase(const std::vector<int>& indices,
if
(
!
std
::
is_same
<
T
,
phi
::
dtype
::
float16
>::
value
)
{
DenseTensor
rulebook
=
phi
::
Empty
(
dev_ctx_cpu
,
DenseTensorMeta
(
DataType
::
INT32
,
{
1
},
DataLayout
::
NCHW
));
dev_ctx_cpu
,
DenseTensorMeta
(
indices_dtype
,
{
1
},
DataLayout
::
NCHW
));
SparseCooTensor
out
=
sparse
::
Conv3d
<
T
>
(
dev_ctx_cpu
,
x_tensor
,
kernel_tensor
,
...
...
@@ -129,8 +131,8 @@ void TestConv3dBase(const std::vector<int>& indices,
ASSERT_EQ
((
int64_t
)
correct_out_features
.
size
()
/
out_channels
,
out
.
nnz
());
int
cmp_indices
=
memcmp
(
correct_out_indices
.
data
(),
out
.
non_zero_indices
().
data
<
int
>
(),
correct_out_indices
.
size
()
*
sizeof
(
int
));
out
.
non_zero_indices
().
data
<
IntT
>
(),
correct_out_indices
.
size
()
*
sizeof
(
IntT
));
ASSERT_EQ
(
cmp_indices
,
0
);
f_verify
(
out
.
non_zero_elements
().
data
<
T
>
(),
correct_out_features
);
...
...
@@ -172,7 +174,7 @@ void TestConv3dBase(const std::vector<int>& indices,
DenseTensor
d_indices_tensor
=
phi
::
Empty
(
dev_ctx_gpu
,
DenseTensorMeta
(
DataType
::
INT32
,
{
4
,
non_zero_num
},
DataLayout
::
NCHW
));
DenseTensorMeta
(
indices_dtype
,
{
4
,
non_zero_num
},
DataLayout
::
NCHW
));
phi
::
Copy
(
dev_ctx_gpu
,
indices_tensor
,
phi
::
GPUPlace
(),
true
,
&
d_indices_tensor
);
...
...
@@ -195,7 +197,7 @@ void TestConv3dBase(const std::vector<int>& indices,
dev_ctx_gpu
,
kernel_tensor
,
phi
::
GPUPlace
(),
true
,
&
d_kernel_tensor
);
DenseTensor
d_rulebook
=
phi
::
Empty
(
dev_ctx_gpu
,
DenseTensorMeta
(
DataType
::
INT32
,
{
1
},
DataLayout
::
NCHW
));
dev_ctx_gpu
,
DenseTensorMeta
(
indices_dtype
,
{
1
},
DataLayout
::
NCHW
));
SparseCooTensor
d_out
=
sparse
::
Conv3d
<
T
>
(
dev_ctx_gpu
,
d_x_tensor
,
d_kernel_tensor
,
...
...
@@ -214,7 +216,7 @@ void TestConv3dBase(const std::vector<int>& indices,
DenseTensor
h_indices_tensor
=
phi
::
Empty
(
dev_ctx_cpu
,
DenseTensorMeta
(
DataType
::
INT32
,
{
4
,
d_out
.
nnz
()},
DataLayout
::
NCHW
));
DenseTensorMeta
(
indices_dtype
,
{
4
,
d_out
.
nnz
()},
DataLayout
::
NCHW
));
phi
::
Copy
(
dev_ctx_gpu
,
d_out
.
non_zero_indices
(),
phi
::
CPUPlace
(),
...
...
@@ -222,8 +224,8 @@ void TestConv3dBase(const std::vector<int>& indices,
&
h_indices_tensor
);
int
cmp_indices2
=
memcmp
(
correct_out_indices
.
data
(),
h_indices_tensor
.
data
<
int
>
(),
correct_out_indices
.
size
()
*
sizeof
(
int
));
h_indices_tensor
.
data
<
IntT
>
(),
correct_out_indices
.
size
()
*
sizeof
(
IntT
));
ASSERT_EQ
(
cmp_indices2
,
0
);
DenseTensor
h_features_tensor
=
...
...
@@ -264,12 +266,13 @@ void TestConv3dBase(const std::vector<int>& indices,
#endif
}
void
TestConv3d
(
const
std
::
vector
<
int
>&
indices
,
template
<
typename
IntT
=
int
>
void
TestConv3d
(
const
std
::
vector
<
IntT
>&
indices
,
const
std
::
vector
<
float
>&
features
,
const
DDim
&
x_dims
,
const
std
::
vector
<
float
>&
kernel
,
const
DDim
&
kernel_dims
,
const
std
::
vector
<
int
>&
correct_out_indices
,
const
std
::
vector
<
IntT
>&
correct_out_indices
,
const
std
::
vector
<
float
>&
correct_out_features
,
const
DDim
&
correct_out_dims
,
const
int
non_zero_num
,
...
...
@@ -282,41 +285,41 @@ void TestConv3d(const std::vector<int>& indices,
const
std
::
vector
<
float
>
kernel_grad
=
{},
const
bool
subm
=
false
)
{
// test float
TestConv3dBase
<
float
>
(
indices
,
features
,
x_dims
,
kernel
,
kernel_dims
,
correct_out_indices
,
correct_out_features
,
correct_out_dims
,
non_zero_num
,
paddings
,
strides
,
dilations
,
diff
,
backward
,
features_grad
,
kernel_grad
,
subm
);
TestConv3dBase
<
float
,
IntT
>
(
indices
,
features
,
x_dims
,
kernel
,
kernel_dims
,
correct_out_indices
,
correct_out_features
,
correct_out_dims
,
non_zero_num
,
paddings
,
strides
,
dilations
,
diff
,
backward
,
features_grad
,
kernel_grad
,
subm
);
// test double
TestConv3dBase
<
double
>
(
indices
,
cast
<
float
,
double
>
(
features
),
x_dims
,
cast
<
float
,
double
>
(
kernel
),
kernel_dims
,
correct_out_indices
,
cast
<
float
,
double
>
(
correct_out_features
),
correct_out_dims
,
non_zero_num
,
paddings
,
strides
,
dilations
,
diff
,
backward
,
cast
<
float
,
double
>
(
features_grad
),
cast
<
float
,
double
>
(
kernel_grad
),
subm
);
TestConv3dBase
<
double
,
IntT
>
(
indices
,
cast
<
float
,
double
>
(
features
),
x_dims
,
cast
<
float
,
double
>
(
kernel
),
kernel_dims
,
correct_out_indices
,
cast
<
float
,
double
>
(
correct_out_features
),
correct_out_dims
,
non_zero_num
,
paddings
,
strides
,
dilations
,
diff
,
backward
,
cast
<
float
,
double
>
(
features_grad
),
cast
<
float
,
double
>
(
kernel_grad
),
subm
);
}
TEST
(
DEV_API
,
sparse_conv3d
)
{
...
...
@@ -616,6 +619,51 @@ TEST(DEV_API, sparse_conv2d) {
dilations
);
}
TEST
(
DEV_API
,
sparse_conv2d_int64
)
{
const
int
in_channels
=
1
;
const
int
out_channels
=
1
;
DDim
x_dims
=
{
1
,
1
,
5
,
5
,
in_channels
};
DDim
kernel_dims
=
{
1
,
3
,
3
,
in_channels
,
out_channels
};
DDim
out_dims
=
{
1
,
1
,
3
,
3
,
out_channels
};
std
::
vector
<
int
>
paddings
=
{
0
,
0
,
0
};
std
::
vector
<
int
>
strides
=
{
1
,
1
,
1
};
std
::
vector
<
int
>
dilations
=
{
1
,
1
,
1
};
const
int
non_zero_num
=
3
;
std
::
vector
<
int64_t
>
indices_flatten
=
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
4
,
0
,
3
,
2
,
4
};
std
::
vector
<
float
>
features
=
{
-
0.79394531
,
-
0.3125
,
-
0.55029297
};
// 3*3*3=27
std
::
vector
<
float
>
kernel
=
{
0.65820312
,
0.75048828
,
0.21411133
,
0.17370605
,
0.85546875
,
0.53076172
,
0.28833008
,
0.71044922
,
0.00659943
};
std
::
vector
<
int64_t
>
out_indices_flatten
=
{
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
0
,
2
,
2
,
2
,
1
,
2
,
0
,
1
,
2
};
std
::
vector
<
float
>
out_features
=
{
-
0.17004
,
-
0.71338
,
-
0.00206
,
-
0.22205
,
-
0.09009
};
TestConv3d
<
int64_t
>
(
indices_flatten
,
features
,
x_dims
,
kernel
,
kernel_dims
,
out_indices_flatten
,
out_features
,
out_dims
,
non_zero_num
,
paddings
,
strides
,
dilations
);
}
TEST
(
DEV_API
,
sparse_conv3d_backward
)
{
const
int
in_channels
=
1
;
const
int
out_channels
=
1
;
...
...
paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
浏览文件 @
5d3fd4fe
...
...
@@ -36,11 +36,11 @@ std::vector<T2> cast(const std::vector<T1>& in) {
}
return
out
;
}
template
<
typename
T
>
void
TestMaxPoolBase
(
const
std
::
vector
<
int
>&
indices
,
template
<
typename
T
,
typename
IntT
=
int
>
void
TestMaxPoolBase
(
const
std
::
vector
<
IntT
>&
indices
,
const
std
::
vector
<
T
>&
features
,
const
DDim
&
x_dims
,
const
std
::
vector
<
int
>&
correct_out_indices
,
const
std
::
vector
<
IntT
>&
correct_out_indices
,
const
std
::
vector
<
T
>&
correct_out_features
,
const
DDim
&
correct_out_dims
,
const
int
non_zero_num
,
...
...
@@ -65,11 +65,13 @@ void TestMaxPoolBase(const std::vector<int>& indices,
const
int
in_channels
=
x_dims
[
4
];
const
int
out_channels
=
in_channels
;
auto
indices_dtype
=
paddle
::
experimental
::
CppTypeToDataType
<
IntT
>::
Type
();
DenseTensor
indices_tensor
=
phi
::
Empty
(
dev_ctx_cpu
,
DenseTensorMeta
(
DataType
::
INT32
,
{
4
,
non_zero_num
},
DataLayout
::
NCHW
));
memcpy
(
indices_tensor
.
data
<
int
>
(),
indices
.
data
(),
indices
.
size
()
*
sizeof
(
int
));
DenseTensorMeta
(
indices_dtype
,
{
4
,
non_zero_num
},
DataLayout
::
NCHW
));
memcpy
(
indices_tensor
.
data
<
IntT
>
(),
indices
.
data
(),
indices
.
size
()
*
sizeof
(
IntT
));
DenseTensor
features_tensor
=
phi
::
Empty
(
dev_ctx_cpu
,
DenseTensorMeta
(
paddle
::
experimental
::
CppTypeToDataType
<
T
>::
Type
(),
...
...
@@ -88,8 +90,7 @@ void TestMaxPoolBase(const std::vector<int>& indices,
};
if
(
!
std
::
is_same
<
T
,
phi
::
dtype
::
float16
>::
value
)
{
DenseTensor
rulebook
=
phi
::
Empty
(
dev_ctx_cpu
,
DenseTensorMeta
(
DataType
::
INT32
,
{
1
},
DataLayout
::
NCHW
));
DenseTensor
rulebook
;
SparseCooTensor
out
=
sparse
::
MaxPool
<
T
>
(
dev_ctx_cpu
,
x_tensor
,
kernel_sizes
,
...
...
@@ -105,20 +106,16 @@ void TestMaxPoolBase(const std::vector<int>& indices,
ASSERT_EQ
((
int64_t
)
correct_out_features
.
size
()
/
out_channels
,
out
.
nnz
());
int
cmp_indices
=
memcmp
(
correct_out_indices
.
data
(),
out
.
non_zero_indices
().
data
<
int
>
(),
correct_out_indices
.
size
()
*
sizeof
(
int
));
out
.
non_zero_indices
().
data
<
IntT
>
(),
correct_out_indices
.
size
()
*
sizeof
(
IntT
));
ASSERT_EQ
(
cmp_indices
,
0
);
f_verify
(
out
.
non_zero_elements
().
data
<
T
>
(),
correct_out_features
);
if
(
backward
)
{
DenseTensor
x_grad
=
sparse
::
MaxPoolGrad
<
T
>
(
dev_ctx_cpu
,
x_tensor
,
rulebook
,
out
,
out
.
non_zero_elements
(),
kernel_sizes
);
f_verify
(
x_grad
.
data
<
T
>
(),
features_grad
);
SparseCooTensor
x_grad
=
sparse
::
MaxPoolGrad
<
T
>
(
dev_ctx_cpu
,
x_tensor
,
rulebook
,
out
,
out
,
kernel_sizes
);
f_verify
(
x_grad
.
non_zero_elements
().
data
<
T
>
(),
features_grad
);
}
}
...
...
@@ -142,7 +139,7 @@ void TestMaxPoolBase(const std::vector<int>& indices,
DenseTensor
d_indices_tensor
=
phi
::
Empty
(
dev_ctx_gpu
,
DenseTensorMeta
(
DataType
::
INT32
,
{
4
,
non_zero_num
},
DataLayout
::
NCHW
));
DenseTensorMeta
(
indices_dtype
,
{
4
,
non_zero_num
},
DataLayout
::
NCHW
));
phi
::
Copy
(
dev_ctx_gpu
,
indices_tensor
,
phi
::
GPUPlace
(),
true
,
&
d_indices_tensor
);
...
...
@@ -153,8 +150,7 @@ void TestMaxPoolBase(const std::vector<int>& indices,
SparseCooTensor
d_x_tensor
(
d_indices_tensor
,
d_features_tensor
,
x_dims
);
DenseTensor
d_rulebook
=
phi
::
Empty
(
dev_ctx_gpu
,
DenseTensorMeta
(
DataType
::
INT32
,
{
1
},
DataLayout
::
NCHW
));
DenseTensor
d_rulebook
;
SparseCooTensor
d_out
=
sparse
::
MaxPool
<
T
>
(
dev_ctx_gpu
,
d_x_tensor
,
kernel_sizes
,
...
...
@@ -171,7 +167,7 @@ void TestMaxPoolBase(const std::vector<int>& indices,
DenseTensor
h_indices_tensor
=
phi
::
Empty
(
dev_ctx_cpu
,
DenseTensorMeta
(
DataType
::
INT32
,
{
4
,
d_out
.
nnz
()},
DataLayout
::
NCHW
));
DenseTensorMeta
(
indices_dtype
,
{
4
,
d_out
.
nnz
()},
DataLayout
::
NCHW
));
phi
::
Copy
(
dev_ctx_gpu
,
d_out
.
non_zero_indices
(),
phi
::
CPUPlace
(),
...
...
@@ -179,8 +175,8 @@ void TestMaxPoolBase(const std::vector<int>& indices,
&
h_indices_tensor
);
int
cmp_indices2
=
memcmp
(
correct_out_indices
.
data
(),
h_indices_tensor
.
data
<
int
>
(),
correct_out_indices
.
size
()
*
sizeof
(
int
));
h_indices_tensor
.
data
<
IntT
>
(),
correct_out_indices
.
size
()
*
sizeof
(
IntT
));
ASSERT_EQ
(
cmp_indices2
,
0
);
DenseTensor
h_features_tensor
=
...
...
@@ -194,23 +190,25 @@ void TestMaxPoolBase(const std::vector<int>& indices,
f_verify
(
h_features_tensor
.
data
<
T
>
(),
correct_out_features
);
if
(
backward
)
{
DenseTensor
x_grad
=
sparse
::
MaxPoolGrad
<
T
>
(
dev_ctx_gpu
,
d_x_tensor
,
d_rulebook
,
d_out
,
d_out
.
non_zero_elements
(),
kernel_sizes
);
DenseTensor
h_features_grad
=
phi
::
EmptyLike
<
T
>
(
dev_ctx_cpu
,
x_grad
);
phi
::
Copy
(
dev_ctx_gpu
,
x_grad
,
phi
::
CPUPlace
(),
true
,
&
h_features_grad
);
SparseCooTensor
x_grad
=
sparse
::
MaxPoolGrad
<
T
>
(
dev_ctx_gpu
,
d_x_tensor
,
d_rulebook
,
d_out
,
d_out
,
kernel_sizes
);
DenseTensor
h_features_grad
=
phi
::
EmptyLike
<
T
>
(
dev_ctx_cpu
,
x_grad
.
non_zero_elements
());
phi
::
Copy
(
dev_ctx_gpu
,
x_grad
.
non_zero_elements
(),
phi
::
CPUPlace
(),
true
,
&
h_features_grad
);
f_verify
(
h_features_grad
.
data
<
T
>
(),
features_grad
);
}
#endif
}
void
TestMaxPool
(
const
std
::
vector
<
int
>&
indices
,
template
<
typename
IntT
=
int
>
void
TestMaxPool
(
const
std
::
vector
<
IntT
>&
indices
,
const
std
::
vector
<
float
>&
features
,
const
DDim
&
x_dims
,
const
std
::
vector
<
int
>&
correct_out_indices
,
const
std
::
vector
<
IntT
>&
correct_out_indices
,
const
std
::
vector
<
float
>&
correct_out_features
,
const
DDim
&
correct_out_dims
,
const
int
non_zero_num
,
...
...
@@ -222,35 +220,35 @@ void TestMaxPool(const std::vector<int>& indices,
const
bool
backward
=
false
,
const
std
::
vector
<
float
>
features_grad
=
{})
{
// test float
TestMaxPoolBase
<
float
>
(
indices
,
features
,
x_dims
,
correct_out_indices
,
correct_out_features
,
correct_out_dims
,
non_zero_num
,
kernel_sizes
,
paddings
,
strides
,
dilations
,
diff
,
backward
,
features_grad
);
TestMaxPoolBase
<
float
,
IntT
>
(
indices
,
features
,
x_dims
,
correct_out_indices
,
correct_out_features
,
correct_out_dims
,
non_zero_num
,
kernel_sizes
,
paddings
,
strides
,
dilations
,
diff
,
backward
,
features_grad
);
// test double
TestMaxPoolBase
<
double
>
(
indices
,
cast
<
float
,
double
>
(
features
),
x_dims
,
correct_out_indices
,
cast
<
float
,
double
>
(
correct_out_features
),
correct_out_dims
,
non_zero_num
,
kernel_sizes
,
paddings
,
strides
,
dilations
,
diff
,
backward
,
cast
<
float
,
double
>
(
features_grad
));
TestMaxPoolBase
<
double
,
IntT
>
(
indices
,
cast
<
float
,
double
>
(
features
),
x_dims
,
correct_out_indices
,
cast
<
float
,
double
>
(
correct_out_features
),
correct_out_dims
,
non_zero_num
,
kernel_sizes
,
paddings
,
strides
,
dilations
,
diff
,
backward
,
cast
<
float
,
double
>
(
features_grad
));
}
TEST
(
DEV_API
,
sparse_maxpool
)
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录