Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
9b48199a
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
9b48199a
编写于
7月 02, 2021
作者:
N
niuliling123
提交者:
GitHub
7月 02, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
modified reduce_all_op reduce_any_op for higher performance (#33267)
上级
4c352033
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
112 addition
and
122 deletion
+112
-122
paddle/fluid/operators/reduce_ops/reduce_all_op.cu
paddle/fluid/operators/reduce_ops/reduce_all_op.cu
+4
-2
paddle/fluid/operators/reduce_ops/reduce_any_op.cu
paddle/fluid/operators/reduce_ops/reduce_any_op.cu
+5
-2
paddle/fluid/operators/reduce_ops/reduce_op.cu.h
paddle/fluid/operators/reduce_ops/reduce_op.cu.h
+51
-108
paddle/fluid/operators/reduce_ops/reduce_op.h
paddle/fluid/operators/reduce_ops/reduce_op.h
+52
-0
paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
+0
-10
未找到文件。
paddle/fluid/operators/reduce_ops/reduce_all_op.cu
浏览文件 @
9b48199a
...
...
@@ -13,7 +13,9 @@
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
// reduce_prod
REGISTER_OP_CUDA_KERNEL
(
reduce_all
,
ops
::
BoolReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
bool
,
ops
::
AllFunctor
>
);
reduce_all
,
ops
::
ReduceCudaKernel
<
bool
,
paddle
::
operators
::
CustomLogicalAnd
>
);
paddle/fluid/operators/reduce_ops/reduce_any_op.cu
浏览文件 @
9b48199a
...
...
@@ -13,7 +13,10 @@
// limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
// reduce_prod
REGISTER_OP_CUDA_KERNEL
(
reduce_any
,
ops
::
BoolReduceKernel
<
paddle
::
platform
::
CUDADeviceContext
,
bool
,
ops
::
AnyFuncto
r
>
);
reduce_any
,
ops
::
ReduceCudaKernel
<
bool
,
paddle
::
operators
::
CustomLogicalO
r
>
);
paddle/fluid/operators/reduce_ops/reduce_op.cu.h
浏览文件 @
9b48199a
...
...
@@ -62,27 +62,6 @@ struct DivideFunctor {
T
n_inv
;
};
static
inline
std
::
vector
<
int
>
GetReduceDim
(
const
std
::
vector
<
int
>&
dims
,
int
dim_size
,
bool
reduce_all
)
{
std
::
vector
<
int
>
reduce_dims
;
if
(
reduce_all
)
{
reduce_dims
.
resize
(
dim_size
);
for
(
int
i
=
0
;
i
<
reduce_dims
.
size
();
++
i
)
{
reduce_dims
[
i
]
=
i
;
}
}
else
{
for
(
auto
e
:
dims
)
{
PADDLE_ENFORCE_LT
(
e
,
dim_size
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"ReduceOp: invalid axis, when x_dims is %d, "
"axis[i] should less than x_dims, but got %d."
,
dim_size
,
e
));
reduce_dims
.
push_back
(
e
>=
0
?
e
:
e
+
dim_size
);
}
}
return
reduce_dims
;
}
static
inline
int
GetLastPow2
(
int
n
)
{
n
|=
(
n
>>
1
);
n
|=
(
n
>>
2
);
...
...
@@ -167,8 +146,9 @@ enum ReduceType {
// reduce config
template
<
typename
Ty
>
struct
ReduceConfig
{
ReduceConfig
(
std
::
vector
<
int
>
origin_reduce_dims
,
std
::
vector
<
int
>
x_dim
)
:
reduce_dims_origin
(
origin_reduce_dims
),
x_dim
(
x_dim
)
{}
ReduceConfig
(
const
std
::
vector
<
int
>&
origin_reduce_dims
,
const
std
::
vector
<
int
>&
origin_x_dim
)
:
reduce_dims_origin
(
origin_reduce_dims
),
x_dim
(
origin_x_dim
)
{}
// get the parameters of reduceKernel
void
Run
()
{
...
...
@@ -530,22 +510,22 @@ __device__ __forceinline__ void ReduceAny(
// module function designed for global function
template
<
typename
Tx
,
typename
Ty
,
typename
ReduceOp
,
typename
TransformOp
,
int
BlockDim
,
int
Rank
,
int
ReduceRank
,
int
ReduceType
>
int
BlockDim
,
int
Rank
,
int
ReduceRank
>
__device__
__forceinline__
void
ReduceModule
(
const
Tx
*
x
,
Ty
*
y
,
ReduceOp
reducer
,
TransformOp
transformer
,
Ty
init
,
int
reduce_num
,
int
left_num
,
int
blocking_size
,
int
reduce_num
,
int
left_num
,
int
blocking_size
,
int
reduce_type
,
paddle
::
framework
::
Array
<
int
,
Rank
>
x_strides
,
paddle
::
framework
::
Array
<
int
,
ReduceRank
>
reduce_dim
,
paddle
::
framework
::
Array
<
int
,
ReduceRank
>
reduce_strides
,
paddle
::
framework
::
Array
<
int
,
Rank
-
ReduceRank
>
left_dim
,
paddle
::
framework
::
Array
<
int
,
Rank
-
ReduceRank
>
left_strides
)
{
// reduce_rank == 1 && reduce_dim[0] == x_dim.size() - 1
if
(
ReduceT
ype
==
ReduceType
::
kReduceLastDim
)
{
if
(
reduce_t
ype
==
ReduceType
::
kReduceLastDim
)
{
ReduceLastDim
<
Tx
,
Ty
,
ReduceOp
,
TransformOp
,
BlockDim
>
(
x
,
y
,
reducer
,
transformer
,
init
,
reduce_num
);
// reduce_rank == 1 && reduce_dim[0] != x_dim.size() - 1
}
else
if
(
ReduceT
ype
==
ReduceType
::
kReduceHigherDim
)
{
}
else
if
(
reduce_t
ype
==
ReduceType
::
kReduceHigherDim
)
{
ReduceHigherDim
<
Tx
,
Ty
,
ReduceOp
,
TransformOp
>
(
x
,
y
,
reducer
,
transformer
,
init
,
reduce_num
,
left_num
,
blocking_size
);
...
...
@@ -558,57 +538,47 @@ __device__ __forceinline__ void ReduceModule(
}
template
<
typename
Tx
,
typename
Ty
,
typename
ReduceOp
,
typename
TransformOp
,
int
BlockDim
,
int
Rank
,
int
ReduceRank
,
int
ReduceType
>
int
BlockDim
,
int
Rank
,
int
ReduceRank
>
__global__
void
ReduceKernelFunction
(
const
Tx
*
x
,
Ty
*
y
,
ReduceOp
reducer
,
TransformOp
transformer
,
Ty
init
,
int
reduce_num
,
int
left_num
,
int
block_size
,
int
reduce_num
,
int
left_num
,
int
block_size
,
int
reduce_type
,
paddle
::
framework
::
Array
<
int
,
Rank
>
x_strides
,
paddle
::
framework
::
Array
<
int
,
ReduceRank
>
reduce_dim
,
paddle
::
framework
::
Array
<
int
,
ReduceRank
>
reduce_strides
,
paddle
::
framework
::
Array
<
int
,
Rank
-
ReduceRank
>
left_dim
,
paddle
::
framework
::
Array
<
int
,
Rank
-
ReduceRank
>
left_strides
)
{
ReduceModule
<
Tx
,
Ty
,
ReduceOp
,
TransformOp
,
BlockDim
,
Rank
,
ReduceRank
,
ReduceType
>
(
x
,
y
,
reducer
,
transformer
,
init
,
reduce_num
,
left_num
,
block_size
,
x_strides
,
reduce
_dim
,
reduce_strides
,
left_dim
,
left_strides
);
ReduceModule
<
Tx
,
Ty
,
ReduceOp
,
TransformOp
,
BlockDim
,
Rank
,
ReduceRank
>
(
x
,
y
,
reducer
,
transformer
,
init
,
reduce_num
,
left_num
,
block_size
,
reduce_type
,
x_strides
,
reduce_dim
,
reduce_strides
,
left
_dim
,
left_strides
);
}
template
<
typename
Tx
,
typename
Ty
,
int
BlockDim
,
typename
ReduceOp
,
typename
TransformOp
,
int
kRank
,
int
kReduceRank
>
static
void
LaunchKernel
(
const
Tx
*
x_data
,
Ty
*
y_data
,
const
ReduceOp
&
reducer
,
const
TransformOp
&
transformer
,
Ty
init
,
gpuStream_t
stream
,
ReduceConfig
<
Ty
>
config
)
{
#define CUB_REDUCE_TYPE_CASE(type) \
case type: { \
constexpr auto kReduceType = type; \
ReduceKernelFunction< \
Tx, Ty, ReduceOp, TransformOp, BlockDim, kRank, kReduceRank, \
kReduceType><<<config.grid, config.block, 0, stream>>>( \
x_data, config.output_data, reducer, transformer, init, \
config.reduce_num, config.left_num, config.blocking_size, \
detail::VectorToArray<int, kRank>(config.x_strides), \
detail::VectorToArray<int, kReduceRank>(config.reduce_dim), \
detail::VectorToArray<int, kReduceRank>(config.reduce_strides), \
detail::VectorToArray<int, kRank - kReduceRank>(config.left_dim), \
detail::VectorToArray<int, kRank - kReduceRank>(config.left_strides)); \
} break
switch
(
config
.
reduce_type
)
{
CUB_REDUCE_TYPE_CASE
(
1
);
// reduceLastDim
CUB_REDUCE_TYPE_CASE
(
2
);
// ReduceHigherDim
CUB_REDUCE_TYPE_CASE
(
3
);
// reduceAny
}
template
<
typename
Tx
,
typename
Ty
,
int
BlockDim
,
typename
ReduceOp
,
int
kRank
,
int
kReduceRank
>
static
void
LaunchReduceKernel
(
const
Tx
*
x_data
,
Ty
*
y_data
,
const
ReduceOp
&
reducer
,
Ty
init
,
gpuStream_t
stream
,
ReduceConfig
<
Ty
>
config
)
{
using
TransformOp
=
typename
ReduceOp
::
Transformer
;
ReduceKernelFunction
<
Tx
,
Ty
,
ReduceOp
,
TransformOp
,
BlockDim
,
kRank
,
kReduceRank
><<<
config
.
grid
,
config
.
block
,
0
,
stream
>>>
(
x_data
,
config
.
output_data
,
reducer
,
TransformOp
(
config
.
reduce_num
),
init
,
config
.
reduce_num
,
config
.
left_num
,
config
.
blocking_size
,
config
.
reduce_type
,
detail
::
VectorToArray
<
int
,
kRank
>
(
config
.
x_strides
),
detail
::
VectorToArray
<
int
,
kReduceRank
>
(
config
.
reduce_dim
),
detail
::
VectorToArray
<
int
,
kReduceRank
>
(
config
.
reduce_strides
),
detail
::
VectorToArray
<
int
,
kRank
-
kReduceRank
>
(
config
.
left_dim
),
detail
::
VectorToArray
<
int
,
kRank
-
kReduceRank
>
(
config
.
left_strides
));
if
(
config
.
should_reduce_again
)
{
dim3
block
(
config
.
block
.
x
,
1
,
1
);
dim3
grid
(
config
.
grid
.
x
,
1
,
config
.
grid
.
z
);
ReduceKernelFunction
<
Ty
,
Ty
,
ReduceOp
,
detail
::
IdentityFunctor
<
Ty
>
,
128
,
kRank
,
kReduceRank
,
ReduceType
::
kReduceHigherDim
><<<
grid
,
block
,
0
,
stream
>>>
(
ReduceKernelFunction
<
Ty
,
Ty
,
ReduceOp
,
detail
::
IdentityFunctor
<
Ty
>
,
128
,
kRank
,
kReduceRank
><<<
grid
,
block
,
0
,
stream
>>>
(
config
.
output_data
,
y_data
,
reducer
,
detail
::
IdentityFunctor
<
Ty
>
(
config
.
grid
.
y
),
init
,
config
.
grid
.
y
,
config
.
left_num
,
config
.
grid
.
y
,
config
.
left_num
,
config
.
grid
.
y
,
ReduceType
::
kReduceHigherDim
,
detail
::
VectorToArray
<
int
,
kRank
>
(
config
.
x_strides
),
detail
::
VectorToArray
<
int
,
kReduceRank
>
(
config
.
reduce_dim
),
detail
::
VectorToArray
<
int
,
kReduceRank
>
(
config
.
reduce_strides
),
...
...
@@ -617,12 +587,10 @@ static void LaunchKernel(const Tx* x_data, Ty* y_data, const ReduceOp& reducer,
}
}
template
<
typename
Tx
,
typename
Ty
,
int
BlockDim
,
typename
ReduceOp
,
typename
TransformOp
>
static
void
LaunchReduceKernel
(
const
Tx
*
x_data
,
Ty
*
y_data
,
const
ReduceOp
&
reducer
,
const
TransformOp
&
transformer
,
Ty
init
,
gpuStream_t
stream
,
ReduceConfig
<
Ty
>
config
)
{
template
<
typename
Tx
,
typename
Ty
,
int
BlockDim
,
typename
ReduceOp
>
static
void
ReduceKernelImpl
(
const
Tx
*
x_data
,
Ty
*
y_data
,
const
ReduceOp
&
reducer
,
Ty
init
,
gpuStream_t
stream
,
ReduceConfig
<
Ty
>
config
)
{
int
reduce_rank
=
config
.
reduce_strides
.
size
();
int
rank
=
config
.
x_strides
.
size
();
...
...
@@ -632,11 +600,11 @@ static void LaunchReduceKernel(const Tx* x_data, Ty* y_data,
switch (reduce_rank) { __VA_ARGS__; } \
} break
#define CUB_REDUCE_RANK_CASE(i, ...)
\
case i: {
\
constexpr auto kReduceRank = i;
\
Launch
Kernel<Tx, Ty, BlockDim, ReduceOp, Transform
Op, kRank, kReduceRank>( \
x_data, y_data, reducer,
transformer, init, stream, config);
\
#define CUB_REDUCE_RANK_CASE(i, ...) \
case i: { \
constexpr auto kReduceRank = i; \
Launch
ReduceKernel<Tx, Ty, BlockDim, Reduce
Op, kRank, kReduceRank>( \
x_data, y_data, reducer,
init, stream, config);
\
} break
detail
::
CheckReduceRank
(
reduce_rank
,
rank
);
...
...
@@ -671,15 +639,13 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
auto
config
=
ReduceConfig
<
Ty
>
(
origin_reduce_dims
,
x_dim
);
config
.
Run
();
// get the parameters of LaunchReduceKernel
auto
x_data
=
x
.
data
<
Tx
>
();
auto
y_data
=
y
->
mutable_data
<
Ty
>
(
x
.
place
());
// after config.run()
// SetOutputData for ReduceHigherDim when should_reduce_again is true,
// temp_output should be stored temp_data in output_data space or stored in
// y_data;
framework
::
Tensor
tmp
;
config
.
SetOutputData
(
y_data
,
x
.
place
(),
&
tmp
);
auto
x_data
=
x
.
data
<
Tx
>
();
auto
y_data
=
y
->
mutable_data
<
Ty
>
(
x
.
place
());
if
(
config
.
reduce_num
==
1
)
{
auto
out_dims
=
y
->
dims
();
...
...
@@ -687,6 +653,9 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
y
->
Resize
(
out_dims
);
return
;
}
config
.
SetOutputData
(
y_data
,
x
.
place
(),
&
tmp
);
using
TransformOp
=
typename
ReduceOp
<
Tx
,
Ty
>::
Transformer
;
auto
reducer
=
ReduceOp
<
Tx
,
Ty
>
();
// launch CUB::Reduce
...
...
@@ -708,12 +677,11 @@ void TensorReduceFunctorImpl(const framework::Tensor& x, framework::Tensor* y,
return
;
}
#define CUB_BLOCK_DIM_CASE(block_dim) \
case block_dim: { \
constexpr auto kBlockDim = block_dim; \
LaunchReduceKernel<Tx, Ty, block_dim, ReduceOp<Tx, Ty>, TransformOp>( \
x_data, y_data, reducer, TransformOp(config.reduce_num), \
reducer.initial(), stream, config); \
#define CUB_BLOCK_DIM_CASE(block_dim) \
case block_dim: { \
constexpr auto kBlockDim = block_dim; \
ReduceKernelImpl<Tx, Ty, block_dim, ReduceOp<Tx, Ty>>( \
x_data, y_data, reducer, reducer.initial(), stream, config); \
} break
switch
(
detail
::
GetBlockDim
(
config
.
reduce_num
))
{
...
...
@@ -745,30 +713,5 @@ struct TensorReduceFunc {
}
};
template
<
typename
T
,
template
<
typename
,
typename
>
class
ReduceOp
>
class
ReduceCudaKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
bool
reduce_all
=
context
.
Attr
<
bool
>
(
"reduce_all"
);
const
Tensor
*
input
=
context
.
Input
<
Tensor
>
(
"X"
);
Tensor
*
output
=
context
.
Output
<
Tensor
>
(
"Out"
);
auto
out_dtype
=
context
.
Attr
<
int
>
(
"out_dtype"
);
std
::
vector
<
int
>
dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
std
::
vector
<
int
>
reduce_dims
=
detail
::
GetReduceDim
(
dims
,
input
->
dims
().
size
(),
reduce_all
);
gpuStream_t
stream
=
context
.
cuda_device_context
().
stream
();
if
(
out_dtype
>=
0
)
{
framework
::
VisitDataTypeSmall
(
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
out_dtype
),
TensorReduceFunc
<
T
,
ReduceOp
>
(
*
input
,
output
,
reduce_dims
,
stream
));
}
else
{
TensorReduceFunctorImpl
<
T
,
T
,
ReduceOp
>
(
*
input
,
output
,
reduce_dims
,
stream
);
}
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/reduce_ops/reduce_op.h
浏览文件 @
9b48199a
...
...
@@ -23,6 +23,9 @@ limitations under the License. */
#include "paddle/fluid/operators/cast_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
#if defined(__HIPCC__) || defined(__NVCC__)
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
#endif
namespace
paddle
{
namespace
operators
{
...
...
@@ -60,6 +63,27 @@ inline void GetShuffledDim(const DDim& src_dims, DDim* dst_dims,
}
}
static
inline
std
::
vector
<
int
>
GetReduceDim
(
const
std
::
vector
<
int
>&
dims
,
int
dim_size
,
bool
reduce_all
)
{
std
::
vector
<
int
>
reduce_dims
;
if
(
reduce_all
)
{
reduce_dims
.
resize
(
dim_size
);
int
reduce_size
=
reduce_dims
.
size
();
for
(
int
i
=
0
;
i
<
reduce_size
;
++
i
)
{
reduce_dims
[
i
]
=
i
;
}
}
else
{
for
(
auto
e
:
dims
)
{
PADDLE_ENFORCE_LT
(
e
,
dim_size
,
paddle
::
platform
::
errors
::
InvalidArgument
(
"ReduceOp: invalid axis, when x_dims is %d, "
"axis[i] should less than x_dims, but got %d."
,
dim_size
,
e
));
reduce_dims
.
push_back
(
e
>=
0
?
e
:
e
+
dim_size
);
}
}
return
reduce_dims
;
}
template
<
typename
DeviceContext
,
typename
OutT
>
void
GetShuffledInput
(
const
framework
::
ExecutionContext
&
context
,
const
Tensor
*
input
,
Tensor
*
shuffled_input
,
...
...
@@ -308,6 +332,7 @@ class BoolReduceKernel : public framework::OpKernel<OutT> {
}
}
};
template
<
typename
DeviceContext
,
typename
T
,
typename
Functor
,
bool
kNoNeedBufferX
=
false
,
bool
kNoNeedBufferY
=
false
>
class
ReduceGradKernel
:
public
framework
::
OpKernel
<
T
>
{
...
...
@@ -636,6 +661,33 @@ If reduce_all is true, just reduce along all dimensions and output a scalar.
virtual
std
::
string
GetOpType
()
const
=
0
;
};
#if defined(__HIPCC__) || defined(__NVCC__)
template
<
typename
T
,
template
<
typename
,
typename
>
class
ReduceOp
>
class
ReduceCudaKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
bool
reduce_all
=
context
.
Attr
<
bool
>
(
"reduce_all"
);
const
Tensor
*
input
=
context
.
Input
<
Tensor
>
(
"X"
);
Tensor
*
output
=
context
.
Output
<
Tensor
>
(
"Out"
);
auto
out_dtype
=
context
.
Attr
<
int
>
(
"out_dtype"
);
std
::
vector
<
int
>
dims
=
context
.
Attr
<
std
::
vector
<
int
>>
(
"dim"
);
std
::
vector
<
int
>
reduce_dims
=
GetReduceDim
(
dims
,
input
->
dims
().
size
(),
reduce_all
);
gpuStream_t
stream
=
context
.
cuda_device_context
().
stream
();
if
(
out_dtype
>=
0
)
{
framework
::
VisitDataTypeSmall
(
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
out_dtype
),
TensorReduceFunc
<
T
,
ReduceOp
>
(
*
input
,
output
,
reduce_dims
,
stream
));
}
else
{
TensorReduceFunctorImpl
<
T
,
T
,
ReduceOp
>
(
*
input
,
output
,
reduce_dims
,
stream
);
}
}
};
#endif
}
// namespace operators
}
// namespace paddle
...
...
paddle/fluid/operators/reduce_ops/reduce_prod_op.cu
浏览文件 @
9b48199a
...
...
@@ -16,18 +16,8 @@
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
#include "paddle/fluid/operators/reduce_ops/reduce_prod_op.h"
// reduce_prod
#ifdef __HIPCC__
// Eigen3/unsupported/Eigen/CXX11/src/Tensor/TensorReductionGpu.h:922
// do not support double in HIPCC platform (Eigen3 to be fixed)
REGISTER_OP_CUDA_KERNEL
(
reduce_prod
,
ops
::
ReduceCudaKernel
<
float
,
paddle
::
operators
::
CustomMul
>
,
ops
::
ReduceCudaKernel
<
int
,
paddle
::
operators
::
CustomMul
>
,
ops
::
ReduceCudaKernel
<
int64_t
,
paddle
::
operators
::
CustomMul
>
);
#else
REGISTER_OP_CUDA_KERNEL
(
reduce_prod
,
ops
::
ReduceCudaKernel
<
float
,
paddle
::
operators
::
CustomMul
>
,
ops
::
ReduceCudaKernel
<
int
,
paddle
::
operators
::
CustomMul
>
,
ops
::
ReduceCudaKernel
<
double
,
paddle
::
operators
::
CustomMul
>
,
ops
::
ReduceCudaKernel
<
int64_t
,
paddle
::
operators
::
CustomMul
>
);
#endif
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录