Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
3419de53
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
3419de53
编写于
5月 10, 2021
作者:
Z
Zhang Zheng
提交者:
GitHub
5月 10, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Support different data type between input and output (#32823)
上级
fbbc3394
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
129 addition
and
86 deletion
+129
-86
paddle/fluid/operators/abs_op.cu
paddle/fluid/operators/abs_op.cu
+66
-31
paddle/fluid/operators/activation_op.cu
paddle/fluid/operators/activation_op.cu
+8
-8
paddle/fluid/operators/elementwise/elementwise_add_op.cu
paddle/fluid/operators/elementwise/elementwise_add_op.cu
+1
-1
paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+54
-46
未找到文件。
paddle/fluid/operators/abs_op.cu
浏览文件 @
3419de53
...
...
@@ -13,44 +13,79 @@
// limitations under the License.
#include "paddle/fluid/operators/abs_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
#include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
,
typename
Enable
=
void
>
struct
CudaAbsFunctor
;
template
<
typename
T
>
struct
CudaAbsFunctor
<
T
,
math
::
Complex
<
T
,
math
::
Real
<
T
>>>
{
__device__
__forceinline__
math
::
Real
<
T
>
operator
()(
const
T
*
args
)
const
{
return
abs
(
args
[
0
]);
}
};
template
<
typename
T
>
struct
CudaAbsFunctor
<
T
,
math
::
NoComplex
<
T
,
math
::
Real
<
T
>>>
{
__device__
__forceinline__
T
operator
()(
const
T
*
args
)
const
{
return
std
::
abs
(
args
[
0
]);
}
};
template
<
typename
T
>
class
AbsKernel
<
platform
::
CUDADeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
const
Tensor
*
x
=
context
.
Input
<
Tensor
>
(
"X"
);
Tensor
*
out
=
context
.
Output
<
Tensor
>
(
"Out"
);
out
->
mutable_data
<
math
::
Real
<
T
>>
(
context
.
GetPlace
());
auto
&
dev_ctx
=
context
.
template
device_context
<
platform
::
CUDADeviceContext
>();
std
::
vector
<
const
framework
::
Tensor
*>
ins
=
{
x
};
std
::
vector
<
framework
::
Tensor
*>
outs
=
{
out
};
auto
functor
=
CudaAbsFunctor
<
T
>
();
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kUnary
,
T
,
math
::
Real
<
T
>>
(
dev_ctx
,
ins
,
&
outs
,
functor
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
abs
,
ops
::
AbsKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
AbsKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
AbsKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
AbsKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
AbsKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
>
,
ops
::
AbsKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
complex64
>
,
ops
::
AbsKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
complex128
>
);
abs
,
ops
::
AbsKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
AbsKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
AbsKernel
<
plat
::
CUDADeviceContext
,
int
>
,
ops
::
AbsKernel
<
plat
::
CUDADeviceContext
,
int64_t
>
,
ops
::
AbsKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
,
ops
::
AbsKernel
<
plat
::
CUDADeviceContext
,
plat
::
complex64
>
,
ops
::
AbsKernel
<
plat
::
CUDADeviceContext
,
plat
::
complex128
>
);
REGISTER_OP_CUDA_KERNEL
(
abs_grad
,
ops
::
AbsGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
AbsGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
AbsGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
AbsGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
AbsGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
>
,
ops
::
AbsGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
complex64
>
,
ops
::
AbsGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
complex128
>
);
abs_grad
,
ops
::
AbsGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
AbsGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
AbsGradKernel
<
plat
::
CUDADeviceContext
,
int
>
,
ops
::
AbsGradKernel
<
plat
::
CUDADeviceContext
,
int64_t
>
,
ops
::
AbsGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
,
ops
::
AbsGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
complex64
>
,
ops
::
AbsGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
complex128
>
);
REGISTER_OP_CUDA_KERNEL
(
abs_grad_grad
,
ops
::
AbsDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
AbsDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
AbsDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
AbsDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
AbsDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
>
,
ops
::
AbsDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
complex64
>
,
ops
::
AbsDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
complex128
>
);
abs_grad_grad
,
ops
::
AbsDoubleGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
AbsDoubleGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
AbsDoubleGradKernel
<
plat
::
CUDADeviceContext
,
int
>
,
ops
::
AbsDoubleGradKernel
<
plat
::
CUDADeviceContext
,
int64_t
>
,
ops
::
AbsDoubleGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
,
ops
::
AbsDoubleGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
complex64
>
,
ops
::
AbsDoubleGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
complex128
>
);
paddle/fluid/operators/activation_op.cu
浏览文件 @
3419de53
...
...
@@ -1315,8 +1315,8 @@ class ActivationCudaKernel
for
(
auto
&
attr
:
attrs
)
{
*
attr
.
second
=
ctx
.
Attr
<
float
>
(
attr
.
first
);
}
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kUnary
,
T
>
(
dev_ctx
,
ins
,
&
out
s
,
functor
);
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kUnary
,
T
,
T
>
(
dev_ctx
,
in
s
,
&
outs
,
functor
);
}
};
...
...
@@ -1345,17 +1345,17 @@ class ActivationGradCudaKernel
if
(
static_cast
<
int
>
(
Functor
::
FwdDeps
())
==
static_cast
<
int
>
(
kDepOut
))
{
// Only need forward output Out
ins
.
push_back
(
out
);
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kBinary
,
T
>
(
dev_ctx
,
ins
,
&
outs
,
functor
);
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kBinary
,
T
,
T
>
(
dev_ctx
,
ins
,
&
outs
,
functor
);
}
else
if
(
static_cast
<
int
>
(
Functor
::
FwdDeps
())
==
static_cast
<
int
>
(
kDepX
))
{
// Only need forward input X
ins
.
push_back
(
x
);
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kBinary
,
T
>
(
dev_ctx
,
ins
,
&
outs
,
functor
);
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kBinary
,
T
,
T
>
(
dev_ctx
,
ins
,
&
outs
,
functor
);
}
else
{
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kUnary
,
T
>
(
dev_ctx
,
ins
,
&
outs
,
functor
);
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kUnary
,
T
,
T
>
(
dev_ctx
,
ins
,
&
outs
,
functor
);
}
}
};
...
...
paddle/fluid/operators/elementwise/elementwise_add_op.cu
浏览文件 @
3419de53
...
...
@@ -45,7 +45,7 @@ struct SameDimsElemwiseAdd<platform::CUDADeviceContext, T> {
framework
::
Tensor
*
z
)
{
std
::
vector
<
const
framework
::
Tensor
*>
ins
=
{
x
,
y
};
std
::
vector
<
framework
::
Tensor
*>
outs
=
{
z
};
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kBinary
,
T
>
(
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kBinary
,
T
,
T
>
(
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>(),
ins
,
&
outs
,
CudaAddFunctor
<
T
>
());
}
...
...
paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
浏览文件 @
3419de53
...
...
@@ -49,69 +49,73 @@ int GetVectorizedSizeImpl(const T *pointer) {
return
1
;
}
template
<
typename
T
>
template
<
typename
InT
,
typename
Out
T
>
int
GetVectorizedSize
(
const
std
::
vector
<
const
framework
::
Tensor
*>
&
ins
,
const
std
::
vector
<
framework
::
Tensor
*>
&
outs
)
{
int
vec_size
=
4
;
for
(
auto
iter
=
ins
.
begin
();
iter
!=
ins
.
end
();
++
iter
)
{
vec_size
=
std
::
min
<
int
>
(
vec_size
,
GetVectorizedSizeImpl
((
*
iter
)
->
data
<
T
>
()));
std
::
min
<
int
>
(
vec_size
,
GetVectorizedSizeImpl
((
*
iter
)
->
data
<
In
T
>
()));
}
for
(
auto
iter
=
outs
.
begin
();
iter
!=
outs
.
end
();
++
iter
)
{
vec_size
=
std
::
min
<
int
>
(
vec_size
,
GetVectorizedSizeImpl
((
*
iter
)
->
data
<
T
>
()));
std
::
min
<
int
>
(
vec_size
,
GetVectorizedSizeImpl
((
*
iter
)
->
data
<
Out
T
>
()));
}
return
vec_size
;
}
template
<
ElementwiseType
ET
,
int
VecSize
,
typename
T
>
template
<
ElementwiseType
ET
,
int
VecSize
,
typename
InT
,
typename
Out
T
>
struct
ElementwiseDataWrapper
{
T
*
out
;
const
T
*
in0
;
const
T
*
in1
;
__device__
ElementwiseDataWrapper
(
T
*
out
,
const
T
*
in0
,
const
T
*
in1
=
nullptr
)
Out
T
*
out
;
const
In
T
*
in0
;
const
In
T
*
in1
;
__device__
ElementwiseDataWrapper
(
OutT
*
out
,
const
In
T
*
in0
,
const
In
T
*
in1
=
nullptr
)
:
out
(
out
),
in0
(
in0
),
in1
(
in1
)
{}
using
VecType
=
CudaAlignedVector
<
T
,
VecSize
>
;
using
InVecType
=
CudaAlignedVector
<
InT
,
VecSize
>
;
using
OutVecType
=
CudaAlignedVector
<
OutT
,
VecSize
>
;
inline
__device__
void
load_vector
(
VecType
args
[],
int
idx
)
{
const
VecType
*
x_vec
=
reinterpret_cast
<
const
VecType
*>
(
in0
);
inline
__device__
void
load_vector
(
In
VecType
args
[],
int
idx
)
{
const
InVecType
*
x_vec
=
reinterpret_cast
<
const
In
VecType
*>
(
in0
);
args
[
0
]
=
x_vec
[
idx
];
if
(
ET
==
ElementwiseType
::
kBinary
)
{
const
VecType
*
y_vec
=
reinterpret_cast
<
const
VecType
*>
(
in1
);
const
InVecType
*
y_vec
=
reinterpret_cast
<
const
In
VecType
*>
(
in1
);
args
[
1
]
=
y_vec
[
idx
];
}
}
inline
__device__
void
load_scalar
(
T
args
[],
int
idx
)
{
inline
__device__
void
load_scalar
(
In
T
args
[],
int
idx
)
{
args
[
0
]
=
in0
[
idx
];
if
(
ET
==
ElementwiseType
::
kBinary
)
{
args
[
1
]
=
in1
[
idx
];
}
}
inline
__device__
void
store_vector
(
VecType
res
,
int
idx
)
{
VecType
*
out_vec
=
reinterpret_cast
<
VecType
*>
(
out
);
inline
__device__
void
store_vector
(
Out
VecType
res
,
int
idx
)
{
OutVecType
*
out_vec
=
reinterpret_cast
<
Out
VecType
*>
(
out
);
out_vec
[
idx
]
=
res
;
}
inline
__device__
void
store_scalar
(
T
res
,
int
idx
)
{
out
[
idx
]
=
res
;
}
inline
__device__
void
store_scalar
(
Out
T
res
,
int
idx
)
{
out
[
idx
]
=
res
;
}
};
template
<
ElementwiseType
ET
,
int
VecSize
,
typename
T
,
typename
Functor
>
template
<
ElementwiseType
ET
,
int
VecSize
,
typename
InT
,
typename
OutT
,
typename
Functor
>
__device__
void
VectorizedKernelImpl
(
ElementwiseDataWrapper
<
ET
,
VecSize
,
T
>
data
,
Functor
func
,
int
tid
)
{
using
VecType
=
CudaAlignedVector
<
T
,
VecSize
>
;
VecType
ins_vec
[
ET
];
VecType
out_vec
;
T
*
ins_ptr
[
ET
];
T
*
out_ptr
;
ElementwiseDataWrapper
<
ET
,
VecSize
,
InT
,
OutT
>
data
,
Functor
func
,
int
tid
)
{
using
InVecType
=
CudaAlignedVector
<
InT
,
VecSize
>
;
using
OutVecType
=
CudaAlignedVector
<
OutT
,
VecSize
>
;
InVecType
ins_vec
[
ET
];
OutVecType
out_vec
;
InT
*
ins_ptr
[
ET
];
OutT
*
out_ptr
;
#pragma unroll
for
(
int
i
=
0
;
i
<
ET
;
++
i
)
{
ins_ptr
[
i
]
=
reinterpret_cast
<
T
*>
(
&
(
ins_vec
[
i
]));
ins_ptr
[
i
]
=
reinterpret_cast
<
In
T
*>
(
&
(
ins_vec
[
i
]));
}
out_ptr
=
reinterpret_cast
<
T
*>
(
&
out_vec
);
out_ptr
=
reinterpret_cast
<
Out
T
*>
(
&
out_vec
);
// load
data
.
load_vector
(
ins_vec
,
tid
);
...
...
@@ -119,7 +123,7 @@ __device__ void VectorizedKernelImpl(
// compute
#pragma unroll
for
(
int
i
=
0
;
i
<
VecSize
;
++
i
)
{
T
ins
[
ET
];
In
T
ins
[
ET
];
#pragma unroll
for
(
int
j
=
0
;
j
<
ET
;
++
j
)
{
ins
[
j
]
=
ins_ptr
[
j
][
i
];
...
...
@@ -131,11 +135,13 @@ __device__ void VectorizedKernelImpl(
data
.
store_vector
(
out_vec
,
tid
);
}
template
<
ElementwiseType
ET
,
int
VecSize
,
typename
T
,
typename
Functor
>
__device__
void
ScalarKernelImpl
(
ElementwiseDataWrapper
<
ET
,
VecSize
,
T
>
data
,
Functor
func
,
int
start
,
int
remain
)
{
T
ins
[
ET
];
T
out
;
template
<
ElementwiseType
ET
,
int
VecSize
,
typename
InT
,
typename
OutT
,
typename
Functor
>
__device__
void
ScalarKernelImpl
(
ElementwiseDataWrapper
<
ET
,
VecSize
,
InT
,
OutT
>
data
,
Functor
func
,
int
start
,
int
remain
)
{
InT
ins
[
ET
];
OutT
out
;
for
(
int
i
=
0
;
i
<
remain
;
++
i
)
{
int
idx
=
start
+
i
;
...
...
@@ -148,14 +154,15 @@ __device__ void ScalarKernelImpl(ElementwiseDataWrapper<ET, VecSize, T> data,
}
}
template
<
ElementwiseType
ET
,
int
VecSize
,
typename
T
,
typename
Functor
>
__global__
void
VectorizedKernel
(
const
T
*
__restrict__
in0
,
const
T
*
__restrict__
in1
,
T
*
out
,
int
size
,
Functor
func
)
{
template
<
ElementwiseType
ET
,
int
VecSize
,
typename
InT
,
typename
OutT
,
typename
Functor
>
__global__
void
VectorizedKernel
(
const
InT
*
__restrict__
in0
,
const
InT
*
__restrict__
in1
,
OutT
*
out
,
int
size
,
Functor
func
)
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
remain
=
size
-
VecSize
*
tid
;
remain
=
remain
>
0
?
remain
:
0
;
auto
data
=
ElementwiseDataWrapper
<
ET
,
VecSize
,
T
>
(
out
,
in0
,
in1
);
auto
data
=
ElementwiseDataWrapper
<
ET
,
VecSize
,
InT
,
Out
T
>
(
out
,
in0
,
in1
);
if
(
remain
>=
VecSize
)
{
VectorizedKernelImpl
(
data
,
func
,
tid
);
}
else
{
...
...
@@ -163,30 +170,31 @@ __global__ void VectorizedKernel(const T *__restrict__ in0,
}
}
template
<
ElementwiseType
ET
,
typename
T
,
typename
Functor
>
__global__
void
ScalarKernel
(
const
T
*
__restrict__
in0
,
const
T
*
__restrict__
in1
,
T
*
out
,
int
size
,
template
<
ElementwiseType
ET
,
typename
InT
,
typename
Out
T
,
typename
Functor
>
__global__
void
ScalarKernel
(
const
In
T
*
__restrict__
in0
,
const
InT
*
__restrict__
in1
,
Out
T
*
out
,
int
size
,
Functor
func
)
{
auto
data
=
ElementwiseDataWrapper
<
ET
,
1
,
T
>
(
out
,
in0
,
in1
);
auto
data
=
ElementwiseDataWrapper
<
ET
,
1
,
InT
,
Out
T
>
(
out
,
in0
,
in1
);
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
remain
=
tid
<
size
?
1
:
0
;
ScalarKernelImpl
(
data
,
func
,
tid
,
remain
);
}
template
<
ElementwiseType
ET
,
typename
T
,
typename
Functor
>
template
<
ElementwiseType
ET
,
typename
InT
,
typename
Out
T
,
typename
Functor
>
void
LaunchElementwiseCudaKernel
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
std
::
vector
<
const
framework
::
Tensor
*>
&
ins
,
std
::
vector
<
framework
::
Tensor
*>
*
outs
,
Functor
func
)
{
// calculate the max vec_size for all ins and outs
auto
size
=
ins
[
0
]
->
numel
();
int
vec_size
=
GetVectorizedSize
<
T
>
(
ins
,
*
outs
);
int
vec_size
=
GetVectorizedSize
<
InT
,
Out
T
>
(
ins
,
*
outs
);
int
block_size
=
ELEMENTWISE_BLOCK_SIZE
;
int
grid_size
=
((
size
+
vec_size
-
1
)
/
vec_size
+
block_size
-
1
)
/
block_size
;
const
T
*
in0
=
ins
[
0
]
->
data
<
T
>
();
const
T
*
in1
=
(
ET
==
ElementwiseType
::
kBinary
)
?
ins
[
1
]
->
data
<
T
>
()
:
nullptr
;
T
*
out
=
(
*
outs
)[
0
]
->
data
<
T
>
();
const
InT
*
in0
=
ins
[
0
]
->
data
<
InT
>
();
const
InT
*
in1
=
(
ET
==
ElementwiseType
::
kBinary
)
?
ins
[
1
]
->
data
<
InT
>
()
:
nullptr
;
OutT
*
out
=
(
*
outs
)[
0
]
->
data
<
OutT
>
();
// cuda kernel
auto
stream
=
ctx
.
stream
();
switch
(
vec_size
)
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录