Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
3419de53
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2305
Star
20932
Fork
5423
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
3419de53
编写于
5月 10, 2021
作者:
Z
Zhang Zheng
提交者:
GitHub
5月 10, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Support different data type between input and output (#32823)
上级
fbbc3394
变更
4
隐藏空白更改
内联
并排
Showing
4 changed file
with
129 addition
and
86 deletion
+129
-86
paddle/fluid/operators/abs_op.cu
paddle/fluid/operators/abs_op.cu
+66
-31
paddle/fluid/operators/activation_op.cu
paddle/fluid/operators/activation_op.cu
+8
-8
paddle/fluid/operators/elementwise/elementwise_add_op.cu
paddle/fluid/operators/elementwise/elementwise_add_op.cu
+1
-1
paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+54
-46
未找到文件。
paddle/fluid/operators/abs_op.cu
浏览文件 @
3419de53
...
...
@@ -13,44 +13,79 @@
// limitations under the License.
#include "paddle/fluid/operators/abs_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
#include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h"
#include "paddle/fluid/platform/float16.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
,
typename
Enable
=
void
>
struct
CudaAbsFunctor
;
template
<
typename
T
>
struct
CudaAbsFunctor
<
T
,
math
::
Complex
<
T
,
math
::
Real
<
T
>>>
{
__device__
__forceinline__
math
::
Real
<
T
>
operator
()(
const
T
*
args
)
const
{
return
abs
(
args
[
0
]);
}
};
template
<
typename
T
>
struct
CudaAbsFunctor
<
T
,
math
::
NoComplex
<
T
,
math
::
Real
<
T
>>>
{
__device__
__forceinline__
T
operator
()(
const
T
*
args
)
const
{
return
std
::
abs
(
args
[
0
]);
}
};
template
<
typename
T
>
class
AbsKernel
<
platform
::
CUDADeviceContext
,
T
>
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
const
Tensor
*
x
=
context
.
Input
<
Tensor
>
(
"X"
);
Tensor
*
out
=
context
.
Output
<
Tensor
>
(
"Out"
);
out
->
mutable_data
<
math
::
Real
<
T
>>
(
context
.
GetPlace
());
auto
&
dev_ctx
=
context
.
template
device_context
<
platform
::
CUDADeviceContext
>();
std
::
vector
<
const
framework
::
Tensor
*>
ins
=
{
x
};
std
::
vector
<
framework
::
Tensor
*>
outs
=
{
out
};
auto
functor
=
CudaAbsFunctor
<
T
>
();
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kUnary
,
T
,
math
::
Real
<
T
>>
(
dev_ctx
,
ins
,
&
outs
,
functor
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
abs
,
ops
::
AbsKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
AbsKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
AbsKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
AbsKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
AbsKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
>
,
ops
::
AbsKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
complex64
>
,
ops
::
AbsKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
complex128
>
);
abs
,
ops
::
AbsKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
AbsKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
AbsKernel
<
plat
::
CUDADeviceContext
,
int
>
,
ops
::
AbsKernel
<
plat
::
CUDADeviceContext
,
int64_t
>
,
ops
::
AbsKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
,
ops
::
AbsKernel
<
plat
::
CUDADeviceContext
,
plat
::
complex64
>
,
ops
::
AbsKernel
<
plat
::
CUDADeviceContext
,
plat
::
complex128
>
);
REGISTER_OP_CUDA_KERNEL
(
abs_grad
,
ops
::
AbsGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
AbsGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
AbsGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
AbsGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
AbsGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
>
,
ops
::
AbsGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
complex64
>
,
ops
::
AbsGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
complex128
>
);
abs_grad
,
ops
::
AbsGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
AbsGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
AbsGradKernel
<
plat
::
CUDADeviceContext
,
int
>
,
ops
::
AbsGradKernel
<
plat
::
CUDADeviceContext
,
int64_t
>
,
ops
::
AbsGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
,
ops
::
AbsGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
complex64
>
,
ops
::
AbsGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
complex128
>
);
REGISTER_OP_CUDA_KERNEL
(
abs_grad_grad
,
ops
::
AbsDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
AbsDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
AbsDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
AbsDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
,
ops
::
AbsDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
float16
>
,
ops
::
AbsDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
complex64
>
,
ops
::
AbsDoubleGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
paddle
::
platform
::
complex128
>
);
abs_grad_grad
,
ops
::
AbsDoubleGradKernel
<
plat
::
CUDADeviceContext
,
float
>
,
ops
::
AbsDoubleGradKernel
<
plat
::
CUDADeviceContext
,
double
>
,
ops
::
AbsDoubleGradKernel
<
plat
::
CUDADeviceContext
,
int
>
,
ops
::
AbsDoubleGradKernel
<
plat
::
CUDADeviceContext
,
int64_t
>
,
ops
::
AbsDoubleGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
float16
>
,
ops
::
AbsDoubleGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
complex64
>
,
ops
::
AbsDoubleGradKernel
<
plat
::
CUDADeviceContext
,
plat
::
complex128
>
);
paddle/fluid/operators/activation_op.cu
浏览文件 @
3419de53
...
...
@@ -1315,8 +1315,8 @@ class ActivationCudaKernel
for
(
auto
&
attr
:
attrs
)
{
*
attr
.
second
=
ctx
.
Attr
<
float
>
(
attr
.
first
);
}
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kUnary
,
T
>
(
dev_ctx
,
ins
,
&
out
s
,
functor
);
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kUnary
,
T
,
T
>
(
dev_ctx
,
in
s
,
&
outs
,
functor
);
}
};
...
...
@@ -1345,17 +1345,17 @@ class ActivationGradCudaKernel
if
(
static_cast
<
int
>
(
Functor
::
FwdDeps
())
==
static_cast
<
int
>
(
kDepOut
))
{
// Only need forward output Out
ins
.
push_back
(
out
);
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kBinary
,
T
>
(
dev_ctx
,
ins
,
&
outs
,
functor
);
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kBinary
,
T
,
T
>
(
dev_ctx
,
ins
,
&
outs
,
functor
);
}
else
if
(
static_cast
<
int
>
(
Functor
::
FwdDeps
())
==
static_cast
<
int
>
(
kDepX
))
{
// Only need forward input X
ins
.
push_back
(
x
);
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kBinary
,
T
>
(
dev_ctx
,
ins
,
&
outs
,
functor
);
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kBinary
,
T
,
T
>
(
dev_ctx
,
ins
,
&
outs
,
functor
);
}
else
{
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kUnary
,
T
>
(
dev_ctx
,
ins
,
&
outs
,
functor
);
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kUnary
,
T
,
T
>
(
dev_ctx
,
ins
,
&
outs
,
functor
);
}
}
};
...
...
paddle/fluid/operators/elementwise/elementwise_add_op.cu
浏览文件 @
3419de53
...
...
@@ -45,7 +45,7 @@ struct SameDimsElemwiseAdd<platform::CUDADeviceContext, T> {
framework
::
Tensor
*
z
)
{
std
::
vector
<
const
framework
::
Tensor
*>
ins
=
{
x
,
y
};
std
::
vector
<
framework
::
Tensor
*>
outs
=
{
z
};
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kBinary
,
T
>
(
LaunchElementwiseCudaKernel
<
ElementwiseType
::
kBinary
,
T
,
T
>
(
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>(),
ins
,
&
outs
,
CudaAddFunctor
<
T
>
());
}
...
...
paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
浏览文件 @
3419de53
...
...
@@ -49,69 +49,73 @@ int GetVectorizedSizeImpl(const T *pointer) {
return
1
;
}
template
<
typename
T
>
template
<
typename
InT
,
typename
Out
T
>
int
GetVectorizedSize
(
const
std
::
vector
<
const
framework
::
Tensor
*>
&
ins
,
const
std
::
vector
<
framework
::
Tensor
*>
&
outs
)
{
int
vec_size
=
4
;
for
(
auto
iter
=
ins
.
begin
();
iter
!=
ins
.
end
();
++
iter
)
{
vec_size
=
std
::
min
<
int
>
(
vec_size
,
GetVectorizedSizeImpl
((
*
iter
)
->
data
<
T
>
()));
std
::
min
<
int
>
(
vec_size
,
GetVectorizedSizeImpl
((
*
iter
)
->
data
<
In
T
>
()));
}
for
(
auto
iter
=
outs
.
begin
();
iter
!=
outs
.
end
();
++
iter
)
{
vec_size
=
std
::
min
<
int
>
(
vec_size
,
GetVectorizedSizeImpl
((
*
iter
)
->
data
<
T
>
()));
std
::
min
<
int
>
(
vec_size
,
GetVectorizedSizeImpl
((
*
iter
)
->
data
<
Out
T
>
()));
}
return
vec_size
;
}
template
<
ElementwiseType
ET
,
int
VecSize
,
typename
T
>
template
<
ElementwiseType
ET
,
int
VecSize
,
typename
InT
,
typename
Out
T
>
struct
ElementwiseDataWrapper
{
T
*
out
;
const
T
*
in0
;
const
T
*
in1
;
__device__
ElementwiseDataWrapper
(
T
*
out
,
const
T
*
in0
,
const
T
*
in1
=
nullptr
)
Out
T
*
out
;
const
In
T
*
in0
;
const
In
T
*
in1
;
__device__
ElementwiseDataWrapper
(
OutT
*
out
,
const
In
T
*
in0
,
const
In
T
*
in1
=
nullptr
)
:
out
(
out
),
in0
(
in0
),
in1
(
in1
)
{}
using
VecType
=
CudaAlignedVector
<
T
,
VecSize
>
;
using
InVecType
=
CudaAlignedVector
<
InT
,
VecSize
>
;
using
OutVecType
=
CudaAlignedVector
<
OutT
,
VecSize
>
;
inline
__device__
void
load_vector
(
VecType
args
[],
int
idx
)
{
const
VecType
*
x_vec
=
reinterpret_cast
<
const
VecType
*>
(
in0
);
inline
__device__
void
load_vector
(
In
VecType
args
[],
int
idx
)
{
const
InVecType
*
x_vec
=
reinterpret_cast
<
const
In
VecType
*>
(
in0
);
args
[
0
]
=
x_vec
[
idx
];
if
(
ET
==
ElementwiseType
::
kBinary
)
{
const
VecType
*
y_vec
=
reinterpret_cast
<
const
VecType
*>
(
in1
);
const
InVecType
*
y_vec
=
reinterpret_cast
<
const
In
VecType
*>
(
in1
);
args
[
1
]
=
y_vec
[
idx
];
}
}
inline
__device__
void
load_scalar
(
T
args
[],
int
idx
)
{
inline
__device__
void
load_scalar
(
In
T
args
[],
int
idx
)
{
args
[
0
]
=
in0
[
idx
];
if
(
ET
==
ElementwiseType
::
kBinary
)
{
args
[
1
]
=
in1
[
idx
];
}
}
inline
__device__
void
store_vector
(
VecType
res
,
int
idx
)
{
VecType
*
out_vec
=
reinterpret_cast
<
VecType
*>
(
out
);
inline
__device__
void
store_vector
(
Out
VecType
res
,
int
idx
)
{
OutVecType
*
out_vec
=
reinterpret_cast
<
Out
VecType
*>
(
out
);
out_vec
[
idx
]
=
res
;
}
inline
__device__
void
store_scalar
(
T
res
,
int
idx
)
{
out
[
idx
]
=
res
;
}
inline
__device__
void
store_scalar
(
Out
T
res
,
int
idx
)
{
out
[
idx
]
=
res
;
}
};
template
<
ElementwiseType
ET
,
int
VecSize
,
typename
T
,
typename
Functor
>
template
<
ElementwiseType
ET
,
int
VecSize
,
typename
InT
,
typename
OutT
,
typename
Functor
>
__device__
void
VectorizedKernelImpl
(
ElementwiseDataWrapper
<
ET
,
VecSize
,
T
>
data
,
Functor
func
,
int
tid
)
{
using
VecType
=
CudaAlignedVector
<
T
,
VecSize
>
;
VecType
ins_vec
[
ET
];
VecType
out_vec
;
T
*
ins_ptr
[
ET
];
T
*
out_ptr
;
ElementwiseDataWrapper
<
ET
,
VecSize
,
InT
,
OutT
>
data
,
Functor
func
,
int
tid
)
{
using
InVecType
=
CudaAlignedVector
<
InT
,
VecSize
>
;
using
OutVecType
=
CudaAlignedVector
<
OutT
,
VecSize
>
;
InVecType
ins_vec
[
ET
];
OutVecType
out_vec
;
InT
*
ins_ptr
[
ET
];
OutT
*
out_ptr
;
#pragma unroll
for
(
int
i
=
0
;
i
<
ET
;
++
i
)
{
ins_ptr
[
i
]
=
reinterpret_cast
<
T
*>
(
&
(
ins_vec
[
i
]));
ins_ptr
[
i
]
=
reinterpret_cast
<
In
T
*>
(
&
(
ins_vec
[
i
]));
}
out_ptr
=
reinterpret_cast
<
T
*>
(
&
out_vec
);
out_ptr
=
reinterpret_cast
<
Out
T
*>
(
&
out_vec
);
// load
data
.
load_vector
(
ins_vec
,
tid
);
...
...
@@ -119,7 +123,7 @@ __device__ void VectorizedKernelImpl(
// compute
#pragma unroll
for
(
int
i
=
0
;
i
<
VecSize
;
++
i
)
{
T
ins
[
ET
];
In
T
ins
[
ET
];
#pragma unroll
for
(
int
j
=
0
;
j
<
ET
;
++
j
)
{
ins
[
j
]
=
ins_ptr
[
j
][
i
];
...
...
@@ -131,11 +135,13 @@ __device__ void VectorizedKernelImpl(
data
.
store_vector
(
out_vec
,
tid
);
}
template
<
ElementwiseType
ET
,
int
VecSize
,
typename
T
,
typename
Functor
>
__device__
void
ScalarKernelImpl
(
ElementwiseDataWrapper
<
ET
,
VecSize
,
T
>
data
,
Functor
func
,
int
start
,
int
remain
)
{
T
ins
[
ET
];
T
out
;
template
<
ElementwiseType
ET
,
int
VecSize
,
typename
InT
,
typename
OutT
,
typename
Functor
>
__device__
void
ScalarKernelImpl
(
ElementwiseDataWrapper
<
ET
,
VecSize
,
InT
,
OutT
>
data
,
Functor
func
,
int
start
,
int
remain
)
{
InT
ins
[
ET
];
OutT
out
;
for
(
int
i
=
0
;
i
<
remain
;
++
i
)
{
int
idx
=
start
+
i
;
...
...
@@ -148,14 +154,15 @@ __device__ void ScalarKernelImpl(ElementwiseDataWrapper<ET, VecSize, T> data,
}
}
template
<
ElementwiseType
ET
,
int
VecSize
,
typename
T
,
typename
Functor
>
__global__
void
VectorizedKernel
(
const
T
*
__restrict__
in0
,
const
T
*
__restrict__
in1
,
T
*
out
,
int
size
,
Functor
func
)
{
template
<
ElementwiseType
ET
,
int
VecSize
,
typename
InT
,
typename
OutT
,
typename
Functor
>
__global__
void
VectorizedKernel
(
const
InT
*
__restrict__
in0
,
const
InT
*
__restrict__
in1
,
OutT
*
out
,
int
size
,
Functor
func
)
{
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
remain
=
size
-
VecSize
*
tid
;
remain
=
remain
>
0
?
remain
:
0
;
auto
data
=
ElementwiseDataWrapper
<
ET
,
VecSize
,
T
>
(
out
,
in0
,
in1
);
auto
data
=
ElementwiseDataWrapper
<
ET
,
VecSize
,
InT
,
Out
T
>
(
out
,
in0
,
in1
);
if
(
remain
>=
VecSize
)
{
VectorizedKernelImpl
(
data
,
func
,
tid
);
}
else
{
...
...
@@ -163,30 +170,31 @@ __global__ void VectorizedKernel(const T *__restrict__ in0,
}
}
template
<
ElementwiseType
ET
,
typename
T
,
typename
Functor
>
__global__
void
ScalarKernel
(
const
T
*
__restrict__
in0
,
const
T
*
__restrict__
in1
,
T
*
out
,
int
size
,
template
<
ElementwiseType
ET
,
typename
InT
,
typename
Out
T
,
typename
Functor
>
__global__
void
ScalarKernel
(
const
In
T
*
__restrict__
in0
,
const
InT
*
__restrict__
in1
,
Out
T
*
out
,
int
size
,
Functor
func
)
{
auto
data
=
ElementwiseDataWrapper
<
ET
,
1
,
T
>
(
out
,
in0
,
in1
);
auto
data
=
ElementwiseDataWrapper
<
ET
,
1
,
InT
,
Out
T
>
(
out
,
in0
,
in1
);
int
tid
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
int
remain
=
tid
<
size
?
1
:
0
;
ScalarKernelImpl
(
data
,
func
,
tid
,
remain
);
}
template
<
ElementwiseType
ET
,
typename
T
,
typename
Functor
>
template
<
ElementwiseType
ET
,
typename
InT
,
typename
Out
T
,
typename
Functor
>
void
LaunchElementwiseCudaKernel
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
std
::
vector
<
const
framework
::
Tensor
*>
&
ins
,
std
::
vector
<
framework
::
Tensor
*>
*
outs
,
Functor
func
)
{
// calculate the max vec_size for all ins and outs
auto
size
=
ins
[
0
]
->
numel
();
int
vec_size
=
GetVectorizedSize
<
T
>
(
ins
,
*
outs
);
int
vec_size
=
GetVectorizedSize
<
InT
,
Out
T
>
(
ins
,
*
outs
);
int
block_size
=
ELEMENTWISE_BLOCK_SIZE
;
int
grid_size
=
((
size
+
vec_size
-
1
)
/
vec_size
+
block_size
-
1
)
/
block_size
;
const
T
*
in0
=
ins
[
0
]
->
data
<
T
>
();
const
T
*
in1
=
(
ET
==
ElementwiseType
::
kBinary
)
?
ins
[
1
]
->
data
<
T
>
()
:
nullptr
;
T
*
out
=
(
*
outs
)[
0
]
->
data
<
T
>
();
const
InT
*
in0
=
ins
[
0
]
->
data
<
InT
>
();
const
InT
*
in1
=
(
ET
==
ElementwiseType
::
kBinary
)
?
ins
[
1
]
->
data
<
InT
>
()
:
nullptr
;
OutT
*
out
=
(
*
outs
)[
0
]
->
data
<
OutT
>
();
// cuda kernel
auto
stream
=
ctx
.
stream
();
switch
(
vec_size
)
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录