Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
b035c8b0
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
b035c8b0
编写于
5月 14, 2021
作者:
L
limingshu
提交者:
GitHub
5月 14, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
Optimization the broadcast performance of elementwise_add (#32512)
上级
096b2f5a
变更
6
隐藏空白更改
内联
并排
Showing
6 changed file
with
567 addition
and
1 deletion
+567
-1
paddle/fluid/operators/elementwise/elementwise_add_op.cc
paddle/fluid/operators/elementwise/elementwise_add_op.cc
+9
-0
paddle/fluid/operators/elementwise/elementwise_add_op.cu
paddle/fluid/operators/elementwise/elementwise_add_op.cu
+15
-0
paddle/fluid/operators/elementwise/elementwise_add_op.h
paddle/fluid/operators/elementwise/elementwise_add_op.h
+11
-1
paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
...fluid/operators/elementwise/elementwise_op_broadcast.cu.h
+468
-0
paddle/fluid/operators/elementwise/elementwise_op_broadcast_impl.cu.h
.../operators/elementwise/elementwise_op_broadcast_impl.cu.h
+63
-0
paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+1
-0
未找到文件。
paddle/fluid/operators/elementwise/elementwise_add_op.cc
浏览文件 @
b035c8b0
...
...
@@ -69,6 +69,15 @@ struct SameDimsElemwiseAdd<
}
};
template
<
typename
T
>
struct
BroadcastElemwiseAdd
<
platform
::
CPUDeviceContext
,
T
>
{
void
operator
()(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
Tensor
*
x
,
const
framework
::
Tensor
*
y
,
framework
::
Tensor
*
z
)
{
default_elementwise_add
<
platform
::
CPUDeviceContext
,
T
>
(
ctx
,
x
,
y
,
z
);
}
};
class
ElementwiseAddOpMaker
:
public
ElementwiseOpMaker
{
protected:
std
::
string
GetName
()
const
override
{
return
"Add"
;
}
...
...
paddle/fluid/operators/elementwise/elementwise_add_op.cu
浏览文件 @
b035c8b0
...
...
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
#include "paddle/fluid/platform/complex128.h"
#include "paddle/fluid/platform/complex64.h"
...
...
@@ -51,6 +52,20 @@ struct SameDimsElemwiseAdd<platform::CUDADeviceContext, T> {
}
};
template
<
typename
T
>
struct
BroadcastElemwiseAdd
<
platform
::
CUDADeviceContext
,
T
>
{
void
operator
()(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
Tensor
*
x
,
const
framework
::
Tensor
*
y
,
framework
::
Tensor
*
out
)
{
std
::
vector
<
const
framework
::
Tensor
*>
ins
=
{
x
,
y
};
int
axis
=
ctx
.
Attr
<
int
>
(
"axis"
);
axis
=
axis
==
-
1
?
std
::
abs
(
x
->
dims
().
size
()
-
y
->
dims
().
size
())
:
axis
;
LaunchBroadcastElementwiseCudaKernel
<
ElementwiseType
::
kBinary
,
T
>
(
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>(),
ins
,
out
,
CudaAddFunctor
<
T
>
(),
axis
);
}
};
template
<
typename
T
>
static
__global__
void
SimpleElemwiseAddGradCUDAKernel
(
const
T
*
__restrict__
dout
,
int
size
,
int
vec_size
,
T
*
dx
,
T
*
dy
)
{
...
...
paddle/fluid/operators/elementwise/elementwise_add_op.h
浏览文件 @
b035c8b0
...
...
@@ -20,11 +20,13 @@ limitations under the License. */
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/math_function.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#ifdef __NVCC__
#include <cuda.h>
#include <cuda_fp16.h>
#include "cub/cub.cuh"
#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
#endif
#ifdef __HIPCC__
#include <hip/hip_fp16.h>
...
...
@@ -60,6 +62,13 @@ struct SameDimsElemwiseAdd {
framework
::
Tensor
*
z
);
};
template
<
typename
DeviceContext
,
typename
T
,
class
Enable
=
void
>
struct
BroadcastElemwiseAdd
{
void
operator
()(
const
framework
::
ExecutionContext
&
ctx
,
const
framework
::
Tensor
*
x
,
const
framework
::
Tensor
*
y
,
framework
::
Tensor
*
z
);
};
template
<
typename
DeviceContext
,
typename
T
>
class
ElementwiseAddKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
...
...
@@ -73,7 +82,8 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
SameDimsElemwiseAdd
<
DeviceContext
,
T
>
same_dims_add
;
same_dims_add
(
ctx
,
x
,
y
,
z
);
}
else
{
default_elementwise_add
<
DeviceContext
,
T
>
(
ctx
,
x
,
y
,
z
);
BroadcastElemwiseAdd
<
DeviceContext
,
T
>
broadcast_add
;
broadcast_add
(
ctx
,
x
,
y
,
z
);
}
}
};
...
...
paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
0 → 100644
浏览文件 @
b035c8b0
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.1 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.1
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast_impl.cu.h"
namespace
paddle
{
namespace
operators
{
struct
DimensionsTransform
{
using
DimVector
=
std
::
vector
<
int64_t
>
;
typedef
void
(
*
MergeFunctor
)(
bool
&
,
std
::
vector
<
DimVector
>
&
,
DimVector
&
,
int
,
int
);
int64_t
dim_size
;
DimVector
out_dims
;
std
::
vector
<
DimVector
>
in_dims
;
private:
// 1. To compensate the lackage of input_tensors` dimension;
void
InputDimensionsExtend
(
int
N
,
int
axis
)
{
for
(
auto
&
in_dim
:
in_dims
)
{
int64_t
in_idx
=
0
;
if
(
in_dim
.
size
()
<
dim_size
)
{
DimVector
tmp_dim
(
dim_size
,
1
);
do
{
if
(
in_dim
[
in_idx
]
==
out_dims
[
axis
]
||
in_dim
[
in_idx
]
==
1
)
{
tmp_dim
[
axis
]
=
in_dim
[
in_idx
];
in_idx
++
;
axis
++
;
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"The %dth dimension of input tensor is expected to be equal "
"with"
"the %dth dimension of output tensor %d or 1, but recieved "
"%d.
\n
"
,
in_idx
+
1
,
axis
+
1
,
out_dims
[
axis
],
in_dim
[
in_idx
]));
}
}
while
(
in_idx
<
in_dim
.
size
());
in_dim
.
resize
(
dim_size
);
std
::
copy
(
tmp_dim
.
begin
(),
tmp_dim
.
end
(),
in_dim
.
begin
());
}
else
{
do
{
if
(
in_dim
[
in_idx
]
==
out_dims
[
in_idx
]
||
in_dim
[
in_idx
]
==
1
)
{
in_idx
++
;
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"The %dth dimension of input tensor is expected to be equal "
"with"
"the %dth dimension of output tensor %d or 1, but recieved "
"%d.
\n
"
,
in_idx
+
1
,
in_idx
+
1
,
out_dims
[
in_idx
],
in_dim
[
in_idx
]));
}
}
while
(
in_idx
<
dim_size
);
}
std
::
reverse
(
in_dim
.
begin
(),
in_dim
.
end
());
}
std
::
reverse
(
out_dims
.
begin
(),
out_dims
.
end
());
}
template
<
typename
MergeFunctor
>
__inline__
void
DimensionsReorganise
(
MergeFunctor
merge_func
,
int
N
)
{
auto
VectorReorganise
=
[](
DimVector
*
vec
,
int
l_idx
,
int
m_idx
)
{
(
*
vec
)[
m_idx
-
1
]
=
std
::
accumulate
(
vec
->
begin
()
+
l_idx
,
vec
->
begin
()
+
m_idx
,
1
,
std
::
multiplies
<
int64_t
>
());
vec
->
erase
(
vec
->
begin
()
+
l_idx
,
vec
->
begin
()
+
m_idx
-
1
);
};
int64_t
i
=
0
;
while
(
i
<
dim_size
)
{
int
cnt
=
0
;
int
low_idx
=
i
;
bool
equal
=
true
;
do
{
merge_func
(
equal
,
in_dims
,
out_dims
,
i
,
N
);
if
(
equal
)
{
i
++
;
cnt
++
;
}
else
{
break
;
}
}
while
(
i
<
dim_size
);
if
(
cnt
>
1
)
{
for
(
auto
&
in_dim
:
in_dims
)
{
VectorReorganise
(
&
in_dim
,
low_idx
,
i
);
}
VectorReorganise
(
&
out_dims
,
low_idx
,
i
);
dim_size
-=
--
cnt
;
i
-=
cnt
;
}
else
if
(
cnt
<
1
)
{
i
++
;
}
}
}
public:
explicit
DimensionsTransform
(
const
std
::
vector
<
const
framework
::
Tensor
*>
&
ins
,
const
framework
::
DDim
&
dims
,
int
axis
)
{
const
int
N
=
ins
.
size
();
dim_size
=
dims
.
size
();
out_dims
=
framework
::
vectorize
<
int64_t
>
(
dims
);
in_dims
.
resize
(
N
);
for
(
int
j
=
0
;
j
<
N
;
++
j
)
{
in_dims
[
j
]
=
framework
::
vectorize
<
int64_t
>
(
ins
[
j
]
->
dims
());
}
InputDimensionsExtend
(
N
,
axis
);
auto
merge_sequential_dims
=
[](
bool
&
equal
,
std
::
vector
<
DimVector
>
&
in_dims
,
DimVector
&
out
,
int
i
,
int
num
)
{
for
(
int
j
=
1
;
j
<
num
;
++
j
)
{
equal
=
(
in_dims
[
0
][
i
]
==
in_dims
[
j
][
i
])
?
true
:
false
;
}
};
auto
merge_sequential_one_dims
=
[](
bool
&
equal
,
std
::
vector
<
DimVector
>
&
in_dims
,
DimVector
&
out
,
int
i
,
int
num
)
{
equal
=
in_dims
[
0
][
i
]
==
1
;
if
(
equal
)
{
for
(
int
j
=
1
;
j
<
num
;
++
j
)
{
equal
=
in_dims
[
j
][
i
]
==
out
[
i
];
}
}
};
// To Merge the dimensions of input_tensors while the consequtive
// equal-dimensions appears.
MergeFunctor
merge_ptr
=
merge_sequential_dims
;
DimensionsReorganise
<
MergeFunctor
>
(
merge_ptr
,
N
);
int
min_idx
=
0
;
int
min_val
=
std
::
accumulate
(
in_dims
[
0
].
begin
(),
in_dims
[
0
].
end
(),
1
,
std
::
multiplies
<
int64_t
>
());
for
(
int
j
=
1
;
j
<
N
;
++
j
)
{
int
temp
=
std
::
accumulate
(
in_dims
[
j
].
begin
(),
in_dims
[
j
].
end
(),
1
,
std
::
multiplies
<
int64_t
>
());
min_val
=
min_val
>
temp
?
temp
:
min_val
;
min_idx
=
min_val
==
temp
?
j
:
min_idx
;
}
std
::
swap
(
in_dims
[
0
],
in_dims
[
min_idx
]);
// To Merge the dimension of input_tensors while the consequtive
// 1-value-dimensions appears.
merge_ptr
=
merge_sequential_one_dims
;
DimensionsReorganise
<
MergeFunctor
>
(
merge_ptr
,
N
);
std
::
swap
(
in_dims
[
min_idx
],
in_dims
[
0
]);
}
};
struct
CalculateInputStrides
{
std
::
vector
<
std
::
vector
<
uint32_t
>>
strides
;
std
::
vector
<
FastDivMod
>
divmoders
;
private:
// To calculate the strides of each input_tensor.
__inline__
void
CalculateStrides
(
int
N
,
int
dim_size
,
const
std
::
vector
<
std
::
vector
<
int64_t
>>
&
in_dims
)
{
for
(
int
j
=
0
;
j
<
N
;
++
j
)
{
for
(
int
i
=
0
;
i
<
dim_size
;
++
i
)
{
strides
[
j
][
i
]
=
in_dims
[
j
][
i
]
==
1
?
0
:
strides
[
j
][
i
];
strides
[
j
][
i
]
=
(
i
!=
0
&&
strides
[
j
][
i
]
!=
0
)
?
std
::
accumulate
(
in_dims
[
j
].
begin
(),
in_dims
[
j
].
begin
()
+
i
,
1
,
std
::
multiplies
<
int64_t
>
())
:
strides
[
j
][
i
];
}
}
}
public:
explicit
CalculateInputStrides
(
const
int64_t
&
dim_size
,
const
std
::
vector
<
std
::
vector
<
int64_t
>>
&
in_dims
,
const
std
::
vector
<
int64_t
>
&
out_dims
)
{
const
auto
N
=
in_dims
.
size
();
divmoders
.
resize
(
dim_size
);
strides
.
resize
(
N
,
std
::
vector
<
uint32_t
>
(
dim_size
,
1
));
for
(
int
i
=
0
;
i
<
dim_size
;
++
i
)
{
divmoders
[
i
]
=
FastDivMod
(
out_dims
[
i
]);
}
CalculateStrides
(
N
,
dim_size
,
in_dims
);
}
};
template
<
typename
T
,
ElementwiseType
ET
,
int
VecSize
,
int
kDims
>
struct
BroadcastArgsWarpper
{
using
DimsVec
=
CudaAlignedVector
<
T
,
VecSize
>
;
T
*
out_data
;
const
T
*
__restrict__
in_data
[
ET
];
uint32_t
strides
[
ET
][
framework
::
DDim
::
kMaxRank
];
bool
no_broadcast
[
ET
];
FastDivMod
divmoders
[
kDims
];
uint32_t
scalar_offset
;
HOSTDEVICE
BroadcastArgsWarpper
(
const
std
::
vector
<
const
framework
::
Tensor
*>
&
ins
,
const
CalculateInputStrides
&
offset_calculator
,
framework
::
Tensor
*
out
,
int
scalar_offset
)
:
scalar_offset
(
scalar_offset
)
{
for
(
int
j
=
0
;
j
<
ET
;
++
j
)
{
in_data
[
j
]
=
ins
[
j
]
->
data
<
T
>
();
no_broadcast
[
j
]
=
ins
[
j
]
->
dims
()
==
out
->
dims
()
?
true
:
false
;
memcpy
(
strides
[
j
],
offset_calculator
.
strides
[
j
].
data
(),
kDims
*
sizeof
(
uint32_t
));
}
out_data
=
out
->
data
<
T
>
();
memcpy
(
divmoders
,
offset_calculator
.
divmoders
.
data
(),
kDims
*
sizeof
(
FastDivMod
));
}
__device__
__forceinline__
uint32_t
GetDivmodOffset
(
int
idx
,
int
in_idx
)
{
uint32_t
offset
=
0
;
#pragma unroll(kDims)
for
(
int
i
=
0
;
i
<
kDims
;
++
i
)
{
auto
fast_divmoder
=
divmoders
[
i
].
Divmod
(
idx
);
idx
=
fast_divmoder
.
val
[
0
];
offset
+=
fast_divmoder
.
val
[
1
]
*
strides
[
in_idx
][
i
];
}
return
offset
;
}
__device__
__forceinline__
void
CommonVector
(
DimsVec
args
[],
int
tid
,
int
idx
)
{
const
DimsVec
*
__restrict__
vec_data
=
reinterpret_cast
<
const
DimsVec
*
__restrict__
>
(
in_data
[
idx
]);
args
[
idx
]
=
vec_data
[
tid
];
}
__device__
__forceinline__
void
DivmodVector
(
DimsVec
args
[],
int
tid
,
int
idx
)
{
int
index
=
tid
*
VecSize
;
for
(
int
i
=
0
;
i
<
VecSize
;
++
i
)
{
uint32_t
offset
=
GetDivmodOffset
(
index
+
i
,
idx
);
args
[
idx
].
val
[
i
]
=
in_data
[
idx
][
offset
];
}
}
__device__
__forceinline__
void
CommonScalar
(
T
args
[],
int
tid
,
int
idx
)
{
args
[
idx
]
=
in_data
[
idx
][
tid
+
scalar_offset
];
}
__device__
__forceinline__
void
DivmodScalar
(
T
args
[],
int
tid
,
int
idx
)
{
auto
offset
=
GetDivmodOffset
(
tid
+
scalar_offset
,
idx
);
args
[
idx
]
=
in_data
[
idx
][
offset
];
}
__device__
__forceinline__
void
LoadVector
(
DimsVec
args
[],
int
tid
)
{
#pragma unroll(ET)
for
(
int
j
=
0
;
j
<
ET
;
++
j
)
{
if
(
no_broadcast
[
j
])
{
CommonVector
(
args
,
tid
,
j
);
}
else
{
DivmodVector
(
args
,
tid
,
j
);
}
}
}
__device__
__forceinline__
void
LoadScalar
(
T
args
[],
int
tid
)
{
#pragma unroll(ET)
for
(
int
j
=
0
;
j
<
ET
;
++
j
)
{
if
(
no_broadcast
[
j
])
{
CommonScalar
(
args
,
tid
,
j
);
}
else
{
DivmodScalar
(
args
,
tid
,
j
);
}
}
}
__device__
__forceinline__
void
StoreVector
(
DimsVec
args
[],
int
tid
)
{
DimsVec
*
vec_out
=
reinterpret_cast
<
DimsVec
*>
(
out_data
);
vec_out
[
tid
]
=
args
[
0
];
}
__device__
__forceinline__
void
StoreScalar
(
T
args
[],
int
tid
)
{
out_data
[
scalar_offset
+
tid
]
=
args
[
0
];
}
};
template
<
typename
T
,
typename
BroadcastArgsWarpper
,
ElementwiseType
ET
>
__device__
inline
void
ScalarizedBroadcastKernelImpl
(
BroadcastArgsWarpper
data_transfer
,
int
tid
)
{
T
args
[
ET
];
data_transfer
.
LoadScalar
(
args
,
tid
);
#pragma unroll(ET)
for
(
int
j
=
1
;
j
<
ET
;
++
j
)
{
args
[
0
]
+=
args
[
j
];
}
data_transfer
.
StoreScalar
(
args
,
tid
);
}
template
<
typename
T
,
typename
BroadcastArgsWarpper
,
ElementwiseType
ET
,
int
VecSize
>
__device__
inline
void
VectorizedBroadcastKernelImpl
(
BroadcastArgsWarpper
data_transfer
,
int
tid
)
{
using
VecT
=
CudaAlignedVector
<
T
,
VecSize
>
;
VecT
args
[
ET
];
data_transfer
.
LoadVector
(
args
,
tid
);
#pragma unroll(ET)
for
(
int
j
=
1
;
j
<
ET
;
++
j
)
{
#pragma unroll(VecSize)
for
(
int
i
=
0
;
i
<
VecSize
;
++
i
)
{
args
[
0
].
val
[
i
]
+=
args
[
j
].
val
[
i
];
}
}
data_transfer
.
StoreVector
(
args
,
tid
);
}
template
<
typename
T
,
typename
BroadcastArgsWarpper
,
ElementwiseType
ET
,
int
VecSize
>
__global__
void
ElementwiseBroadcastKernel
(
BroadcastArgsWarpper
data_transfer
,
int
main_tid
,
int
tail_tid
)
{
int
tid
=
threadIdx
.
x
+
blockIdx
.
x
*
blockDim
.
x
;
// Aimming at vectorized calculation of major data whose length is max
// multipler of VecSize.
if
(
tid
<
main_tid
)
{
VectorizedBroadcastKernelImpl
<
T
,
BroadcastArgsWarpper
,
ET
,
VecSize
>
(
data_transfer
,
tid
);
}
// Aimming at scalar calculation of rest data whose lenght cannot fulfill
// VecSize.
if
(
tid
<
tail_tid
)
{
ScalarizedBroadcastKernelImpl
<
T
,
BroadcastArgsWarpper
,
ET
>
(
data_transfer
,
tid
);
}
}
template
<
typename
T
,
ElementwiseType
ET
,
int
VecSize
=
1
>
void
LaunchBroadcastKernelForDifferentDimSize
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
std
::
vector
<
const
framework
::
Tensor
*>
&
ins
,
framework
::
Tensor
*
out
,
int
axis
)
{
int
numel
=
out
->
numel
();
const
int
threads
=
256
;
int
blocks
=
((
numel
+
VecSize
-
1
)
/
VecSize
+
threads
-
1
)
/
threads
;
int
main_tid
=
numel
/
VecSize
;
int
tail_tid
=
numel
%
VecSize
;
int
vec_len
=
main_tid
*
VecSize
;
auto
stream
=
ctx
.
stream
();
const
auto
merge_dims
=
DimensionsTransform
(
ins
,
out
->
dims
(),
axis
);
const
auto
offset_calculator
=
CalculateInputStrides
(
merge_dims
.
dim_size
,
merge_dims
.
in_dims
,
merge_dims
.
out_dims
);
switch
(
merge_dims
.
dim_size
)
{
case
1
:
{
auto
data_transfer
=
BroadcastArgsWarpper
<
T
,
ET
,
VecSize
,
1
>
(
ins
,
offset_calculator
,
out
,
vec_len
);
ElementwiseBroadcastKernel
<
T
,
decltype
(
data_transfer
),
ET
,
VecSize
><<<
blocks
,
threads
,
0
,
stream
>>>
(
data_transfer
,
main_tid
,
tail_tid
);
break
;
}
case
2
:
{
auto
data_transfer
=
BroadcastArgsWarpper
<
T
,
ET
,
VecSize
,
2
>
(
ins
,
offset_calculator
,
out
,
vec_len
);
ElementwiseBroadcastKernel
<
T
,
decltype
(
data_transfer
),
ET
,
VecSize
><<<
blocks
,
threads
,
0
,
stream
>>>
(
data_transfer
,
main_tid
,
tail_tid
);
break
;
}
case
3
:
{
auto
data_transfer
=
BroadcastArgsWarpper
<
T
,
ET
,
VecSize
,
3
>
(
ins
,
offset_calculator
,
out
,
vec_len
);
ElementwiseBroadcastKernel
<
T
,
decltype
(
data_transfer
),
ET
,
VecSize
><<<
blocks
,
threads
,
0
,
stream
>>>
(
data_transfer
,
main_tid
,
tail_tid
);
break
;
}
case
4
:
{
auto
data_transfer
=
BroadcastArgsWarpper
<
T
,
ET
,
VecSize
,
4
>
(
ins
,
offset_calculator
,
out
,
vec_len
);
ElementwiseBroadcastKernel
<
T
,
decltype
(
data_transfer
),
ET
,
VecSize
><<<
blocks
,
threads
,
0
,
stream
>>>
(
data_transfer
,
main_tid
,
tail_tid
);
break
;
}
case
5
:
{
auto
data_transfer
=
BroadcastArgsWarpper
<
T
,
ET
,
VecSize
,
5
>
(
ins
,
offset_calculator
,
out
,
vec_len
);
ElementwiseBroadcastKernel
<
T
,
decltype
(
data_transfer
),
ET
,
VecSize
><<<
blocks
,
threads
,
0
,
stream
>>>
(
data_transfer
,
main_tid
,
tail_tid
);
break
;
}
case
6
:
{
auto
data_transfer
=
BroadcastArgsWarpper
<
T
,
ET
,
VecSize
,
6
>
(
ins
,
offset_calculator
,
out
,
vec_len
);
ElementwiseBroadcastKernel
<
T
,
decltype
(
data_transfer
),
ET
,
VecSize
><<<
blocks
,
threads
,
0
,
stream
>>>
(
data_transfer
,
main_tid
,
tail_tid
);
break
;
}
case
7
:
{
auto
data_transfer
=
BroadcastArgsWarpper
<
T
,
ET
,
VecSize
,
7
>
(
ins
,
offset_calculator
,
out
,
vec_len
);
ElementwiseBroadcastKernel
<
T
,
decltype
(
data_transfer
),
ET
,
VecSize
><<<
blocks
,
threads
,
0
,
stream
>>>
(
data_transfer
,
main_tid
,
tail_tid
);
break
;
}
case
8
:
{
auto
data_transfer
=
BroadcastArgsWarpper
<
T
,
ET
,
VecSize
,
8
>
(
ins
,
offset_calculator
,
out
,
vec_len
);
ElementwiseBroadcastKernel
<
T
,
decltype
(
data_transfer
),
ET
,
VecSize
><<<
blocks
,
threads
,
0
,
stream
>>>
(
data_transfer
,
main_tid
,
tail_tid
);
break
;
}
default:
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"The maximum dimension of input tensor is expected to be less than "
"%d, but recieved %d.
\n
"
,
merge_dims
.
dim_size
,
framework
::
DDim
::
kMaxRank
));
}
}
}
template
<
ElementwiseType
ET
,
typename
T
,
typename
Functor
>
void
LaunchBroadcastElementwiseCudaKernel
(
const
platform
::
CUDADeviceContext
&
ctx
,
const
std
::
vector
<
const
framework
::
Tensor
*>
&
ins
,
framework
::
Tensor
*
out
,
Functor
func
,
int
axis
)
{
int
in_vec_size
=
4
;
for
(
auto
*
in
:
ins
)
{
auto
temp_size
=
GetVectorizedSizeImpl
<
T
>
(
in
->
data
<
T
>
());
in_vec_size
=
in
->
dims
()
==
out
->
dims
()
?
std
::
min
(
temp_size
,
in_vec_size
)
:
in_vec_size
;
}
int
out_vec_size
=
GetVectorizedSizeImpl
<
T
>
(
out
->
data
<
T
>
());
int
vec_size
=
std
::
min
(
out_vec_size
,
in_vec_size
);
switch
(
vec_size
)
{
case
4
:
{
LaunchBroadcastKernelForDifferentDimSize
<
T
,
ET
,
4
>
(
ctx
,
ins
,
out
,
axis
);
break
;
}
case
2
:
{
LaunchBroadcastKernelForDifferentDimSize
<
T
,
ET
,
2
>
(
ctx
,
ins
,
out
,
axis
);
break
;
}
default:
{
LaunchBroadcastKernelForDifferentDimSize
<
T
,
ET
,
1
>
(
ctx
,
ins
,
out
,
axis
);
break
;
}
}
}
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/elementwise/elementwise_op_broadcast_impl.cu.h
0 → 100644
浏览文件 @
b035c8b0
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.1 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.1
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
#define INT_BITS 32
namespace
paddle
{
namespace
operators
{
struct
FastDivMod
{
// 1st value represents the result of input number divides by recorded divisor
// 2nd value represents the result of input number modulo by recorded divisor
using
DivModT
=
CudaAlignedVector
<
uint32_t
,
2
>
;
FastDivMod
()
{}
HOSTDEVICE
FastDivMod
(
uint32_t
d
)
:
divisor
(
d
)
{
static_assert
(
sizeof
(
unsigned
int
)
==
4
,
"Only Support 32-bit unsigned int."
);
for
(
shift_val
=
0
;
shift_val
<
INT_BITS
;
++
shift_val
)
{
auto
shift_limit
=
1
<<
shift_val
;
if
(
shift_limit
>=
divisor
)
break
;
}
uint64_t
long_one
=
1
;
uint64_t
temp_div
=
((
long_one
<<
INT_BITS
)
*
((
long_one
<<
shift_val
)
-
divisor
))
/
divisor
+
1
;
multiplier
=
temp_div
;
}
__device__
__forceinline__
uint32_t
Div
(
uint32_t
n
)
const
{
uint32_t
t
=
__umulhi
(
n
,
multiplier
);
return
(
t
+
n
)
>>
shift_val
;
}
__device__
__forceinline__
DivModT
Divmod
(
uint32_t
n
)
{
uint32_t
q
=
Div
(
n
);
DivModT
result
=
{
q
,
n
-
q
*
divisor
};
return
result
;
}
int32_t
divisor
;
int32_t
shift_val
;
uint32_t
multiplier
;
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
浏览文件 @
b035c8b0
...
...
@@ -197,6 +197,7 @@ void LaunchElementwiseCudaKernel(
OutT
*
out
=
(
*
outs
)[
0
]
->
data
<
OutT
>
();
// cuda kernel
auto
stream
=
ctx
.
stream
();
switch
(
vec_size
)
{
case
4
:
VectorizedKernel
<
ET
,
4
><<<
grid_size
,
block_size
,
0
,
stream
>>>
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录