Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
452bcbe2
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
452bcbe2
编写于
1月 26, 2022
作者:
Y
YuanRisheng
提交者:
GitHub
1月 26, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Pten]Move kernel_primitives lib to Pten directory (#39169)
* move kernel_primitives * use pten's errors
上级
bd5c962d
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
578 addition
and
403 deletion
+578
-403
paddle/fluid/operators/kernel_primitives/functor_primitives.h
...le/fluid/operators/kernel_primitives/functor_primitives.h
+2
-233
paddle/fluid/operators/kernel_primitives/kernel_primitives.h
paddle/fluid/operators/kernel_primitives/kernel_primitives.h
+2
-53
paddle/pten/kernels/funcs/elementwise_base.h
paddle/pten/kernels/funcs/elementwise_base.h
+2
-2
paddle/pten/kernels/gpu/reduce.h
paddle/pten/kernels/gpu/reduce.h
+2
-2
paddle/pten/kernels/primitive/compute_primitives.h
paddle/pten/kernels/primitive/compute_primitives.h
+51
-24
paddle/pten/kernels/primitive/compute_primitives_xpu2.h
paddle/pten/kernels/primitive/compute_primitives_xpu2.h
+51
-23
paddle/pten/kernels/primitive/datamover_primitives.h
paddle/pten/kernels/primitive/datamover_primitives.h
+76
-33
paddle/pten/kernels/primitive/datamover_primitives_xpu2.h
paddle/pten/kernels/primitive/datamover_primitives_xpu2.h
+57
-24
paddle/pten/kernels/primitive/functor_primitives.h
paddle/pten/kernels/primitive/functor_primitives.h
+255
-0
paddle/pten/kernels/primitive/helper_primitives.h
paddle/pten/kernels/primitive/helper_primitives.h
+11
-9
paddle/pten/kernels/primitive/kernel_primitives.h
paddle/pten/kernels/primitive/kernel_primitives.h
+69
-0
未找到文件。
paddle/fluid/operators/kernel_primitives/functor_primitives.h
浏览文件 @
452bcbe2
...
@@ -13,241 +13,10 @@
...
@@ -13,241 +13,10 @@
// limitations under the License.
// limitations under the License.
#pragma once
#pragma once
#include "paddle/pten/kernels/primitive/functor_primitives.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/pten/kernels/funcs/eigen/extensions.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
kernel_primitives
{
namespace
kernel_primitives
=
pten
::
kps
;
namespace
details
{
static
__device__
__forceinline__
platform
::
float16
Exp
(
platform
::
float16
x
)
{
return
::
Eigen
::
numext
::
exp
(
x
);
}
static
__device__
__forceinline__
float
Exp
(
float
x
)
{
return
expf
(
x
);
}
static
__device__
__forceinline__
double
Exp
(
double
x
)
{
return
exp
(
x
);
}
static
__device__
__forceinline__
platform
::
float16
Log
(
platform
::
float16
x
)
{
return
::
Eigen
::
numext
::
log
(
x
);
}
static
__device__
__forceinline__
float
Log
(
float
x
)
{
return
logf
(
x
);
}
static
__device__
__forceinline__
double
Log
(
double
x
)
{
return
log
(
x
);
}
}
// namespace details
/******************************** Unary Functor *******************************/
/**
* @brief Default unary exp functor
*/
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
ExpFunctor
{
HOSTDEVICE
inline
ExpFunctor
()
{}
HOSTDEVICE
explicit
inline
ExpFunctor
(
int
n
)
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
x
)
const
{
return
static_cast
<
Ty
>
(
details
::
Exp
(
x
));
}
};
/**
* @brief Default unary identity functor
*/
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
IdentityFunctor
{
HOSTDEVICE
inline
IdentityFunctor
()
{}
HOSTDEVICE
explicit
inline
IdentityFunctor
(
int
n
)
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
x
)
const
{
return
static_cast
<
Ty
>
(
x
);
}
};
/**
* @brief Default unary div functor. Divide by a constant
*/
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
DivideFunctor
{
private:
using
MPType
=
typename
::
paddle
::
operators
::
details
::
MPTypeTrait
<
Tx
>::
Type
;
public:
HOSTDEVICE
inline
DivideFunctor
()
{
n_inv
=
static_cast
<
MPType
>
(
1.0
f
);
}
HOSTDEVICE
explicit
inline
DivideFunctor
(
int
n
)
:
n_inv
((
MPType
)(
1.0
/
n
))
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
x
)
const
{
return
static_cast
<
Ty
>
(
static_cast
<
MPType
>
(
x
)
*
n_inv
);
}
private:
MPType
n_inv
;
};
/**
* @brief Default inverse functor
*/
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
InverseFunctor
{
HOSTDEVICE
inline
InverseFunctor
()
{}
HOSTDEVICE
explicit
inline
InverseFunctor
(
int
n
)
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
x
)
const
{
return
static_cast
<
Ty
>
(
-
x
);
}
};
/**
* @brief Default unary square functor
*/
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
SquareFunctor
{
HOSTDEVICE
inline
SquareFunctor
()
{}
HOSTDEVICE
explicit
inline
SquareFunctor
(
int
n
)
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
x
)
const
{
return
static_cast
<
Ty
>
(
x
)
*
static_cast
<
Ty
>
(
x
);
}
};
/****************************** Binary Functor ********************************/
/**
* @brief Default binary min functor
*/
template
<
typename
T
>
struct
MinFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
std
::
numeric_limits
<
T
>::
max
());
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
(
b
<
a
)
?
b
:
a
;
}
};
/**
* @brief Default binary max functor
*/
template
<
typename
T
>
struct
MaxFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
std
::
numeric_limits
<
T
>::
lowest
());
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
(
b
>
a
)
?
b
:
a
;
}
};
/**
* @brief Default binary add functor
*/
template
<
typename
T
>
struct
AddFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
0.0
f
);
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
b
+
a
;
}
};
/**
* @brief Default binary add functor
*/
template
<
typename
T
>
struct
MulFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
1.0
f
);
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
b
*
a
;
}
};
/**
* @brief Default binary logic or functor
*/
template
<
typename
T
>
struct
LogicalOrFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
false
);
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
b
||
a
;
}
};
/**
* @brief Default binary logic and functor
*/
template
<
typename
T
>
struct
LogicalAndFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
true
);
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
b
&&
a
;
}
};
/**
* @brief Default binary sub functor
*/
template
<
typename
T
>
struct
SubFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
0.0
f
);
}
inline
HOSTDEVICE
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
a
-
b
;
}
};
/**
* @brief Default binary div functor
*/
template
<
typename
T
,
typename
Enable
=
void
>
struct
DivFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
1.0
f
);
}
inline
HOSTDEVICE
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
a
/
b
;
}
};
template
<
typename
T
>
struct
DivFunctor
<
T
,
typename
std
::
enable_if
<
std
::
is_integral
<
T
>::
value
>::
type
>
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
1.0
f
);
}
inline
HOSTDEVICE
T
operator
()(
const
T
a
,
const
T
b
)
const
{
// For int32/int64, need to check whether the divison is zero.
PADDLE_ENFORCE_NE
(
b
,
0
,
platform
::
errors
::
InvalidArgument
(
"Integer division by zero encountered "
"in (floor) divide. Please check the input value."
));
return
a
/
b
;
}
};
/**
* @brief Default binary floor divide functor
*/
template
<
typename
T
>
struct
FloorDivFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
1.0
f
);
}
inline
HOSTDEVICE
T
operator
()(
const
T
a
,
const
T
b
)
const
{
PADDLE_ENFORCE_NE
(
b
,
0
,
platform
::
errors
::
InvalidArgument
(
"Integer division by zero encountered "
"in (floor) divide. Please check the input value."
));
return
static_cast
<
T
>
(
std
::
trunc
(
a
/
b
));
}
};
}
// namespace kernel_primitives
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
paddle/fluid/operators/kernel_primitives/kernel_primitives.h
浏览文件 @
452bcbe2
...
@@ -13,61 +13,10 @@
...
@@ -13,61 +13,10 @@
// limitations under the License.
// limitations under the License.
#pragma once
#pragma once
#include "paddle/fluid/operators/kernel_primitives/helper_primitives.h"
#include "paddle/pten/kernels/primitive/kernel_primitives.h"
#ifdef PADDLE_WITH_XPU2
#include "paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h"
#include "paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h"
#include "paddle/fluid/operators/kernel_primitives/functor_primitives_xpu2.h"
#define KPStream XPUStream
#define KPDevice paddle::platform::XPUDeviceContext
#define _ptr_ _global_ptr_
#define __forceinline__ __inline__
#define __restrict__
#define THREAD_ID_X core_id()
#define THREAD_ID_Y 0
#define THREAD_ID_Z 0
#define BLOCK_NUM_X core_num()
#define BLOCK_NUM_Y 0
#define BLOCK_NUM_Z 0
#define BLOCK_ID_X cluster_id()
#define BLOCK_ID_Y 0
#define BLOCK_ID_Z 0
#define GRID_NUM_X cluster_num()
#define GRID_NUM_Y 0
#define GRID_NUM_Z 0
#else
#include "paddle/fluid/operators/kernel_primitives/compute_primitives.h"
#include "paddle/fluid/operators/kernel_primitives/datamover_primitives.h"
#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h"
#define KPStream gpuStream_t
#define KPDevice paddle::platform::CUDADeviceContext
#define _ptr_
#define THREAD_ID_X threadIdx.x
#define THREAD_ID_Y threadIdx.y
#define THREAD_ID_Z threadIdx.z
#define BLOCK_NUM_X blockDim.x
#define BLOCK_NUM_Y blockDim.y
#define BLOCK_NUM_Z blockDim.z
#define BLOCK_ID_X blockIdx.x
#define BLOCK_ID_Y blockIdx.y
#define BLOCK_ID_Z blockIdx.z
#define GRID_NUM_X gridDim.x
#define GRID_NUM_Y gridDim.y
#define GRID_NUM_Z gridDim.z
#endif
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
kernel_primitives
{}
namespace
kernel_primitives
=
pten
::
kps
;
}
}
}
}
paddle/pten/kernels/funcs/elementwise_base.h
浏览文件 @
452bcbe2
...
@@ -22,12 +22,12 @@ limitations under the License. */
...
@@ -22,12 +22,12 @@ limitations under the License. */
#include "paddle/pten/kernels/empty_kernel.h"
#include "paddle/pten/kernels/empty_kernel.h"
#if defined(__NVCC__) || defined(__HIPCC__)
#if defined(__NVCC__) || defined(__HIPCC__)
#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/function_traits.h"
#include "paddle/fluid/platform/function_traits.h"
#include "paddle/pten/kernels/primitive/kernel_primitives.h"
namespace
kps
=
p
addle
::
operators
::
kernel_primitive
s
;
namespace
kps
=
p
ten
::
kp
s
;
#endif
#endif
...
...
paddle/pten/kernels/gpu/reduce.h
浏览文件 @
452bcbe2
...
@@ -34,13 +34,13 @@ namespace cub = hipcub;
...
@@ -34,13 +34,13 @@ namespace cub = hipcub;
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/fast_divmod.h"
#include "paddle/fluid/platform/fast_divmod.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/pten/core/array.h"
#include "paddle/pten/core/array.h"
#include "paddle/pten/core/enforce.h"
#include "paddle/pten/core/enforce.h"
#include "paddle/pten/kernels/primitive/kernel_primitives.h"
#include "paddle/pten/api/ext/dispatch.h"
#include "paddle/pten/api/ext/dispatch.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
...
@@ -51,7 +51,7 @@ namespace cub = hipcub;
...
@@ -51,7 +51,7 @@ namespace cub = hipcub;
#define REDUCE_SPLIT_BOUNDARY 512
#define REDUCE_SPLIT_BOUNDARY 512
#define REDUCE_VEC_SIZE 4
#define REDUCE_VEC_SIZE 4
namespace
kps
=
p
addle
::
operators
::
kernel_primitive
s
;
namespace
kps
=
p
ten
::
kp
s
;
namespace
pten
{
namespace
pten
{
namespace
kernels
{
namespace
kernels
{
...
...
paddle/
fluid/operators/kernel_primitives
/compute_primitives.h
→
paddle/
pten/kernels/primitive
/compute_primitives.h
浏览文件 @
452bcbe2
...
@@ -22,11 +22,10 @@
...
@@ -22,11 +22,10 @@
#endif
#endif
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/
fluid/platform
/float16.h"
#include "paddle/
pten/common
/float16.h"
namespace
paddle
{
namespace
pten
{
namespace
operators
{
namespace
kps
{
namespace
kernel_primitives
{
namespace
details
{
namespace
details
{
#ifdef __HIPCC__
#ifdef __HIPCC__
...
@@ -48,7 +47,7 @@ class MPTypeTrait {
...
@@ -48,7 +47,7 @@ class MPTypeTrait {
};
};
template
<
>
template
<
>
class
MPTypeTrait
<
p
latform
::
float16
>
{
class
MPTypeTrait
<
p
ten
::
dtype
::
float16
>
{
public:
public:
using
Type
=
float
;
using
Type
=
float
;
};
};
...
@@ -158,9 +157,14 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
...
@@ -158,9 +157,14 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
* in: The register pointer of in, the size is NX * NY.
* in: The register pointer of in, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseUnary
(
OutT
*
out
,
const
InT
*
in
,
__device__
__forceinline__
void
ElementwiseUnary
(
OutT
*
out
,
const
InT
*
in
,
OpFunc
compute
)
{
OpFunc
compute
)
{
#pragma unroll
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
idx
++
)
{
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
idx
++
)
{
...
@@ -193,9 +197,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
...
@@ -193,9 +197,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
* in2: The register pointer of second input, size is NX * NY.
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
const
InT
*
in1
,
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
const
InT
*
in2
,
OpFunc
compute
)
{
OpFunc
compute
)
{
#pragma unroll
#pragma unroll
...
@@ -231,12 +240,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
...
@@ -231,12 +240,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
* in3: The register pointer of third input, size is NX * NY.
* in3: The register pointer of third input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseTernary
(
OutT
*
out
,
const
InT
*
in1
,
__device__
__forceinline__
void
ElementwiseTernary
(
const
InT
*
in2
,
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
const
InT
*
in3
,
OpFunc
compute
)
{
const
InT
*
in3
,
OpFunc
compute
)
{
#pragma unroll
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
++
idx
)
{
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
++
idx
)
{
out
[
idx
]
=
static_cast
<
OutT
>
(
compute
(
in1
[
idx
],
in2
[
idx
],
in3
[
idx
]));
out
[
idx
]
=
static_cast
<
OutT
>
(
compute
(
in1
[
idx
],
in2
[
idx
],
in3
[
idx
]));
...
@@ -268,9 +279,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
...
@@ -268,9 +279,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
* ins: A pointers of array consisting of multiple inputs.
* ins: A pointers of array consisting of multiple inputs.
* compute: Compute function which was declared like OpFunc<InT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Arity
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Arity
,
class
OpFunc
>
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseAny
(
OutT
*
out
,
InT
(
*
ins
)[
NX
*
NY
],
__device__
__forceinline__
void
ElementwiseAny
(
OutT
*
out
,
InT
(
*
ins
)[
NX
*
NY
],
OpFunc
compute
)
{
OpFunc
compute
)
{
InT
args
[
Arity
];
InT
args
[
Arity
];
#pragma unroll
#pragma unroll
...
@@ -309,10 +326,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
...
@@ -309,10 +326,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
* in2: The register pointer of second input, size is NX * NY.
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
class
OpFunc
>
__device__
__forceinline__
void
CycleBinary
(
OutT
*
out
,
const
InT
*
in1
,
__device__
__forceinline__
void
CycleBinary
(
OutT
*
out
,
const
InT
*
in2
,
OpFunc
compute
)
{
const
InT
*
in1
,
const
InT
*
in2
,
OpFunc
compute
)
{
#pragma unroll
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
;
idx
++
)
{
for
(
int
idx
=
0
;
idx
<
NX
;
idx
++
)
{
#pragma unroll
#pragma unroll
...
@@ -350,9 +373,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
...
@@ -350,9 +373,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
* reducer: Compute function which was declared like ReduceFunctor<InT>().
* reducer: Compute function which was declared like ReduceFunctor<InT>().
* reduce_last_dim: if the last dim gets involved in reduction.
* reduce_last_dim: if the last dim gets involved in reduction.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
class
ReduceFunctor
,
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
class
ReduceFunctor
,
details
::
ReduceMode
Mode
>
details
::
ReduceMode
Mode
>
__device__
__forceinline__
void
Reduce
(
T
*
out
,
const
T
*
in
,
__device__
__forceinline__
void
Reduce
(
T
*
out
,
const
T
*
in
,
ReduceFunctor
reducer
,
ReduceFunctor
reducer
,
bool
reduce_last_dim
)
{
bool
reduce_last_dim
)
{
int
block_index
=
blockDim
.
y
;
int
block_index
=
blockDim
.
y
;
...
@@ -386,6 +414,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in,
...
@@ -386,6 +414,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in,
}
}
}
}
}
// namespace kernel_primitives
}
// namespace kps
}
// namespace operators
}
// namespace pten
}
// namespace paddle
paddle/
fluid/operators/kernel_primitives
/compute_primitives_xpu2.h
→
paddle/
pten/kernels/primitive
/compute_primitives_xpu2.h
浏览文件 @
452bcbe2
...
@@ -13,13 +13,13 @@
...
@@ -13,13 +13,13 @@
// limitations under the License.
// limitations under the License.
#pragma once
#pragma once
#include "paddle/pten/common/float16.h"
#include "xpu/kernel/cluster_header.h"
#include "xpu/kernel/cluster_header.h"
#include "xpu/kernel/debug.h"
#include "xpu/kernel/debug.h"
#include "xpu/kernel/math.h"
#include "xpu/kernel/math.h"
namespace
paddle
{
namespace
pten
{
namespace
operators
{
namespace
kps
{
namespace
kernel_primitives
{
namespace
details
{
namespace
details
{
// kGlobalMode: block reduce, each block gets an output;
// kGlobalMode: block reduce, each block gets an output;
...
@@ -33,7 +33,7 @@ class MPTypeTrait {
...
@@ -33,7 +33,7 @@ class MPTypeTrait {
};
};
template
<
>
template
<
>
class
MPTypeTrait
<
p
latform
::
float16
>
{
class
MPTypeTrait
<
p
ten
::
dtype
::
float16
>
{
public:
public:
using
Type
=
float
;
using
Type
=
float
;
};
};
...
@@ -102,9 +102,14 @@ __device__ void BlockXReduce(T* data, OpFunc reducer) {
...
@@ -102,9 +102,14 @@ __device__ void BlockXReduce(T* data, OpFunc reducer) {
* in: The register pointer of in, the size is NX * NY.
* in: The register pointer of in, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseUnary
(
OutT
*
out
,
const
InT
*
in
,
__device__
__forceinline__
void
ElementwiseUnary
(
OutT
*
out
,
const
InT
*
in
,
OpFunc
compute
)
{
OpFunc
compute
)
{
#pragma unroll
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
idx
++
)
{
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
idx
++
)
{
...
@@ -137,9 +142,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
...
@@ -137,9 +142,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
* in2: The register pointer of second input, size is NX * NY.
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
const
InT
*
in1
,
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
const
InT
*
in2
,
OpFunc
compute
)
{
OpFunc
compute
)
{
#pragma unroll
#pragma unroll
...
@@ -175,12 +185,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
...
@@ -175,12 +185,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
* in3: The register pointer of third input, size is NX * NY.
* in3: The register pointer of third input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseTernary
(
OutT
*
out
,
const
InT
*
in1
,
__device__
__forceinline__
void
ElementwiseTernary
(
const
InT
*
in2
,
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
const
InT
*
in3
,
OpFunc
compute
)
{
const
InT
*
in3
,
OpFunc
compute
)
{
#pragma unroll
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
++
idx
)
{
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
++
idx
)
{
out
[
idx
]
=
static_cast
<
OutT
>
(
compute
(
in1
[
idx
],
in2
[
idx
],
in3
[
idx
]));
out
[
idx
]
=
static_cast
<
OutT
>
(
compute
(
in1
[
idx
],
in2
[
idx
],
in3
[
idx
]));
...
@@ -212,9 +224,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
...
@@ -212,9 +224,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
* ins: A pointers of array consisting of multiple inputs.
* ins: A pointers of array consisting of multiple inputs.
* compute: Compute function which was declared like OpFunc<InT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Arity
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Arity
,
class
OpFunc
>
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseAny
(
OutT
*
out
,
InT
(
*
ins
)[
NX
*
NY
],
__device__
__forceinline__
void
ElementwiseAny
(
OutT
*
out
,
InT
(
*
ins
)[
NX
*
NY
],
OpFunc
compute
)
{
OpFunc
compute
)
{
__local__
InT
args
[
Arity
];
__local__
InT
args
[
Arity
];
#pragma unroll
#pragma unroll
...
@@ -253,10 +271,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
...
@@ -253,10 +271,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
* in2: The register pointer of second input, size is NX * NY.
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
class
OpFunc
>
__device__
__forceinline__
void
CycleBinary
(
OutT
*
out
,
const
InT
*
in1
,
__device__
__forceinline__
void
CycleBinary
(
OutT
*
out
,
const
InT
*
in2
,
OpFunc
compute
)
{
const
InT
*
in1
,
const
InT
*
in2
,
OpFunc
compute
)
{
#pragma unroll
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
;
idx
++
)
{
for
(
int
idx
=
0
;
idx
<
NX
;
idx
++
)
{
#pragma unroll
#pragma unroll
...
@@ -294,9 +318,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
...
@@ -294,9 +318,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
* reducer: Compute function which was declared like ReduceFunctor<InT>().
* reducer: Compute function which was declared like ReduceFunctor<InT>().
* reduce_last_dim: if the last dim gets involved in reduction.
* reduce_last_dim: if the last dim gets involved in reduction.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
class
ReduceFunctor
,
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
class
ReduceFunctor
,
details
::
ReduceMode
Mode
>
details
::
ReduceMode
Mode
>
__device__
__forceinline__
void
Reduce
(
T
*
out
,
const
T
*
in
,
__device__
__forceinline__
void
Reduce
(
T
*
out
,
const
T
*
in
,
ReduceFunctor
reducer
,
ReduceFunctor
reducer
,
bool
reduce_last_dim
)
{
bool
reduce_last_dim
)
{
if
(
Mode
==
kGlobalMode
)
{
if
(
Mode
==
kGlobalMode
)
{
...
@@ -319,6 +348,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in,
...
@@ -319,6 +348,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in,
}
}
}
}
}
// namespace kernel_primitives
}
// namespace kps
}
// namespace operators
}
// namespace pten
}
// namespace paddle
paddle/
fluid/operators/kernel_primitives
/datamover_primitives.h
→
paddle/
pten/kernels/primitive
/datamover_primitives.h
浏览文件 @
452bcbe2
...
@@ -22,9 +22,8 @@
...
@@ -22,9 +22,8 @@
#endif
#endif
#include "paddle/pten/core/ddim.h"
#include "paddle/pten/core/ddim.h"
namespace
paddle
{
namespace
pten
{
namespace
operators
{
namespace
kps
{
namespace
kernel_primitives
{
namespace
details
{
namespace
details
{
#define INT_BITS 32
#define INT_BITS 32
...
@@ -103,11 +102,12 @@ struct BroadcastConfig {
...
@@ -103,11 +102,12 @@ struct BroadcastConfig {
strides_in
.
resize
(
dim_size
,
1
);
strides_in
.
resize
(
dim_size
,
1
);
for
(
int
i
=
0
;
i
<
dim_size
;
++
i
)
{
for
(
int
i
=
0
;
i
<
dim_size
;
++
i
)
{
strides_in
[
i
]
=
in_dims
[
i
]
==
1
?
0
:
strides_in
[
i
];
strides_in
[
i
]
=
in_dims
[
i
]
==
1
?
0
:
strides_in
[
i
];
strides_in
[
i
]
=
strides_in
[
i
]
=
(
i
!=
0
&&
strides_in
[
i
]
!=
0
)
(
i
!=
0
&&
strides_in
[
i
]
!=
0
)
?
std
::
accumulate
(
in_dims
.
begin
(),
?
std
::
accumulate
(
in_dims
.
begin
(),
in_dims
.
begin
()
+
i
,
1
,
in_dims
.
begin
()
+
i
,
std
::
multiplies
<
int64_t
>
())
1
,
:
strides_in
[
i
];
std
::
multiplies
<
int64_t
>
())
:
strides_in
[
i
];
}
}
memcpy
(
strides
,
strides_in
.
data
(),
kDims
*
sizeof
(
uint32_t
));
memcpy
(
strides
,
strides_in
.
data
(),
kDims
*
sizeof
(
uint32_t
));
...
@@ -144,11 +144,18 @@ struct BroadcastConfig {
...
@@ -144,11 +144,18 @@ struct BroadcastConfig {
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
*/
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadData
(
Ty
*
dst
,
const
Tx
*
__restrict__
src
,
__device__
__forceinline__
void
ReadData
(
Ty
*
dst
,
int
size_nx
,
int
size_ny
,
const
Tx
*
__restrict__
src
,
int
stride_nx
,
int
stride_ny
)
{
int
size_nx
,
int
size_ny
,
int
stride_nx
,
int
stride_ny
)
{
int
thread_offset
=
threadIdx
.
x
;
int
thread_offset
=
threadIdx
.
x
;
int
left_size_nx
=
size_nx
-
thread_offset
;
int
left_size_nx
=
size_nx
-
thread_offset
;
...
@@ -244,7 +251,8 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
...
@@ -244,7 +251,8 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
* size: The current block needs to load size data continuously.
* size: The current block needs to load size data continuously.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadData
(
T
*
dst
,
const
T
*
__restrict__
src
,
__device__
__forceinline__
void
ReadData
(
T
*
dst
,
const
T
*
__restrict__
src
,
int
num
)
{
int
num
)
{
if
(
IsBoundary
)
{
// blockDim.x * NX > num
if
(
IsBoundary
)
{
// blockDim.x * NX > num
int
thread_offset
=
threadIdx
.
x
*
NX
;
int
thread_offset
=
threadIdx
.
x
*
NX
;
...
@@ -299,11 +307,19 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
...
@@ -299,11 +307,19 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
bool
IsBoundary
=
false
>
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadDataBc
(
__device__
__forceinline__
void
ReadDataBc
(
T
*
dst
,
const
T
*
__restrict__
src
,
uint32_t
block_offset
,
T
*
dst
,
details
::
BroadcastConfig
<
Rank
>
config
,
int
total_num_output
,
int
stride_nx
,
const
T
*
__restrict__
src
,
uint32_t
block_offset
,
details
::
BroadcastConfig
<
Rank
>
config
,
int
total_num_output
,
int
stride_nx
,
int
stride_ny
)
{
int
stride_ny
)
{
uint32_t
thread_offset
=
block_offset
+
threadIdx
.
x
;
uint32_t
thread_offset
=
block_offset
+
threadIdx
.
x
;
uint32_t
index_src
=
0
;
uint32_t
index_src
=
0
;
...
@@ -361,12 +377,25 @@ __device__ __forceinline__ void ReadDataBc(
...
@@ -361,12 +377,25 @@ __device__ __forceinline__ void ReadDataBc(
* reduce_last_dim: Used to indicate whether the dimension of reduce contains
* reduce_last_dim: Used to indicate whether the dimension of reduce contains
* the lowest dimension.
* the lowest dimension.
*/
*/
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
template
<
typename
Tx
,
typename
IndexCal
,
typename
Functor
,
bool
IsBoundary
=
false
>
typename
Ty
,
__device__
__forceinline__
void
ReadDataReduce
(
int
NX
,
Ty
*
dst
,
const
Tx
*
__restrict__
src
,
int
block_offset
,
int
NY
,
const
IndexCal
&
index_cal
,
int
size_nx
,
int
size_ny
,
int
stride_nx
,
int
BlockSize
,
int
stride_ny
,
Functor
func
,
bool
reduce_last_dim
)
{
int
Rank
,
typename
IndexCal
,
typename
Functor
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadDataReduce
(
Ty
*
dst
,
const
Tx
*
__restrict__
src
,
int
block_offset
,
const
IndexCal
&
index_cal
,
int
size_nx
,
int
size_ny
,
int
stride_nx
,
int
stride_ny
,
Functor
func
,
bool
reduce_last_dim
)
{
int
thread_offset
=
0
;
int
thread_offset
=
0
;
int
left_idx
=
0
;
int
left_idx
=
0
;
if
(
reduce_last_dim
)
{
if
(
reduce_last_dim
)
{
...
@@ -430,7 +459,8 @@ __device__ __forceinline__ void ReadDataReduce(
...
@@ -430,7 +459,8 @@ __device__ __forceinline__ void ReadDataReduce(
* size: The current block needs to load size elements continuously.
* size: The current block needs to load size elements continuously.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
WriteData
(
T
*
dst
,
T
*
__restrict__
src
,
__device__
__forceinline__
void
WriteData
(
T
*
dst
,
T
*
__restrict__
src
,
int
num
)
{
int
num
)
{
if
(
IsBoundary
)
{
if
(
IsBoundary
)
{
int
thread_offset
=
threadIdx
.
x
*
NX
;
int
thread_offset
=
threadIdx
.
x
*
NX
;
...
@@ -483,11 +513,18 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
...
@@ -483,11 +513,18 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
*/
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
WriteData
(
Ty
*
dst
,
const
Tx
*
__restrict__
src
,
__device__
__forceinline__
void
WriteData
(
Ty
*
dst
,
int
size_nx
,
int
size_ny
,
const
Tx
*
__restrict__
src
,
int
stride_nx
,
int
stride_ny
)
{
int
size_nx
,
int
size_ny
,
int
stride_nx
,
int
stride_ny
)
{
int
thread_offset
=
threadIdx
.
x
;
int
thread_offset
=
threadIdx
.
x
;
int
left_size_nx
=
size_nx
-
thread_offset
;
int
left_size_nx
=
size_nx
-
thread_offset
;
...
@@ -589,11 +626,18 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
...
@@ -589,11 +626,18 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
* coordinate mapping relationship between output data and input data.
* coordinate mapping relationship between output data and input data.
* total_num_output: Total number of original output.
* total_num_output: Total number of original output.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
bool
IsBoundary
=
false
>
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadDataBc
(
__device__
__forceinline__
void
ReadDataBc
(
T
*
dst
,
const
T
*
__restrict__
src
,
uint32_t
block_offset
,
T
*
dst
,
details
::
BroadcastConfig
<
Rank
>
config
,
int
total_num_output
)
{
const
T
*
__restrict__
src
,
uint32_t
block_offset
,
details
::
BroadcastConfig
<
Rank
>
config
,
int
total_num_output
)
{
uint32_t
thread_offset
=
block_offset
+
threadIdx
.
x
*
NX
;
uint32_t
thread_offset
=
block_offset
+
threadIdx
.
x
*
NX
;
uint32_t
index_src
=
0
;
uint32_t
index_src
=
0
;
...
@@ -616,6 +660,5 @@ __device__ __forceinline__ void ReadDataBc(
...
@@ -616,6 +660,5 @@ __device__ __forceinline__ void ReadDataBc(
}
}
}
}
}
// namespace kernel_primitives
}
// namespace kps
}
// namespace operators
}
// namespace pten
}
// namespace paddle
paddle/
fluid/operators/kernel_primitives
/datamover_primitives_xpu2.h
→
paddle/
pten/kernels/primitive
/datamover_primitives_xpu2.h
浏览文件 @
452bcbe2
...
@@ -17,9 +17,8 @@
...
@@ -17,9 +17,8 @@
#include "xpu/kernel/debug.h"
#include "xpu/kernel/debug.h"
#include "xpu/kernel/math.h"
#include "xpu/kernel/math.h"
namespace
paddle
{
namespace
pten
{
namespace
operators
{
namespace
kps
{
namespace
kernel_primitives
{
namespace
details
{
namespace
details
{
template
<
typename
T
,
int
VecSize
>
template
<
typename
T
,
int
VecSize
>
...
@@ -105,10 +104,17 @@ struct BroadcastConfig {
...
@@ -105,10 +104,17 @@ struct BroadcastConfig {
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
*/
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadData
(
Ty
*
dst
,
const
Tx
_global_ptr_
*
src
,
__device__
__inline__
void
ReadData
(
Ty
*
dst
,
int
size_nx
,
int
size_ny
,
int
stride_nx
,
const
Tx
_global_ptr_
*
src
,
int
size_nx
,
int
size_ny
,
int
stride_nx
,
int
stride_ny
)
{
int
stride_ny
)
{
int
thread_offset
=
core_id
();
int
thread_offset
=
core_id
();
int
left_size_nx
=
size_nx
-
thread_offset
;
int
left_size_nx
=
size_nx
-
thread_offset
;
...
@@ -205,7 +211,8 @@ __device__ __inline__ void Init(T* dst, T init_data) {
...
@@ -205,7 +211,8 @@ __device__ __inline__ void Init(T* dst, T init_data) {
* size: The current block needs to load size data continuously.
* size: The current block needs to load size data continuously.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadData
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
__device__
__inline__
void
ReadData
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
int
num
)
{
int
num
)
{
int
thread_offset
=
core_id
()
*
NX
;
int
thread_offset
=
core_id
()
*
NX
;
__local__
T
in_temp
[
1
];
__local__
T
in_temp
[
1
];
...
@@ -247,12 +254,18 @@ __device__ __inline__ void ReadData(T* dst, const T _global_ptr_* src,
...
@@ -247,12 +254,18 @@ __device__ __inline__ void ReadData(T* dst, const T _global_ptr_* src,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
bool
IsBoundary
=
false
>
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadDataBc
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
__device__
__inline__
void
ReadDataBc
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
uint32_t
block_offset
,
uint32_t
block_offset
,
details
::
BroadcastConfig
<
Rank
>
config
,
details
::
BroadcastConfig
<
Rank
>
config
,
int
total_num_output
,
int
stride_nx
,
int
total_num_output
,
int
stride_nx
,
int
stride_ny
)
{
int
stride_ny
)
{
uint32_t
thread_offset
=
block_offset
+
core_id
();
uint32_t
thread_offset
=
block_offset
+
core_id
();
uint32_t
index_src
=
0
;
uint32_t
index_src
=
0
;
...
@@ -307,13 +320,21 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
...
@@ -307,13 +320,21 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
* reduce_last_dim: Used to indicate whether the dimension of reduce contains
* reduce_last_dim: Used to indicate whether the dimension of reduce contains
* the lowest dimension.
* the lowest dimension.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
template
<
typename
T
,
typename
IndexCal
,
bool
IsBoundary
=
false
>
int
NX
,
__device__
__inline__
void
ReadDataReduce
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
int
NY
,
int
BlockSize
,
int
Rank
,
typename
IndexCal
,
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadDataReduce
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
int
block_offset
,
int
block_offset
,
const
IndexCal
&
index_cal
,
const
IndexCal
&
index_cal
,
int
size_nx
,
int
size_ny
,
int
size_nx
,
int
stride_nx
,
int
stride_ny
,
int
size_ny
,
int
stride_nx
,
int
stride_ny
,
bool
reduce_last_dim
)
{
bool
reduce_last_dim
)
{
__local__
Tx
in_temp
[
1
];
__local__
Tx
in_temp
[
1
];
int
thread_offset
=
0
;
int
thread_offset
=
0
;
...
@@ -423,10 +444,17 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
...
@@ -423,10 +444,17 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
*/
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
bool
IsBoundary
=
false
>
__device__
__inline__
void
WriteData
(
Ty
_global_ptr_
*
dst
,
const
Tx
*
src
,
__device__
__inline__
void
WriteData
(
Ty
_global_ptr_
*
dst
,
int
size_nx
,
int
size_ny
,
int
stride_nx
,
const
Tx
*
src
,
int
size_nx
,
int
size_ny
,
int
stride_nx
,
int
stride_ny
)
{
int
stride_ny
)
{
int
thread_offset
=
core_id
();
int
thread_offset
=
core_id
();
int
left_size_nx
=
size_nx
-
thread_offset
;
int
left_size_nx
=
size_nx
-
thread_offset
;
...
@@ -483,7 +511,8 @@ __device__ __inline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
...
@@ -483,7 +511,8 @@ __device__ __inline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
}
}
}
}
in_temp
[
0
]
=
static_cast
<
Ty
>
(
src
[
idx
+
idy
*
NX
]);
in_temp
[
0
]
=
static_cast
<
Ty
>
(
src
[
idx
+
idy
*
NX
]);
LM2GM
(
in_temp
,
dst
+
thread_offset
+
idx
*
stride_nx
+
idy
*
stride_ny
,
LM2GM
(
in_temp
,
dst
+
thread_offset
+
idx
*
stride_nx
+
idy
*
stride_ny
,
sizeof
(
Ty
));
sizeof
(
Ty
));
}
}
}
}
...
@@ -537,9 +566,14 @@ __device__ __inline__ void Init(T* dst, T* init_data, int num) {
...
@@ -537,9 +566,14 @@ __device__ __inline__ void Init(T* dst, T* init_data, int num) {
* coordinate mapping relationship between output data and input data.
* coordinate mapping relationship between output data and input data.
* total_num_output: Total number of original output.
* total_num_output: Total number of original output.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
bool
IsBoundary
=
false
>
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadDataBc
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
__device__
__inline__
void
ReadDataBc
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
uint32_t
block_offset
,
uint32_t
block_offset
,
details
::
BroadcastConfig
<
Rank
>
config
,
details
::
BroadcastConfig
<
Rank
>
config
,
int
total_num_output
)
{
int
total_num_output
)
{
...
@@ -562,6 +596,5 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
...
@@ -562,6 +596,5 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
}
}
}
}
}
// namespace kernel_primitives
}
// namespace kps
}
// namespace operators
}
// namespace pten
}
// namespace paddle
paddle/pten/kernels/primitive/functor_primitives.h
0 → 100644
浏览文件 @
452bcbe2
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/pten/common/float16.h"
#include "paddle/pten/core/enforce.h"
#include "paddle/pten/kernels/funcs/eigen/extensions.h"
namespace
pten
{
namespace
kps
{
namespace
details
{
static
__device__
__forceinline__
pten
::
dtype
::
float16
Exp
(
pten
::
dtype
::
float16
x
)
{
return
::
Eigen
::
numext
::
exp
(
x
);
}
static
__device__
__forceinline__
float
Exp
(
float
x
)
{
return
expf
(
x
);
}
static
__device__
__forceinline__
double
Exp
(
double
x
)
{
return
exp
(
x
);
}
static
__device__
__forceinline__
pten
::
dtype
::
float16
Log
(
pten
::
dtype
::
float16
x
)
{
return
::
Eigen
::
numext
::
log
(
x
);
}
static
__device__
__forceinline__
float
Log
(
float
x
)
{
return
logf
(
x
);
}
static
__device__
__forceinline__
double
Log
(
double
x
)
{
return
log
(
x
);
}
}
// namespace details
/******************************** Unary Functor *******************************/
/**
* @brief Default unary exp functor
*/
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
ExpFunctor
{
HOSTDEVICE
inline
ExpFunctor
()
{}
HOSTDEVICE
explicit
inline
ExpFunctor
(
int
n
)
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
x
)
const
{
return
static_cast
<
Ty
>
(
details
::
Exp
(
x
));
}
};
/**
* @brief Default unary identity functor
*/
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
IdentityFunctor
{
HOSTDEVICE
inline
IdentityFunctor
()
{}
HOSTDEVICE
explicit
inline
IdentityFunctor
(
int
n
)
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
x
)
const
{
return
static_cast
<
Ty
>
(
x
);
}
};
/**
* @brief Default unary div functor. Divide by a constant
*/
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
DivideFunctor
{
private:
using
MPType
=
typename
::
paddle
::
operators
::
details
::
MPTypeTrait
<
Tx
>::
Type
;
public:
HOSTDEVICE
inline
DivideFunctor
()
{
n_inv
=
static_cast
<
MPType
>
(
1.0
f
);
}
HOSTDEVICE
explicit
inline
DivideFunctor
(
int
n
)
:
n_inv
((
MPType
)(
1.0
/
n
))
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
x
)
const
{
return
static_cast
<
Ty
>
(
static_cast
<
MPType
>
(
x
)
*
n_inv
);
}
private:
MPType
n_inv
;
};
/**
* @brief Default inverse functor
*/
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
InverseFunctor
{
HOSTDEVICE
inline
InverseFunctor
()
{}
HOSTDEVICE
explicit
inline
InverseFunctor
(
int
n
)
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
x
)
const
{
return
static_cast
<
Ty
>
(
-
x
);
}
};
/**
* @brief Default unary square functor
*/
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
SquareFunctor
{
HOSTDEVICE
inline
SquareFunctor
()
{}
HOSTDEVICE
explicit
inline
SquareFunctor
(
int
n
)
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
x
)
const
{
return
static_cast
<
Ty
>
(
x
)
*
static_cast
<
Ty
>
(
x
);
}
};
/****************************** Binary Functor ********************************/
/**
* @brief Default binary min functor
*/
template
<
typename
T
>
struct
MinFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
std
::
numeric_limits
<
T
>::
max
());
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
(
b
<
a
)
?
b
:
a
;
}
};
/**
* @brief Default binary max functor
*/
template
<
typename
T
>
struct
MaxFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
std
::
numeric_limits
<
T
>::
lowest
());
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
(
b
>
a
)
?
b
:
a
;
}
};
/**
* @brief Default binary add functor
*/
template
<
typename
T
>
struct
AddFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
0.0
f
);
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
b
+
a
;
}
};
/**
* @brief Default binary add functor
*/
template
<
typename
T
>
struct
MulFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
1.0
f
);
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
b
*
a
;
}
};
/**
* @brief Default binary logic or functor
*/
template
<
typename
T
>
struct
LogicalOrFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
false
);
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
b
||
a
;
}
};
/**
* @brief Default binary logic and functor
*/
template
<
typename
T
>
struct
LogicalAndFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
true
);
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
b
&&
a
;
}
};
/**
* @brief Default binary sub functor
*/
template
<
typename
T
>
struct
SubFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
0.0
f
);
}
inline
HOSTDEVICE
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
a
-
b
;
}
};
/**
* @brief Default binary div functor
*/
template
<
typename
T
,
typename
Enable
=
void
>
struct
DivFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
1.0
f
);
}
inline
HOSTDEVICE
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
a
/
b
;
}
};
template
<
typename
T
>
struct
DivFunctor
<
T
,
typename
std
::
enable_if
<
std
::
is_integral
<
T
>::
value
>::
type
>
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
1.0
f
);
}
inline
HOSTDEVICE
T
operator
()(
const
T
a
,
const
T
b
)
const
{
// For int32/int64, need to check whether the divison is zero.
PADDLE_ENFORCE_NE
(
b
,
0
,
pten
::
errors
::
InvalidArgument
(
"Integer division by zero encountered "
"in (floor) divide. Please check the input value."
));
return
a
/
b
;
}
};
/**
* @brief Default binary floor divide functor
*/
template
<
typename
T
>
struct
FloorDivFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
1.0
f
);
}
inline
HOSTDEVICE
T
operator
()(
const
T
a
,
const
T
b
)
const
{
PADDLE_ENFORCE_NE
(
b
,
0
,
pten
::
errors
::
InvalidArgument
(
"Integer division by zero encountered "
"in (floor) divide. Please check the input value."
));
return
static_cast
<
T
>
(
std
::
trunc
(
a
/
b
));
}
};
}
// namespace kps
}
// namespace pten
paddle/
fluid/operators/kernel_primitives
/helper_primitives.h
→
paddle/
pten/kernels/primitive
/helper_primitives.h
浏览文件 @
452bcbe2
// Copyright (c) 202
1
PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 202
2
PaddlePaddle Authors. All Rights Reserved.
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// you may not use this file except in compliance with the License.
...
@@ -14,9 +14,8 @@
...
@@ -14,9 +14,8 @@
#pragma once
#pragma once
namespace
paddle
{
namespace
pten
{
namespace
operators
{
namespace
kps
{
namespace
kernel_primitives
{
#ifdef PADDLE_WITH_XPU2
#ifdef PADDLE_WITH_XPU2
struct
dim3
{
struct
dim3
{
...
@@ -43,8 +42,12 @@ struct DimConfig {
...
@@ -43,8 +42,12 @@ struct DimConfig {
int
rem_y
;
int
rem_y
;
int
rem_z
;
int
rem_z
;
HOSTDEVICE
explicit
inline
DimConfig
(
int
split_x
,
int
split_y
,
int
split_z
,
HOSTDEVICE
explicit
inline
DimConfig
(
int
split_x
,
int
size_x
,
int
size_y
,
int
size_z
)
{
int
split_y
,
int
split_z
,
int
size_x
,
int
size_y
,
int
size_z
)
{
split_num_x
=
split_x
;
split_num_x
=
split_x
;
split_num_y
=
split_y
;
split_num_y
=
split_y
;
split_num_z
=
split_z
;
split_num_z
=
split_z
;
...
@@ -60,6 +63,5 @@ struct DimConfig {
...
@@ -60,6 +63,5 @@ struct DimConfig {
}
}
};
};
}
// namespace kernel_primitives
}
// namespace kps
}
// namespace operators
}
// namespace pten
}
// namespace paddle
paddle/pten/kernels/primitive/kernel_primitives.h
0 → 100644
浏览文件 @
452bcbe2
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/pten/kernels/primitive/helper_primitives.h"
#ifdef PADDLE_WITH_XPU2
#include "paddle/pten/backends/xpu/xpu_context.h"
#include "paddle/pten/kernels/primitive/compute_primitives_xpu2.h"
#include "paddle/pten/kernels/primitive/datamover_primitives_xpu2.h"
#include "paddle/pten/kernels/primitive/functor_primitives_xpu2.h"
#define KPStream XPUStream
#define KPDevice pten::XPUContext
#define _ptr_ _global_ptr_
#define __forceinline__ __inline__
#define __restrict__
#define THREAD_ID_X core_id()
#define THREAD_ID_Y 0
#define THREAD_ID_Z 0
#define BLOCK_NUM_X core_num()
#define BLOCK_NUM_Y 0
#define BLOCK_NUM_Z 0
#define BLOCK_ID_X cluster_id()
#define BLOCK_ID_Y 0
#define BLOCK_ID_Z 0
#define GRID_NUM_X cluster_num()
#define GRID_NUM_Y 0
#define GRID_NUM_Z 0
#else
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/kernels/primitive/compute_primitives.h"
#include "paddle/pten/kernels/primitive/datamover_primitives.h"
#include "paddle/pten/kernels/primitive/functor_primitives.h"
#define KPStream gpuStream_t
#define KPDevice pten::GPUContext
#define _ptr_
#define THREAD_ID_X threadIdx.x
#define THREAD_ID_Y threadIdx.y
#define THREAD_ID_Z threadIdx.z
#define BLOCK_NUM_X blockDim.x
#define BLOCK_NUM_Y blockDim.y
#define BLOCK_NUM_Z blockDim.z
#define BLOCK_ID_X blockIdx.x
#define BLOCK_ID_Y blockIdx.y
#define BLOCK_ID_Z blockIdx.z
#define GRID_NUM_X gridDim.x
#define GRID_NUM_Y gridDim.y
#define GRID_NUM_Z gridDim.z
#endif
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录