Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
452bcbe2
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
452bcbe2
编写于
1月 26, 2022
作者:
Y
YuanRisheng
提交者:
GitHub
1月 26, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Pten]Move kernel_primitives lib to Pten directory (#39169)
* move kernel_primitives * use pten's errors
上级
bd5c962d
变更
11
隐藏空白更改
内联
并排
Showing
11 changed file
with
578 addition
and
403 deletion
+578
-403
paddle/fluid/operators/kernel_primitives/functor_primitives.h
...le/fluid/operators/kernel_primitives/functor_primitives.h
+2
-233
paddle/fluid/operators/kernel_primitives/kernel_primitives.h
paddle/fluid/operators/kernel_primitives/kernel_primitives.h
+2
-53
paddle/pten/kernels/funcs/elementwise_base.h
paddle/pten/kernels/funcs/elementwise_base.h
+2
-2
paddle/pten/kernels/gpu/reduce.h
paddle/pten/kernels/gpu/reduce.h
+2
-2
paddle/pten/kernels/primitive/compute_primitives.h
paddle/pten/kernels/primitive/compute_primitives.h
+51
-24
paddle/pten/kernels/primitive/compute_primitives_xpu2.h
paddle/pten/kernels/primitive/compute_primitives_xpu2.h
+51
-23
paddle/pten/kernels/primitive/datamover_primitives.h
paddle/pten/kernels/primitive/datamover_primitives.h
+76
-33
paddle/pten/kernels/primitive/datamover_primitives_xpu2.h
paddle/pten/kernels/primitive/datamover_primitives_xpu2.h
+57
-24
paddle/pten/kernels/primitive/functor_primitives.h
paddle/pten/kernels/primitive/functor_primitives.h
+255
-0
paddle/pten/kernels/primitive/helper_primitives.h
paddle/pten/kernels/primitive/helper_primitives.h
+11
-9
paddle/pten/kernels/primitive/kernel_primitives.h
paddle/pten/kernels/primitive/kernel_primitives.h
+69
-0
未找到文件。
paddle/fluid/operators/kernel_primitives/functor_primitives.h
浏览文件 @
452bcbe2
...
@@ -13,241 +13,10 @@
...
@@ -13,241 +13,10 @@
// limitations under the License.
// limitations under the License.
#pragma once
#pragma once
#include "paddle/pten/kernels/primitive/functor_primitives.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/pten/kernels/funcs/eigen/extensions.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
kernel_primitives
{
namespace
kernel_primitives
=
pten
::
kps
;
namespace
details
{
static
__device__
__forceinline__
platform
::
float16
Exp
(
platform
::
float16
x
)
{
return
::
Eigen
::
numext
::
exp
(
x
);
}
static
__device__
__forceinline__
float
Exp
(
float
x
)
{
return
expf
(
x
);
}
static
__device__
__forceinline__
double
Exp
(
double
x
)
{
return
exp
(
x
);
}
static
__device__
__forceinline__
platform
::
float16
Log
(
platform
::
float16
x
)
{
return
::
Eigen
::
numext
::
log
(
x
);
}
static
__device__
__forceinline__
float
Log
(
float
x
)
{
return
logf
(
x
);
}
static
__device__
__forceinline__
double
Log
(
double
x
)
{
return
log
(
x
);
}
}
// namespace details
/******************************** Unary Functor *******************************/
/**
* @brief Default unary exp functor
*/
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
ExpFunctor
{
HOSTDEVICE
inline
ExpFunctor
()
{}
HOSTDEVICE
explicit
inline
ExpFunctor
(
int
n
)
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
x
)
const
{
return
static_cast
<
Ty
>
(
details
::
Exp
(
x
));
}
};
/**
* @brief Default unary identity functor
*/
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
IdentityFunctor
{
HOSTDEVICE
inline
IdentityFunctor
()
{}
HOSTDEVICE
explicit
inline
IdentityFunctor
(
int
n
)
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
x
)
const
{
return
static_cast
<
Ty
>
(
x
);
}
};
/**
* @brief Default unary div functor. Divide by a constant
*/
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
DivideFunctor
{
private:
using
MPType
=
typename
::
paddle
::
operators
::
details
::
MPTypeTrait
<
Tx
>::
Type
;
public:
HOSTDEVICE
inline
DivideFunctor
()
{
n_inv
=
static_cast
<
MPType
>
(
1.0
f
);
}
HOSTDEVICE
explicit
inline
DivideFunctor
(
int
n
)
:
n_inv
((
MPType
)(
1.0
/
n
))
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
x
)
const
{
return
static_cast
<
Ty
>
(
static_cast
<
MPType
>
(
x
)
*
n_inv
);
}
private:
MPType
n_inv
;
};
/**
* @brief Default inverse functor
*/
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
InverseFunctor
{
HOSTDEVICE
inline
InverseFunctor
()
{}
HOSTDEVICE
explicit
inline
InverseFunctor
(
int
n
)
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
x
)
const
{
return
static_cast
<
Ty
>
(
-
x
);
}
};
/**
* @brief Default unary square functor
*/
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
SquareFunctor
{
HOSTDEVICE
inline
SquareFunctor
()
{}
HOSTDEVICE
explicit
inline
SquareFunctor
(
int
n
)
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
x
)
const
{
return
static_cast
<
Ty
>
(
x
)
*
static_cast
<
Ty
>
(
x
);
}
};
/****************************** Binary Functor ********************************/
/**
* @brief Default binary min functor
*/
template
<
typename
T
>
struct
MinFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
std
::
numeric_limits
<
T
>::
max
());
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
(
b
<
a
)
?
b
:
a
;
}
};
/**
* @brief Default binary max functor
*/
template
<
typename
T
>
struct
MaxFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
std
::
numeric_limits
<
T
>::
lowest
());
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
(
b
>
a
)
?
b
:
a
;
}
};
/**
* @brief Default binary add functor
*/
template
<
typename
T
>
struct
AddFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
0.0
f
);
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
b
+
a
;
}
};
/**
* @brief Default binary add functor
*/
template
<
typename
T
>
struct
MulFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
1.0
f
);
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
b
*
a
;
}
};
/**
* @brief Default binary logic or functor
*/
template
<
typename
T
>
struct
LogicalOrFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
false
);
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
b
||
a
;
}
};
/**
* @brief Default binary logic and functor
*/
template
<
typename
T
>
struct
LogicalAndFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
true
);
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
b
&&
a
;
}
};
/**
* @brief Default binary sub functor
*/
template
<
typename
T
>
struct
SubFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
0.0
f
);
}
inline
HOSTDEVICE
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
a
-
b
;
}
};
/**
* @brief Default binary div functor
*/
template
<
typename
T
,
typename
Enable
=
void
>
struct
DivFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
1.0
f
);
}
inline
HOSTDEVICE
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
a
/
b
;
}
};
template
<
typename
T
>
struct
DivFunctor
<
T
,
typename
std
::
enable_if
<
std
::
is_integral
<
T
>::
value
>::
type
>
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
1.0
f
);
}
inline
HOSTDEVICE
T
operator
()(
const
T
a
,
const
T
b
)
const
{
// For int32/int64, need to check whether the divison is zero.
PADDLE_ENFORCE_NE
(
b
,
0
,
platform
::
errors
::
InvalidArgument
(
"Integer division by zero encountered "
"in (floor) divide. Please check the input value."
));
return
a
/
b
;
}
};
/**
* @brief Default binary floor divide functor
*/
template
<
typename
T
>
struct
FloorDivFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
1.0
f
);
}
inline
HOSTDEVICE
T
operator
()(
const
T
a
,
const
T
b
)
const
{
PADDLE_ENFORCE_NE
(
b
,
0
,
platform
::
errors
::
InvalidArgument
(
"Integer division by zero encountered "
"in (floor) divide. Please check the input value."
));
return
static_cast
<
T
>
(
std
::
trunc
(
a
/
b
));
}
};
}
// namespace kernel_primitives
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
paddle/fluid/operators/kernel_primitives/kernel_primitives.h
浏览文件 @
452bcbe2
...
@@ -13,61 +13,10 @@
...
@@ -13,61 +13,10 @@
// limitations under the License.
// limitations under the License.
#pragma once
#pragma once
#include "paddle/fluid/operators/kernel_primitives/helper_primitives.h"
#include "paddle/pten/kernels/primitive/kernel_primitives.h"
#ifdef PADDLE_WITH_XPU2
#include "paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h"
#include "paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h"
#include "paddle/fluid/operators/kernel_primitives/functor_primitives_xpu2.h"
#define KPStream XPUStream
#define KPDevice paddle::platform::XPUDeviceContext
#define _ptr_ _global_ptr_
#define __forceinline__ __inline__
#define __restrict__
#define THREAD_ID_X core_id()
#define THREAD_ID_Y 0
#define THREAD_ID_Z 0
#define BLOCK_NUM_X core_num()
#define BLOCK_NUM_Y 0
#define BLOCK_NUM_Z 0
#define BLOCK_ID_X cluster_id()
#define BLOCK_ID_Y 0
#define BLOCK_ID_Z 0
#define GRID_NUM_X cluster_num()
#define GRID_NUM_Y 0
#define GRID_NUM_Z 0
#else
#include "paddle/fluid/operators/kernel_primitives/compute_primitives.h"
#include "paddle/fluid/operators/kernel_primitives/datamover_primitives.h"
#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h"
#define KPStream gpuStream_t
#define KPDevice paddle::platform::CUDADeviceContext
#define _ptr_
#define THREAD_ID_X threadIdx.x
#define THREAD_ID_Y threadIdx.y
#define THREAD_ID_Z threadIdx.z
#define BLOCK_NUM_X blockDim.x
#define BLOCK_NUM_Y blockDim.y
#define BLOCK_NUM_Z blockDim.z
#define BLOCK_ID_X blockIdx.x
#define BLOCK_ID_Y blockIdx.y
#define BLOCK_ID_Z blockIdx.z
#define GRID_NUM_X gridDim.x
#define GRID_NUM_Y gridDim.y
#define GRID_NUM_Z gridDim.z
#endif
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
kernel_primitives
{}
namespace
kernel_primitives
=
pten
::
kps
;
}
}
}
}
paddle/pten/kernels/funcs/elementwise_base.h
浏览文件 @
452bcbe2
...
@@ -22,12 +22,12 @@ limitations under the License. */
...
@@ -22,12 +22,12 @@ limitations under the License. */
#include "paddle/pten/kernels/empty_kernel.h"
#include "paddle/pten/kernels/empty_kernel.h"
#if defined(__NVCC__) || defined(__HIPCC__)
#if defined(__NVCC__) || defined(__HIPCC__)
#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/function_traits.h"
#include "paddle/fluid/platform/function_traits.h"
#include "paddle/pten/kernels/primitive/kernel_primitives.h"
namespace
kps
=
p
addle
::
operators
::
kernel_primitive
s
;
namespace
kps
=
p
ten
::
kp
s
;
#endif
#endif
...
...
paddle/pten/kernels/gpu/reduce.h
浏览文件 @
452bcbe2
...
@@ -34,13 +34,13 @@ namespace cub = hipcub;
...
@@ -34,13 +34,13 @@ namespace cub = hipcub;
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/fast_divmod.h"
#include "paddle/fluid/platform/fast_divmod.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/pten/core/array.h"
#include "paddle/pten/core/array.h"
#include "paddle/pten/core/enforce.h"
#include "paddle/pten/core/enforce.h"
#include "paddle/pten/kernels/primitive/kernel_primitives.h"
#include "paddle/pten/api/ext/dispatch.h"
#include "paddle/pten/api/ext/dispatch.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
...
@@ -51,7 +51,7 @@ namespace cub = hipcub;
...
@@ -51,7 +51,7 @@ namespace cub = hipcub;
#define REDUCE_SPLIT_BOUNDARY 512
#define REDUCE_SPLIT_BOUNDARY 512
#define REDUCE_VEC_SIZE 4
#define REDUCE_VEC_SIZE 4
namespace
kps
=
p
addle
::
operators
::
kernel_primitive
s
;
namespace
kps
=
p
ten
::
kp
s
;
namespace
pten
{
namespace
pten
{
namespace
kernels
{
namespace
kernels
{
...
...
paddle/
fluid/operators/kernel_primitives
/compute_primitives.h
→
paddle/
pten/kernels/primitive
/compute_primitives.h
浏览文件 @
452bcbe2
...
@@ -22,11 +22,10 @@
...
@@ -22,11 +22,10 @@
#endif
#endif
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/
fluid/platform
/float16.h"
#include "paddle/
pten/common
/float16.h"
namespace
paddle
{
namespace
pten
{
namespace
operators
{
namespace
kps
{
namespace
kernel_primitives
{
namespace
details
{
namespace
details
{
#ifdef __HIPCC__
#ifdef __HIPCC__
...
@@ -48,7 +47,7 @@ class MPTypeTrait {
...
@@ -48,7 +47,7 @@ class MPTypeTrait {
};
};
template
<
>
template
<
>
class
MPTypeTrait
<
p
latform
::
float16
>
{
class
MPTypeTrait
<
p
ten
::
dtype
::
float16
>
{
public:
public:
using
Type
=
float
;
using
Type
=
float
;
};
};
...
@@ -158,9 +157,14 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
...
@@ -158,9 +157,14 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
* in: The register pointer of in, the size is NX * NY.
* in: The register pointer of in, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseUnary
(
OutT
*
out
,
const
InT
*
in
,
__device__
__forceinline__
void
ElementwiseUnary
(
OutT
*
out
,
const
InT
*
in
,
OpFunc
compute
)
{
OpFunc
compute
)
{
#pragma unroll
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
idx
++
)
{
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
idx
++
)
{
...
@@ -193,9 +197,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
...
@@ -193,9 +197,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
* in2: The register pointer of second input, size is NX * NY.
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
const
InT
*
in1
,
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
const
InT
*
in2
,
OpFunc
compute
)
{
OpFunc
compute
)
{
#pragma unroll
#pragma unroll
...
@@ -231,12 +240,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
...
@@ -231,12 +240,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
* in3: The register pointer of third input, size is NX * NY.
* in3: The register pointer of third input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseTernary
(
OutT
*
out
,
const
InT
*
in1
,
__device__
__forceinline__
void
ElementwiseTernary
(
const
InT
*
in2
,
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
const
InT
*
in3
,
OpFunc
compute
)
{
const
InT
*
in3
,
OpFunc
compute
)
{
#pragma unroll
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
++
idx
)
{
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
++
idx
)
{
out
[
idx
]
=
static_cast
<
OutT
>
(
compute
(
in1
[
idx
],
in2
[
idx
],
in3
[
idx
]));
out
[
idx
]
=
static_cast
<
OutT
>
(
compute
(
in1
[
idx
],
in2
[
idx
],
in3
[
idx
]));
...
@@ -268,9 +279,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
...
@@ -268,9 +279,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
* ins: A pointers of array consisting of multiple inputs.
* ins: A pointers of array consisting of multiple inputs.
* compute: Compute function which was declared like OpFunc<InT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Arity
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Arity
,
class
OpFunc
>
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseAny
(
OutT
*
out
,
InT
(
*
ins
)[
NX
*
NY
],
__device__
__forceinline__
void
ElementwiseAny
(
OutT
*
out
,
InT
(
*
ins
)[
NX
*
NY
],
OpFunc
compute
)
{
OpFunc
compute
)
{
InT
args
[
Arity
];
InT
args
[
Arity
];
#pragma unroll
#pragma unroll
...
@@ -309,10 +326,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
...
@@ -309,10 +326,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
* in2: The register pointer of second input, size is NX * NY.
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
class
OpFunc
>
__device__
__forceinline__
void
CycleBinary
(
OutT
*
out
,
const
InT
*
in1
,
__device__
__forceinline__
void
CycleBinary
(
OutT
*
out
,
const
InT
*
in2
,
OpFunc
compute
)
{
const
InT
*
in1
,
const
InT
*
in2
,
OpFunc
compute
)
{
#pragma unroll
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
;
idx
++
)
{
for
(
int
idx
=
0
;
idx
<
NX
;
idx
++
)
{
#pragma unroll
#pragma unroll
...
@@ -350,9 +373,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
...
@@ -350,9 +373,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
* reducer: Compute function which was declared like ReduceFunctor<InT>().
* reducer: Compute function which was declared like ReduceFunctor<InT>().
* reduce_last_dim: if the last dim gets involved in reduction.
* reduce_last_dim: if the last dim gets involved in reduction.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
class
ReduceFunctor
,
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
class
ReduceFunctor
,
details
::
ReduceMode
Mode
>
details
::
ReduceMode
Mode
>
__device__
__forceinline__
void
Reduce
(
T
*
out
,
const
T
*
in
,
__device__
__forceinline__
void
Reduce
(
T
*
out
,
const
T
*
in
,
ReduceFunctor
reducer
,
ReduceFunctor
reducer
,
bool
reduce_last_dim
)
{
bool
reduce_last_dim
)
{
int
block_index
=
blockDim
.
y
;
int
block_index
=
blockDim
.
y
;
...
@@ -386,6 +414,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in,
...
@@ -386,6 +414,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in,
}
}
}
}
}
// namespace kernel_primitives
}
// namespace kps
}
// namespace operators
}
// namespace pten
}
// namespace paddle
paddle/
fluid/operators/kernel_primitives
/compute_primitives_xpu2.h
→
paddle/
pten/kernels/primitive
/compute_primitives_xpu2.h
浏览文件 @
452bcbe2
...
@@ -13,13 +13,13 @@
...
@@ -13,13 +13,13 @@
// limitations under the License.
// limitations under the License.
#pragma once
#pragma once
#include "paddle/pten/common/float16.h"
#include "xpu/kernel/cluster_header.h"
#include "xpu/kernel/cluster_header.h"
#include "xpu/kernel/debug.h"
#include "xpu/kernel/debug.h"
#include "xpu/kernel/math.h"
#include "xpu/kernel/math.h"
namespace
paddle
{
namespace
pten
{
namespace
operators
{
namespace
kps
{
namespace
kernel_primitives
{
namespace
details
{
namespace
details
{
// kGlobalMode: block reduce, each block gets an output;
// kGlobalMode: block reduce, each block gets an output;
...
@@ -33,7 +33,7 @@ class MPTypeTrait {
...
@@ -33,7 +33,7 @@ class MPTypeTrait {
};
};
template
<
>
template
<
>
class
MPTypeTrait
<
p
latform
::
float16
>
{
class
MPTypeTrait
<
p
ten
::
dtype
::
float16
>
{
public:
public:
using
Type
=
float
;
using
Type
=
float
;
};
};
...
@@ -102,9 +102,14 @@ __device__ void BlockXReduce(T* data, OpFunc reducer) {
...
@@ -102,9 +102,14 @@ __device__ void BlockXReduce(T* data, OpFunc reducer) {
* in: The register pointer of in, the size is NX * NY.
* in: The register pointer of in, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseUnary
(
OutT
*
out
,
const
InT
*
in
,
__device__
__forceinline__
void
ElementwiseUnary
(
OutT
*
out
,
const
InT
*
in
,
OpFunc
compute
)
{
OpFunc
compute
)
{
#pragma unroll
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
idx
++
)
{
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
idx
++
)
{
...
@@ -137,9 +142,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
...
@@ -137,9 +142,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
* in2: The register pointer of second input, size is NX * NY.
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
const
InT
*
in1
,
__device__
__forceinline__
void
ElementwiseBinary
(
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
const
InT
*
in2
,
OpFunc
compute
)
{
OpFunc
compute
)
{
#pragma unroll
#pragma unroll
...
@@ -175,12 +185,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
...
@@ -175,12 +185,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
* in3: The register pointer of third input, size is NX * NY.
* in3: The register pointer of third input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseTernary
(
OutT
*
out
,
const
InT
*
in1
,
__device__
__forceinline__
void
ElementwiseTernary
(
const
InT
*
in2
,
OutT
*
out
,
const
InT
*
in1
,
const
InT
*
in2
,
const
InT
*
in3
,
OpFunc
compute
)
{
const
InT
*
in3
,
OpFunc
compute
)
{
#pragma unroll
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
++
idx
)
{
for
(
int
idx
=
0
;
idx
<
NX
*
NY
;
++
idx
)
{
out
[
idx
]
=
static_cast
<
OutT
>
(
compute
(
in1
[
idx
],
in2
[
idx
],
in3
[
idx
]));
out
[
idx
]
=
static_cast
<
OutT
>
(
compute
(
in1
[
idx
],
in2
[
idx
],
in3
[
idx
]));
...
@@ -212,9 +224,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
...
@@ -212,9 +224,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
* ins: A pointers of array consisting of multiple inputs.
* ins: A pointers of array consisting of multiple inputs.
* compute: Compute function which was declared like OpFunc<InT>().
* compute: Compute function which was declared like OpFunc<InT>().
*/
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Arity
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Arity
,
class
OpFunc
>
class
OpFunc
>
__device__
__forceinline__
void
ElementwiseAny
(
OutT
*
out
,
InT
(
*
ins
)[
NX
*
NY
],
__device__
__forceinline__
void
ElementwiseAny
(
OutT
*
out
,
InT
(
*
ins
)[
NX
*
NY
],
OpFunc
compute
)
{
OpFunc
compute
)
{
__local__
InT
args
[
Arity
];
__local__
InT
args
[
Arity
];
#pragma unroll
#pragma unroll
...
@@ -253,10 +271,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
...
@@ -253,10 +271,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
* in2: The register pointer of second input, size is NX * NY.
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
*/
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
InT
,
typename
OutT
,
int
NX
,
int
NY
,
int
BlockSize
,
class
OpFunc
>
class
OpFunc
>
__device__
__forceinline__
void
CycleBinary
(
OutT
*
out
,
const
InT
*
in1
,
__device__
__forceinline__
void
CycleBinary
(
OutT
*
out
,
const
InT
*
in2
,
OpFunc
compute
)
{
const
InT
*
in1
,
const
InT
*
in2
,
OpFunc
compute
)
{
#pragma unroll
#pragma unroll
for
(
int
idx
=
0
;
idx
<
NX
;
idx
++
)
{
for
(
int
idx
=
0
;
idx
<
NX
;
idx
++
)
{
#pragma unroll
#pragma unroll
...
@@ -294,9 +318,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
...
@@ -294,9 +318,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
* reducer: Compute function which was declared like ReduceFunctor<InT>().
* reducer: Compute function which was declared like ReduceFunctor<InT>().
* reduce_last_dim: if the last dim gets involved in reduction.
* reduce_last_dim: if the last dim gets involved in reduction.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
class
ReduceFunctor
,
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
class
ReduceFunctor
,
details
::
ReduceMode
Mode
>
details
::
ReduceMode
Mode
>
__device__
__forceinline__
void
Reduce
(
T
*
out
,
const
T
*
in
,
__device__
__forceinline__
void
Reduce
(
T
*
out
,
const
T
*
in
,
ReduceFunctor
reducer
,
ReduceFunctor
reducer
,
bool
reduce_last_dim
)
{
bool
reduce_last_dim
)
{
if
(
Mode
==
kGlobalMode
)
{
if
(
Mode
==
kGlobalMode
)
{
...
@@ -319,6 +348,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in,
...
@@ -319,6 +348,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in,
}
}
}
}
}
// namespace kernel_primitives
}
// namespace kps
}
// namespace operators
}
// namespace pten
}
// namespace paddle
paddle/
fluid/operators/kernel_primitives
/datamover_primitives.h
→
paddle/
pten/kernels/primitive
/datamover_primitives.h
浏览文件 @
452bcbe2
...
@@ -22,9 +22,8 @@
...
@@ -22,9 +22,8 @@
#endif
#endif
#include "paddle/pten/core/ddim.h"
#include "paddle/pten/core/ddim.h"
namespace
paddle
{
namespace
pten
{
namespace
operators
{
namespace
kps
{
namespace
kernel_primitives
{
namespace
details
{
namespace
details
{
#define INT_BITS 32
#define INT_BITS 32
...
@@ -103,11 +102,12 @@ struct BroadcastConfig {
...
@@ -103,11 +102,12 @@ struct BroadcastConfig {
strides_in
.
resize
(
dim_size
,
1
);
strides_in
.
resize
(
dim_size
,
1
);
for
(
int
i
=
0
;
i
<
dim_size
;
++
i
)
{
for
(
int
i
=
0
;
i
<
dim_size
;
++
i
)
{
strides_in
[
i
]
=
in_dims
[
i
]
==
1
?
0
:
strides_in
[
i
];
strides_in
[
i
]
=
in_dims
[
i
]
==
1
?
0
:
strides_in
[
i
];
strides_in
[
i
]
=
strides_in
[
i
]
=
(
i
!=
0
&&
strides_in
[
i
]
!=
0
)
(
i
!=
0
&&
strides_in
[
i
]
!=
0
)
?
std
::
accumulate
(
in_dims
.
begin
(),
?
std
::
accumulate
(
in_dims
.
begin
(),
in_dims
.
begin
()
+
i
,
1
,
in_dims
.
begin
()
+
i
,
std
::
multiplies
<
int64_t
>
())
1
,
:
strides_in
[
i
];
std
::
multiplies
<
int64_t
>
())
:
strides_in
[
i
];
}
}
memcpy
(
strides
,
strides_in
.
data
(),
kDims
*
sizeof
(
uint32_t
));
memcpy
(
strides
,
strides_in
.
data
(),
kDims
*
sizeof
(
uint32_t
));
...
@@ -144,11 +144,18 @@ struct BroadcastConfig {
...
@@ -144,11 +144,18 @@ struct BroadcastConfig {
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
*/
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadData
(
Ty
*
dst
,
const
Tx
*
__restrict__
src
,
__device__
__forceinline__
void
ReadData
(
Ty
*
dst
,
int
size_nx
,
int
size_ny
,
const
Tx
*
__restrict__
src
,
int
stride_nx
,
int
stride_ny
)
{
int
size_nx
,
int
size_ny
,
int
stride_nx
,
int
stride_ny
)
{
int
thread_offset
=
threadIdx
.
x
;
int
thread_offset
=
threadIdx
.
x
;
int
left_size_nx
=
size_nx
-
thread_offset
;
int
left_size_nx
=
size_nx
-
thread_offset
;
...
@@ -244,7 +251,8 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
...
@@ -244,7 +251,8 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
* size: The current block needs to load size data continuously.
* size: The current block needs to load size data continuously.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadData
(
T
*
dst
,
const
T
*
__restrict__
src
,
__device__
__forceinline__
void
ReadData
(
T
*
dst
,
const
T
*
__restrict__
src
,
int
num
)
{
int
num
)
{
if
(
IsBoundary
)
{
// blockDim.x * NX > num
if
(
IsBoundary
)
{
// blockDim.x * NX > num
int
thread_offset
=
threadIdx
.
x
*
NX
;
int
thread_offset
=
threadIdx
.
x
*
NX
;
...
@@ -299,11 +307,19 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
...
@@ -299,11 +307,19 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
bool
IsBoundary
=
false
>
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadDataBc
(
__device__
__forceinline__
void
ReadDataBc
(
T
*
dst
,
const
T
*
__restrict__
src
,
uint32_t
block_offset
,
T
*
dst
,
details
::
BroadcastConfig
<
Rank
>
config
,
int
total_num_output
,
int
stride_nx
,
const
T
*
__restrict__
src
,
uint32_t
block_offset
,
details
::
BroadcastConfig
<
Rank
>
config
,
int
total_num_output
,
int
stride_nx
,
int
stride_ny
)
{
int
stride_ny
)
{
uint32_t
thread_offset
=
block_offset
+
threadIdx
.
x
;
uint32_t
thread_offset
=
block_offset
+
threadIdx
.
x
;
uint32_t
index_src
=
0
;
uint32_t
index_src
=
0
;
...
@@ -361,12 +377,25 @@ __device__ __forceinline__ void ReadDataBc(
...
@@ -361,12 +377,25 @@ __device__ __forceinline__ void ReadDataBc(
* reduce_last_dim: Used to indicate whether the dimension of reduce contains
* reduce_last_dim: Used to indicate whether the dimension of reduce contains
* the lowest dimension.
* the lowest dimension.
*/
*/
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
template
<
typename
Tx
,
typename
IndexCal
,
typename
Functor
,
bool
IsBoundary
=
false
>
typename
Ty
,
__device__
__forceinline__
void
ReadDataReduce
(
int
NX
,
Ty
*
dst
,
const
Tx
*
__restrict__
src
,
int
block_offset
,
int
NY
,
const
IndexCal
&
index_cal
,
int
size_nx
,
int
size_ny
,
int
stride_nx
,
int
BlockSize
,
int
stride_ny
,
Functor
func
,
bool
reduce_last_dim
)
{
int
Rank
,
typename
IndexCal
,
typename
Functor
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadDataReduce
(
Ty
*
dst
,
const
Tx
*
__restrict__
src
,
int
block_offset
,
const
IndexCal
&
index_cal
,
int
size_nx
,
int
size_ny
,
int
stride_nx
,
int
stride_ny
,
Functor
func
,
bool
reduce_last_dim
)
{
int
thread_offset
=
0
;
int
thread_offset
=
0
;
int
left_idx
=
0
;
int
left_idx
=
0
;
if
(
reduce_last_dim
)
{
if
(
reduce_last_dim
)
{
...
@@ -430,7 +459,8 @@ __device__ __forceinline__ void ReadDataReduce(
...
@@ -430,7 +459,8 @@ __device__ __forceinline__ void ReadDataReduce(
* size: The current block needs to load size elements continuously.
* size: The current block needs to load size elements continuously.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
WriteData
(
T
*
dst
,
T
*
__restrict__
src
,
__device__
__forceinline__
void
WriteData
(
T
*
dst
,
T
*
__restrict__
src
,
int
num
)
{
int
num
)
{
if
(
IsBoundary
)
{
if
(
IsBoundary
)
{
int
thread_offset
=
threadIdx
.
x
*
NX
;
int
thread_offset
=
threadIdx
.
x
*
NX
;
...
@@ -483,11 +513,18 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
...
@@ -483,11 +513,18 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
*/
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
WriteData
(
Ty
*
dst
,
const
Tx
*
__restrict__
src
,
__device__
__forceinline__
void
WriteData
(
Ty
*
dst
,
int
size_nx
,
int
size_ny
,
const
Tx
*
__restrict__
src
,
int
stride_nx
,
int
stride_ny
)
{
int
size_nx
,
int
size_ny
,
int
stride_nx
,
int
stride_ny
)
{
int
thread_offset
=
threadIdx
.
x
;
int
thread_offset
=
threadIdx
.
x
;
int
left_size_nx
=
size_nx
-
thread_offset
;
int
left_size_nx
=
size_nx
-
thread_offset
;
...
@@ -589,11 +626,18 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
...
@@ -589,11 +626,18 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
* coordinate mapping relationship between output data and input data.
* coordinate mapping relationship between output data and input data.
* total_num_output: Total number of original output.
* total_num_output: Total number of original output.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
bool
IsBoundary
=
false
>
bool
IsBoundary
=
false
>
__device__
__forceinline__
void
ReadDataBc
(
__device__
__forceinline__
void
ReadDataBc
(
T
*
dst
,
const
T
*
__restrict__
src
,
uint32_t
block_offset
,
T
*
dst
,
details
::
BroadcastConfig
<
Rank
>
config
,
int
total_num_output
)
{
const
T
*
__restrict__
src
,
uint32_t
block_offset
,
details
::
BroadcastConfig
<
Rank
>
config
,
int
total_num_output
)
{
uint32_t
thread_offset
=
block_offset
+
threadIdx
.
x
*
NX
;
uint32_t
thread_offset
=
block_offset
+
threadIdx
.
x
*
NX
;
uint32_t
index_src
=
0
;
uint32_t
index_src
=
0
;
...
@@ -616,6 +660,5 @@ __device__ __forceinline__ void ReadDataBc(
...
@@ -616,6 +660,5 @@ __device__ __forceinline__ void ReadDataBc(
}
}
}
}
}
// namespace kernel_primitives
}
// namespace kps
}
// namespace operators
}
// namespace pten
}
// namespace paddle
paddle/
fluid/operators/kernel_primitives
/datamover_primitives_xpu2.h
→
paddle/
pten/kernels/primitive
/datamover_primitives_xpu2.h
浏览文件 @
452bcbe2
...
@@ -17,9 +17,8 @@
...
@@ -17,9 +17,8 @@
#include "xpu/kernel/debug.h"
#include "xpu/kernel/debug.h"
#include "xpu/kernel/math.h"
#include "xpu/kernel/math.h"
namespace
paddle
{
namespace
pten
{
namespace
operators
{
namespace
kps
{
namespace
kernel_primitives
{
namespace
details
{
namespace
details
{
template
<
typename
T
,
int
VecSize
>
template
<
typename
T
,
int
VecSize
>
...
@@ -105,10 +104,17 @@ struct BroadcastConfig {
...
@@ -105,10 +104,17 @@ struct BroadcastConfig {
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
*/
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadData
(
Ty
*
dst
,
const
Tx
_global_ptr_
*
src
,
__device__
__inline__
void
ReadData
(
Ty
*
dst
,
int
size_nx
,
int
size_ny
,
int
stride_nx
,
const
Tx
_global_ptr_
*
src
,
int
size_nx
,
int
size_ny
,
int
stride_nx
,
int
stride_ny
)
{
int
stride_ny
)
{
int
thread_offset
=
core_id
();
int
thread_offset
=
core_id
();
int
left_size_nx
=
size_nx
-
thread_offset
;
int
left_size_nx
=
size_nx
-
thread_offset
;
...
@@ -205,7 +211,8 @@ __device__ __inline__ void Init(T* dst, T init_data) {
...
@@ -205,7 +211,8 @@ __device__ __inline__ void Init(T* dst, T init_data) {
* size: The current block needs to load size data continuously.
* size: The current block needs to load size data continuously.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadData
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
__device__
__inline__
void
ReadData
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
int
num
)
{
int
num
)
{
int
thread_offset
=
core_id
()
*
NX
;
int
thread_offset
=
core_id
()
*
NX
;
__local__
T
in_temp
[
1
];
__local__
T
in_temp
[
1
];
...
@@ -247,12 +254,18 @@ __device__ __inline__ void ReadData(T* dst, const T _global_ptr_* src,
...
@@ -247,12 +254,18 @@ __device__ __inline__ void ReadData(T* dst, const T _global_ptr_* src,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
bool
IsBoundary
=
false
>
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadDataBc
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
__device__
__inline__
void
ReadDataBc
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
uint32_t
block_offset
,
uint32_t
block_offset
,
details
::
BroadcastConfig
<
Rank
>
config
,
details
::
BroadcastConfig
<
Rank
>
config
,
int
total_num_output
,
int
stride_nx
,
int
total_num_output
,
int
stride_nx
,
int
stride_ny
)
{
int
stride_ny
)
{
uint32_t
thread_offset
=
block_offset
+
core_id
();
uint32_t
thread_offset
=
block_offset
+
core_id
();
uint32_t
index_src
=
0
;
uint32_t
index_src
=
0
;
...
@@ -307,13 +320,21 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
...
@@ -307,13 +320,21 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
* reduce_last_dim: Used to indicate whether the dimension of reduce contains
* reduce_last_dim: Used to indicate whether the dimension of reduce contains
* the lowest dimension.
* the lowest dimension.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
template
<
typename
T
,
typename
IndexCal
,
bool
IsBoundary
=
false
>
int
NX
,
__device__
__inline__
void
ReadDataReduce
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
int
NY
,
int
BlockSize
,
int
Rank
,
typename
IndexCal
,
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadDataReduce
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
int
block_offset
,
int
block_offset
,
const
IndexCal
&
index_cal
,
const
IndexCal
&
index_cal
,
int
size_nx
,
int
size_ny
,
int
size_nx
,
int
stride_nx
,
int
stride_ny
,
int
size_ny
,
int
stride_nx
,
int
stride_ny
,
bool
reduce_last_dim
)
{
bool
reduce_last_dim
)
{
__local__
Tx
in_temp
[
1
];
__local__
Tx
in_temp
[
1
];
int
thread_offset
=
0
;
int
thread_offset
=
0
;
...
@@ -423,10 +444,17 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
...
@@ -423,10 +444,17 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
*/
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
template
<
typename
Tx
,
typename
Ty
,
int
NX
,
int
NY
,
int
BlockSize
,
bool
IsBoundary
=
false
>
bool
IsBoundary
=
false
>
__device__
__inline__
void
WriteData
(
Ty
_global_ptr_
*
dst
,
const
Tx
*
src
,
__device__
__inline__
void
WriteData
(
Ty
_global_ptr_
*
dst
,
int
size_nx
,
int
size_ny
,
int
stride_nx
,
const
Tx
*
src
,
int
size_nx
,
int
size_ny
,
int
stride_nx
,
int
stride_ny
)
{
int
stride_ny
)
{
int
thread_offset
=
core_id
();
int
thread_offset
=
core_id
();
int
left_size_nx
=
size_nx
-
thread_offset
;
int
left_size_nx
=
size_nx
-
thread_offset
;
...
@@ -483,7 +511,8 @@ __device__ __inline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
...
@@ -483,7 +511,8 @@ __device__ __inline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
}
}
}
}
in_temp
[
0
]
=
static_cast
<
Ty
>
(
src
[
idx
+
idy
*
NX
]);
in_temp
[
0
]
=
static_cast
<
Ty
>
(
src
[
idx
+
idy
*
NX
]);
LM2GM
(
in_temp
,
dst
+
thread_offset
+
idx
*
stride_nx
+
idy
*
stride_ny
,
LM2GM
(
in_temp
,
dst
+
thread_offset
+
idx
*
stride_nx
+
idy
*
stride_ny
,
sizeof
(
Ty
));
sizeof
(
Ty
));
}
}
}
}
...
@@ -537,9 +566,14 @@ __device__ __inline__ void Init(T* dst, T* init_data, int num) {
...
@@ -537,9 +566,14 @@ __device__ __inline__ void Init(T* dst, T* init_data, int num) {
* coordinate mapping relationship between output data and input data.
* coordinate mapping relationship between output data and input data.
* total_num_output: Total number of original output.
* total_num_output: Total number of original output.
*/
*/
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
template
<
typename
T
,
int
NX
,
int
NY
,
int
BlockSize
,
int
Rank
,
bool
IsBoundary
=
false
>
bool
IsBoundary
=
false
>
__device__
__inline__
void
ReadDataBc
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
__device__
__inline__
void
ReadDataBc
(
T
*
dst
,
const
T
_global_ptr_
*
src
,
uint32_t
block_offset
,
uint32_t
block_offset
,
details
::
BroadcastConfig
<
Rank
>
config
,
details
::
BroadcastConfig
<
Rank
>
config
,
int
total_num_output
)
{
int
total_num_output
)
{
...
@@ -562,6 +596,5 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
...
@@ -562,6 +596,5 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
}
}
}
}
}
// namespace kernel_primitives
}
// namespace kps
}
// namespace operators
}
// namespace pten
}
// namespace paddle
paddle/pten/kernels/primitive/functor_primitives.h
0 → 100644
浏览文件 @
452bcbe2
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/pten/common/float16.h"
#include "paddle/pten/core/enforce.h"
#include "paddle/pten/kernels/funcs/eigen/extensions.h"
namespace
pten
{
namespace
kps
{
namespace
details
{
static
__device__
__forceinline__
pten
::
dtype
::
float16
Exp
(
pten
::
dtype
::
float16
x
)
{
return
::
Eigen
::
numext
::
exp
(
x
);
}
static
__device__
__forceinline__
float
Exp
(
float
x
)
{
return
expf
(
x
);
}
static
__device__
__forceinline__
double
Exp
(
double
x
)
{
return
exp
(
x
);
}
static
__device__
__forceinline__
pten
::
dtype
::
float16
Log
(
pten
::
dtype
::
float16
x
)
{
return
::
Eigen
::
numext
::
log
(
x
);
}
static
__device__
__forceinline__
float
Log
(
float
x
)
{
return
logf
(
x
);
}
static
__device__
__forceinline__
double
Log
(
double
x
)
{
return
log
(
x
);
}
}
// namespace details
/******************************** Unary Functor *******************************/
/**
* @brief Default unary exp functor
*/
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
ExpFunctor
{
HOSTDEVICE
inline
ExpFunctor
()
{}
HOSTDEVICE
explicit
inline
ExpFunctor
(
int
n
)
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
x
)
const
{
return
static_cast
<
Ty
>
(
details
::
Exp
(
x
));
}
};
/**
* @brief Default unary identity functor
*/
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
IdentityFunctor
{
HOSTDEVICE
inline
IdentityFunctor
()
{}
HOSTDEVICE
explicit
inline
IdentityFunctor
(
int
n
)
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
x
)
const
{
return
static_cast
<
Ty
>
(
x
);
}
};
/**
* @brief Default unary div functor. Divide by a constant
*/
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
DivideFunctor
{
private:
using
MPType
=
typename
::
paddle
::
operators
::
details
::
MPTypeTrait
<
Tx
>::
Type
;
public:
HOSTDEVICE
inline
DivideFunctor
()
{
n_inv
=
static_cast
<
MPType
>
(
1.0
f
);
}
HOSTDEVICE
explicit
inline
DivideFunctor
(
int
n
)
:
n_inv
((
MPType
)(
1.0
/
n
))
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
x
)
const
{
return
static_cast
<
Ty
>
(
static_cast
<
MPType
>
(
x
)
*
n_inv
);
}
private:
MPType
n_inv
;
};
/**
* @brief Default inverse functor
*/
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
InverseFunctor
{
HOSTDEVICE
inline
InverseFunctor
()
{}
HOSTDEVICE
explicit
inline
InverseFunctor
(
int
n
)
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
x
)
const
{
return
static_cast
<
Ty
>
(
-
x
);
}
};
/**
* @brief Default unary square functor
*/
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
SquareFunctor
{
HOSTDEVICE
inline
SquareFunctor
()
{}
HOSTDEVICE
explicit
inline
SquareFunctor
(
int
n
)
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
x
)
const
{
return
static_cast
<
Ty
>
(
x
)
*
static_cast
<
Ty
>
(
x
);
}
};
/****************************** Binary Functor ********************************/
/**
* @brief Default binary min functor
*/
template
<
typename
T
>
struct
MinFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
std
::
numeric_limits
<
T
>::
max
());
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
(
b
<
a
)
?
b
:
a
;
}
};
/**
* @brief Default binary max functor
*/
template
<
typename
T
>
struct
MaxFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
std
::
numeric_limits
<
T
>::
lowest
());
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
(
b
>
a
)
?
b
:
a
;
}
};
/**
* @brief Default binary add functor
*/
template
<
typename
T
>
struct
AddFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
0.0
f
);
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
b
+
a
;
}
};
/**
* @brief Default binary add functor
*/
template
<
typename
T
>
struct
MulFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
1.0
f
);
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
b
*
a
;
}
};
/**
* @brief Default binary logic or functor
*/
template
<
typename
T
>
struct
LogicalOrFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
false
);
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
b
||
a
;
}
};
/**
* @brief Default binary logic and functor
*/
template
<
typename
T
>
struct
LogicalAndFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
true
);
}
__device__
__forceinline__
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
b
&&
a
;
}
};
/**
* @brief Default binary sub functor
*/
template
<
typename
T
>
struct
SubFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
0.0
f
);
}
inline
HOSTDEVICE
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
a
-
b
;
}
};
/**
* @brief Default binary div functor
*/
template
<
typename
T
,
typename
Enable
=
void
>
struct
DivFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
1.0
f
);
}
inline
HOSTDEVICE
T
operator
()(
const
T
a
,
const
T
b
)
const
{
return
a
/
b
;
}
};
template
<
typename
T
>
struct
DivFunctor
<
T
,
typename
std
::
enable_if
<
std
::
is_integral
<
T
>::
value
>::
type
>
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
1.0
f
);
}
inline
HOSTDEVICE
T
operator
()(
const
T
a
,
const
T
b
)
const
{
// For int32/int64, need to check whether the divison is zero.
PADDLE_ENFORCE_NE
(
b
,
0
,
pten
::
errors
::
InvalidArgument
(
"Integer division by zero encountered "
"in (floor) divide. Please check the input value."
));
return
a
/
b
;
}
};
/**
* @brief Default binary floor divide functor
*/
template
<
typename
T
>
struct
FloorDivFunctor
{
inline
T
initial
()
{
return
static_cast
<
T
>
(
1.0
f
);
}
inline
HOSTDEVICE
T
operator
()(
const
T
a
,
const
T
b
)
const
{
PADDLE_ENFORCE_NE
(
b
,
0
,
pten
::
errors
::
InvalidArgument
(
"Integer division by zero encountered "
"in (floor) divide. Please check the input value."
));
return
static_cast
<
T
>
(
std
::
trunc
(
a
/
b
));
}
};
}
// namespace kps
}
// namespace pten
paddle/
fluid/operators/kernel_primitives
/helper_primitives.h
→
paddle/
pten/kernels/primitive
/helper_primitives.h
浏览文件 @
452bcbe2
// Copyright (c) 202
1
PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 202
2
PaddlePaddle Authors. All Rights Reserved.
//
//
// Licensed under the Apache License, Version 2.0 (the "License");
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// you may not use this file except in compliance with the License.
...
@@ -14,9 +14,8 @@
...
@@ -14,9 +14,8 @@
#pragma once
#pragma once
namespace
paddle
{
namespace
pten
{
namespace
operators
{
namespace
kps
{
namespace
kernel_primitives
{
#ifdef PADDLE_WITH_XPU2
#ifdef PADDLE_WITH_XPU2
struct
dim3
{
struct
dim3
{
...
@@ -43,8 +42,12 @@ struct DimConfig {
...
@@ -43,8 +42,12 @@ struct DimConfig {
int
rem_y
;
int
rem_y
;
int
rem_z
;
int
rem_z
;
HOSTDEVICE
explicit
inline
DimConfig
(
int
split_x
,
int
split_y
,
int
split_z
,
HOSTDEVICE
explicit
inline
DimConfig
(
int
split_x
,
int
size_x
,
int
size_y
,
int
size_z
)
{
int
split_y
,
int
split_z
,
int
size_x
,
int
size_y
,
int
size_z
)
{
split_num_x
=
split_x
;
split_num_x
=
split_x
;
split_num_y
=
split_y
;
split_num_y
=
split_y
;
split_num_z
=
split_z
;
split_num_z
=
split_z
;
...
@@ -60,6 +63,5 @@ struct DimConfig {
...
@@ -60,6 +63,5 @@ struct DimConfig {
}
}
};
};
}
// namespace kernel_primitives
}
// namespace kps
}
// namespace operators
}
// namespace pten
}
// namespace paddle
paddle/pten/kernels/primitive/kernel_primitives.h
0 → 100644
浏览文件 @
452bcbe2
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/pten/kernels/primitive/helper_primitives.h"
#ifdef PADDLE_WITH_XPU2
#include "paddle/pten/backends/xpu/xpu_context.h"
#include "paddle/pten/kernels/primitive/compute_primitives_xpu2.h"
#include "paddle/pten/kernels/primitive/datamover_primitives_xpu2.h"
#include "paddle/pten/kernels/primitive/functor_primitives_xpu2.h"
#define KPStream XPUStream
#define KPDevice pten::XPUContext
#define _ptr_ _global_ptr_
#define __forceinline__ __inline__
#define __restrict__
#define THREAD_ID_X core_id()
#define THREAD_ID_Y 0
#define THREAD_ID_Z 0
#define BLOCK_NUM_X core_num()
#define BLOCK_NUM_Y 0
#define BLOCK_NUM_Z 0
#define BLOCK_ID_X cluster_id()
#define BLOCK_ID_Y 0
#define BLOCK_ID_Z 0
#define GRID_NUM_X cluster_num()
#define GRID_NUM_Y 0
#define GRID_NUM_Z 0
#else
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/kernels/primitive/compute_primitives.h"
#include "paddle/pten/kernels/primitive/datamover_primitives.h"
#include "paddle/pten/kernels/primitive/functor_primitives.h"
#define KPStream gpuStream_t
#define KPDevice pten::GPUContext
#define _ptr_
#define THREAD_ID_X threadIdx.x
#define THREAD_ID_Y threadIdx.y
#define THREAD_ID_Z threadIdx.z
#define BLOCK_NUM_X blockDim.x
#define BLOCK_NUM_Y blockDim.y
#define BLOCK_NUM_Z blockDim.z
#define BLOCK_ID_X blockIdx.x
#define BLOCK_ID_Y blockIdx.y
#define BLOCK_ID_Z blockIdx.z
#define GRID_NUM_X gridDim.x
#define GRID_NUM_Y gridDim.y
#define GRID_NUM_Z gridDim.z
#endif
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录