未验证 提交 452bcbe2 编写于 作者: Y YuanRisheng 提交者: GitHub

[Pten]Move kernel_primitives lib to Pten directory (#39169)

* move kernel_primitives

* use pten's errors
上级 bd5c962d
......@@ -13,241 +13,10 @@
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/pten/kernels/funcs/eigen/extensions.h"
#include "paddle/pten/kernels/primitive/functor_primitives.h"
namespace paddle {
namespace operators {
namespace kernel_primitives {
namespace details {
static __device__ __forceinline__ platform::float16 Exp(platform::float16 x) {
return ::Eigen::numext::exp(x);
}
static __device__ __forceinline__ float Exp(float x) { return expf(x); }
static __device__ __forceinline__ double Exp(double x) { return exp(x); }
static __device__ __forceinline__ platform::float16 Log(platform::float16 x) {
return ::Eigen::numext::log(x);
}
static __device__ __forceinline__ float Log(float x) { return logf(x); }
static __device__ __forceinline__ double Log(double x) { return log(x); }
} // namespace details
/******************************** Unary Functor *******************************/
/**
* @brief Default unary exp functor
*/
template <typename Tx, typename Ty = Tx>
struct ExpFunctor {
HOSTDEVICE inline ExpFunctor() {}
HOSTDEVICE explicit inline ExpFunctor(int n) {}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(details::Exp(x));
}
};
/**
* @brief Default unary identity functor
*/
template <typename Tx, typename Ty = Tx>
struct IdentityFunctor {
HOSTDEVICE inline IdentityFunctor() {}
HOSTDEVICE explicit inline IdentityFunctor(int n) {}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(x);
}
};
/**
* @brief Default unary div functor. Divide by a constant
*/
template <typename Tx, typename Ty = Tx>
struct DivideFunctor {
private:
using MPType = typename ::paddle::operators::details::MPTypeTrait<Tx>::Type;
public:
HOSTDEVICE inline DivideFunctor() { n_inv = static_cast<MPType>(1.0f); }
HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((MPType)(1.0 / n)) {}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(static_cast<MPType>(x) * n_inv);
}
private:
MPType n_inv;
};
/**
* @brief Default inverse functor
*/
template <typename Tx, typename Ty = Tx>
struct InverseFunctor {
HOSTDEVICE inline InverseFunctor() {}
HOSTDEVICE explicit inline InverseFunctor(int n) {}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(-x);
}
};
/**
* @brief Default unary square functor
*/
template <typename Tx, typename Ty = Tx>
struct SquareFunctor {
HOSTDEVICE inline SquareFunctor() {}
HOSTDEVICE explicit inline SquareFunctor(int n) {}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(x) * static_cast<Ty>(x);
}
};
/****************************** Binary Functor ********************************/
/**
* @brief Default binary min functor
*/
template <typename T>
struct MinFunctor {
inline T initial() { return static_cast<T>(std::numeric_limits<T>::max()); }
__device__ __forceinline__ T operator()(const T a, const T b) const {
return (b < a) ? b : a;
}
};
/**
* @brief Default binary max functor
*/
template <typename T>
struct MaxFunctor {
inline T initial() {
return static_cast<T>(std::numeric_limits<T>::lowest());
}
__device__ __forceinline__ T operator()(const T a, const T b) const {
return (b > a) ? b : a;
}
};
/**
* @brief Default binary add functor
*/
template <typename T>
struct AddFunctor {
inline T initial() { return static_cast<T>(0.0f); }
__device__ __forceinline__ T operator()(const T a, const T b) const {
return b + a;
}
};
/**
* @brief Default binary add functor
*/
template <typename T>
struct MulFunctor {
inline T initial() { return static_cast<T>(1.0f); }
__device__ __forceinline__ T operator()(const T a, const T b) const {
return b * a;
}
};
/**
* @brief Default binary logic or functor
*/
template <typename T>
struct LogicalOrFunctor {
inline T initial() { return static_cast<T>(false); }
__device__ __forceinline__ T operator()(const T a, const T b) const {
return b || a;
}
};
/**
* @brief Default binary logic and functor
*/
template <typename T>
struct LogicalAndFunctor {
inline T initial() { return static_cast<T>(true); }
__device__ __forceinline__ T operator()(const T a, const T b) const {
return b && a;
}
};
/**
* @brief Default binary sub functor
*/
template <typename T>
struct SubFunctor {
inline T initial() { return static_cast<T>(0.0f); }
inline HOSTDEVICE T operator()(const T a, const T b) const { return a - b; }
};
/**
* @brief Default binary div functor
*/
template <typename T, typename Enable = void>
struct DivFunctor {
inline T initial() { return static_cast<T>(1.0f); }
inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
};
template <typename T>
struct DivFunctor<T,
typename std::enable_if<std::is_integral<T>::value>::type> {
inline T initial() { return static_cast<T>(1.0f); }
inline HOSTDEVICE T operator()(const T a, const T b) const {
// For int32/int64, need to check whether the divison is zero.
PADDLE_ENFORCE_NE(b, 0,
platform::errors::InvalidArgument(
"Integer division by zero encountered "
"in (floor) divide. Please check the input value."));
return a / b;
}
};
/**
* @brief Default binary floor divide functor
*/
template <typename T>
struct FloorDivFunctor {
inline T initial() { return static_cast<T>(1.0f); }
inline HOSTDEVICE T operator()(const T a, const T b) const {
PADDLE_ENFORCE_NE(b, 0,
platform::errors::InvalidArgument(
"Integer division by zero encountered "
"in (floor) divide. Please check the input value."));
return static_cast<T>(std::trunc(a / b));
}
};
} // namespace kernel_primitives
namespace kernel_primitives = pten::kps;
} // namespace operators
} // namespace paddle
......@@ -13,61 +13,10 @@
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/kernel_primitives/helper_primitives.h"
#ifdef PADDLE_WITH_XPU2
#include "paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h"
#include "paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h"
#include "paddle/fluid/operators/kernel_primitives/functor_primitives_xpu2.h"
#define KPStream XPUStream
#define KPDevice paddle::platform::XPUDeviceContext
#define _ptr_ _global_ptr_
#define __forceinline__ __inline__
#define __restrict__
#define THREAD_ID_X core_id()
#define THREAD_ID_Y 0
#define THREAD_ID_Z 0
#define BLOCK_NUM_X core_num()
#define BLOCK_NUM_Y 0
#define BLOCK_NUM_Z 0
#define BLOCK_ID_X cluster_id()
#define BLOCK_ID_Y 0
#define BLOCK_ID_Z 0
#define GRID_NUM_X cluster_num()
#define GRID_NUM_Y 0
#define GRID_NUM_Z 0
#else
#include "paddle/fluid/operators/kernel_primitives/compute_primitives.h"
#include "paddle/fluid/operators/kernel_primitives/datamover_primitives.h"
#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h"
#define KPStream gpuStream_t
#define KPDevice paddle::platform::CUDADeviceContext
#define _ptr_
#define THREAD_ID_X threadIdx.x
#define THREAD_ID_Y threadIdx.y
#define THREAD_ID_Z threadIdx.z
#define BLOCK_NUM_X blockDim.x
#define BLOCK_NUM_Y blockDim.y
#define BLOCK_NUM_Z blockDim.z
#define BLOCK_ID_X blockIdx.x
#define BLOCK_ID_Y blockIdx.y
#define BLOCK_ID_Z blockIdx.z
#define GRID_NUM_X gridDim.x
#define GRID_NUM_Y gridDim.y
#define GRID_NUM_Z gridDim.z
#endif
#include "paddle/pten/kernels/primitive/kernel_primitives.h"
namespace paddle {
namespace operators {
namespace kernel_primitives {}
namespace kernel_primitives = pten::kps;
}
}
......@@ -22,12 +22,12 @@ limitations under the License. */
#include "paddle/pten/kernels/empty_kernel.h"
#if defined(__NVCC__) || defined(__HIPCC__)
#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/function_traits.h"
#include "paddle/pten/kernels/primitive/kernel_primitives.h"
namespace kps = paddle::operators::kernel_primitives;
namespace kps = pten::kps;
#endif
......
......@@ -34,13 +34,13 @@ namespace cub = hipcub;
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/fast_divmod.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/pten/core/array.h"
#include "paddle/pten/core/enforce.h"
#include "paddle/pten/kernels/primitive/kernel_primitives.h"
#include "paddle/pten/api/ext/dispatch.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
......@@ -51,7 +51,7 @@ namespace cub = hipcub;
#define REDUCE_SPLIT_BOUNDARY 512
#define REDUCE_VEC_SIZE 4
namespace kps = paddle::operators::kernel_primitives;
namespace kps = pten::kps;
namespace pten {
namespace kernels {
......
......@@ -22,11 +22,10 @@
#endif
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/pten/common/float16.h"
namespace paddle {
namespace operators {
namespace kernel_primitives {
namespace pten {
namespace kps {
namespace details {
#ifdef __HIPCC__
......@@ -48,7 +47,7 @@ class MPTypeTrait {
};
template <>
class MPTypeTrait<platform::float16> {
class MPTypeTrait<pten::dtype::float16> {
public:
using Type = float;
};
......@@ -158,9 +157,14 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
* in: The register pointer of in, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
template <typename InT, typename OutT, int NX, int NY, int BlockSize,
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc>
__device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
__device__ __forceinline__ void ElementwiseUnary(OutT* out,
const InT* in,
OpFunc compute) {
#pragma unroll
for (int idx = 0; idx < NX * NY; idx++) {
......@@ -193,9 +197,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template <typename InT, typename OutT, int NX, int NY, int BlockSize,
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc>
__device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
__device__ __forceinline__ void ElementwiseBinary(OutT* out,
const InT* in1,
const InT* in2,
OpFunc compute) {
#pragma unroll
......@@ -231,12 +240,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
* in3: The register pointer of third input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template <typename InT, typename OutT, int NX, int NY, int BlockSize,
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc>
__device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
const InT* in2,
const InT* in3,
OpFunc compute) {
__device__ __forceinline__ void ElementwiseTernary(
OutT* out, const InT* in1, const InT* in2, const InT* in3, OpFunc compute) {
#pragma unroll
for (int idx = 0; idx < NX * NY; ++idx) {
out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx], in3[idx]));
......@@ -268,9 +279,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
* ins: A pointers of array consisting of multiple inputs.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template <typename InT, typename OutT, int NX, int NY, int BlockSize, int Arity,
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
int Arity,
class OpFunc>
__device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
__device__ __forceinline__ void ElementwiseAny(OutT* out,
InT (*ins)[NX * NY],
OpFunc compute) {
InT args[Arity];
#pragma unroll
......@@ -309,10 +326,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
template <typename InT, typename OutT, int NX, int NY, int BlockSize,
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc>
__device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
const InT* in2, OpFunc compute) {
__device__ __forceinline__ void CycleBinary(OutT* out,
const InT* in1,
const InT* in2,
OpFunc compute) {
#pragma unroll
for (int idx = 0; idx < NX; idx++) {
#pragma unroll
......@@ -350,9 +373,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
* reducer: Compute function which was declared like ReduceFunctor<InT>().
* reduce_last_dim: if the last dim gets involved in reduction.
*/
template <typename T, int NX, int NY, int BlockSize, class ReduceFunctor,
template <typename T,
int NX,
int NY,
int BlockSize,
class ReduceFunctor,
details::ReduceMode Mode>
__device__ __forceinline__ void Reduce(T* out, const T* in,
__device__ __forceinline__ void Reduce(T* out,
const T* in,
ReduceFunctor reducer,
bool reduce_last_dim) {
int block_index = blockDim.y;
......@@ -386,6 +414,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in,
}
}
} // namespace kernel_primitives
} // namespace operators
} // namespace paddle
} // namespace kps
} // namespace pten
......@@ -13,13 +13,13 @@
// limitations under the License.
#pragma once
#include "paddle/pten/common/float16.h"
#include "xpu/kernel/cluster_header.h"
#include "xpu/kernel/debug.h"
#include "xpu/kernel/math.h"
namespace paddle {
namespace operators {
namespace kernel_primitives {
namespace pten {
namespace kps {
namespace details {
// kGlobalMode: block reduce, each block gets an output;
......@@ -33,7 +33,7 @@ class MPTypeTrait {
};
template <>
class MPTypeTrait<platform::float16> {
class MPTypeTrait<pten::dtype::float16> {
public:
using Type = float;
};
......@@ -102,9 +102,14 @@ __device__ void BlockXReduce(T* data, OpFunc reducer) {
* in: The register pointer of in, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
template <typename InT, typename OutT, int NX, int NY, int BlockSize,
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc>
__device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
__device__ __forceinline__ void ElementwiseUnary(OutT* out,
const InT* in,
OpFunc compute) {
#pragma unroll
for (int idx = 0; idx < NX * NY; idx++) {
......@@ -137,9 +142,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template <typename InT, typename OutT, int NX, int NY, int BlockSize,
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc>
__device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
__device__ __forceinline__ void ElementwiseBinary(OutT* out,
const InT* in1,
const InT* in2,
OpFunc compute) {
#pragma unroll
......@@ -175,12 +185,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
* in3: The register pointer of third input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template <typename InT, typename OutT, int NX, int NY, int BlockSize,
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc>
__device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
const InT* in2,
const InT* in3,
OpFunc compute) {
__device__ __forceinline__ void ElementwiseTernary(
OutT* out, const InT* in1, const InT* in2, const InT* in3, OpFunc compute) {
#pragma unroll
for (int idx = 0; idx < NX * NY; ++idx) {
out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx], in3[idx]));
......@@ -212,9 +224,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
* ins: A pointers of array consisting of multiple inputs.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template <typename InT, typename OutT, int NX, int NY, int BlockSize, int Arity,
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
int Arity,
class OpFunc>
__device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
__device__ __forceinline__ void ElementwiseAny(OutT* out,
InT (*ins)[NX * NY],
OpFunc compute) {
__local__ InT args[Arity];
#pragma unroll
......@@ -253,10 +271,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
template <typename InT, typename OutT, int NX, int NY, int BlockSize,
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc>
__device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
const InT* in2, OpFunc compute) {
__device__ __forceinline__ void CycleBinary(OutT* out,
const InT* in1,
const InT* in2,
OpFunc compute) {
#pragma unroll
for (int idx = 0; idx < NX; idx++) {
#pragma unroll
......@@ -294,9 +318,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
* reducer: Compute function which was declared like ReduceFunctor<InT>().
* reduce_last_dim: if the last dim gets involved in reduction.
*/
template <typename T, int NX, int NY, int BlockSize, class ReduceFunctor,
template <typename T,
int NX,
int NY,
int BlockSize,
class ReduceFunctor,
details::ReduceMode Mode>
__device__ __forceinline__ void Reduce(T* out, const T* in,
__device__ __forceinline__ void Reduce(T* out,
const T* in,
ReduceFunctor reducer,
bool reduce_last_dim) {
if (Mode == kGlobalMode) {
......@@ -319,6 +348,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in,
}
}
} // namespace kernel_primitives
} // namespace operators
} // namespace paddle
} // namespace kps
} // namespace pten
......@@ -22,9 +22,8 @@
#endif
#include "paddle/pten/core/ddim.h"
namespace paddle {
namespace operators {
namespace kernel_primitives {
namespace pten {
namespace kps {
namespace details {
#define INT_BITS 32
......@@ -103,11 +102,12 @@ struct BroadcastConfig {
strides_in.resize(dim_size, 1);
for (int i = 0; i < dim_size; ++i) {
strides_in[i] = in_dims[i] == 1 ? 0 : strides_in[i];
strides_in[i] =
(i != 0 && strides_in[i] != 0)
? std::accumulate(in_dims.begin(), in_dims.begin() + i, 1,
std::multiplies<int64_t>())
: strides_in[i];
strides_in[i] = (i != 0 && strides_in[i] != 0)
? std::accumulate(in_dims.begin(),
in_dims.begin() + i,
1,
std::multiplies<int64_t>())
: strides_in[i];
}
memcpy(strides, strides_in.data(), kDims * sizeof(uint32_t));
......@@ -144,11 +144,18 @@ struct BroadcastConfig {
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
template <typename Tx,
typename Ty,
int NX,
int NY,
int BlockSize,
bool IsBoundary = false>
__device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src,
int size_nx, int size_ny,
int stride_nx, int stride_ny) {
__device__ __forceinline__ void ReadData(Ty* dst,
const Tx* __restrict__ src,
int size_nx,
int size_ny,
int stride_nx,
int stride_ny) {
int thread_offset = threadIdx.x;
int left_size_nx = size_nx - thread_offset;
......@@ -244,7 +251,8 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
* size: The current block needs to load size data continuously.
*/
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
__device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
__device__ __forceinline__ void ReadData(T* dst,
const T* __restrict__ src,
int num) {
if (IsBoundary) { // blockDim.x * NX > num
int thread_offset = threadIdx.x * NX;
......@@ -299,11 +307,19 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template <typename T, int NX, int NY, int BlockSize, int Rank,
template <typename T,
int NX,
int NY,
int BlockSize,
int Rank,
bool IsBoundary = false>
__device__ __forceinline__ void ReadDataBc(
T* dst, const T* __restrict__ src, uint32_t block_offset,
details::BroadcastConfig<Rank> config, int total_num_output, int stride_nx,
T* dst,
const T* __restrict__ src,
uint32_t block_offset,
details::BroadcastConfig<Rank> config,
int total_num_output,
int stride_nx,
int stride_ny) {
uint32_t thread_offset = block_offset + threadIdx.x;
uint32_t index_src = 0;
......@@ -361,12 +377,25 @@ __device__ __forceinline__ void ReadDataBc(
* reduce_last_dim: Used to indicate whether the dimension of reduce contains
* the lowest dimension.
*/
template <typename Tx, typename Ty, int NX, int NY, int BlockSize, int Rank,
typename IndexCal, typename Functor, bool IsBoundary = false>
__device__ __forceinline__ void ReadDataReduce(
Ty* dst, const Tx* __restrict__ src, int block_offset,
const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx,
int stride_ny, Functor func, bool reduce_last_dim) {
template <typename Tx,
typename Ty,
int NX,
int NY,
int BlockSize,
int Rank,
typename IndexCal,
typename Functor,
bool IsBoundary = false>
__device__ __forceinline__ void ReadDataReduce(Ty* dst,
const Tx* __restrict__ src,
int block_offset,
const IndexCal& index_cal,
int size_nx,
int size_ny,
int stride_nx,
int stride_ny,
Functor func,
bool reduce_last_dim) {
int thread_offset = 0;
int left_idx = 0;
if (reduce_last_dim) {
......@@ -430,7 +459,8 @@ __device__ __forceinline__ void ReadDataReduce(
* size: The current block needs to load size elements continuously.
*/
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
__device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
__device__ __forceinline__ void WriteData(T* dst,
T* __restrict__ src,
int num) {
if (IsBoundary) {
int thread_offset = threadIdx.x * NX;
......@@ -483,11 +513,18 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
template <typename Tx,
typename Ty,
int NX,
int NY,
int BlockSize,
bool IsBoundary = false>
__device__ __forceinline__ void WriteData(Ty* dst, const Tx* __restrict__ src,
int size_nx, int size_ny,
int stride_nx, int stride_ny) {
__device__ __forceinline__ void WriteData(Ty* dst,
const Tx* __restrict__ src,
int size_nx,
int size_ny,
int stride_nx,
int stride_ny) {
int thread_offset = threadIdx.x;
int left_size_nx = size_nx - thread_offset;
......@@ -589,11 +626,18 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
* coordinate mapping relationship between output data and input data.
* total_num_output: Total number of original output.
*/
template <typename T, int NX, int NY, int BlockSize, int Rank,
template <typename T,
int NX,
int NY,
int BlockSize,
int Rank,
bool IsBoundary = false>
__device__ __forceinline__ void ReadDataBc(
T* dst, const T* __restrict__ src, uint32_t block_offset,
details::BroadcastConfig<Rank> config, int total_num_output) {
T* dst,
const T* __restrict__ src,
uint32_t block_offset,
details::BroadcastConfig<Rank> config,
int total_num_output) {
uint32_t thread_offset = block_offset + threadIdx.x * NX;
uint32_t index_src = 0;
......@@ -616,6 +660,5 @@ __device__ __forceinline__ void ReadDataBc(
}
}
} // namespace kernel_primitives
} // namespace operators
} // namespace paddle
} // namespace kps
} // namespace pten
......@@ -17,9 +17,8 @@
#include "xpu/kernel/debug.h"
#include "xpu/kernel/math.h"
namespace paddle {
namespace operators {
namespace kernel_primitives {
namespace pten {
namespace kps {
namespace details {
template <typename T, int VecSize>
......@@ -105,10 +104,17 @@ struct BroadcastConfig {
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
template <typename Tx,
typename Ty,
int NX,
int NY,
int BlockSize,
bool IsBoundary = false>
__device__ __inline__ void ReadData(Ty* dst, const Tx _global_ptr_* src,
int size_nx, int size_ny, int stride_nx,
__device__ __inline__ void ReadData(Ty* dst,
const Tx _global_ptr_* src,
int size_nx,
int size_ny,
int stride_nx,
int stride_ny) {
int thread_offset = core_id();
int left_size_nx = size_nx - thread_offset;
......@@ -205,7 +211,8 @@ __device__ __inline__ void Init(T* dst, T init_data) {
* size: The current block needs to load size data continuously.
*/
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
__device__ __inline__ void ReadData(T* dst, const T _global_ptr_* src,
__device__ __inline__ void ReadData(T* dst,
const T _global_ptr_* src,
int num) {
int thread_offset = core_id() * NX;
__local__ T in_temp[1];
......@@ -247,12 +254,18 @@ __device__ __inline__ void ReadData(T* dst, const T _global_ptr_* src,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template <typename T, int NX, int NY, int BlockSize, int Rank,
template <typename T,
int NX,
int NY,
int BlockSize,
int Rank,
bool IsBoundary = false>
__device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
__device__ __inline__ void ReadDataBc(T* dst,
const T _global_ptr_* src,
uint32_t block_offset,
details::BroadcastConfig<Rank> config,
int total_num_output, int stride_nx,
int total_num_output,
int stride_nx,
int stride_ny) {
uint32_t thread_offset = block_offset + core_id();
uint32_t index_src = 0;
......@@ -307,13 +320,21 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
* reduce_last_dim: Used to indicate whether the dimension of reduce contains
* the lowest dimension.
*/
template <typename T, int NX, int NY, int BlockSize, int Rank,
typename IndexCal, bool IsBoundary = false>
__device__ __inline__ void ReadDataReduce(T* dst, const T _global_ptr_* src,
template <typename T,
int NX,
int NY,
int BlockSize,
int Rank,
typename IndexCal,
bool IsBoundary = false>
__device__ __inline__ void ReadDataReduce(T* dst,
const T _global_ptr_* src,
int block_offset,
const IndexCal& index_cal,
int size_nx, int size_ny,
int stride_nx, int stride_ny,
int size_nx,
int size_ny,
int stride_nx,
int stride_ny,
bool reduce_last_dim) {
__local__ Tx in_temp[1];
int thread_offset = 0;
......@@ -423,10 +444,17 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template <typename Tx, typename Ty, int NX, int NY, int BlockSize,
template <typename Tx,
typename Ty,
int NX,
int NY,
int BlockSize,
bool IsBoundary = false>
__device__ __inline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
int size_nx, int size_ny, int stride_nx,
__device__ __inline__ void WriteData(Ty _global_ptr_* dst,
const Tx* src,
int size_nx,
int size_ny,
int stride_nx,
int stride_ny) {
int thread_offset = core_id();
int left_size_nx = size_nx - thread_offset;
......@@ -483,7 +511,8 @@ __device__ __inline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
}
}
in_temp[0] = static_cast<Ty>(src[idx + idy * NX]);
LM2GM(in_temp, dst + thread_offset + idx * stride_nx + idy * stride_ny,
LM2GM(in_temp,
dst + thread_offset + idx * stride_nx + idy * stride_ny,
sizeof(Ty));
}
}
......@@ -537,9 +566,14 @@ __device__ __inline__ void Init(T* dst, T* init_data, int num) {
* coordinate mapping relationship between output data and input data.
* total_num_output: Total number of original output.
*/
template <typename T, int NX, int NY, int BlockSize, int Rank,
template <typename T,
int NX,
int NY,
int BlockSize,
int Rank,
bool IsBoundary = false>
__device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
__device__ __inline__ void ReadDataBc(T* dst,
const T _global_ptr_* src,
uint32_t block_offset,
details::BroadcastConfig<Rank> config,
int total_num_output) {
......@@ -562,6 +596,5 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
}
}
} // namespace kernel_primitives
} // namespace operators
} // namespace paddle
} // namespace kps
} // namespace pten
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/pten/common/float16.h"
#include "paddle/pten/core/enforce.h"
#include "paddle/pten/kernels/funcs/eigen/extensions.h"
namespace pten {
namespace kps {
namespace details {
static __device__ __forceinline__ pten::dtype::float16 Exp(
pten::dtype::float16 x) {
return ::Eigen::numext::exp(x);
}
static __device__ __forceinline__ float Exp(float x) { return expf(x); }
static __device__ __forceinline__ double Exp(double x) { return exp(x); }
static __device__ __forceinline__ pten::dtype::float16 Log(
pten::dtype::float16 x) {
return ::Eigen::numext::log(x);
}
static __device__ __forceinline__ float Log(float x) { return logf(x); }
static __device__ __forceinline__ double Log(double x) { return log(x); }
} // namespace details
/******************************** Unary Functor *******************************/
/**
* @brief Default unary exp functor
*/
template <typename Tx, typename Ty = Tx>
struct ExpFunctor {
HOSTDEVICE inline ExpFunctor() {}
HOSTDEVICE explicit inline ExpFunctor(int n) {}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(details::Exp(x));
}
};
/**
* @brief Default unary identity functor
*/
template <typename Tx, typename Ty = Tx>
struct IdentityFunctor {
HOSTDEVICE inline IdentityFunctor() {}
HOSTDEVICE explicit inline IdentityFunctor(int n) {}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(x);
}
};
/**
* @brief Default unary div functor. Divide by a constant
*/
template <typename Tx, typename Ty = Tx>
struct DivideFunctor {
private:
using MPType = typename ::paddle::operators::details::MPTypeTrait<Tx>::Type;
public:
HOSTDEVICE inline DivideFunctor() { n_inv = static_cast<MPType>(1.0f); }
HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((MPType)(1.0 / n)) {}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(static_cast<MPType>(x) * n_inv);
}
private:
MPType n_inv;
};
/**
* @brief Default inverse functor
*/
template <typename Tx, typename Ty = Tx>
struct InverseFunctor {
HOSTDEVICE inline InverseFunctor() {}
HOSTDEVICE explicit inline InverseFunctor(int n) {}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(-x);
}
};
/**
* @brief Default unary square functor
*/
template <typename Tx, typename Ty = Tx>
struct SquareFunctor {
HOSTDEVICE inline SquareFunctor() {}
HOSTDEVICE explicit inline SquareFunctor(int n) {}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(x) * static_cast<Ty>(x);
}
};
/****************************** Binary Functor ********************************/
/**
* @brief Default binary min functor
*/
template <typename T>
struct MinFunctor {
inline T initial() { return static_cast<T>(std::numeric_limits<T>::max()); }
__device__ __forceinline__ T operator()(const T a, const T b) const {
return (b < a) ? b : a;
}
};
/**
* @brief Default binary max functor
*/
template <typename T>
struct MaxFunctor {
inline T initial() {
return static_cast<T>(std::numeric_limits<T>::lowest());
}
__device__ __forceinline__ T operator()(const T a, const T b) const {
return (b > a) ? b : a;
}
};
/**
* @brief Default binary add functor
*/
template <typename T>
struct AddFunctor {
inline T initial() { return static_cast<T>(0.0f); }
__device__ __forceinline__ T operator()(const T a, const T b) const {
return b + a;
}
};
/**
* @brief Default binary add functor
*/
template <typename T>
struct MulFunctor {
inline T initial() { return static_cast<T>(1.0f); }
__device__ __forceinline__ T operator()(const T a, const T b) const {
return b * a;
}
};
/**
* @brief Default binary logic or functor
*/
template <typename T>
struct LogicalOrFunctor {
inline T initial() { return static_cast<T>(false); }
__device__ __forceinline__ T operator()(const T a, const T b) const {
return b || a;
}
};
/**
* @brief Default binary logic and functor
*/
template <typename T>
struct LogicalAndFunctor {
inline T initial() { return static_cast<T>(true); }
__device__ __forceinline__ T operator()(const T a, const T b) const {
return b && a;
}
};
/**
* @brief Default binary sub functor
*/
template <typename T>
struct SubFunctor {
inline T initial() { return static_cast<T>(0.0f); }
inline HOSTDEVICE T operator()(const T a, const T b) const { return a - b; }
};
/**
* @brief Default binary div functor
*/
template <typename T, typename Enable = void>
struct DivFunctor {
inline T initial() { return static_cast<T>(1.0f); }
inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
};
template <typename T>
struct DivFunctor<T,
typename std::enable_if<std::is_integral<T>::value>::type> {
inline T initial() { return static_cast<T>(1.0f); }
inline HOSTDEVICE T operator()(const T a, const T b) const {
// For int32/int64, need to check whether the divison is zero.
PADDLE_ENFORCE_NE(b,
0,
pten::errors::InvalidArgument(
"Integer division by zero encountered "
"in (floor) divide. Please check the input value."));
return a / b;
}
};
/**
* @brief Default binary floor divide functor
*/
template <typename T>
struct FloorDivFunctor {
inline T initial() { return static_cast<T>(1.0f); }
inline HOSTDEVICE T operator()(const T a, const T b) const {
PADDLE_ENFORCE_NE(b,
0,
pten::errors::InvalidArgument(
"Integer division by zero encountered "
"in (floor) divide. Please check the input value."));
return static_cast<T>(std::trunc(a / b));
}
};
} // namespace kps
} // namespace pten
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -14,9 +14,8 @@
#pragma once
namespace paddle {
namespace operators {
namespace kernel_primitives {
namespace pten {
namespace kps {
#ifdef PADDLE_WITH_XPU2
struct dim3 {
......@@ -43,8 +42,12 @@ struct DimConfig {
int rem_y;
int rem_z;
HOSTDEVICE explicit inline DimConfig(int split_x, int split_y, int split_z,
int size_x, int size_y, int size_z) {
HOSTDEVICE explicit inline DimConfig(int split_x,
int split_y,
int split_z,
int size_x,
int size_y,
int size_z) {
split_num_x = split_x;
split_num_y = split_y;
split_num_z = split_z;
......@@ -60,6 +63,5 @@ struct DimConfig {
}
};
} // namespace kernel_primitives
} // namespace operators
} // namespace paddle
} // namespace kps
} // namespace pten
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/pten/kernels/primitive/helper_primitives.h"
#ifdef PADDLE_WITH_XPU2
#include "paddle/pten/backends/xpu/xpu_context.h"
#include "paddle/pten/kernels/primitive/compute_primitives_xpu2.h"
#include "paddle/pten/kernels/primitive/datamover_primitives_xpu2.h"
#include "paddle/pten/kernels/primitive/functor_primitives_xpu2.h"
#define KPStream XPUStream
#define KPDevice pten::XPUContext
#define _ptr_ _global_ptr_
#define __forceinline__ __inline__
#define __restrict__
#define THREAD_ID_X core_id()
#define THREAD_ID_Y 0
#define THREAD_ID_Z 0
#define BLOCK_NUM_X core_num()
#define BLOCK_NUM_Y 0
#define BLOCK_NUM_Z 0
#define BLOCK_ID_X cluster_id()
#define BLOCK_ID_Y 0
#define BLOCK_ID_Z 0
#define GRID_NUM_X cluster_num()
#define GRID_NUM_Y 0
#define GRID_NUM_Z 0
#else
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/kernels/primitive/compute_primitives.h"
#include "paddle/pten/kernels/primitive/datamover_primitives.h"
#include "paddle/pten/kernels/primitive/functor_primitives.h"
#define KPStream gpuStream_t
#define KPDevice pten::GPUContext
#define _ptr_
#define THREAD_ID_X threadIdx.x
#define THREAD_ID_Y threadIdx.y
#define THREAD_ID_Z threadIdx.z
#define BLOCK_NUM_X blockDim.x
#define BLOCK_NUM_Y blockDim.y
#define BLOCK_NUM_Z blockDim.z
#define BLOCK_ID_X blockIdx.x
#define BLOCK_ID_Y blockIdx.y
#define BLOCK_ID_Z blockIdx.z
#define GRID_NUM_X gridDim.x
#define GRID_NUM_Y gridDim.y
#define GRID_NUM_Z gridDim.z
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册