未验证 提交 452bcbe2 编写于 作者: Y YuanRisheng 提交者: GitHub

[Pten]Move kernel_primitives lib to Pten directory (#39169)

* move kernel_primitives

* use pten's errors
上级 bd5c962d
...@@ -13,241 +13,10 @@ ...@@ -13,241 +13,10 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "paddle/pten/kernels/primitive/functor_primitives.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/pten/kernels/funcs/eigen/extensions.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace kernel_primitives { namespace kernel_primitives = pten::kps;
namespace details {
static __device__ __forceinline__ platform::float16 Exp(platform::float16 x) {
return ::Eigen::numext::exp(x);
}
static __device__ __forceinline__ float Exp(float x) { return expf(x); }
static __device__ __forceinline__ double Exp(double x) { return exp(x); }
static __device__ __forceinline__ platform::float16 Log(platform::float16 x) {
return ::Eigen::numext::log(x);
}
static __device__ __forceinline__ float Log(float x) { return logf(x); }
static __device__ __forceinline__ double Log(double x) { return log(x); }
} // namespace details
/******************************** Unary Functor *******************************/
/**
* @brief Default unary exp functor
*/
template <typename Tx, typename Ty = Tx>
struct ExpFunctor {
HOSTDEVICE inline ExpFunctor() {}
HOSTDEVICE explicit inline ExpFunctor(int n) {}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(details::Exp(x));
}
};
/**
* @brief Default unary identity functor
*/
template <typename Tx, typename Ty = Tx>
struct IdentityFunctor {
HOSTDEVICE inline IdentityFunctor() {}
HOSTDEVICE explicit inline IdentityFunctor(int n) {}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(x);
}
};
/**
* @brief Default unary div functor. Divide by a constant
*/
template <typename Tx, typename Ty = Tx>
struct DivideFunctor {
private:
using MPType = typename ::paddle::operators::details::MPTypeTrait<Tx>::Type;
public:
HOSTDEVICE inline DivideFunctor() { n_inv = static_cast<MPType>(1.0f); }
HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((MPType)(1.0 / n)) {}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(static_cast<MPType>(x) * n_inv);
}
private:
MPType n_inv;
};
/**
* @brief Default inverse functor
*/
template <typename Tx, typename Ty = Tx>
struct InverseFunctor {
HOSTDEVICE inline InverseFunctor() {}
HOSTDEVICE explicit inline InverseFunctor(int n) {}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(-x);
}
};
/**
* @brief Default unary square functor
*/
template <typename Tx, typename Ty = Tx>
struct SquareFunctor {
HOSTDEVICE inline SquareFunctor() {}
HOSTDEVICE explicit inline SquareFunctor(int n) {}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(x) * static_cast<Ty>(x);
}
};
/****************************** Binary Functor ********************************/
/**
* @brief Default binary min functor
*/
template <typename T>
struct MinFunctor {
inline T initial() { return static_cast<T>(std::numeric_limits<T>::max()); }
__device__ __forceinline__ T operator()(const T a, const T b) const {
return (b < a) ? b : a;
}
};
/**
* @brief Default binary max functor
*/
template <typename T>
struct MaxFunctor {
inline T initial() {
return static_cast<T>(std::numeric_limits<T>::lowest());
}
__device__ __forceinline__ T operator()(const T a, const T b) const {
return (b > a) ? b : a;
}
};
/**
* @brief Default binary add functor
*/
template <typename T>
struct AddFunctor {
inline T initial() { return static_cast<T>(0.0f); }
__device__ __forceinline__ T operator()(const T a, const T b) const {
return b + a;
}
};
/**
* @brief Default binary add functor
*/
template <typename T>
struct MulFunctor {
inline T initial() { return static_cast<T>(1.0f); }
__device__ __forceinline__ T operator()(const T a, const T b) const {
return b * a;
}
};
/**
* @brief Default binary logic or functor
*/
template <typename T>
struct LogicalOrFunctor {
inline T initial() { return static_cast<T>(false); }
__device__ __forceinline__ T operator()(const T a, const T b) const {
return b || a;
}
};
/**
* @brief Default binary logic and functor
*/
template <typename T>
struct LogicalAndFunctor {
inline T initial() { return static_cast<T>(true); }
__device__ __forceinline__ T operator()(const T a, const T b) const {
return b && a;
}
};
/**
* @brief Default binary sub functor
*/
template <typename T>
struct SubFunctor {
inline T initial() { return static_cast<T>(0.0f); }
inline HOSTDEVICE T operator()(const T a, const T b) const { return a - b; }
};
/**
* @brief Default binary div functor
*/
template <typename T, typename Enable = void>
struct DivFunctor {
inline T initial() { return static_cast<T>(1.0f); }
inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
};
template <typename T>
struct DivFunctor<T,
typename std::enable_if<std::is_integral<T>::value>::type> {
inline T initial() { return static_cast<T>(1.0f); }
inline HOSTDEVICE T operator()(const T a, const T b) const {
// For int32/int64, need to check whether the divison is zero.
PADDLE_ENFORCE_NE(b, 0,
platform::errors::InvalidArgument(
"Integer division by zero encountered "
"in (floor) divide. Please check the input value."));
return a / b;
}
};
/**
* @brief Default binary floor divide functor
*/
template <typename T>
struct FloorDivFunctor {
inline T initial() { return static_cast<T>(1.0f); }
inline HOSTDEVICE T operator()(const T a, const T b) const {
PADDLE_ENFORCE_NE(b, 0,
platform::errors::InvalidArgument(
"Integer division by zero encountered "
"in (floor) divide. Please check the input value."));
return static_cast<T>(std::trunc(a / b));
}
};
} // namespace kernel_primitives
} // namespace operators } // namespace operators
} // namespace paddle } // namespace paddle
...@@ -13,61 +13,10 @@ ...@@ -13,61 +13,10 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "paddle/fluid/operators/kernel_primitives/helper_primitives.h" #include "paddle/pten/kernels/primitive/kernel_primitives.h"
#ifdef PADDLE_WITH_XPU2
#include "paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h"
#include "paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h"
#include "paddle/fluid/operators/kernel_primitives/functor_primitives_xpu2.h"
#define KPStream XPUStream
#define KPDevice paddle::platform::XPUDeviceContext
#define _ptr_ _global_ptr_
#define __forceinline__ __inline__
#define __restrict__
#define THREAD_ID_X core_id()
#define THREAD_ID_Y 0
#define THREAD_ID_Z 0
#define BLOCK_NUM_X core_num()
#define BLOCK_NUM_Y 0
#define BLOCK_NUM_Z 0
#define BLOCK_ID_X cluster_id()
#define BLOCK_ID_Y 0
#define BLOCK_ID_Z 0
#define GRID_NUM_X cluster_num()
#define GRID_NUM_Y 0
#define GRID_NUM_Z 0
#else
#include "paddle/fluid/operators/kernel_primitives/compute_primitives.h"
#include "paddle/fluid/operators/kernel_primitives/datamover_primitives.h"
#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h"
#define KPStream gpuStream_t
#define KPDevice paddle::platform::CUDADeviceContext
#define _ptr_
#define THREAD_ID_X threadIdx.x
#define THREAD_ID_Y threadIdx.y
#define THREAD_ID_Z threadIdx.z
#define BLOCK_NUM_X blockDim.x
#define BLOCK_NUM_Y blockDim.y
#define BLOCK_NUM_Z blockDim.z
#define BLOCK_ID_X blockIdx.x
#define BLOCK_ID_Y blockIdx.y
#define BLOCK_ID_Z blockIdx.z
#define GRID_NUM_X gridDim.x
#define GRID_NUM_Y gridDim.y
#define GRID_NUM_Z gridDim.z
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace kernel_primitives {} namespace kernel_primitives = pten::kps;
} }
} }
...@@ -22,12 +22,12 @@ limitations under the License. */ ...@@ -22,12 +22,12 @@ limitations under the License. */
#include "paddle/pten/kernels/empty_kernel.h" #include "paddle/pten/kernels/empty_kernel.h"
#if defined(__NVCC__) || defined(__HIPCC__) #if defined(__NVCC__) || defined(__HIPCC__)
#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
#include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/function_traits.h" #include "paddle/fluid/platform/function_traits.h"
#include "paddle/pten/kernels/primitive/kernel_primitives.h"
namespace kps = paddle::operators::kernel_primitives; namespace kps = pten::kps;
#endif #endif
......
...@@ -34,13 +34,13 @@ namespace cub = hipcub; ...@@ -34,13 +34,13 @@ namespace cub = hipcub;
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/amp/fp16_type_traits.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/fast_divmod.h" #include "paddle/fluid/platform/fast_divmod.h"
#include "paddle/fluid/string/string_helper.h" #include "paddle/fluid/string/string_helper.h"
#include "paddle/pten/core/array.h" #include "paddle/pten/core/array.h"
#include "paddle/pten/core/enforce.h" #include "paddle/pten/core/enforce.h"
#include "paddle/pten/kernels/primitive/kernel_primitives.h"
#include "paddle/pten/api/ext/dispatch.h" #include "paddle/pten/api/ext/dispatch.h"
#include "paddle/pten/backends/gpu/gpu_context.h" #include "paddle/pten/backends/gpu/gpu_context.h"
...@@ -51,7 +51,7 @@ namespace cub = hipcub; ...@@ -51,7 +51,7 @@ namespace cub = hipcub;
#define REDUCE_SPLIT_BOUNDARY 512 #define REDUCE_SPLIT_BOUNDARY 512
#define REDUCE_VEC_SIZE 4 #define REDUCE_VEC_SIZE 4
namespace kps = paddle::operators::kernel_primitives; namespace kps = pten::kps;
namespace pten { namespace pten {
namespace kernels { namespace kernels {
......
...@@ -22,11 +22,10 @@ ...@@ -22,11 +22,10 @@
#endif #endif
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/pten/common/float16.h"
namespace paddle { namespace pten {
namespace operators { namespace kps {
namespace kernel_primitives {
namespace details { namespace details {
#ifdef __HIPCC__ #ifdef __HIPCC__
...@@ -48,7 +47,7 @@ class MPTypeTrait { ...@@ -48,7 +47,7 @@ class MPTypeTrait {
}; };
template <> template <>
class MPTypeTrait<platform::float16> { class MPTypeTrait<pten::dtype::float16> {
public: public:
using Type = float; using Type = float;
}; };
...@@ -158,9 +157,14 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) { ...@@ -158,9 +157,14 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) {
* in: The register pointer of in, the size is NX * NY. * in: The register pointer of in, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>(). * compute: Compute function which was declared like OpFunc<InT, OutT>().
*/ */
template <typename InT, typename OutT, int NX, int NY, int BlockSize, template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc> class OpFunc>
__device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, __device__ __forceinline__ void ElementwiseUnary(OutT* out,
const InT* in,
OpFunc compute) { OpFunc compute) {
#pragma unroll #pragma unroll
for (int idx = 0; idx < NX * NY; idx++) { for (int idx = 0; idx < NX * NY; idx++) {
...@@ -193,9 +197,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, ...@@ -193,9 +197,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
* in2: The register pointer of second input, size is NX * NY. * in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>(). * compute: Compute function which was declared like OpFunc<InT>().
*/ */
template <typename InT, typename OutT, int NX, int NY, int BlockSize, template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc> class OpFunc>
__device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, __device__ __forceinline__ void ElementwiseBinary(OutT* out,
const InT* in1,
const InT* in2, const InT* in2,
OpFunc compute) { OpFunc compute) {
#pragma unroll #pragma unroll
...@@ -231,12 +240,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, ...@@ -231,12 +240,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
* in3: The register pointer of third input, size is NX * NY. * in3: The register pointer of third input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>(). * compute: Compute function which was declared like OpFunc<InT>().
*/ */
template <typename InT, typename OutT, int NX, int NY, int BlockSize, template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc> class OpFunc>
__device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1, __device__ __forceinline__ void ElementwiseTernary(
const InT* in2, OutT* out, const InT* in1, const InT* in2, const InT* in3, OpFunc compute) {
const InT* in3,
OpFunc compute) {
#pragma unroll #pragma unroll
for (int idx = 0; idx < NX * NY; ++idx) { for (int idx = 0; idx < NX * NY; ++idx) {
out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx], in3[idx])); out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx], in3[idx]));
...@@ -268,9 +279,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1, ...@@ -268,9 +279,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
* ins: A pointers of array consisting of multiple inputs. * ins: A pointers of array consisting of multiple inputs.
* compute: Compute function which was declared like OpFunc<InT>(). * compute: Compute function which was declared like OpFunc<InT>().
*/ */
template <typename InT, typename OutT, int NX, int NY, int BlockSize, int Arity, template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
int Arity,
class OpFunc> class OpFunc>
__device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY], __device__ __forceinline__ void ElementwiseAny(OutT* out,
InT (*ins)[NX * NY],
OpFunc compute) { OpFunc compute) {
InT args[Arity]; InT args[Arity];
#pragma unroll #pragma unroll
...@@ -309,10 +326,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY], ...@@ -309,10 +326,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
* in2: The register pointer of second input, size is NX * NY. * in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>(). * compute: Compute function which was declared like OpFunc<InT, OutT>().
*/ */
template <typename InT, typename OutT, int NX, int NY, int BlockSize, template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc> class OpFunc>
__device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1, __device__ __forceinline__ void CycleBinary(OutT* out,
const InT* in2, OpFunc compute) { const InT* in1,
const InT* in2,
OpFunc compute) {
#pragma unroll #pragma unroll
for (int idx = 0; idx < NX; idx++) { for (int idx = 0; idx < NX; idx++) {
#pragma unroll #pragma unroll
...@@ -350,9 +373,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1, ...@@ -350,9 +373,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
* reducer: Compute function which was declared like ReduceFunctor<InT>(). * reducer: Compute function which was declared like ReduceFunctor<InT>().
* reduce_last_dim: if the last dim gets involved in reduction. * reduce_last_dim: if the last dim gets involved in reduction.
*/ */
template <typename T, int NX, int NY, int BlockSize, class ReduceFunctor, template <typename T,
int NX,
int NY,
int BlockSize,
class ReduceFunctor,
details::ReduceMode Mode> details::ReduceMode Mode>
__device__ __forceinline__ void Reduce(T* out, const T* in, __device__ __forceinline__ void Reduce(T* out,
const T* in,
ReduceFunctor reducer, ReduceFunctor reducer,
bool reduce_last_dim) { bool reduce_last_dim) {
int block_index = blockDim.y; int block_index = blockDim.y;
...@@ -386,6 +414,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in, ...@@ -386,6 +414,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in,
} }
} }
} // namespace kernel_primitives } // namespace kps
} // namespace operators } // namespace pten
} // namespace paddle
...@@ -13,13 +13,13 @@ ...@@ -13,13 +13,13 @@
// limitations under the License. // limitations under the License.
#pragma once #pragma once
#include "paddle/pten/common/float16.h"
#include "xpu/kernel/cluster_header.h" #include "xpu/kernel/cluster_header.h"
#include "xpu/kernel/debug.h" #include "xpu/kernel/debug.h"
#include "xpu/kernel/math.h" #include "xpu/kernel/math.h"
namespace paddle { namespace pten {
namespace operators { namespace kps {
namespace kernel_primitives {
namespace details { namespace details {
// kGlobalMode: block reduce, each block gets an output; // kGlobalMode: block reduce, each block gets an output;
...@@ -33,7 +33,7 @@ class MPTypeTrait { ...@@ -33,7 +33,7 @@ class MPTypeTrait {
}; };
template <> template <>
class MPTypeTrait<platform::float16> { class MPTypeTrait<pten::dtype::float16> {
public: public:
using Type = float; using Type = float;
}; };
...@@ -102,9 +102,14 @@ __device__ void BlockXReduce(T* data, OpFunc reducer) { ...@@ -102,9 +102,14 @@ __device__ void BlockXReduce(T* data, OpFunc reducer) {
* in: The register pointer of in, the size is NX * NY. * in: The register pointer of in, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>(). * compute: Compute function which was declared like OpFunc<InT, OutT>().
*/ */
template <typename InT, typename OutT, int NX, int NY, int BlockSize, template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc> class OpFunc>
__device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, __device__ __forceinline__ void ElementwiseUnary(OutT* out,
const InT* in,
OpFunc compute) { OpFunc compute) {
#pragma unroll #pragma unroll
for (int idx = 0; idx < NX * NY; idx++) { for (int idx = 0; idx < NX * NY; idx++) {
...@@ -137,9 +142,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, ...@@ -137,9 +142,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in,
* in2: The register pointer of second input, size is NX * NY. * in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>(). * compute: Compute function which was declared like OpFunc<InT>().
*/ */
template <typename InT, typename OutT, int NX, int NY, int BlockSize, template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc> class OpFunc>
__device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, __device__ __forceinline__ void ElementwiseBinary(OutT* out,
const InT* in1,
const InT* in2, const InT* in2,
OpFunc compute) { OpFunc compute) {
#pragma unroll #pragma unroll
...@@ -175,12 +185,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, ...@@ -175,12 +185,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1,
* in3: The register pointer of third input, size is NX * NY. * in3: The register pointer of third input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>(). * compute: Compute function which was declared like OpFunc<InT>().
*/ */
template <typename InT, typename OutT, int NX, int NY, int BlockSize, template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc> class OpFunc>
__device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1, __device__ __forceinline__ void ElementwiseTernary(
const InT* in2, OutT* out, const InT* in1, const InT* in2, const InT* in3, OpFunc compute) {
const InT* in3,
OpFunc compute) {
#pragma unroll #pragma unroll
for (int idx = 0; idx < NX * NY; ++idx) { for (int idx = 0; idx < NX * NY; ++idx) {
out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx], in3[idx])); out[idx] = static_cast<OutT>(compute(in1[idx], in2[idx], in3[idx]));
...@@ -212,9 +224,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1, ...@@ -212,9 +224,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1,
* ins: A pointers of array consisting of multiple inputs. * ins: A pointers of array consisting of multiple inputs.
* compute: Compute function which was declared like OpFunc<InT>(). * compute: Compute function which was declared like OpFunc<InT>().
*/ */
template <typename InT, typename OutT, int NX, int NY, int BlockSize, int Arity, template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
int Arity,
class OpFunc> class OpFunc>
__device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY], __device__ __forceinline__ void ElementwiseAny(OutT* out,
InT (*ins)[NX * NY],
OpFunc compute) { OpFunc compute) {
__local__ InT args[Arity]; __local__ InT args[Arity];
#pragma unroll #pragma unroll
...@@ -253,10 +271,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY], ...@@ -253,10 +271,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY],
* in2: The register pointer of second input, size is NX * NY. * in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>(). * compute: Compute function which was declared like OpFunc<InT, OutT>().
*/ */
template <typename InT, typename OutT, int NX, int NY, int BlockSize, template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc> class OpFunc>
__device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1, __device__ __forceinline__ void CycleBinary(OutT* out,
const InT* in2, OpFunc compute) { const InT* in1,
const InT* in2,
OpFunc compute) {
#pragma unroll #pragma unroll
for (int idx = 0; idx < NX; idx++) { for (int idx = 0; idx < NX; idx++) {
#pragma unroll #pragma unroll
...@@ -294,9 +318,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1, ...@@ -294,9 +318,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1,
* reducer: Compute function which was declared like ReduceFunctor<InT>(). * reducer: Compute function which was declared like ReduceFunctor<InT>().
* reduce_last_dim: if the last dim gets involved in reduction. * reduce_last_dim: if the last dim gets involved in reduction.
*/ */
template <typename T, int NX, int NY, int BlockSize, class ReduceFunctor, template <typename T,
int NX,
int NY,
int BlockSize,
class ReduceFunctor,
details::ReduceMode Mode> details::ReduceMode Mode>
__device__ __forceinline__ void Reduce(T* out, const T* in, __device__ __forceinline__ void Reduce(T* out,
const T* in,
ReduceFunctor reducer, ReduceFunctor reducer,
bool reduce_last_dim) { bool reduce_last_dim) {
if (Mode == kGlobalMode) { if (Mode == kGlobalMode) {
...@@ -319,6 +348,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in, ...@@ -319,6 +348,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in,
} }
} }
} // namespace kernel_primitives } // namespace kps
} // namespace operators } // namespace pten
} // namespace paddle
...@@ -22,9 +22,8 @@ ...@@ -22,9 +22,8 @@
#endif #endif
#include "paddle/pten/core/ddim.h" #include "paddle/pten/core/ddim.h"
namespace paddle { namespace pten {
namespace operators { namespace kps {
namespace kernel_primitives {
namespace details { namespace details {
#define INT_BITS 32 #define INT_BITS 32
...@@ -103,11 +102,12 @@ struct BroadcastConfig { ...@@ -103,11 +102,12 @@ struct BroadcastConfig {
strides_in.resize(dim_size, 1); strides_in.resize(dim_size, 1);
for (int i = 0; i < dim_size; ++i) { for (int i = 0; i < dim_size; ++i) {
strides_in[i] = in_dims[i] == 1 ? 0 : strides_in[i]; strides_in[i] = in_dims[i] == 1 ? 0 : strides_in[i];
strides_in[i] = strides_in[i] = (i != 0 && strides_in[i] != 0)
(i != 0 && strides_in[i] != 0) ? std::accumulate(in_dims.begin(),
? std::accumulate(in_dims.begin(), in_dims.begin() + i, 1, in_dims.begin() + i,
std::multiplies<int64_t>()) 1,
: strides_in[i]; std::multiplies<int64_t>())
: strides_in[i];
} }
memcpy(strides, strides_in.data(), kDims * sizeof(uint32_t)); memcpy(strides, strides_in.data(), kDims * sizeof(uint32_t));
...@@ -144,11 +144,18 @@ struct BroadcastConfig { ...@@ -144,11 +144,18 @@ struct BroadcastConfig {
* stride_nx: Each read one element stride stride_nx elements in the last dim. * stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim. * stride_ny: Each read one element stride stride_ny elements in the first dim.
*/ */
template <typename Tx, typename Ty, int NX, int NY, int BlockSize, template <typename Tx,
typename Ty,
int NX,
int NY,
int BlockSize,
bool IsBoundary = false> bool IsBoundary = false>
__device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, __device__ __forceinline__ void ReadData(Ty* dst,
int size_nx, int size_ny, const Tx* __restrict__ src,
int stride_nx, int stride_ny) { int size_nx,
int size_ny,
int stride_nx,
int stride_ny) {
int thread_offset = threadIdx.x; int thread_offset = threadIdx.x;
int left_size_nx = size_nx - thread_offset; int left_size_nx = size_nx - thread_offset;
...@@ -244,7 +251,8 @@ __device__ __forceinline__ void Init(T* dst, T init_data) { ...@@ -244,7 +251,8 @@ __device__ __forceinline__ void Init(T* dst, T init_data) {
* size: The current block needs to load size data continuously. * size: The current block needs to load size data continuously.
*/ */
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false> template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
__device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src, __device__ __forceinline__ void ReadData(T* dst,
const T* __restrict__ src,
int num) { int num) {
if (IsBoundary) { // blockDim.x * NX > num if (IsBoundary) { // blockDim.x * NX > num
int thread_offset = threadIdx.x * NX; int thread_offset = threadIdx.x * NX;
...@@ -299,11 +307,19 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src, ...@@ -299,11 +307,19 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src,
* stride_nx: Each read one element stride stride_nx elements in the last dim. * stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim. * stride_ny: Each read one element stride stride_ny elements in the first dim.
*/ */
template <typename T, int NX, int NY, int BlockSize, int Rank, template <typename T,
int NX,
int NY,
int BlockSize,
int Rank,
bool IsBoundary = false> bool IsBoundary = false>
__device__ __forceinline__ void ReadDataBc( __device__ __forceinline__ void ReadDataBc(
T* dst, const T* __restrict__ src, uint32_t block_offset, T* dst,
details::BroadcastConfig<Rank> config, int total_num_output, int stride_nx, const T* __restrict__ src,
uint32_t block_offset,
details::BroadcastConfig<Rank> config,
int total_num_output,
int stride_nx,
int stride_ny) { int stride_ny) {
uint32_t thread_offset = block_offset + threadIdx.x; uint32_t thread_offset = block_offset + threadIdx.x;
uint32_t index_src = 0; uint32_t index_src = 0;
...@@ -361,12 +377,25 @@ __device__ __forceinline__ void ReadDataBc( ...@@ -361,12 +377,25 @@ __device__ __forceinline__ void ReadDataBc(
* reduce_last_dim: Used to indicate whether the dimension of reduce contains * reduce_last_dim: Used to indicate whether the dimension of reduce contains
* the lowest dimension. * the lowest dimension.
*/ */
template <typename Tx, typename Ty, int NX, int NY, int BlockSize, int Rank, template <typename Tx,
typename IndexCal, typename Functor, bool IsBoundary = false> typename Ty,
__device__ __forceinline__ void ReadDataReduce( int NX,
Ty* dst, const Tx* __restrict__ src, int block_offset, int NY,
const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx, int BlockSize,
int stride_ny, Functor func, bool reduce_last_dim) { int Rank,
typename IndexCal,
typename Functor,
bool IsBoundary = false>
__device__ __forceinline__ void ReadDataReduce(Ty* dst,
const Tx* __restrict__ src,
int block_offset,
const IndexCal& index_cal,
int size_nx,
int size_ny,
int stride_nx,
int stride_ny,
Functor func,
bool reduce_last_dim) {
int thread_offset = 0; int thread_offset = 0;
int left_idx = 0; int left_idx = 0;
if (reduce_last_dim) { if (reduce_last_dim) {
...@@ -430,7 +459,8 @@ __device__ __forceinline__ void ReadDataReduce( ...@@ -430,7 +459,8 @@ __device__ __forceinline__ void ReadDataReduce(
* size: The current block needs to load size elements continuously. * size: The current block needs to load size elements continuously.
*/ */
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false> template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
__device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src, __device__ __forceinline__ void WriteData(T* dst,
T* __restrict__ src,
int num) { int num) {
if (IsBoundary) { if (IsBoundary) {
int thread_offset = threadIdx.x * NX; int thread_offset = threadIdx.x * NX;
...@@ -483,11 +513,18 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src, ...@@ -483,11 +513,18 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src,
* stride_nx: Each read one element stride stride_nx elements in the last dim. * stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim. * stride_ny: Each read one element stride stride_ny elements in the first dim.
*/ */
template <typename Tx, typename Ty, int NX, int NY, int BlockSize, template <typename Tx,
typename Ty,
int NX,
int NY,
int BlockSize,
bool IsBoundary = false> bool IsBoundary = false>
__device__ __forceinline__ void WriteData(Ty* dst, const Tx* __restrict__ src, __device__ __forceinline__ void WriteData(Ty* dst,
int size_nx, int size_ny, const Tx* __restrict__ src,
int stride_nx, int stride_ny) { int size_nx,
int size_ny,
int stride_nx,
int stride_ny) {
int thread_offset = threadIdx.x; int thread_offset = threadIdx.x;
int left_size_nx = size_nx - thread_offset; int left_size_nx = size_nx - thread_offset;
...@@ -589,11 +626,18 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) { ...@@ -589,11 +626,18 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
* coordinate mapping relationship between output data and input data. * coordinate mapping relationship between output data and input data.
* total_num_output: Total number of original output. * total_num_output: Total number of original output.
*/ */
template <typename T, int NX, int NY, int BlockSize, int Rank, template <typename T,
int NX,
int NY,
int BlockSize,
int Rank,
bool IsBoundary = false> bool IsBoundary = false>
__device__ __forceinline__ void ReadDataBc( __device__ __forceinline__ void ReadDataBc(
T* dst, const T* __restrict__ src, uint32_t block_offset, T* dst,
details::BroadcastConfig<Rank> config, int total_num_output) { const T* __restrict__ src,
uint32_t block_offset,
details::BroadcastConfig<Rank> config,
int total_num_output) {
uint32_t thread_offset = block_offset + threadIdx.x * NX; uint32_t thread_offset = block_offset + threadIdx.x * NX;
uint32_t index_src = 0; uint32_t index_src = 0;
...@@ -616,6 +660,5 @@ __device__ __forceinline__ void ReadDataBc( ...@@ -616,6 +660,5 @@ __device__ __forceinline__ void ReadDataBc(
} }
} }
} // namespace kernel_primitives } // namespace kps
} // namespace operators } // namespace pten
} // namespace paddle
...@@ -17,9 +17,8 @@ ...@@ -17,9 +17,8 @@
#include "xpu/kernel/debug.h" #include "xpu/kernel/debug.h"
#include "xpu/kernel/math.h" #include "xpu/kernel/math.h"
namespace paddle { namespace pten {
namespace operators { namespace kps {
namespace kernel_primitives {
namespace details { namespace details {
template <typename T, int VecSize> template <typename T, int VecSize>
...@@ -105,10 +104,17 @@ struct BroadcastConfig { ...@@ -105,10 +104,17 @@ struct BroadcastConfig {
* stride_nx: Each read one element stride stride_nx elements in the last dim. * stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim. * stride_ny: Each read one element stride stride_ny elements in the first dim.
*/ */
template <typename Tx, typename Ty, int NX, int NY, int BlockSize, template <typename Tx,
typename Ty,
int NX,
int NY,
int BlockSize,
bool IsBoundary = false> bool IsBoundary = false>
__device__ __inline__ void ReadData(Ty* dst, const Tx _global_ptr_* src, __device__ __inline__ void ReadData(Ty* dst,
int size_nx, int size_ny, int stride_nx, const Tx _global_ptr_* src,
int size_nx,
int size_ny,
int stride_nx,
int stride_ny) { int stride_ny) {
int thread_offset = core_id(); int thread_offset = core_id();
int left_size_nx = size_nx - thread_offset; int left_size_nx = size_nx - thread_offset;
...@@ -205,7 +211,8 @@ __device__ __inline__ void Init(T* dst, T init_data) { ...@@ -205,7 +211,8 @@ __device__ __inline__ void Init(T* dst, T init_data) {
* size: The current block needs to load size data continuously. * size: The current block needs to load size data continuously.
*/ */
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false> template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
__device__ __inline__ void ReadData(T* dst, const T _global_ptr_* src, __device__ __inline__ void ReadData(T* dst,
const T _global_ptr_* src,
int num) { int num) {
int thread_offset = core_id() * NX; int thread_offset = core_id() * NX;
__local__ T in_temp[1]; __local__ T in_temp[1];
...@@ -247,12 +254,18 @@ __device__ __inline__ void ReadData(T* dst, const T _global_ptr_* src, ...@@ -247,12 +254,18 @@ __device__ __inline__ void ReadData(T* dst, const T _global_ptr_* src,
* stride_nx: Each read one element stride stride_nx elements in the last dim. * stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim. * stride_ny: Each read one element stride stride_ny elements in the first dim.
*/ */
template <typename T, int NX, int NY, int BlockSize, int Rank, template <typename T,
int NX,
int NY,
int BlockSize,
int Rank,
bool IsBoundary = false> bool IsBoundary = false>
__device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src, __device__ __inline__ void ReadDataBc(T* dst,
const T _global_ptr_* src,
uint32_t block_offset, uint32_t block_offset,
details::BroadcastConfig<Rank> config, details::BroadcastConfig<Rank> config,
int total_num_output, int stride_nx, int total_num_output,
int stride_nx,
int stride_ny) { int stride_ny) {
uint32_t thread_offset = block_offset + core_id(); uint32_t thread_offset = block_offset + core_id();
uint32_t index_src = 0; uint32_t index_src = 0;
...@@ -307,13 +320,21 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src, ...@@ -307,13 +320,21 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
* reduce_last_dim: Used to indicate whether the dimension of reduce contains * reduce_last_dim: Used to indicate whether the dimension of reduce contains
* the lowest dimension. * the lowest dimension.
*/ */
template <typename T, int NX, int NY, int BlockSize, int Rank, template <typename T,
typename IndexCal, bool IsBoundary = false> int NX,
__device__ __inline__ void ReadDataReduce(T* dst, const T _global_ptr_* src, int NY,
int BlockSize,
int Rank,
typename IndexCal,
bool IsBoundary = false>
__device__ __inline__ void ReadDataReduce(T* dst,
const T _global_ptr_* src,
int block_offset, int block_offset,
const IndexCal& index_cal, const IndexCal& index_cal,
int size_nx, int size_ny, int size_nx,
int stride_nx, int stride_ny, int size_ny,
int stride_nx,
int stride_ny,
bool reduce_last_dim) { bool reduce_last_dim) {
__local__ Tx in_temp[1]; __local__ Tx in_temp[1];
int thread_offset = 0; int thread_offset = 0;
...@@ -423,10 +444,17 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) { ...@@ -423,10 +444,17 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
* stride_nx: Each read one element stride stride_nx elements in the last dim. * stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim. * stride_ny: Each read one element stride stride_ny elements in the first dim.
*/ */
template <typename Tx, typename Ty, int NX, int NY, int BlockSize, template <typename Tx,
typename Ty,
int NX,
int NY,
int BlockSize,
bool IsBoundary = false> bool IsBoundary = false>
__device__ __inline__ void WriteData(Ty _global_ptr_* dst, const Tx* src, __device__ __inline__ void WriteData(Ty _global_ptr_* dst,
int size_nx, int size_ny, int stride_nx, const Tx* src,
int size_nx,
int size_ny,
int stride_nx,
int stride_ny) { int stride_ny) {
int thread_offset = core_id(); int thread_offset = core_id();
int left_size_nx = size_nx - thread_offset; int left_size_nx = size_nx - thread_offset;
...@@ -483,7 +511,8 @@ __device__ __inline__ void WriteData(Ty _global_ptr_* dst, const Tx* src, ...@@ -483,7 +511,8 @@ __device__ __inline__ void WriteData(Ty _global_ptr_* dst, const Tx* src,
} }
} }
in_temp[0] = static_cast<Ty>(src[idx + idy * NX]); in_temp[0] = static_cast<Ty>(src[idx + idy * NX]);
LM2GM(in_temp, dst + thread_offset + idx * stride_nx + idy * stride_ny, LM2GM(in_temp,
dst + thread_offset + idx * stride_nx + idy * stride_ny,
sizeof(Ty)); sizeof(Ty));
} }
} }
...@@ -537,9 +566,14 @@ __device__ __inline__ void Init(T* dst, T* init_data, int num) { ...@@ -537,9 +566,14 @@ __device__ __inline__ void Init(T* dst, T* init_data, int num) {
* coordinate mapping relationship between output data and input data. * coordinate mapping relationship between output data and input data.
* total_num_output: Total number of original output. * total_num_output: Total number of original output.
*/ */
template <typename T, int NX, int NY, int BlockSize, int Rank, template <typename T,
int NX,
int NY,
int BlockSize,
int Rank,
bool IsBoundary = false> bool IsBoundary = false>
__device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src, __device__ __inline__ void ReadDataBc(T* dst,
const T _global_ptr_* src,
uint32_t block_offset, uint32_t block_offset,
details::BroadcastConfig<Rank> config, details::BroadcastConfig<Rank> config,
int total_num_output) { int total_num_output) {
...@@ -562,6 +596,5 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src, ...@@ -562,6 +596,5 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src,
} }
} }
} // namespace kernel_primitives } // namespace kps
} // namespace operators } // namespace pten
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/pten/common/float16.h"
#include "paddle/pten/core/enforce.h"
#include "paddle/pten/kernels/funcs/eigen/extensions.h"
namespace pten {
namespace kps {
namespace details {
static __device__ __forceinline__ pten::dtype::float16 Exp(
pten::dtype::float16 x) {
return ::Eigen::numext::exp(x);
}
static __device__ __forceinline__ float Exp(float x) { return expf(x); }
static __device__ __forceinline__ double Exp(double x) { return exp(x); }
static __device__ __forceinline__ pten::dtype::float16 Log(
pten::dtype::float16 x) {
return ::Eigen::numext::log(x);
}
static __device__ __forceinline__ float Log(float x) { return logf(x); }
static __device__ __forceinline__ double Log(double x) { return log(x); }
} // namespace details
/******************************** Unary Functor *******************************/
/**
* @brief Default unary exp functor
*/
template <typename Tx, typename Ty = Tx>
struct ExpFunctor {
HOSTDEVICE inline ExpFunctor() {}
HOSTDEVICE explicit inline ExpFunctor(int n) {}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(details::Exp(x));
}
};
/**
* @brief Default unary identity functor
*/
template <typename Tx, typename Ty = Tx>
struct IdentityFunctor {
HOSTDEVICE inline IdentityFunctor() {}
HOSTDEVICE explicit inline IdentityFunctor(int n) {}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(x);
}
};
/**
* @brief Default unary div functor. Divide by a constant
*/
template <typename Tx, typename Ty = Tx>
struct DivideFunctor {
private:
using MPType = typename ::paddle::operators::details::MPTypeTrait<Tx>::Type;
public:
HOSTDEVICE inline DivideFunctor() { n_inv = static_cast<MPType>(1.0f); }
HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((MPType)(1.0 / n)) {}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(static_cast<MPType>(x) * n_inv);
}
private:
MPType n_inv;
};
/**
* @brief Default inverse functor
*/
template <typename Tx, typename Ty = Tx>
struct InverseFunctor {
HOSTDEVICE inline InverseFunctor() {}
HOSTDEVICE explicit inline InverseFunctor(int n) {}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(-x);
}
};
/**
* @brief Default unary square functor
*/
template <typename Tx, typename Ty = Tx>
struct SquareFunctor {
HOSTDEVICE inline SquareFunctor() {}
HOSTDEVICE explicit inline SquareFunctor(int n) {}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(x) * static_cast<Ty>(x);
}
};
/****************************** Binary Functor ********************************/
/**
* @brief Default binary min functor
*/
template <typename T>
struct MinFunctor {
inline T initial() { return static_cast<T>(std::numeric_limits<T>::max()); }
__device__ __forceinline__ T operator()(const T a, const T b) const {
return (b < a) ? b : a;
}
};
/**
* @brief Default binary max functor
*/
template <typename T>
struct MaxFunctor {
inline T initial() {
return static_cast<T>(std::numeric_limits<T>::lowest());
}
__device__ __forceinline__ T operator()(const T a, const T b) const {
return (b > a) ? b : a;
}
};
/**
* @brief Default binary add functor
*/
template <typename T>
struct AddFunctor {
inline T initial() { return static_cast<T>(0.0f); }
__device__ __forceinline__ T operator()(const T a, const T b) const {
return b + a;
}
};
/**
* @brief Default binary add functor
*/
template <typename T>
struct MulFunctor {
inline T initial() { return static_cast<T>(1.0f); }
__device__ __forceinline__ T operator()(const T a, const T b) const {
return b * a;
}
};
/**
* @brief Default binary logic or functor
*/
template <typename T>
struct LogicalOrFunctor {
inline T initial() { return static_cast<T>(false); }
__device__ __forceinline__ T operator()(const T a, const T b) const {
return b || a;
}
};
/**
* @brief Default binary logic and functor
*/
template <typename T>
struct LogicalAndFunctor {
inline T initial() { return static_cast<T>(true); }
__device__ __forceinline__ T operator()(const T a, const T b) const {
return b && a;
}
};
/**
* @brief Default binary sub functor
*/
template <typename T>
struct SubFunctor {
inline T initial() { return static_cast<T>(0.0f); }
inline HOSTDEVICE T operator()(const T a, const T b) const { return a - b; }
};
/**
* @brief Default binary div functor
*/
template <typename T, typename Enable = void>
struct DivFunctor {
inline T initial() { return static_cast<T>(1.0f); }
inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
};
template <typename T>
struct DivFunctor<T,
typename std::enable_if<std::is_integral<T>::value>::type> {
inline T initial() { return static_cast<T>(1.0f); }
inline HOSTDEVICE T operator()(const T a, const T b) const {
// For int32/int64, need to check whether the divison is zero.
PADDLE_ENFORCE_NE(b,
0,
pten::errors::InvalidArgument(
"Integer division by zero encountered "
"in (floor) divide. Please check the input value."));
return a / b;
}
};
/**
* @brief Default binary floor divide functor
*/
template <typename T>
struct FloorDivFunctor {
inline T initial() { return static_cast<T>(1.0f); }
inline HOSTDEVICE T operator()(const T a, const T b) const {
PADDLE_ENFORCE_NE(b,
0,
pten::errors::InvalidArgument(
"Integer division by zero encountered "
"in (floor) divide. Please check the input value."));
return static_cast<T>(std::trunc(a / b));
}
};
} // namespace kps
} // namespace pten
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
// //
// Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
...@@ -14,9 +14,8 @@ ...@@ -14,9 +14,8 @@
#pragma once #pragma once
namespace paddle { namespace pten {
namespace operators { namespace kps {
namespace kernel_primitives {
#ifdef PADDLE_WITH_XPU2 #ifdef PADDLE_WITH_XPU2
struct dim3 { struct dim3 {
...@@ -43,8 +42,12 @@ struct DimConfig { ...@@ -43,8 +42,12 @@ struct DimConfig {
int rem_y; int rem_y;
int rem_z; int rem_z;
HOSTDEVICE explicit inline DimConfig(int split_x, int split_y, int split_z, HOSTDEVICE explicit inline DimConfig(int split_x,
int size_x, int size_y, int size_z) { int split_y,
int split_z,
int size_x,
int size_y,
int size_z) {
split_num_x = split_x; split_num_x = split_x;
split_num_y = split_y; split_num_y = split_y;
split_num_z = split_z; split_num_z = split_z;
...@@ -60,6 +63,5 @@ struct DimConfig { ...@@ -60,6 +63,5 @@ struct DimConfig {
} }
}; };
} // namespace kernel_primitives } // namespace kps
} // namespace operators } // namespace pten
} // namespace paddle
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/pten/kernels/primitive/helper_primitives.h"
#ifdef PADDLE_WITH_XPU2
#include "paddle/pten/backends/xpu/xpu_context.h"
#include "paddle/pten/kernels/primitive/compute_primitives_xpu2.h"
#include "paddle/pten/kernels/primitive/datamover_primitives_xpu2.h"
#include "paddle/pten/kernels/primitive/functor_primitives_xpu2.h"
#define KPStream XPUStream
#define KPDevice pten::XPUContext
#define _ptr_ _global_ptr_
#define __forceinline__ __inline__
#define __restrict__
#define THREAD_ID_X core_id()
#define THREAD_ID_Y 0
#define THREAD_ID_Z 0
#define BLOCK_NUM_X core_num()
#define BLOCK_NUM_Y 0
#define BLOCK_NUM_Z 0
#define BLOCK_ID_X cluster_id()
#define BLOCK_ID_Y 0
#define BLOCK_ID_Z 0
#define GRID_NUM_X cluster_num()
#define GRID_NUM_Y 0
#define GRID_NUM_Z 0
#else
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/kernels/primitive/compute_primitives.h"
#include "paddle/pten/kernels/primitive/datamover_primitives.h"
#include "paddle/pten/kernels/primitive/functor_primitives.h"
#define KPStream gpuStream_t
#define KPDevice pten::GPUContext
#define _ptr_
#define THREAD_ID_X threadIdx.x
#define THREAD_ID_Y threadIdx.y
#define THREAD_ID_Z threadIdx.z
#define BLOCK_NUM_X blockDim.x
#define BLOCK_NUM_Y blockDim.y
#define BLOCK_NUM_Z blockDim.z
#define BLOCK_ID_X blockIdx.x
#define BLOCK_ID_Y blockIdx.y
#define BLOCK_ID_Z blockIdx.z
#define GRID_NUM_X gridDim.x
#define GRID_NUM_Y gridDim.y
#define GRID_NUM_Z gridDim.z
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册