diff --git a/paddle/fluid/operators/kernel_primitives/functor_primitives.h b/paddle/fluid/operators/kernel_primitives/functor_primitives.h index 03610d4589058e074f64940741df34bd8f66e379..15bb01a865d402f8da3fb7ed4178548c8da46b40 100644 --- a/paddle/fluid/operators/kernel_primitives/functor_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/functor_primitives.h @@ -13,241 +13,10 @@ // limitations under the License. #pragma once - -#include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/platform/enforce.h" -#include "paddle/fluid/platform/float16.h" -#include "paddle/pten/kernels/funcs/eigen/extensions.h" +#include "paddle/pten/kernels/primitive/functor_primitives.h" namespace paddle { namespace operators { -namespace kernel_primitives { -namespace details { - -static __device__ __forceinline__ platform::float16 Exp(platform::float16 x) { - return ::Eigen::numext::exp(x); -} - -static __device__ __forceinline__ float Exp(float x) { return expf(x); } - -static __device__ __forceinline__ double Exp(double x) { return exp(x); } - -static __device__ __forceinline__ platform::float16 Log(platform::float16 x) { - return ::Eigen::numext::log(x); -} - -static __device__ __forceinline__ float Log(float x) { return logf(x); } - -static __device__ __forceinline__ double Log(double x) { return log(x); } - -} // namespace details - -/******************************** Unary Functor *******************************/ - -/** - * @brief Default unary exp functor - */ -template -struct ExpFunctor { - HOSTDEVICE inline ExpFunctor() {} - - HOSTDEVICE explicit inline ExpFunctor(int n) {} - - HOSTDEVICE inline Ty operator()(const Tx x) const { - return static_cast(details::Exp(x)); - } -}; - -/** - * @brief Default unary identity functor - */ -template -struct IdentityFunctor { - HOSTDEVICE inline IdentityFunctor() {} - - HOSTDEVICE explicit inline IdentityFunctor(int n) {} - - HOSTDEVICE inline Ty operator()(const Tx x) const { - return static_cast(x); - } -}; - -/** - * @brief Default unary div functor. Divide by a constant - */ -template -struct DivideFunctor { - private: - using MPType = typename ::paddle::operators::details::MPTypeTrait::Type; - - public: - HOSTDEVICE inline DivideFunctor() { n_inv = static_cast(1.0f); } - - HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((MPType)(1.0 / n)) {} - - HOSTDEVICE inline Ty operator()(const Tx x) const { - return static_cast(static_cast(x) * n_inv); - } - - private: - MPType n_inv; -}; - -/** - * @brief Default inverse functor - */ -template -struct InverseFunctor { - HOSTDEVICE inline InverseFunctor() {} - - HOSTDEVICE explicit inline InverseFunctor(int n) {} - - HOSTDEVICE inline Ty operator()(const Tx x) const { - return static_cast(-x); - } -}; - -/** - * @brief Default unary square functor - */ -template -struct SquareFunctor { - HOSTDEVICE inline SquareFunctor() {} - - HOSTDEVICE explicit inline SquareFunctor(int n) {} - - HOSTDEVICE inline Ty operator()(const Tx x) const { - return static_cast(x) * static_cast(x); - } -}; - -/****************************** Binary Functor ********************************/ - -/** - * @brief Default binary min functor - */ -template -struct MinFunctor { - inline T initial() { return static_cast(std::numeric_limits::max()); } - - __device__ __forceinline__ T operator()(const T a, const T b) const { - return (b < a) ? b : a; - } -}; - -/** - * @brief Default binary max functor - */ -template -struct MaxFunctor { - inline T initial() { - return static_cast(std::numeric_limits::lowest()); - } - - __device__ __forceinline__ T operator()(const T a, const T b) const { - return (b > a) ? b : a; - } -}; - -/** - * @brief Default binary add functor - */ -template -struct AddFunctor { - inline T initial() { return static_cast(0.0f); } - - __device__ __forceinline__ T operator()(const T a, const T b) const { - return b + a; - } -}; - -/** - * @brief Default binary add functor - */ -template -struct MulFunctor { - inline T initial() { return static_cast(1.0f); } - - __device__ __forceinline__ T operator()(const T a, const T b) const { - return b * a; - } -}; - -/** - * @brief Default binary logic or functor - */ -template -struct LogicalOrFunctor { - inline T initial() { return static_cast(false); } - - __device__ __forceinline__ T operator()(const T a, const T b) const { - return b || a; - } -}; - -/** - * @brief Default binary logic and functor - */ -template -struct LogicalAndFunctor { - inline T initial() { return static_cast(true); } - - __device__ __forceinline__ T operator()(const T a, const T b) const { - return b && a; - } -}; - -/** - * @brief Default binary sub functor - */ -template -struct SubFunctor { - inline T initial() { return static_cast(0.0f); } - - inline HOSTDEVICE T operator()(const T a, const T b) const { return a - b; } -}; - -/** - * @brief Default binary div functor - */ -template -struct DivFunctor { - inline T initial() { return static_cast(1.0f); } - - inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; } -}; - -template -struct DivFunctor::value>::type> { - inline T initial() { return static_cast(1.0f); } - - inline HOSTDEVICE T operator()(const T a, const T b) const { - // For int32/int64, need to check whether the divison is zero. - PADDLE_ENFORCE_NE(b, 0, - platform::errors::InvalidArgument( - "Integer division by zero encountered " - "in (floor) divide. Please check the input value.")); - return a / b; - } -}; - -/** - * @brief Default binary floor divide functor - */ -template -struct FloorDivFunctor { - inline T initial() { return static_cast(1.0f); } - - inline HOSTDEVICE T operator()(const T a, const T b) const { - PADDLE_ENFORCE_NE(b, 0, - platform::errors::InvalidArgument( - "Integer division by zero encountered " - "in (floor) divide. Please check the input value.")); - return static_cast(std::trunc(a / b)); - } -}; - -} // namespace kernel_primitives +namespace kernel_primitives = pten::kps; } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h index 558f8c81c66428ca0561806b8021f09261e32e3b..4ec3741bc91bb58a183ee9a2ff106461c6d71d05 100644 --- a/paddle/fluid/operators/kernel_primitives/kernel_primitives.h +++ b/paddle/fluid/operators/kernel_primitives/kernel_primitives.h @@ -13,61 +13,10 @@ // limitations under the License. #pragma once -#include "paddle/fluid/operators/kernel_primitives/helper_primitives.h" -#ifdef PADDLE_WITH_XPU2 -#include "paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h" -#include "paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h" -#include "paddle/fluid/operators/kernel_primitives/functor_primitives_xpu2.h" - -#define KPStream XPUStream -#define KPDevice paddle::platform::XPUDeviceContext -#define _ptr_ _global_ptr_ -#define __forceinline__ __inline__ -#define __restrict__ - -#define THREAD_ID_X core_id() -#define THREAD_ID_Y 0 -#define THREAD_ID_Z 0 - -#define BLOCK_NUM_X core_num() -#define BLOCK_NUM_Y 0 -#define BLOCK_NUM_Z 0 - -#define BLOCK_ID_X cluster_id() -#define BLOCK_ID_Y 0 -#define BLOCK_ID_Z 0 - -#define GRID_NUM_X cluster_num() -#define GRID_NUM_Y 0 -#define GRID_NUM_Z 0 -#else -#include "paddle/fluid/operators/kernel_primitives/compute_primitives.h" -#include "paddle/fluid/operators/kernel_primitives/datamover_primitives.h" -#include "paddle/fluid/operators/kernel_primitives/functor_primitives.h" - -#define KPStream gpuStream_t -#define KPDevice paddle::platform::CUDADeviceContext -#define _ptr_ - -#define THREAD_ID_X threadIdx.x -#define THREAD_ID_Y threadIdx.y -#define THREAD_ID_Z threadIdx.z - -#define BLOCK_NUM_X blockDim.x -#define BLOCK_NUM_Y blockDim.y -#define BLOCK_NUM_Z blockDim.z - -#define BLOCK_ID_X blockIdx.x -#define BLOCK_ID_Y blockIdx.y -#define BLOCK_ID_Z blockIdx.z - -#define GRID_NUM_X gridDim.x -#define GRID_NUM_Y gridDim.y -#define GRID_NUM_Z gridDim.z -#endif +#include "paddle/pten/kernels/primitive/kernel_primitives.h" namespace paddle { namespace operators { -namespace kernel_primitives {} +namespace kernel_primitives = pten::kps; } } diff --git a/paddle/pten/kernels/funcs/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h index 206ad151c5a9b72f329e99a711b62022fffb9395..9ea27fd9c5b8d5f9b9a4d6fb0d6cb608d13f5984 100644 --- a/paddle/pten/kernels/funcs/elementwise_base.h +++ b/paddle/pten/kernels/funcs/elementwise_base.h @@ -22,12 +22,12 @@ limitations under the License. */ #include "paddle/pten/kernels/empty_kernel.h" #if defined(__NVCC__) || defined(__HIPCC__) -#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" #include "paddle/fluid/platform/aligned_vector.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/function_traits.h" +#include "paddle/pten/kernels/primitive/kernel_primitives.h" -namespace kps = paddle::operators::kernel_primitives; +namespace kps = pten::kps; #endif diff --git a/paddle/pten/kernels/gpu/reduce.h b/paddle/pten/kernels/gpu/reduce.h index 10badf00a1e246a36b0e0b37525ac9ffba028e92..d864c76ea197408e4d035c816a32d5bb5ccb71c1 100644 --- a/paddle/pten/kernels/gpu/reduce.h +++ b/paddle/pten/kernels/gpu/reduce.h @@ -34,13 +34,13 @@ namespace cub = hipcub; #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/amp/fp16_type_traits.h" -#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/fast_divmod.h" #include "paddle/fluid/string/string_helper.h" #include "paddle/pten/core/array.h" #include "paddle/pten/core/enforce.h" +#include "paddle/pten/kernels/primitive/kernel_primitives.h" #include "paddle/pten/api/ext/dispatch.h" #include "paddle/pten/backends/gpu/gpu_context.h" @@ -51,7 +51,7 @@ namespace cub = hipcub; #define REDUCE_SPLIT_BOUNDARY 512 #define REDUCE_VEC_SIZE 4 -namespace kps = paddle::operators::kernel_primitives; +namespace kps = pten::kps; namespace pten { namespace kernels { diff --git a/paddle/fluid/operators/kernel_primitives/compute_primitives.h b/paddle/pten/kernels/primitive/compute_primitives.h similarity index 87% rename from paddle/fluid/operators/kernel_primitives/compute_primitives.h rename to paddle/pten/kernels/primitive/compute_primitives.h index 2320b9e0b2fbf47610365155558b869bd5d77b38..ac812c9c9f3eb3d8d97ef595ca3d1bdff3177e41 100644 --- a/paddle/fluid/operators/kernel_primitives/compute_primitives.h +++ b/paddle/pten/kernels/primitive/compute_primitives.h @@ -22,11 +22,10 @@ #endif #include "paddle/fluid/platform/device/gpu/gpu_device_function.h" -#include "paddle/fluid/platform/float16.h" +#include "paddle/pten/common/float16.h" -namespace paddle { -namespace operators { -namespace kernel_primitives { +namespace pten { +namespace kps { namespace details { #ifdef __HIPCC__ @@ -48,7 +47,7 @@ class MPTypeTrait { }; template <> -class MPTypeTrait { +class MPTypeTrait { public: using Type = float; }; @@ -158,9 +157,14 @@ __device__ __forceinline__ T BlockYReduce(T val, ReduceOp reducer) { * in: The register pointer of in, the size is NX * NY. * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, +__device__ __forceinline__ void ElementwiseUnary(OutT* out, + const InT* in, OpFunc compute) { #pragma unroll for (int idx = 0; idx < NX * NY; idx++) { @@ -193,9 +197,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, * in2: The register pointer of second input, size is NX * NY. * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, +__device__ __forceinline__ void ElementwiseBinary(OutT* out, + const InT* in1, const InT* in2, OpFunc compute) { #pragma unroll @@ -231,12 +240,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, * in3: The register pointer of third input, size is NX * NY. * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1, - const InT* in2, - const InT* in3, - OpFunc compute) { +__device__ __forceinline__ void ElementwiseTernary( + OutT* out, const InT* in1, const InT* in2, const InT* in3, OpFunc compute) { #pragma unroll for (int idx = 0; idx < NX * NY; ++idx) { out[idx] = static_cast(compute(in1[idx], in2[idx], in3[idx])); @@ -268,9 +279,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1, * ins: A pointers of array consisting of multiple inputs. * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY], +__device__ __forceinline__ void ElementwiseAny(OutT* out, + InT (*ins)[NX * NY], OpFunc compute) { InT args[Arity]; #pragma unroll @@ -309,10 +326,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY], * in2: The register pointer of second input, size is NX * NY. * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1, - const InT* in2, OpFunc compute) { +__device__ __forceinline__ void CycleBinary(OutT* out, + const InT* in1, + const InT* in2, + OpFunc compute) { #pragma unroll for (int idx = 0; idx < NX; idx++) { #pragma unroll @@ -350,9 +373,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1, * reducer: Compute function which was declared like ReduceFunctor(). * reduce_last_dim: if the last dim gets involved in reduction. */ -template -__device__ __forceinline__ void Reduce(T* out, const T* in, +__device__ __forceinline__ void Reduce(T* out, + const T* in, ReduceFunctor reducer, bool reduce_last_dim) { int block_index = blockDim.y; @@ -386,6 +414,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in, } } -} // namespace kernel_primitives -} // namespace operators -} // namespace paddle +} // namespace kps +} // namespace pten diff --git a/paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h b/paddle/pten/kernels/primitive/compute_primitives_xpu2.h similarity index 85% rename from paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h rename to paddle/pten/kernels/primitive/compute_primitives_xpu2.h index 32355915809161ae1a4dcc275eba8a28966fb92e..d7282c089fc9cc332abc132941188c7804e68f80 100644 --- a/paddle/fluid/operators/kernel_primitives/compute_primitives_xpu2.h +++ b/paddle/pten/kernels/primitive/compute_primitives_xpu2.h @@ -13,13 +13,13 @@ // limitations under the License. #pragma once +#include "paddle/pten/common/float16.h" #include "xpu/kernel/cluster_header.h" #include "xpu/kernel/debug.h" #include "xpu/kernel/math.h" -namespace paddle { -namespace operators { -namespace kernel_primitives { +namespace pten { +namespace kps { namespace details { // kGlobalMode: block reduce, each block gets an output; @@ -33,7 +33,7 @@ class MPTypeTrait { }; template <> -class MPTypeTrait { +class MPTypeTrait { public: using Type = float; }; @@ -102,9 +102,14 @@ __device__ void BlockXReduce(T* data, OpFunc reducer) { * in: The register pointer of in, the size is NX * NY. * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, +__device__ __forceinline__ void ElementwiseUnary(OutT* out, + const InT* in, OpFunc compute) { #pragma unroll for (int idx = 0; idx < NX * NY; idx++) { @@ -137,9 +142,14 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out, const InT* in, * in2: The register pointer of second input, size is NX * NY. * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, +__device__ __forceinline__ void ElementwiseBinary(OutT* out, + const InT* in1, const InT* in2, OpFunc compute) { #pragma unroll @@ -175,12 +185,14 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out, const InT* in1, * in3: The register pointer of third input, size is NX * NY. * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1, - const InT* in2, - const InT* in3, - OpFunc compute) { +__device__ __forceinline__ void ElementwiseTernary( + OutT* out, const InT* in1, const InT* in2, const InT* in3, OpFunc compute) { #pragma unroll for (int idx = 0; idx < NX * NY; ++idx) { out[idx] = static_cast(compute(in1[idx], in2[idx], in3[idx])); @@ -212,9 +224,15 @@ __device__ __forceinline__ void ElementwiseTernary(OutT* out, const InT* in1, * ins: A pointers of array consisting of multiple inputs. * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY], +__device__ __forceinline__ void ElementwiseAny(OutT* out, + InT (*ins)[NX * NY], OpFunc compute) { __local__ InT args[Arity]; #pragma unroll @@ -253,10 +271,16 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out, InT (*ins)[NX * NY], * in2: The register pointer of second input, size is NX * NY. * compute: Compute function which was declared like OpFunc(). */ -template -__device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1, - const InT* in2, OpFunc compute) { +__device__ __forceinline__ void CycleBinary(OutT* out, + const InT* in1, + const InT* in2, + OpFunc compute) { #pragma unroll for (int idx = 0; idx < NX; idx++) { #pragma unroll @@ -294,9 +318,14 @@ __device__ __forceinline__ void CycleBinary(OutT* out, const InT* in1, * reducer: Compute function which was declared like ReduceFunctor(). * reduce_last_dim: if the last dim gets involved in reduction. */ -template -__device__ __forceinline__ void Reduce(T* out, const T* in, +__device__ __forceinline__ void Reduce(T* out, + const T* in, ReduceFunctor reducer, bool reduce_last_dim) { if (Mode == kGlobalMode) { @@ -319,6 +348,5 @@ __device__ __forceinline__ void Reduce(T* out, const T* in, } } -} // namespace kernel_primitives -} // namespace operators -} // namespace paddle +} // namespace kps +} // namespace pten diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h b/paddle/pten/kernels/primitive/datamover_primitives.h similarity index 87% rename from paddle/fluid/operators/kernel_primitives/datamover_primitives.h rename to paddle/pten/kernels/primitive/datamover_primitives.h index 45697073cbf85b436a4db33b0a2d49d8b805fd63..2a8006f3ecbc427c3e0cf36a08457c2ecd5f84df 100644 --- a/paddle/fluid/operators/kernel_primitives/datamover_primitives.h +++ b/paddle/pten/kernels/primitive/datamover_primitives.h @@ -22,9 +22,8 @@ #endif #include "paddle/pten/core/ddim.h" -namespace paddle { -namespace operators { -namespace kernel_primitives { +namespace pten { +namespace kps { namespace details { #define INT_BITS 32 @@ -103,11 +102,12 @@ struct BroadcastConfig { strides_in.resize(dim_size, 1); for (int i = 0; i < dim_size; ++i) { strides_in[i] = in_dims[i] == 1 ? 0 : strides_in[i]; - strides_in[i] = - (i != 0 && strides_in[i] != 0) - ? std::accumulate(in_dims.begin(), in_dims.begin() + i, 1, - std::multiplies()) - : strides_in[i]; + strides_in[i] = (i != 0 && strides_in[i] != 0) + ? std::accumulate(in_dims.begin(), + in_dims.begin() + i, + 1, + std::multiplies()) + : strides_in[i]; } memcpy(strides, strides_in.data(), kDims * sizeof(uint32_t)); @@ -144,11 +144,18 @@ struct BroadcastConfig { * stride_nx: Each read one element stride stride_nx elements in the last dim. * stride_ny: Each read one element stride stride_ny elements in the first dim. */ -template -__device__ __forceinline__ void ReadData(Ty* dst, const Tx* __restrict__ src, - int size_nx, int size_ny, - int stride_nx, int stride_ny) { +__device__ __forceinline__ void ReadData(Ty* dst, + const Tx* __restrict__ src, + int size_nx, + int size_ny, + int stride_nx, + int stride_ny) { int thread_offset = threadIdx.x; int left_size_nx = size_nx - thread_offset; @@ -244,7 +251,8 @@ __device__ __forceinline__ void Init(T* dst, T init_data) { * size: The current block needs to load size data continuously. */ template -__device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src, +__device__ __forceinline__ void ReadData(T* dst, + const T* __restrict__ src, int num) { if (IsBoundary) { // blockDim.x * NX > num int thread_offset = threadIdx.x * NX; @@ -299,11 +307,19 @@ __device__ __forceinline__ void ReadData(T* dst, const T* __restrict__ src, * stride_nx: Each read one element stride stride_nx elements in the last dim. * stride_ny: Each read one element stride stride_ny elements in the first dim. */ -template __device__ __forceinline__ void ReadDataBc( - T* dst, const T* __restrict__ src, uint32_t block_offset, - details::BroadcastConfig config, int total_num_output, int stride_nx, + T* dst, + const T* __restrict__ src, + uint32_t block_offset, + details::BroadcastConfig config, + int total_num_output, + int stride_nx, int stride_ny) { uint32_t thread_offset = block_offset + threadIdx.x; uint32_t index_src = 0; @@ -361,12 +377,25 @@ __device__ __forceinline__ void ReadDataBc( * reduce_last_dim: Used to indicate whether the dimension of reduce contains * the lowest dimension. */ -template -__device__ __forceinline__ void ReadDataReduce( - Ty* dst, const Tx* __restrict__ src, int block_offset, - const IndexCal& index_cal, int size_nx, int size_ny, int stride_nx, - int stride_ny, Functor func, bool reduce_last_dim) { +template +__device__ __forceinline__ void ReadDataReduce(Ty* dst, + const Tx* __restrict__ src, + int block_offset, + const IndexCal& index_cal, + int size_nx, + int size_ny, + int stride_nx, + int stride_ny, + Functor func, + bool reduce_last_dim) { int thread_offset = 0; int left_idx = 0; if (reduce_last_dim) { @@ -430,7 +459,8 @@ __device__ __forceinline__ void ReadDataReduce( * size: The current block needs to load size elements continuously. */ template -__device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src, +__device__ __forceinline__ void WriteData(T* dst, + T* __restrict__ src, int num) { if (IsBoundary) { int thread_offset = threadIdx.x * NX; @@ -483,11 +513,18 @@ __device__ __forceinline__ void WriteData(T* dst, T* __restrict__ src, * stride_nx: Each read one element stride stride_nx elements in the last dim. * stride_ny: Each read one element stride stride_ny elements in the first dim. */ -template -__device__ __forceinline__ void WriteData(Ty* dst, const Tx* __restrict__ src, - int size_nx, int size_ny, - int stride_nx, int stride_ny) { +__device__ __forceinline__ void WriteData(Ty* dst, + const Tx* __restrict__ src, + int size_nx, + int size_ny, + int stride_nx, + int stride_ny) { int thread_offset = threadIdx.x; int left_size_nx = size_nx - thread_offset; @@ -589,11 +626,18 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) { * coordinate mapping relationship between output data and input data. * total_num_output: Total number of original output. */ -template __device__ __forceinline__ void ReadDataBc( - T* dst, const T* __restrict__ src, uint32_t block_offset, - details::BroadcastConfig config, int total_num_output) { + T* dst, + const T* __restrict__ src, + uint32_t block_offset, + details::BroadcastConfig config, + int total_num_output) { uint32_t thread_offset = block_offset + threadIdx.x * NX; uint32_t index_src = 0; @@ -616,6 +660,5 @@ __device__ __forceinline__ void ReadDataBc( } } -} // namespace kernel_primitives -} // namespace operators -} // namespace paddle +} // namespace kps +} // namespace pten diff --git a/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h b/paddle/pten/kernels/primitive/datamover_primitives_xpu2.h similarity index 90% rename from paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h rename to paddle/pten/kernels/primitive/datamover_primitives_xpu2.h index 333899535894e0939086817c9fd6caad992f807f..d6586368c804126f896ba476cc1679d54a4c6eb8 100644 --- a/paddle/fluid/operators/kernel_primitives/datamover_primitives_xpu2.h +++ b/paddle/pten/kernels/primitive/datamover_primitives_xpu2.h @@ -17,9 +17,8 @@ #include "xpu/kernel/debug.h" #include "xpu/kernel/math.h" -namespace paddle { -namespace operators { -namespace kernel_primitives { +namespace pten { +namespace kps { namespace details { template @@ -105,10 +104,17 @@ struct BroadcastConfig { * stride_nx: Each read one element stride stride_nx elements in the last dim. * stride_ny: Each read one element stride stride_ny elements in the first dim. */ -template -__device__ __inline__ void ReadData(Ty* dst, const Tx _global_ptr_* src, - int size_nx, int size_ny, int stride_nx, +__device__ __inline__ void ReadData(Ty* dst, + const Tx _global_ptr_* src, + int size_nx, + int size_ny, + int stride_nx, int stride_ny) { int thread_offset = core_id(); int left_size_nx = size_nx - thread_offset; @@ -205,7 +211,8 @@ __device__ __inline__ void Init(T* dst, T init_data) { * size: The current block needs to load size data continuously. */ template -__device__ __inline__ void ReadData(T* dst, const T _global_ptr_* src, +__device__ __inline__ void ReadData(T* dst, + const T _global_ptr_* src, int num) { int thread_offset = core_id() * NX; __local__ T in_temp[1]; @@ -247,12 +254,18 @@ __device__ __inline__ void ReadData(T* dst, const T _global_ptr_* src, * stride_nx: Each read one element stride stride_nx elements in the last dim. * stride_ny: Each read one element stride stride_ny elements in the first dim. */ -template -__device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src, +__device__ __inline__ void ReadDataBc(T* dst, + const T _global_ptr_* src, uint32_t block_offset, details::BroadcastConfig config, - int total_num_output, int stride_nx, + int total_num_output, + int stride_nx, int stride_ny) { uint32_t thread_offset = block_offset + core_id(); uint32_t index_src = 0; @@ -307,13 +320,21 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src, * reduce_last_dim: Used to indicate whether the dimension of reduce contains * the lowest dimension. */ -template -__device__ __inline__ void ReadDataReduce(T* dst, const T _global_ptr_* src, +template +__device__ __inline__ void ReadDataReduce(T* dst, + const T _global_ptr_* src, int block_offset, const IndexCal& index_cal, - int size_nx, int size_ny, - int stride_nx, int stride_ny, + int size_nx, + int size_ny, + int stride_nx, + int stride_ny, bool reduce_last_dim) { __local__ Tx in_temp[1]; int thread_offset = 0; @@ -423,10 +444,17 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) { * stride_nx: Each read one element stride stride_nx elements in the last dim. * stride_ny: Each read one element stride stride_ny elements in the first dim. */ -template -__device__ __inline__ void WriteData(Ty _global_ptr_* dst, const Tx* src, - int size_nx, int size_ny, int stride_nx, +__device__ __inline__ void WriteData(Ty _global_ptr_* dst, + const Tx* src, + int size_nx, + int size_ny, + int stride_nx, int stride_ny) { int thread_offset = core_id(); int left_size_nx = size_nx - thread_offset; @@ -483,7 +511,8 @@ __device__ __inline__ void WriteData(Ty _global_ptr_* dst, const Tx* src, } } in_temp[0] = static_cast(src[idx + idy * NX]); - LM2GM(in_temp, dst + thread_offset + idx * stride_nx + idy * stride_ny, + LM2GM(in_temp, + dst + thread_offset + idx * stride_nx + idy * stride_ny, sizeof(Ty)); } } @@ -537,9 +566,14 @@ __device__ __inline__ void Init(T* dst, T* init_data, int num) { * coordinate mapping relationship between output data and input data. * total_num_output: Total number of original output. */ -template -__device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src, +__device__ __inline__ void ReadDataBc(T* dst, + const T _global_ptr_* src, uint32_t block_offset, details::BroadcastConfig config, int total_num_output) { @@ -562,6 +596,5 @@ __device__ __inline__ void ReadDataBc(T* dst, const T _global_ptr_* src, } } -} // namespace kernel_primitives -} // namespace operators -} // namespace paddle +} // namespace kps +} // namespace pten diff --git a/paddle/pten/kernels/primitive/functor_primitives.h b/paddle/pten/kernels/primitive/functor_primitives.h new file mode 100644 index 0000000000000000000000000000000000000000..8d62d622701342e058a57ff31d12410e78eb1306 --- /dev/null +++ b/paddle/pten/kernels/primitive/functor_primitives.h @@ -0,0 +1,255 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/pten/common/float16.h" +#include "paddle/pten/core/enforce.h" +#include "paddle/pten/kernels/funcs/eigen/extensions.h" + +namespace pten { +namespace kps { +namespace details { + +static __device__ __forceinline__ pten::dtype::float16 Exp( + pten::dtype::float16 x) { + return ::Eigen::numext::exp(x); +} + +static __device__ __forceinline__ float Exp(float x) { return expf(x); } + +static __device__ __forceinline__ double Exp(double x) { return exp(x); } + +static __device__ __forceinline__ pten::dtype::float16 Log( + pten::dtype::float16 x) { + return ::Eigen::numext::log(x); +} + +static __device__ __forceinline__ float Log(float x) { return logf(x); } + +static __device__ __forceinline__ double Log(double x) { return log(x); } + +} // namespace details + +/******************************** Unary Functor *******************************/ + +/** + * @brief Default unary exp functor + */ +template +struct ExpFunctor { + HOSTDEVICE inline ExpFunctor() {} + + HOSTDEVICE explicit inline ExpFunctor(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx x) const { + return static_cast(details::Exp(x)); + } +}; + +/** + * @brief Default unary identity functor + */ +template +struct IdentityFunctor { + HOSTDEVICE inline IdentityFunctor() {} + + HOSTDEVICE explicit inline IdentityFunctor(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx x) const { + return static_cast(x); + } +}; + +/** + * @brief Default unary div functor. Divide by a constant + */ +template +struct DivideFunctor { + private: + using MPType = typename ::paddle::operators::details::MPTypeTrait::Type; + + public: + HOSTDEVICE inline DivideFunctor() { n_inv = static_cast(1.0f); } + + HOSTDEVICE explicit inline DivideFunctor(int n) : n_inv((MPType)(1.0 / n)) {} + + HOSTDEVICE inline Ty operator()(const Tx x) const { + return static_cast(static_cast(x) * n_inv); + } + + private: + MPType n_inv; +}; + +/** + * @brief Default inverse functor + */ +template +struct InverseFunctor { + HOSTDEVICE inline InverseFunctor() {} + + HOSTDEVICE explicit inline InverseFunctor(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx x) const { + return static_cast(-x); + } +}; + +/** + * @brief Default unary square functor + */ +template +struct SquareFunctor { + HOSTDEVICE inline SquareFunctor() {} + + HOSTDEVICE explicit inline SquareFunctor(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx x) const { + return static_cast(x) * static_cast(x); + } +}; + +/****************************** Binary Functor ********************************/ + +/** + * @brief Default binary min functor + */ +template +struct MinFunctor { + inline T initial() { return static_cast(std::numeric_limits::max()); } + + __device__ __forceinline__ T operator()(const T a, const T b) const { + return (b < a) ? b : a; + } +}; + +/** + * @brief Default binary max functor + */ +template +struct MaxFunctor { + inline T initial() { + return static_cast(std::numeric_limits::lowest()); + } + + __device__ __forceinline__ T operator()(const T a, const T b) const { + return (b > a) ? b : a; + } +}; + +/** + * @brief Default binary add functor + */ +template +struct AddFunctor { + inline T initial() { return static_cast(0.0f); } + + __device__ __forceinline__ T operator()(const T a, const T b) const { + return b + a; + } +}; + +/** + * @brief Default binary add functor + */ +template +struct MulFunctor { + inline T initial() { return static_cast(1.0f); } + + __device__ __forceinline__ T operator()(const T a, const T b) const { + return b * a; + } +}; + +/** + * @brief Default binary logic or functor + */ +template +struct LogicalOrFunctor { + inline T initial() { return static_cast(false); } + + __device__ __forceinline__ T operator()(const T a, const T b) const { + return b || a; + } +}; + +/** + * @brief Default binary logic and functor + */ +template +struct LogicalAndFunctor { + inline T initial() { return static_cast(true); } + + __device__ __forceinline__ T operator()(const T a, const T b) const { + return b && a; + } +}; + +/** + * @brief Default binary sub functor + */ +template +struct SubFunctor { + inline T initial() { return static_cast(0.0f); } + + inline HOSTDEVICE T operator()(const T a, const T b) const { return a - b; } +}; + +/** + * @brief Default binary div functor + */ +template +struct DivFunctor { + inline T initial() { return static_cast(1.0f); } + + inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; } +}; + +template +struct DivFunctor::value>::type> { + inline T initial() { return static_cast(1.0f); } + + inline HOSTDEVICE T operator()(const T a, const T b) const { + // For int32/int64, need to check whether the divison is zero. + PADDLE_ENFORCE_NE(b, + 0, + pten::errors::InvalidArgument( + "Integer division by zero encountered " + "in (floor) divide. Please check the input value.")); + return a / b; + } +}; + +/** + * @brief Default binary floor divide functor + */ +template +struct FloorDivFunctor { + inline T initial() { return static_cast(1.0f); } + + inline HOSTDEVICE T operator()(const T a, const T b) const { + PADDLE_ENFORCE_NE(b, + 0, + pten::errors::InvalidArgument( + "Integer division by zero encountered " + "in (floor) divide. Please check the input value.")); + return static_cast(std::trunc(a / b)); + } +}; + +} // namespace kps +} // namespace pten diff --git a/paddle/fluid/operators/kernel_primitives/helper_primitives.h b/paddle/pten/kernels/primitive/helper_primitives.h similarity index 73% rename from paddle/fluid/operators/kernel_primitives/helper_primitives.h rename to paddle/pten/kernels/primitive/helper_primitives.h index 48ac1509d1f6e8cd3c6ecf06ac0f3445dac39a51..26d431d46abae651e854820b0c7b43afadf148b6 100644 --- a/paddle/fluid/operators/kernel_primitives/helper_primitives.h +++ b/paddle/pten/kernels/primitive/helper_primitives.h @@ -1,4 +1,4 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. @@ -14,9 +14,8 @@ #pragma once -namespace paddle { -namespace operators { -namespace kernel_primitives { +namespace pten { +namespace kps { #ifdef PADDLE_WITH_XPU2 struct dim3 { @@ -43,8 +42,12 @@ struct DimConfig { int rem_y; int rem_z; - HOSTDEVICE explicit inline DimConfig(int split_x, int split_y, int split_z, - int size_x, int size_y, int size_z) { + HOSTDEVICE explicit inline DimConfig(int split_x, + int split_y, + int split_z, + int size_x, + int size_y, + int size_z) { split_num_x = split_x; split_num_y = split_y; split_num_z = split_z; @@ -60,6 +63,5 @@ struct DimConfig { } }; -} // namespace kernel_primitives -} // namespace operators -} // namespace paddle +} // namespace kps +} // namespace pten diff --git a/paddle/pten/kernels/primitive/kernel_primitives.h b/paddle/pten/kernels/primitive/kernel_primitives.h new file mode 100644 index 0000000000000000000000000000000000000000..6067fa59d57ba6f400500805bff7aea80f17926d --- /dev/null +++ b/paddle/pten/kernels/primitive/kernel_primitives.h @@ -0,0 +1,69 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include "paddle/pten/kernels/primitive/helper_primitives.h" +#ifdef PADDLE_WITH_XPU2 +#include "paddle/pten/backends/xpu/xpu_context.h" +#include "paddle/pten/kernels/primitive/compute_primitives_xpu2.h" +#include "paddle/pten/kernels/primitive/datamover_primitives_xpu2.h" +#include "paddle/pten/kernels/primitive/functor_primitives_xpu2.h" + +#define KPStream XPUStream +#define KPDevice pten::XPUContext +#define _ptr_ _global_ptr_ +#define __forceinline__ __inline__ +#define __restrict__ + +#define THREAD_ID_X core_id() +#define THREAD_ID_Y 0 +#define THREAD_ID_Z 0 + +#define BLOCK_NUM_X core_num() +#define BLOCK_NUM_Y 0 +#define BLOCK_NUM_Z 0 + +#define BLOCK_ID_X cluster_id() +#define BLOCK_ID_Y 0 +#define BLOCK_ID_Z 0 + +#define GRID_NUM_X cluster_num() +#define GRID_NUM_Y 0 +#define GRID_NUM_Z 0 +#else +#include "paddle/pten/backends/gpu/gpu_context.h" +#include "paddle/pten/kernels/primitive/compute_primitives.h" +#include "paddle/pten/kernels/primitive/datamover_primitives.h" +#include "paddle/pten/kernels/primitive/functor_primitives.h" + +#define KPStream gpuStream_t +#define KPDevice pten::GPUContext +#define _ptr_ + +#define THREAD_ID_X threadIdx.x +#define THREAD_ID_Y threadIdx.y +#define THREAD_ID_Z threadIdx.z + +#define BLOCK_NUM_X blockDim.x +#define BLOCK_NUM_Y blockDim.y +#define BLOCK_NUM_Z blockDim.z + +#define BLOCK_ID_X blockIdx.x +#define BLOCK_ID_Y blockIdx.y +#define BLOCK_ID_Z blockIdx.z + +#define GRID_NUM_X gridDim.x +#define GRID_NUM_Y gridDim.y +#define GRID_NUM_Z gridDim.z +#endif