未验证 提交 1a0cd447 编写于 作者: N niuliling123 提交者: GitHub

Delete the template parameter BLockSize in Kernel Primitive API (#45220)

上级 3a7b1810
......@@ -112,17 +112,17 @@ __global__ void VectorizedRandomGenerator(const size_t n,
auto dst_functor =
DstMaskFunctor<T, float>(1.0f - dropout_prob, is_upscale_in_train);
for (; fix < main_offset; fix += stride) {
kps::ReadData<T, kCount, 1, 1, false>(&dst_mask[0], src + fix, deal_size);
kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(
kps::ReadData<T, kCount, 1, false>(&dst_mask[0], src + fix, deal_size);
kps::ElementwiseRandom<SType, float, kCount, Rand>(
&rands[0], Rand(), &state);
// dst
kps::OperatorTernary<T, float, T, DstMaskFunctor<T, float>>(
&dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
kps::WriteData<T, kCount, 1, 1, false>(dst + fix, &dst_mask[0], deal_size);
kps::WriteData<T, kCount, 1, false>(dst + fix, &dst_mask[0], deal_size);
// mask
kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
kps::ElementwiseUnary<T, MaskType, kCount, 1, Cast>(
&mask_result[0], &dst_mask[kCount], Cast());
kps::WriteData<MaskType, kCount, 1, 1, false>(
kps::WriteData<MaskType, kCount, 1, false>(
mask + fix, &mask_result[0], deal_size);
if (fix > idx * kCount + 1) {
__syncthreads();
......@@ -130,17 +130,17 @@ __global__ void VectorizedRandomGenerator(const size_t n,
}
int remainder = n - fix;
if (remainder > 0) {
kps::ReadData<T, kCount, 1, 1, true>(&dst_mask[0], src + fix, remainder);
kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(
kps::ReadData<T, kCount, 1, true>(&dst_mask[0], src + fix, remainder);
kps::ElementwiseRandom<SType, float, kCount, Rand>(
&rands[0], Rand(), &state);
// dst
kps::OperatorTernary<T, float, T, DstMaskFunctor<T, float>>(
&dst_mask[0], &dst_mask[0], &rands[0], dst_functor, kCount);
kps::WriteData<T, kCount, 1, 1, true>(dst + fix, &dst_mask[0], remainder);
kps::WriteData<T, kCount, 1, true>(dst + fix, &dst_mask[0], remainder);
// mask
kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
kps::ElementwiseUnary<T, MaskType, kCount, 1, Cast>(
&mask_result[0], &dst_mask[kCount], Cast());
kps::WriteData<MaskType, kCount, 1, 1, true>(
kps::WriteData<MaskType, kCount, 1, true>(
mask + fix, &mask_result[0], remainder);
__syncthreads();
}
......@@ -233,17 +233,17 @@ __global__ void VectorizedGeneratorMask(const size_t n,
auto mask_functor = MaskFunctor<T, float>(1.0f - dropout_prob);
for (; fix < main_offset; fix += stride) {
kps::ReadData<T, kCount, 1, 1, false>(&dst_mask[0], src + fix, deal_size);
kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(
kps::ReadData<T, kCount, 1, false>(&dst_mask[0], src + fix, deal_size);
kps::ElementwiseRandom<SType, float, kCount, Rand>(
&rands[0], Rand(), &state);
// dst
kps::OperatorBinary<float, T, MaskFunctor<T, float>>(
&dst_mask[0], &rands[0], mask_functor, kCount);
// mask
kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
kps::ElementwiseUnary<T, MaskType, kCount, 1, Cast>(
&mask_result[0], &dst_mask[0], Cast());
kps::WriteData<MaskType, kCount, 1, 1, false>(
kps::WriteData<MaskType, kCount, 1, false>(
mask + fix, &mask_result[0], deal_size);
if (fix > idx * kCount + 1) {
__syncthreads();
......@@ -251,16 +251,16 @@ __global__ void VectorizedGeneratorMask(const size_t n,
}
int remainder = n - fix;
if (remainder > 0) {
kps::ReadData<T, kCount, 1, 1, true>(&dst_mask[0], src + fix, remainder);
kps::ElementwiseRandom<SType, float, kCount, 1, Rand>(
kps::ReadData<T, kCount, 1, true>(&dst_mask[0], src + fix, remainder);
kps::ElementwiseRandom<SType, float, kCount, Rand>(
&rands[0], Rand(), &state);
// dst
kps::OperatorBinary<float, T, MaskFunctor<T, float>>(
&dst_mask[0], &rands[0], mask_functor, kCount);
// mask
kps::ElementwiseUnary<T, MaskType, kCount, 1, 1, Cast>(
kps::ElementwiseUnary<T, MaskType, kCount, 1, Cast>(
&mask_result[0], &dst_mask[0], Cast());
kps::WriteData<MaskType, kCount, 1, 1, true>(
kps::WriteData<MaskType, kCount, 1, true>(
mask + fix, &mask_result[0], remainder);
__syncthreads();
}
......
......@@ -73,24 +73,23 @@ __global__ void BroadcastKernelBinary(
// load in0
if (use_broadcast[0]) {
kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1>(
kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD>(
arg0, in0, fix, configlists[0], numel);
} else {
kernel_primitives::ReadData<InT, VecSize, 1, 1>(arg0, in0 + fix, num);
}
// load in1
if (use_broadcast[1]) {
kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD, 1>(
kernel_primitives::ReadDataBc<InT, VecSize, DATA_PER_THREAD>(
arg1, in1, fix, configlists[1], numel);
} else {
kernel_primitives::ReadData<InT, VecSize, 1, 1>(arg1, in1 + fix, num);
kernel_primitives::ReadData<InT, VecSize, 1>(arg1, in1 + fix, num);
}
// compute
kernel_primitives::ElementwiseBinary<InT, OutT, VecSize, 1, 1, Functor>(
kernel_primitives::ElementwiseBinary<InT, OutT, VecSize, 1, Functor>(
result, arg0, arg1, func);
// store
kernel_primitives::WriteData<OutT, VecSize, 1, 1, true>(
out + fix, result, num);
kernel_primitives::WriteData<OutT, VecSize, 1, true>(out + fix, result, num);
}
// bias add forward impl for "[m, n] + [n] = [m, n]"
......
......@@ -266,10 +266,10 @@ __device__ __forceinline__ void LoadData(
// numel : whole num of output
// num: how many data will be deal with in this time
if (need_broadcast) {
kps::ReadDataBc<T, VecSize, 1, 1, IsBoundary>(
kps::ReadDataBc<T, VecSize, 1, IsBoundary>(
dst, src, block_offset, config, numel, read_lens);
} else {
kps::ReadData<T, VecSize, 1, 1, IsBoundary>(
kps::ReadData<T, VecSize, 1, IsBoundary>(
dst, src + block_offset, num, read_lens);
}
}
......
......@@ -278,11 +278,10 @@ __global__ void DistributionKernel(size_t size,
MT args[kCount];
T result[kCount];
for (size_t i = idx; i < size; i += total_thread * kCount) {
kps::ElementwiseRandom<SType, MT, kCount, 1, DistOp>(
&args[0], dist, &state);
kps::ElementwiseUnary<MT, T, kCount, 1, 1, TransformOp>(
kps::ElementwiseRandom<SType, MT, kCount, DistOp>(&args[0], dist, &state);
kps::ElementwiseUnary<MT, T, kCount, 1, TransformOp>(
&result[0], &args[0], trans);
kps::WriteData<T, T, kCount, 1, 1, true>(
kps::WriteData<T, T, kCount, 1, true>(
out_data + i, &result[0], size - i, 1, stride, 1);
__syncthreads();
}
......
......@@ -519,13 +519,13 @@ struct Loader {
kps::Init<Type, ArgsT, Index, VecSize>(
args, static_cast<Type>(1.0f), read_lens);
if (is_boundary) {
kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, true>(
kps::ReadData<Type, VecSize, 1, ArgsT, Index, true>(
args,
reinterpret_cast<const _ptr_ Type *>(in[Index]) + offset,
num,
read_lens);
} else {
kps::ReadData<Type, VecSize, 1, 1, ArgsT, Index, false>(
kps::ReadData<Type, VecSize, 1, ArgsT, Index, false>(
args,
reinterpret_cast<const _ptr_ Type *>(in[Index]) + offset,
num,
......@@ -595,7 +595,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, Arity, true> {
InT (*args)[VecSize],
OutT *result,
int read_lens) {
kps::ElementwiseAny<InT, OutT, VecSize, 1, 1, Arity, Functor>(
kps::ElementwiseAny<InT, OutT, VecSize, 1, Arity, Functor>(
result, args, func);
}
};
......@@ -606,7 +606,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 0, false> {
InT (*args)[VecSize],
OutT *result,
int read_lens) {
kps::ElementwiseConstant<InT, OutT, VecSize, 1, 1, Functor>(result, func);
kps::ElementwiseConstant<InT, OutT, VecSize, 1, Functor>(result, func);
}
};
......@@ -616,7 +616,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 1, false> {
InT (*args)[VecSize],
OutT *result,
int read_lens) {
kps::ElementwiseUnary<InT, OutT, VecSize, 1, 1, Functor>(
kps::ElementwiseUnary<InT, OutT, VecSize, 1, Functor>(
result, args[0], func);
}
};
......@@ -627,7 +627,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 2, false> {
InT (*args)[VecSize],
OutT *result,
int read_lens) {
kps::ElementwiseBinary<InT, OutT, VecSize, 1, 1, Functor>(
kps::ElementwiseBinary<InT, OutT, VecSize, 1, Functor>(
result, args[0], args[1], func, read_lens);
}
};
......@@ -638,7 +638,7 @@ struct ElementwisePrimitiveCaller<InT, OutT, VecSize, Functor, 3, false> {
InT (*args)[VecSize],
OutT *result,
int read_lens) {
kps::ElementwiseTernary<InT, OutT, VecSize, 1, 1, Functor>(
kps::ElementwiseTernary<InT, OutT, VecSize, 1, Functor>(
result, args[0], args[1], args[2], func);
}
};
......@@ -703,7 +703,7 @@ struct ElementwiseWriteDataCallerBc {
}
#pragma unroll
for (int i = 0; i < NumOuts; ++i) {
kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
kps::WriteData<OutT, VecSize, 1, IsBoundary>(
outs[i] + block_offset, dst[i], num, read_lens);
}
}
......@@ -716,7 +716,7 @@ struct ElementwiseWriteDataCallerBc<OutT, VecSize, IsBoundary, 1> {
kps::IndexType block_offset,
int num,
int read_lens) {
kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(
kps::WriteData<OutT, VecSize, 1, IsBoundary>(
outs[0] + block_offset, src, num, read_lens);
}
};
......
......@@ -36,18 +36,18 @@ __global__ void VectorizedIndexKernel(T *out,
size_t args[VecSize];
T result[VecSize];
for (; data_offset < main_offset; data_offset += stride) {
kps::InitWithDataIndex<size_t, VecSize, 1, 1>(&args[0], data_offset);
kps::ElementwiseUnary<size_t, T, VecSize, 1, 1, Functor>(
kps::InitWithDataIndex<size_t, VecSize, 1>(&args[0], data_offset);
kps::ElementwiseUnary<size_t, T, VecSize, 1, Functor>(
&result[0], &args[0], func);
kps::WriteData<T, VecSize, 1, 1, false>(
kps::WriteData<T, VecSize, 1, false>(
out + data_offset, &result[0], BLOCK_NUM_X * VecSize);
}
size_t num = numel - data_offset;
if (num > 0) {
kps::InitWithDataIndex<size_t, VecSize, 1, 1>(&args[0], data_offset);
kps::ElementwiseUnary<size_t, T, VecSize, 1, 1, Functor>(
kps::InitWithDataIndex<size_t, VecSize, 1>(&args[0], data_offset);
kps::ElementwiseUnary<size_t, T, VecSize, 1, Functor>(
&result[0], &args[0], func);
kps::WriteData<T, VecSize, 1, 1, true>(out + data_offset, &result[0], num);
kps::WriteData<T, VecSize, 1, true>(out + data_offset, &result[0], num);
}
}
......
......@@ -712,7 +712,6 @@ __global__ void ReduceAnyKernel(const Tx* x,
1,
REDUCE_VEC_SIZE,
1,
1,
Calculator,
kps::IdentityFunctor<Tx>,
false>(&input_reg[0],
......@@ -725,12 +724,11 @@ __global__ void ReduceAnyKernel(const Tx* x,
stride,
kps::IdentityFunctor<Tx>(),
reduce_last_dim);
kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, 1, TransformOp>(
kps::ElementwiseUnary<Tx, MPType, REDUCE_VEC_SIZE, 1, TransformOp>(
&input_compute[0], &input_reg[0], transformer);
kps::Reduce<MPType,
REDUCE_VEC_SIZE,
1,
1,
ReduceOp,
kps::details::ReduceMode::kLocalMode>(
&reduce_var, &input_compute[0], reducer, reduce_last_dim);
......@@ -742,7 +740,6 @@ __global__ void ReduceAnyKernel(const Tx* x,
1,
REDUCE_VEC_SIZE,
1,
1,
Calculator,
TransformOp,
true>(&input_compute[0],
......@@ -758,12 +755,11 @@ __global__ void ReduceAnyKernel(const Tx* x,
kps::Reduce<MPType,
REDUCE_VEC_SIZE,
1,
1,
ReduceOp,
kps::details::ReduceMode::kLocalMode>(
&reduce_var, &input_compute[0], reducer, reduce_last_dim);
kps::Reduce<MPType, 1, 1, 1, ReduceOp, kps::details::kGlobalMode>(
kps::Reduce<MPType, 1, 1, ReduceOp, kps::details::kGlobalMode>(
&reduce_var, &reduce_var, reducer, reduce_last_dim);
if (is_mean) {
reduce_var = reduce_var / static_cast<MPType>(reduce_num);
......@@ -807,27 +803,22 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
MPType reduce_var = init;
MPType reduce_compute = init;
for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
kps::ReadData<Tx, Tx, 1, 1, 1, false>(&reduce_input,
input + loop_idx * left_num + idx,
block.BlockDimX(),
1,
1,
left_num);
kps::ElementwiseUnary<Tx, MPType, 1, 1, 1, TransformOp>(
kps::ReadData<Tx, Tx, 1, 1, false>(&reduce_input,
input + loop_idx * left_num + idx,
block.BlockDimX(),
1,
1,
left_num);
kps::ElementwiseUnary<Tx, MPType, 1, 1, TransformOp>(
&reduce_compute, &reduce_input, transformer);
kps::Reduce<MPType,
1,
1,
1,
ReduceOp,
kps::details::ReduceMode::kLocalMode>(
kps::Reduce<MPType, 1, 1, ReduceOp, kps::details::ReduceMode::kLocalMode>(
&reduce_var, &reduce_compute, reducer, false);
}
if (is_mean) {
reduce_var = reduce_var / static_cast<MPType>(mean_div);
}
Ty result = static_cast<Ty>(reduce_var);
kps::WriteData<Ty, 1, 1, 1, false>(
kps::WriteData<Ty, 1, 1, false>(
y + store_offset + idx, &result, block.BlockDimX());
}
......@@ -835,20 +826,15 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
MPType reduce_var = init;
MPType reduce_compute = init;
for (int loop_idx = 0; loop_idx < loop_size; ++loop_idx) {
kps::ReadData<Tx, Tx, 1, 1, 1, true>(&reduce_input,
input + loop_idx * left_num + idx,
dim.rem_x,
1,
1,
left_num);
kps::ElementwiseUnary<Tx, MPType, 1, 1, 1, TransformOp>(
kps::ReadData<Tx, Tx, 1, 1, true>(&reduce_input,
input + loop_idx * left_num + idx,
dim.rem_x,
1,
1,
left_num);
kps::ElementwiseUnary<Tx, MPType, 1, 1, TransformOp>(
&reduce_compute, &reduce_input, transformer);
kps::Reduce<MPType,
1,
1,
1,
ReduceOp,
kps::details::ReduceMode::kLocalMode>(
kps::Reduce<MPType, 1, 1, ReduceOp, kps::details::ReduceMode::kLocalMode>(
&reduce_var, &reduce_compute, reducer, false);
}
......@@ -856,8 +842,7 @@ __global__ void ReduceHigherDimKernel(const Tx* x,
reduce_var = reduce_var / static_cast<MPType>(mean_div);
}
Ty result = static_cast<Ty>(reduce_var);
kps::WriteData<Ty, 1, 1, 1, true>(
y + store_offset + idx, &result, dim.rem_x);
kps::WriteData<Ty, 1, 1, true>(y + store_offset + idx, &result, dim.rem_x);
}
}
......
......@@ -71,21 +71,21 @@ __device__ void GetBlockCountImpl(const InT *in,
int store_fix = BLOCK_ID_X + repeat * GRID_NUM_X;
kps::Init<InT, VecSize>(&in_data[0], static_cast<InT>(0.0f));
kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(&in_data[0], in, num);
kps::ElementwiseUnary<InT, OutT, VecSize, 1, 1, Cast>(
kps::ReadData<InT, VecSize, 1, IsBoundary>(&in_data[0], in, num);
kps::ElementwiseUnary<InT, OutT, VecSize, 1, Cast>(
&temp[0], &in_data[0], Cast());
kps::Reduce<OutT, VecSize, 1, 1, Add, Mode::kLocalMode>(
kps::Reduce<OutT, VecSize, 1, Add, Mode::kLocalMode>(
&result, &temp[0], Add(), true);
kps::Reduce<OutT, 1, 1, 1, Add, Mode::kGlobalMode>(
kps::Reduce<OutT, 1, 1, Add, Mode::kGlobalMode>(
&result, &result, Add(), true);
if (store_fix == 0) {
// first block's fix_size = 0;
OutT tmp = static_cast<OutT>(0.0f);
kps::WriteData<OutT, 1, 1, 1, true>(out + store_fix, &tmp, 1);
kps::WriteData<OutT, 1, 1, true>(out + store_fix, &tmp, 1);
}
// store num of this block
kps::WriteData<OutT, 1, 1, 1, true>(out + store_fix + 1, &result, 1);
kps::WriteData<OutT, 1, 1, true>(out + store_fix + 1, &result, 1);
}
// Count how many data is not zero in current block
......@@ -132,12 +132,12 @@ __device__ void CumsumImpl(
// set pre_cumsum
kps::Init<OutT, VecSize>(&temp[0], *pre_cumsum);
// load data to arg
kps::ReadData<InT, InT, VecSize, 1, 1, IsBoundary>(
kps::ReadData<InT, InT, VecSize, 1, IsBoundary>(
&arg[0], in, num, 1, BLOCK_NUM_X, 1);
// block cumsum
kps::Cumsum<InT, OutT, 1, Functor>(&result[0], &arg[0], func);
kps::Cumsum<InT, OutT, Functor>(&result[0], &arg[0], func);
// result = cumsum_result + pre_cumsum
kps::ElementwiseBinary<OutT, OutT, VecSize, 1, 1, Functor>(
kps::ElementwiseBinary<OutT, OutT, VecSize, 1, Functor>(
&result[0], &result[0], &temp[0], func);
// get the last prefix sum
if ((THREAD_ID_X == BLOCK_NUM_X - 1) && !IsBoundary) {
......@@ -146,7 +146,7 @@ __device__ void CumsumImpl(
__syncthreads();
// update pre_cumsum
*pre_cumsum = max_thread_data;
kps::WriteData<OutT, OutT, VecSize, 1, 1, IsBoundary>(
kps::WriteData<OutT, OutT, VecSize, 1, IsBoundary>(
out, &result[0], num, 1, BLOCK_NUM_X, 1);
}
......@@ -189,7 +189,7 @@ struct SelectCaller {
int64_t in_data[VecSize];
OutT store_data[VecSize * phi::DDim::kMaxRank];
// set index
kps::InitWithDataIndex<int64_t, VecSize, 1, 1>(&in_data[0], data_offset);
kps::InitWithDataIndex<int64_t, VecSize, 1>(&in_data[0], data_offset);
// Get store data according to mask_idt
kps::OperatorTernary<MT, int64_t, OutT, Functor>(
store_data, mask_data, &in_data[0], func, VecSize);
......@@ -215,7 +215,7 @@ struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 1> {
int num) {
InT in_data[VecSize];
OutT store_data[VecSize * phi::DDim::kMaxRank];
kps::ReadData<InT, VecSize, 1, 1, IsBoundary>(&in_data[0], in, num);
kps::ReadData<InT, VecSize, 1, IsBoundary>(&in_data[0], in, num);
// Get store data according to mask_idt
kps::OperatorTernary<MT, InT, OutT, Functor>(
store_data, mask_data, &in_data[0], func, VecSize);
......@@ -244,7 +244,7 @@ struct SelectCaller<OutT, MT, InT, Functor, VecSize, IsBoundary, 2> {
kps::details::ReadData<InT>(&in_data[0], in + thread_fix, store_num);
kps::OperatorTernary<MT, InT, OutT, Functor>(
store_data, mask_data, &in_data[0], func, VecSize);
kps::WriteData<OutT, VecSize, 1, 1, IsBoundary>(out, &store_data[0], num);
kps::WriteData<OutT, VecSize, 1, IsBoundary>(out, &store_data[0], num);
}
};
......@@ -285,16 +285,16 @@ __device__ void SelectKernelImpl(OutT *out,
kps::Init<IdT, kCVecSize>(&num_thread[0], init_idx);
kps::Init<MT, VecSize>(&mask_data[0], init_mask);
// Load mask
kps::ReadData<MT, VecSize, 1, 1, IsBoundary>(&mask_data[0], mask, num);
kps::ReadData<MT, VecSize, 1, IsBoundary>(&mask_data[0], mask, num);
// Cast from MT to int
kps::ElementwiseUnary<MT, IdT, VecSize, 1, 1, Cast>(
kps::ElementwiseUnary<MT, IdT, VecSize, 1, Cast>(
&mask_idt[0], &mask_data[0], Cast());
// Get the num of thread only num_thread[1] has data
kps::Reduce<IdT, VecSize, 1, 1, Add, Mode::kLocalMode>(
kps::Reduce<IdT, VecSize, 1, Add, Mode::kLocalMode>(
&num_thread[0], &mask_idt[0], Add(), true);
// Get cumsum_thread cumsum from 0 to num_thread cumsum_thread[0] is the
// thread_fix
kps::Cumsum<IdT, IdT, 1, Add>(&cumsum_thread[0], &num_thread[0], Add());
kps::Cumsum<IdT, IdT, Add>(&cumsum_thread[0], &num_thread[0], Add());
// get thread_fix
int thread_fix =
(static_cast<int>(cumsum_thread[0] - num_thread[0]) * store_rank);
......
......@@ -311,9 +311,9 @@ __global__ void WarpSoftmaxForward(T* softmax,
const VecT* src_v =
reinterpret_cast<const VecT*>(&src[(first_batch + i) * stride]);
VecT* reg_v = reinterpret_cast<VecT*>(&src_data[i][0][0]);
kps::ReadData<VecT, VecT, kLoopsV, 1, 1, true>(
kps::ReadData<VecT, VecT, kLoopsV, 1, true>(
&reg_v[0], &src_v[0], idx_max_v[i], 0, kWarpSize, 1);
kps::ElementwiseUnary<T, AccT, kVItem, 1, 1, DataTransFunctor<T, AccT>>(
kps::ElementwiseUnary<T, AccT, kVItem, 1, DataTransFunctor<T, AccT>>(
&sub_data[i][0][0], &src_data[i][0][0], DataTransFunctor<T, AccT>());
}
......@@ -321,7 +321,6 @@ __global__ void WarpSoftmaxForward(T* softmax,
kps::Reduce<AccT,
kVItem,
kBatchSize,
1,
ReduceMaxFunctor<AccT>,
kMode::kLocalMode>(
&max[0], &sub_data[0][0][0], ReduceMaxFunctor<AccT>(), true);
......@@ -330,15 +329,14 @@ __global__ void WarpSoftmaxForward(T* softmax,
// compute sum
#pragma unroll
for (int i = 0; i < kBatchSize; ++i) {
kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnarySubFunctor<AccT>>(
kps::ElementwiseUnary<AccT, AccT, kVItem, 1, UnarySubFunctor<AccT>>(
&sub_data[i][0][0], &sub_data[i][0][0], UnarySubFunctor<AccT>(max[i]));
kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpFunctor<AccT>>(
kps::ElementwiseUnary<AccT, AccT, kVItem, 1, ExpFunctor<AccT>>(
&exp_data[i][0][0], &sub_data[i][0][0], ExpFunctor<AccT>());
}
kps::Reduce<AccT,
kVItem,
kBatchSize,
1,
kps::AddFunctor<AccT>,
kMode::kLocalMode>(
&sum[0], &exp_data[0][0][0], kps::AddFunctor<AccT>(), true);
......@@ -351,15 +349,15 @@ __global__ void WarpSoftmaxForward(T* softmax,
reinterpret_cast<VecT*>(&softmax[(first_batch + i) * stride]);
VecT* reg_v = reinterpret_cast<VecT*>(&out_tmp[i][0][0]);
if (LogMode) {
kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnarySubFunctor<AccT>>(
kps::ElementwiseUnary<AccT, T, kVItem, 1, UnarySubFunctor<AccT>>(
&out_tmp[i][0][0],
&sub_data[i][0][0],
UnarySubFunctor<AccT>(std::log(sum[i])));
} else {
kps::ElementwiseUnary<AccT, T, kVItem, 1, 1, UnaryDivFunctor<AccT>>(
kps::ElementwiseUnary<AccT, T, kVItem, 1, UnaryDivFunctor<AccT>>(
&out_tmp[i][0][0], &exp_data[i][0][0], UnaryDivFunctor<AccT>(sum[i]));
}
kps::WriteData<VecT, VecT, kLoopsV, 1, 1, true>(
kps::WriteData<VecT, VecT, kLoopsV, 1, true>(
&softmax_v[0], &reg_v[0], idx_max_v[i], 0, kWarpSize, 1);
}
}
......@@ -417,9 +415,9 @@ __global__ void WarpSoftmaxBackward(T* dst,
int ptr = (first_batch + i) * stride;
const VecT* src_v = reinterpret_cast<const VecT*>(&src[ptr]);
const VecT* grad_v = reinterpret_cast<const VecT*>(&grad[ptr]);
kps::ReadData<VecT, VecT, kLoopsV, 1, 1, true>(
kps::ReadData<VecT, VecT, kLoopsV, 1, true>(
&src_reg[i][0], &src_v[0], idx_max_v[i], 0, kWarpSize, flag);
kps::ReadData<VecT, VecT, kLoopsV, 1, 1, true>(
kps::ReadData<VecT, VecT, kLoopsV, 1, true>(
&grad_reg[i][0], &grad_v[0], idx_max_v[i], 0, kWarpSize, flag);
}
......@@ -430,9 +428,9 @@ __global__ void WarpSoftmaxBackward(T* dst,
const T* grad_ptr = reinterpret_cast<const T*>(&grad_reg[0][0]);
constexpr int kStep = kBatchSize * kLoopsV * kVSize;
constexpr int kVItem = kLoopsV * kVSize;
kps::ElementwiseUnary<T, AccT, kStep, 1, 1, DataTransFunctor<T, AccT>>(
kps::ElementwiseUnary<T, AccT, kStep, 1, DataTransFunctor<T, AccT>>(
&src_tmp[0][0][0], &src_ptr[0], DataTransFunctor<T, AccT>());
kps::ElementwiseUnary<T, AccT, kStep, 1, 1, DataTransFunctor<T, AccT>>(
kps::ElementwiseUnary<T, AccT, kStep, 1, DataTransFunctor<T, AccT>>(
&grad_tmp[0][0][0], &grad_ptr[0], DataTransFunctor<T, AccT>());
// compute sum
......@@ -444,17 +442,15 @@ __global__ void WarpSoftmaxBackward(T* dst,
kps::Reduce<AccT,
kVItem,
kBatchSize,
1,
kps::AddFunctor<AccT>,
kps::details::ReduceMode::kLocalMode>(
&sum[0], &grad_tmp[0][0][0], kps::AddFunctor<AccT>(), true);
} else {
kps::ElementwiseBinary<AccT, AccT, kStep, 1, 1, kps::MulFunctor<AccT>>(
kps::ElementwiseBinary<AccT, AccT, kStep, 1, kps::MulFunctor<AccT>>(
&sum_tmp[0][0][0], &gradptr[0], &srcptr[0], kps::MulFunctor<AccT>());
kps::Reduce<AccT,
kVItem,
kBatchSize,
1,
kps::AddFunctor<AccT>,
kps::details::ReduceMode::kLocalMode>(
&sum[0], &sum_tmp[0][0][0], kps::AddFunctor<AccT>(), true);
......@@ -470,17 +466,17 @@ __global__ void WarpSoftmaxBackward(T* dst,
AccT* gradptr = reinterpret_cast<AccT*>(&grad_tmp[i][0][0]);
AccT* srcptr = reinterpret_cast<AccT*>(&src_tmp[i][0][0]);
if (LogMode) {
kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, ExpMulFunctor<AccT>>(
kps::ElementwiseUnary<AccT, AccT, kVItem, 1, ExpMulFunctor<AccT>>(
&out[i][0][0], &srcptr[0], ExpMulFunctor<AccT>(sum[i]));
kps::ElementwiseBinary<AccT, T, kVItem, 1, 1, kps::SubFunctor<AccT>>(
kps::ElementwiseBinary<AccT, T, kVItem, 1, kps::SubFunctor<AccT>>(
&out_tmp[i][0][0],
&gradptr[0],
&out[i][0][0],
kps::SubFunctor<AccT>());
} else {
kps::ElementwiseUnary<AccT, AccT, kVItem, 1, 1, UnarySubFunctor<AccT>>(
kps::ElementwiseUnary<AccT, AccT, kVItem, 1, UnarySubFunctor<AccT>>(
&out[i][0][0], &gradptr[0], UnarySubFunctor<AccT>(sum[i]));
kps::ElementwiseBinary<AccT, T, kVItem, 1, 1, kps::MulFunctor<AccT>>(
kps::ElementwiseBinary<AccT, T, kVItem, 1, kps::MulFunctor<AccT>>(
&out_tmp[i][0][0],
&srcptr[0],
&out[i][0][0],
......@@ -488,7 +484,7 @@ __global__ void WarpSoftmaxBackward(T* dst,
}
VecT* dst_v = reinterpret_cast<VecT*>(&dst[(first_batch + i) * stride]);
VecT* reg_v = reinterpret_cast<VecT*>(&out_tmp[i][0][0]);
kps::WriteData<VecT, VecT, kLoopsV, 1, 1, true>(
kps::WriteData<VecT, VecT, kLoopsV, 1, true>(
&dst_v[0], &reg_v[0], idx_max_v[i], 0, kWarpSize, 1);
}
}
......@@ -636,7 +632,7 @@ __global__ void NormalSoftmaxForward(
}
if (blockDim.y > 1) {
kps::Reduce<AccT, 1, 1, 1, kps::MaxFunctor<AccT>, kMode::kGlobalMode>(
kps::Reduce<AccT, 1, 1, kps::MaxFunctor<AccT>, kMode::kGlobalMode>(
&max_value, &max_value, kps::MaxFunctor<AccT>(), false);
}
......@@ -647,7 +643,7 @@ __global__ void NormalSoftmaxForward(
sum += std::exp(value - max_value);
}
if (blockDim.y > 1) {
kps::Reduce<AccT, 1, 1, 1, kps::AddFunctor<AccT>, kMode::kGlobalMode>(
kps::Reduce<AccT, 1, 1, kps::AddFunctor<AccT>, kMode::kGlobalMode>(
&sum, &sum, kps::AddFunctor<AccT>(), false);
}
......@@ -695,7 +691,7 @@ __global__ void NormalSoftmaxBackward(T* input_grad,
}
}
if (blockDim.y > 1) {
kps::Reduce<AccT, 1, 1, 1, kps::AddFunctor<AccT>, kMode::kGlobalMode>(
kps::Reduce<AccT, 1, 1, kps::AddFunctor<AccT>, kMode::kGlobalMode>(
&sum, &sum, kps::AddFunctor<AccT>(), false);
}
......
......@@ -200,7 +200,6 @@ __device__ inline int GetLastPow2(int n) {
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT, typename OutT>
......@@ -215,12 +214,7 @@ __device__ inline int GetLastPow2(int n) {
* in: The register pointer of in, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc>
template <typename InT, typename OutT, int NX, int NY, class OpFunc>
__device__ __forceinline__ void ElementwiseUnary(OutT* out,
const InT* in,
OpFunc compute) {
......@@ -239,7 +233,6 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
* OutT: The data type of out.
* NX: The number of data columns computed by each thread.
* NY: The number of data rows computed by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT>
......@@ -255,12 +248,7 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc>
template <typename InT, typename OutT, int NX, int NY, class OpFunc>
__device__ __forceinline__ void ElementwiseBinary(OutT* out,
const InT* in1,
const InT* in2,
......@@ -271,12 +259,7 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out,
}
}
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc>
template <typename InT, typename OutT, int NX, int NY, class OpFunc>
__device__ __forceinline__ void ElementwiseBinary(
OutT* out, const InT* in1, const InT* in2, OpFunc compute, int read_lens) {
#pragma unroll
......@@ -294,7 +277,6 @@ __device__ __forceinline__ void ElementwiseBinary(
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT>
......@@ -312,12 +294,7 @@ __device__ __forceinline__ void ElementwiseBinary(
* in3: The register pointer of third input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc>
template <typename InT, typename OutT, int NX, int NY, class OpFunc>
__device__ __forceinline__ void ElementwiseTernary(
OutT* out, const InT* in1, const InT* in2, const InT* in3, OpFunc compute) {
#pragma unroll
......@@ -335,7 +312,6 @@ __device__ __forceinline__ void ElementwiseTernary(
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* Arity: The size of ins.
* OpFunc: Compute functor which has an operator() as following:
......@@ -351,13 +327,7 @@ __device__ __forceinline__ void ElementwiseTernary(
* ins: A pointers of array consisting of multiple inputs.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
int Arity,
class OpFunc>
template <typename InT, typename OutT, int NX, int NY, int Arity, class OpFunc>
__device__ __forceinline__ void ElementwiseAny(OutT* out,
InT (*ins)[NX * NY],
OpFunc compute) {
......@@ -382,7 +352,6 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT, typename OutT>
......@@ -398,12 +367,7 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc>
template <typename InT, typename OutT, int NX, int NY, class OpFunc>
__device__ __forceinline__ void CycleBinary(OutT* out,
const InT* in1,
const InT* in2,
......@@ -428,7 +392,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
* T: The type of data.
* NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* ReduceFunctor: Compute functor which has an operator() as following
* template <typename InT>
......@@ -448,7 +411,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
template <typename T,
int NX,
int NY,
int BlockSize,
class ReduceFunctor,
details::ReduceMode Mode>
__device__ __forceinline__ void Reduce(T* out,
......@@ -494,7 +456,6 @@ __device__ __forceinline__ void Reduce(T* out,
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. Currently only
* GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT>
......@@ -509,12 +470,7 @@ __device__ __forceinline__ void Reduce(T* out,
* out: The register pointer of out, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc>
template <typename InT, typename OutT, int NX, int NY, class OpFunc>
__device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
#pragma unroll
for (int idx = 0; idx < NX * NY; idx++) {
......@@ -532,7 +488,6 @@ __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
* hiprandStatePhilox4_32_10_t.
* OutT: the type of out register.
* ReturnsCount: The number of random data generated by OpFunc.
* BlockSize: Identifies the current device thread index method. Currently only
* GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* template <typename T>
......@@ -549,11 +504,7 @@ __device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
* compute: Compute function which was declared like OpFunc<T>().
*/
template <typename StateType,
typename OutT,
int ReturnsCount,
int BlockSize,
class OpFunc>
template <typename StateType, typename OutT, int ReturnsCount, class OpFunc>
__device__ __forceinline__ void ElementwiseRandom(OutT* out,
OpFunc compute,
StateType* state) {
......@@ -571,7 +522,6 @@ __device__ __forceinline__ void ElementwiseRandom(OutT* out,
* @template paraments
* InT: the type of input register.
* OutT: the type of out register.
* BlockSize: Identifies the current device thread index method. Currently only
* GPU was supported.
* OpFunc: Compute functor which has an operator() as following
* template <typename T>
......@@ -589,7 +539,7 @@ __device__ __forceinline__ void ElementwiseRandom(OutT* out,
*/
#define SHARED_SIZE_LIMIT 512
template <typename InT, typename OutT, int BlockSize, class OpFunc>
template <typename InT, typename OutT, class OpFunc>
__device__ __forceinline__ void Cumsum(OutT* out,
const InT* in,
OpFunc compute) {
......@@ -632,7 +582,6 @@ __device__ __forceinline__ void Cumsum(OutT* out,
* @template paraments
* InT: the type of input register.
* OutT: the type of out register.
* BlockSize: Identifies the current device thread index method. Currently only
* GPU was supported.
*
* @param
......@@ -645,7 +594,7 @@ __device__ __forceinline__ void Cumsum(OutT* out,
#define SHARED_SIZE_LIMIT 1024
// each thread load 2 data from global memory so SHARED_SIZE_LIMIT must
// larger than blockDim.x * 2
template <typename InT, typename OutT, int BlockSize>
template <typename InT, typename OutT>
__device__ __forceinline__ void Sort(OutT* out,
const InT* in,
int num,
......@@ -689,7 +638,6 @@ __device__ __forceinline__ void Sort(OutT* out,
* InT: The type of input register.
* OutT: The type of out register.
* IndexType: The type of index.
* BlockSize: Identifies the current device thread index method. Currently only
* GPU was supported.
*
* @param
......@@ -701,7 +649,7 @@ __device__ __forceinline__ void Sort(OutT* out,
* monotonic_type: if monotonic_type = 1 then sorted in ascending order, eles
* sorted in escending.
*/
template <typename InT, typename OutT, typename IndexType, int BlockSize>
template <typename InT, typename OutT, typename IndexType>
__device__ __forceinline__ void Sort(OutT* out,
IndexType* out_index,
const InT* in,
......
......@@ -89,7 +89,6 @@ __device__ void BlockXReduce(T* out, const T* data, OpFunc reducer) {
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT, typename OutT>
......@@ -104,12 +103,7 @@ __device__ void BlockXReduce(T* out, const T* data, OpFunc reducer) {
* in: The register pointer of in, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc>
template <typename InT, typename OutT, int NX, int NY, class OpFunc>
__device__ __forceinline__ void ElementwiseUnary(OutT* out,
const InT* in,
OpFunc compute) {
......@@ -128,7 +122,6 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
* OutT: The data type of out.
* NX: The number of data columns computed by each thread.
* NY: The number of data rows computed by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* OpFunc: Compute functor which has an operator() as following:
* template <typename InT>
......@@ -144,12 +137,7 @@ __device__ __forceinline__ void ElementwiseUnary(OutT* out,
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc>
template <typename InT, typename OutT, int NX, int NY, class OpFunc>
__device__ __forceinline__ void ElementwiseBinary(OutT* out,
const InT* in1,
const InT* in2,
......@@ -160,12 +148,7 @@ __device__ __forceinline__ void ElementwiseBinary(OutT* out,
}
}
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc>
template <typename InT, typename OutT, int NX, int NY, class OpFunc>
__device__ __forceinline__ void ElementwiseBinary(
OutT* out, const InT* in1, const InT* in2, OpFunc compute, int read_lens) {
for (int idx = 0; idx < read_lens; ++idx) {
......@@ -182,7 +165,6 @@ __device__ __forceinline__ void ElementwiseBinary(
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT>
......@@ -200,12 +182,7 @@ __device__ __forceinline__ void ElementwiseBinary(
* in3: The register pointer of third input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc>
template <typename InT, typename OutT, int NX, int NY, class OpFunc>
__device__ __forceinline__ void ElementwiseTernary(
OutT* out, const InT* in1, const InT* in2, const InT* in3, OpFunc compute) {
#pragma unroll
......@@ -223,7 +200,6 @@ __device__ __forceinline__ void ElementwiseTernary(
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* Arity: The size of ins
* OpFunc: Compute functor which has an operator() as following:
......@@ -239,13 +215,7 @@ __device__ __forceinline__ void ElementwiseTernary(
* ins: A pointers of array consisting of multiple inputs.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
int Arity,
class OpFunc>
template <typename InT, typename OutT, int NX, int NY, int Arity, class OpFunc>
__device__ __forceinline__ void ElementwiseAny(OutT* out,
InT (*ins)[NX * NY],
OpFunc compute) {
......@@ -270,7 +240,6 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT, typename OutT>
......@@ -286,12 +255,7 @@ __device__ __forceinline__ void ElementwiseAny(OutT* out,
* in2: The register pointer of second input, size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT, OutT>().
*/
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc>
template <typename InT, typename OutT, int NX, int NY, class OpFunc>
__device__ __forceinline__ void CycleBinary(OutT* out,
const InT* in1,
const InT* in2,
......@@ -316,7 +280,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
* T: The type of data.
* NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* ReduceFunctor: Compute functor which has an operator() as following
* template <typename InT>
......@@ -336,7 +299,6 @@ __device__ __forceinline__ void CycleBinary(OutT* out,
template <typename T,
int NX,
int NY,
int BlockSize,
class ReduceFunctor,
details::ReduceMode Mode>
__device__ __forceinline__ void Reduce(T* out,
......@@ -369,7 +331,6 @@ __device__ __forceinline__ void Reduce(T* out,
* OutT: The data type of out.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* OpFunc: Compute functor which has an operator() as following
* template <typename InT>
......@@ -384,12 +345,7 @@ __device__ __forceinline__ void Reduce(T* out,
* out: The register pointer of out, the size is NX * NY.
* compute: Compute function which was declared like OpFunc<InT>().
*/
template <typename InT,
typename OutT,
int NX,
int NY,
int BlockSize,
class OpFunc>
template <typename InT, typename OutT, int NX, int NY, class OpFunc>
__device__ __forceinline__ void ElementwiseConstant(OutT* out, OpFunc compute) {
#pragma unroll
for (int idx = 0; idx < NX * NY; idx++) {
......
......@@ -144,7 +144,6 @@ __device__ __forceinline__ void ReadData(T* dst,
* Ty: The type of data that needs to be stored in registers.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
......@@ -161,12 +160,7 @@ __device__ __forceinline__ void ReadData(T* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template <typename Tx,
typename Ty,
int NX,
int NY,
int BlockSize,
bool IsBoundary = false>
template <typename Tx, typename Ty, int NX, int NY, bool IsBoundary = false>
__device__ __forceinline__ void ReadData(Ty* dst,
const Tx* __restrict__ src,
int size_nx,
......@@ -275,7 +269,6 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
* T: The type of data.
* NX: Each thread load NX data from global memory continuously.
* NY: Each thread need to load NY rows, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* When the number of data processed by this block is less than
......@@ -287,7 +280,7 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
* src: The data pointer of the current block.
* size: The current block needs to load size data continuously.
*/
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
template <typename T, int NX, int NY, bool IsBoundary = false>
__device__ __forceinline__ void ReadData(T* dst,
const T* __restrict__ src,
int num) {
......@@ -319,7 +312,7 @@ __device__ __forceinline__ void ReadData(T* dst,
}
}
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
template <typename T, int NX, int NY, bool IsBoundary = false>
__device__ __forceinline__ void ReadData(T* dst,
const T* __restrict__ src,
int num,
......@@ -361,7 +354,6 @@ __device__ __forceinline__ void ReadData(T* dst,
* NY: Each thread need to load NY rows, only NY = 1 was supported.
* ArgsT: The Type if dst, ArgsT can be std::tuple<T> or std::tuple<Args>
* Index: The index of data stored in dst.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* When the number of data processed by this block is less than
......@@ -376,7 +368,6 @@ __device__ __forceinline__ void ReadData(T* dst,
template <typename T,
int NX,
int NY,
int BlockSize,
typename ArgsT,
int Index,
bool IsBoundary = false>
......@@ -419,7 +410,6 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
* T: The type of data stored in the global memory.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
......@@ -437,7 +427,7 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
template <typename T, int NX, int NY, bool IsBoundary = false>
__device__ __forceinline__ void ReadDataBc(
T* dst,
const T* __restrict__ src,
......@@ -479,7 +469,6 @@ __device__ __forceinline__ void ReadDataBc(
* T: The type of data.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
......@@ -507,7 +496,6 @@ template <typename Tx,
typename Ty,
int NX,
int NY,
int BlockSize,
int Rank,
typename IndexCal,
typename Functor,
......@@ -572,7 +560,6 @@ __device__ __forceinline__ void ReadDataReduce(Ty* dst,
* T: The type of data.
* NX: The number of data continuously writed by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
......@@ -584,7 +571,7 @@ __device__ __forceinline__ void ReadDataReduce(Ty* dst,
* src: The register pointer, the size is NX * NY.
* size: The current block needs to load size elements continuously.
*/
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
template <typename T, int NX, int NY, bool IsBoundary = false>
__device__ __forceinline__ void WriteData(T* dst,
T* __restrict__ src,
int num) {
......@@ -613,7 +600,7 @@ __device__ __forceinline__ void WriteData(T* dst,
}
}
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
template <typename T, int NX, int NY, bool IsBoundary = false>
__device__ __forceinline__ void WriteData(T* dst,
T* __restrict__ src,
int num,
......@@ -652,7 +639,6 @@ __device__ __forceinline__ void WriteData(T* dst,
* Ty: The type of data that stored in the global memory.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
......@@ -669,12 +655,7 @@ __device__ __forceinline__ void WriteData(T* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template <typename Tx,
typename Ty,
int NX,
int NY,
int BlockSize,
bool IsBoundary = false>
template <typename Tx, typename Ty, int NX, int NY, bool IsBoundary = false>
__device__ __forceinline__ void WriteData(Ty* dst,
const Tx* __restrict__ src,
int size_nx,
......@@ -766,7 +747,6 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
* T: The type of data stored in the global memory.
* NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
......@@ -782,7 +762,7 @@ __device__ __forceinline__ void Init(T* dst, T* init_data, int num) {
* coordinate mapping relationship between output data and input data.
* total_num_output: Total number of original output.
*/
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
template <typename T, int NX, int NY, bool IsBoundary = false>
__device__ __forceinline__ void ReadDataBc(
T* dst,
const T* __restrict__ src,
......@@ -820,14 +800,13 @@ __device__ __forceinline__ void ReadDataBc(
* T: Data type of register.
* NX: Number of data to initialize.
* NY: Number of data to initialize, NY only can be 1.
* BlockSize: Identifies the current device thread index method. For GPU,
* threadIdx.x is used as the thread index. Currently only GPU was supported.
*
* @param:
* dst: The register pointer of the thread, the size is NX.
* init_data: The register pointer of init data, the size is NX.
*/
template <typename T, int NX, int NY, int BlockSize>
template <typename T, int NX, int NY>
__device__ __forceinline__ void InitWithDataIndex(T* dst, int block_offset) {
int thread_offset = block_offset + threadIdx.x * NX;
#pragma unroll
......
......@@ -337,7 +337,6 @@ __device__ __forceinline__ void WriteData(T _global_ptr_* dst,
* Ty: The type of data that needs to be stored in registers.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
......@@ -354,12 +353,7 @@ __device__ __forceinline__ void WriteData(T _global_ptr_* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template <typename Tx,
typename Ty,
int NX,
int NY,
int BlockSize,
bool IsBoundary = false>
template <typename Tx, typename Ty, int NX, int NY, bool IsBoundary = false>
__device__ __inline__ void ReadData(Ty* dst,
const Tx _global_ptr_* src,
int size_nx,
......@@ -472,7 +466,6 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
* T: The type of data.
* NX: Each thread load NX data from global memory continuously.
* NY: Each thread need to load NY rows, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* When the number of data processed by this block is less than
......@@ -484,7 +477,7 @@ __device__ __forceinline__ void Init(ArgsT* dst, T init_data, int read_lens) {
* src: The data pointer of the current block.
* size: The current block needs to load size data continuously.
*/
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary>
template <typename T, int NX, int NY, bool IsBoundary>
__device__ __inline__ void ReadData(T* dst,
const T _global_ptr_* src,
int num) {
......@@ -502,7 +495,7 @@ __device__ __inline__ void ReadData(T* dst,
}
}
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary>
template <typename T, int NX, int NY, bool IsBoundary>
__device__ __inline__ void ReadData(T* dst,
const T _global_ptr_* src,
int num,
......@@ -531,7 +524,6 @@ __device__ __inline__ void ReadData(T* dst,
* NY: Each thread need to load NY rows, only NY = 1 was supported.
* ArgsT: The Type if dst, ArgsT can be std::tuple<T> or std::tuple<Args>
* Index: The index of data stored in dst.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* IsBoundary: Whether to make an out-of-bounds judgment on access to memory.
* When the number of data processed by this block is less than
......@@ -546,7 +538,6 @@ __device__ __inline__ void ReadData(T* dst,
template <typename T,
int NX,
int NY,
int BlockSize,
typename ArgsT,
int Index,
bool IsBoundary>
......@@ -582,7 +573,6 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
* T: The type of data stored in the global memory.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
......@@ -599,7 +589,7 @@ __device__ __forceinline__ void ReadData(ArgsT* dst,
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
template <typename T, int NX, int NY, bool IsBoundary = false>
__device__ __inline__ void ReadDataBc(T* dst,
const T _global_ptr_* src,
uint32_t block_offset,
......@@ -634,7 +624,6 @@ __device__ __inline__ void ReadDataBc(T* dst,
* T: The type of data.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* Rank: The shape size of out. eg in[1, 35], out[32, 35] then shape size is 2.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
......@@ -662,7 +651,6 @@ template <typename Tx,
typename Ty,
int NX,
int NY,
int BlockSize,
int Rank,
typename IndexCal,
typename Functor,
......@@ -733,7 +721,6 @@ __device__ __forceinline__ void ReadDataReduce(
* T: The type of data.
* NX: The number of data continuously writed by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
......@@ -746,7 +733,7 @@ __device__ __forceinline__ void ReadDataReduce(
* size: The current block needs to load size elements continuously.
*/
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary>
template <typename T, int NX, int NY, bool IsBoundary>
__device__ void WriteData(T _global_ptr_* dst,
const T* src,
int num,
......@@ -766,7 +753,7 @@ __device__ void WriteData(T _global_ptr_* dst,
}
}
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary>
template <typename T, int NX, int NY, bool IsBoundary>
__device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
int thread_offset = core_id() * NX;
mfence_local();
......@@ -793,7 +780,6 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
* Ty: The type of data stored in the global memory.
* NX: The number of data columns loaded by each thread.
* NY: The number of data rows loaded by each thread.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
......@@ -810,12 +796,7 @@ __device__ void WriteData(T _global_ptr_* dst, const T* src, int num) {
* stride_nx: Each read one element stride stride_nx elements in the last dim.
* stride_ny: Each read one element stride stride_ny elements in the first dim.
*/
template <typename Tx,
typename Ty,
int NX,
int NY,
int BlockSize,
bool IsBoundary = false>
template <typename Tx, typename Ty, int NX, int NY, bool IsBoundary = false>
__device__ __inline__ void WriteData(Ty _global_ptr_* dst,
const Tx* src,
int size_nx,
......@@ -1190,7 +1171,6 @@ __device__ __inline__ void ReadDataBcCanNotCmp(
* T: The type of data stored in the global memory.
* NX: The number of data continuously loaded by each thread.
* NY: The number of data rows loaded by each thread, only NY = 1 was supported.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
* IsBoundary: Indicates whether to perform block access storage out-of-bounds
* judgment. When the number of data processed by the block is less than
......@@ -1206,7 +1186,7 @@ __device__ __inline__ void ReadDataBcCanNotCmp(
* read_lens: The number of data continuously loaded by each thread.
* total_num_output: Total number of original output.
*/
template <typename T, int NX, int NY, int BlockSize, bool IsBoundary = false>
template <typename T, int NX, int NY, bool IsBoundary = false>
__device__ __inline__ void ReadDataBc(T* dst,
const T _global_ptr_* src,
uint32_t block_offset,
......@@ -1238,14 +1218,13 @@ __device__ __inline__ void ReadDataBc(T* dst,
* T: Data type of register.
* NX: Number of data to initialize.
* NY: Number of data to initialize, NY only can be 1.
* BlockSize: Identifies the current device thread index method. For xpu,
* core_id() is used as the index.
*
* @param:
* dst: The register pointer of the thread, the size is NX.
* init_data: The register pointer of init data, the size is NX.
*/
template <typename T, int NX, int NY, int BlockSize>
template <typename T, int NX, int NY>
__device__ __forceinline__ void InitWithDataIndex(T* dst, int block_offset) {
int thread_offset = block_offset + core_id() * NX;
#pragma unroll
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册