提交 10c29185 编写于 作者: T TFbunny

fix speed bottleneck for SrandInit and Shuffle in GPU-RandomChoiceWithMask

上级 3eef4a4e
...@@ -155,7 +155,7 @@ __global__ void Sort(const int ceil_power2, T *rank_buff) { ...@@ -155,7 +155,7 @@ __global__ void Sort(const int ceil_power2, T *rank_buff) {
__global__ void SrandInit(const int ceil_power2, curandState *globalState, const int seedc) { __global__ void SrandInit(const int ceil_power2, curandState *globalState, const int seedc) {
for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < ceil_power2; i += blockDim.x * gridDim.x) { for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i < ceil_power2; i += blockDim.x * gridDim.x) {
curand_init(seedc, i, 0, &globalState[i]); curand_init(seedc, threadIdx.x, 0, &globalState[i]);
} }
} }
...@@ -163,7 +163,7 @@ template <typename T> ...@@ -163,7 +163,7 @@ template <typename T>
__global__ void Shuffle(const int ceil_power2, curandState *globalState, T *rank_buff) { __global__ void Shuffle(const int ceil_power2, curandState *globalState, T *rank_buff) {
int limit = ceil_power2 + 1; int limit = ceil_power2 + 1;
int value; int value;
for (size_t i = 2; i <= ceil_power2; i <<= 1) { size_t i = ceil_power2;
for (size_t j = (i >> 1); j > 0; j >>= 1) { for (size_t j = (i >> 1); j > 0; j >>= 1) {
for (size_t tid = threadIdx.x; tid < ceil_power2; tid += blockDim.x) { for (size_t tid = threadIdx.x; tid < ceil_power2; tid += blockDim.x) {
size_t tid_comp = tid ^ j; size_t tid_comp = tid ^ j;
...@@ -178,7 +178,6 @@ __global__ void Shuffle(const int ceil_power2, curandState *globalState, T *rank ...@@ -178,7 +178,6 @@ __global__ void Shuffle(const int ceil_power2, curandState *globalState, T *rank
} }
__syncthreads(); __syncthreads();
} }
}
} }
template <typename T, typename S> template <typename T, typename S>
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册