未验证 提交 49e4ee27 编写于 作者: Z zlsh80826 提交者: GitHub

[Paddle-TRT] slice kernel optimization (#24783)

* parallel move shared data test=develop

* test=develop
上级 1a7fbb73
...@@ -35,10 +35,8 @@ __global__ void SliceKernel(int num, int dims, const T *input, ...@@ -35,10 +35,8 @@ __global__ void SliceKernel(int num, int dims, const T *input,
const int idx = blockIdx.x * blockDim.x + threadIdx.x; const int idx = blockIdx.x * blockDim.x + threadIdx.x;
extern __shared__ int shared_data[]; extern __shared__ int shared_data[];
if (threadIdx.x == 0) { for (int i = threadIdx.x; i < dims * 3; i += blockDim.x) {
for (int i = 0; i < dims * 3; i++) { shared_data[i] = offsets_info[i];
shared_data[i] = offsets_info[i];
}
} }
__syncthreads(); __syncthreads();
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册