未验证 提交 7e463c84 编写于 作者: Y Yiqun Liu 提交者: GitHub

Optimize the concat and split cuda implementation for cases when the number of...

Optimize the concat and split cuda implementation for cases when the number of inputs/outputs is less than 5. (#17979)

test=develop
上级 101f74cb
...@@ -77,6 +77,33 @@ __global__ void ConcatKernel(const T* input_addr0, const T* input_addr1, ...@@ -77,6 +77,33 @@ __global__ void ConcatKernel(const T* input_addr0, const T* input_addr1,
output_data); output_data);
} }
template <typename T>
__global__ void ConcatKernel(const T* input_addr0, const T* input_addr1,
const T* input_addr2, const int fixed_in_col,
const int out_rows, const int out_cols,
T* output_data) {
const T* inputs_data[3];
inputs_data[0] = input_addr0;
inputs_data[1] = input_addr1;
inputs_data[2] = input_addr2;
ConcatKernelDetail<T>(inputs_data, fixed_in_col, out_rows, out_cols,
output_data);
}
template <typename T>
__global__ void ConcatKernel(const T* input_addr0, const T* input_addr1,
const T* input_addr2, const T* input_addr3,
const int fixed_in_col, const int out_rows,
const int out_cols, T* output_data) {
const T* inputs_data[4];
inputs_data[0] = input_addr0;
inputs_data[1] = input_addr1;
inputs_data[2] = input_addr2;
inputs_data[3] = input_addr3;
ConcatKernelDetail<T>(inputs_data, fixed_in_col, out_rows, out_cols,
output_data);
}
template <typename T> template <typename T>
__global__ void ConcatKernel(const T** inputs_data, const int in_num, __global__ void ConcatKernel(const T** inputs_data, const int in_num,
const int fixed_in_col, const int out_rows, const int fixed_in_col, const int out_rows,
...@@ -147,6 +174,31 @@ __global__ void SplitKernel(const T* input_data, const int in_row, ...@@ -147,6 +174,31 @@ __global__ void SplitKernel(const T* input_data, const int in_row,
SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data); SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
} }
template <typename T>
__global__ void SplitKernel(const T* input_data, const int in_row,
const int in_col, const int fixed_out_col,
T* outputs_addr0, T* outputs_addr1,
T* outputs_addr2) {
T* outputs_data[3];
outputs_data[0] = outputs_addr0;
outputs_data[1] = outputs_addr1;
outputs_data[2] = outputs_addr2;
SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
}
template <typename T>
__global__ void SplitKernel(const T* input_data, const int in_row,
const int in_col, const int fixed_out_col,
T* outputs_addr0, T* outputs_addr1,
T* outputs_addr2, T* outputs_addr3) {
T* outputs_data[4];
outputs_data[0] = outputs_addr0;
outputs_data[1] = outputs_addr1;
outputs_data[2] = outputs_addr2;
outputs_data[3] = outputs_addr3;
SplitKernelDetail<T>(input_data, in_row, in_col, fixed_out_col, outputs_data);
}
static inline void GetBlockDims(const platform::CUDADeviceContext& context, static inline void GetBlockDims(const platform::CUDADeviceContext& context,
int num_rows, int num_cols, dim3* block_dims, int num_rows, int num_cols, dim3* block_dims,
dim3* grid_dims) { dim3* grid_dims) {
...@@ -210,7 +262,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> { ...@@ -210,7 +262,7 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
memory::allocation::AllocationPtr tmp_dev_ins_data; memory::allocation::AllocationPtr tmp_dev_ins_data;
const T** dev_ins_data = nullptr; const T** dev_ins_data = nullptr;
if (!has_same_shape || (in_num != 2)) { if (!has_same_shape || in_num < 2 || in_num > 4) {
tmp_dev_ins_data = tmp_dev_ins_data =
platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate( platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
inputs_data.size() * sizeof(T*)); inputs_data.size() * sizeof(T*));
...@@ -226,6 +278,14 @@ class ConcatFunctor<platform::CUDADeviceContext, T> { ...@@ -226,6 +278,14 @@ class ConcatFunctor<platform::CUDADeviceContext, T> {
ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>( ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
inputs_data[0], inputs_data[1], in_col, out_row, out_col, inputs_data[0], inputs_data[1], in_col, out_row, out_col,
output->data<T>()); output->data<T>());
} else if (in_num == 3) {
ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
inputs_data[0], inputs_data[1], inputs_data[2], in_col, out_row,
out_col, output->data<T>());
} else if (in_num == 4) {
ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
inputs_data[0], inputs_data[1], inputs_data[2], inputs_data[3],
in_col, out_row, out_col, output->data<T>());
} else { } else {
ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>( ConcatKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
dev_ins_data, in_num, in_col, out_row, out_col, output->data<T>()); dev_ins_data, in_num, in_col, out_row, out_col, output->data<T>());
...@@ -294,7 +354,7 @@ class SplitFunctor<platform::CUDADeviceContext, T> { ...@@ -294,7 +354,7 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
memory::allocation::AllocationPtr tmp_dev_outs_data; memory::allocation::AllocationPtr tmp_dev_outs_data;
T** dev_out_gpu_data = nullptr; T** dev_out_gpu_data = nullptr;
if (!has_same_shape || (o_num != 2)) { if (!has_same_shape || o_num < 2 || o_num > 4) {
tmp_dev_outs_data = tmp_dev_outs_data =
platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate( platform::DeviceTemporaryAllocator::Instance().Get(context).Allocate(
outputs_data.size() * sizeof(T*)); outputs_data.size() * sizeof(T*));
...@@ -310,6 +370,14 @@ class SplitFunctor<platform::CUDADeviceContext, T> { ...@@ -310,6 +370,14 @@ class SplitFunctor<platform::CUDADeviceContext, T> {
SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>( SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
input.data<T>(), in_row, in_col, out0_col, outputs_data[0], input.data<T>(), in_row, in_col, out0_col, outputs_data[0],
outputs_data[1]); outputs_data[1]);
} else if (o_num == 3) {
SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
input.data<T>(), in_row, in_col, out0_col, outputs_data[0],
outputs_data[1], outputs_data[2]);
} else if (o_num == 4) {
SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
input.data<T>(), in_row, in_col, out0_col, outputs_data[0],
outputs_data[1], outputs_data[2], outputs_data[3]);
} else { } else {
SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>( SplitKernel<<<grid_dims, block_dims, 0, context.stream()>>>(
input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data); input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册