提交 ad1ad738 编写于 作者: Q qiaolongfei

add gpu support for concat

上级 9c128fe6
...@@ -98,7 +98,7 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> { ...@@ -98,7 +98,7 @@ class ConcatGradFunctor<platform::CPUDeviceContext, T> {
int col_idx = 0; int col_idx = 0;
for (int j = 0; j < num; ++j) { for (int j = 0; j < num; ++j) {
int col_len = output_cols[j]; int col_len = output_cols[j];
auto* out_tensor = (*outputs)[j]; auto* out_tensor = outputs->at(j);
if (out_tensor != nullptr) { if (out_tensor != nullptr) {
T* dst_ptr = out_tensor->data<T>() + k * col_len; T* dst_ptr = out_tensor->data<T>() + k * col_len;
memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx, memory::Copy(cpu_place, dst_ptr, cpu_place, src_ptr + col_idx,
......
...@@ -102,10 +102,12 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row, ...@@ -102,10 +102,12 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
int local_col = tid_x - curr_offset; int local_col = tid_x - curr_offset;
int segment_width = curr_col_offset - curr_offset; int segment_width = curr_col_offset - curr_offset;
T* output_ptr = outputs_data[curr_segment]; T* output_ptr = outputs_data[curr_segment];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y; if (output_ptr != nullptr) {
for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
output_ptr[tid_y * segment_width + local_col] = for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
input_data[tid_y * in_col + tid_x]; output_ptr[tid_y * segment_width + local_col] =
input_data[tid_y * in_col + tid_x];
}
} }
} }
...@@ -118,10 +120,12 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row, ...@@ -118,10 +120,12 @@ __global__ void KernelConcatGrad(const T* input_data, const int in_row,
int split = tid_x / fixed_out_col; int split = tid_x / fixed_out_col;
int in_offset = tid_x - split * fixed_out_col; int in_offset = tid_x - split * fixed_out_col;
T* output_ptr = outputs_data[split]; T* output_ptr = outputs_data[split];
int tid_y = blockIdx.y * blockDim.y + threadIdx.y; if (output_ptr != nullptr) {
for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) int tid_y = blockIdx.y * blockDim.y + threadIdx.y;
output_ptr[tid_y * fixed_out_col + in_offset] = for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y)
input_data[tid_y * in_col + tid_x]; output_ptr[tid_y * fixed_out_col + in_offset] =
input_data[tid_y * in_col + tid_x];
}
} }
} }
...@@ -203,17 +207,18 @@ template <typename T> ...@@ -203,17 +207,18 @@ template <typename T>
class ConcatGradFunctor<platform::CUDADeviceContext, T> { class ConcatGradFunctor<platform::CUDADeviceContext, T> {
public: public:
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, const int axis, const framework::Tensor& input,
std::vector<framework::Tensor>* outputs) { const std::vector<const framework::Tensor*>& ref_inputs,
const int axis, std::vector<framework::Tensor*>* outputs) {
// TODO(zcd): Add input data validity checking // TODO(zcd): Add input data validity checking
int o_num = outputs->size(); int o_num = outputs->size();
int out_row = 1; int out_row = 1;
auto dim_0 = outputs->at(0).dims(); auto dim_0 = ref_inputs[0]->dims();
for (int i = 0; i < axis; ++i) { for (int i = 0; i < axis; ++i) {
out_row *= dim_0[i]; out_row *= dim_0[i];
} }
int out_col = outputs->at(0).numel() / out_row; int out0_col = ref_inputs[0]->numel() / out_row;
int in_col = 0, in_row = out_row; int in_col = 0, in_row = out_row;
bool sameShape = true; bool sameShape = true;
...@@ -223,13 +228,17 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -223,13 +228,17 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
outputs_cols[0] = 0; outputs_cols[0] = 0;
for (int i = 0; i < o_num; ++i) { for (int i = 0; i < o_num; ++i) {
int t_col = outputs->at(i).numel() / out_row; int t_col = outputs->at(i)->numel() / out_row;
if (sameShape) { if (sameShape) {
if (t_col != out_col) sameShape = false; if (t_col != out0_col) sameShape = false;
} }
in_col += t_col; in_col += t_col;
outputs_cols[i + 1] = in_col; outputs_cols[i + 1] = in_col;
outputs_ptr[i] = outputs->at(i).data<T>(); if (outputs->at(i) != nullptr) {
outputs_ptr[i] = outputs->at(i)->data<T>();
} else {
outputs_ptr[i] = nullptr;
}
} }
T** dev_out_gpu_data = T** dev_out_gpu_data =
...@@ -255,7 +264,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> { ...@@ -255,7 +264,7 @@ class ConcatGradFunctor<platform::CUDADeviceContext, T> {
if (sameShape) { if (sameShape) {
KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>( KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
input.data<T>(), in_row, in_col, out_col, dev_out_gpu_data); input.data<T>(), in_row, in_col, out0_col, dev_out_gpu_data);
} else { } else {
const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace()); const int* dev_outs_col_data = outputs_cols.CUDAData(context.GetPlace());
KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>( KernelConcatGrad<<<grid_size, block_size, 0, context.stream()>>>(
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册