diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h index a65b1987cb547d7d02c454df1c6758e74037a1b6..6ac70eacaf9b5b1c9205f84ab3c7047e4b1bffc9 100644 --- a/paddle/fluid/operators/concat_op.h +++ b/paddle/fluid/operators/concat_op.h @@ -33,6 +33,7 @@ class ConcatKernel : public framework::OpKernel { auto place = ctx.GetPlace(); out->mutable_data(place); + // TODO(zcd): Sometimes direct copies will be faster std::vector inputs(ins.size()); for (size_t j = 0; j < ins.size(); ++j) { inputs[j] = *ins[j]; @@ -51,6 +52,7 @@ class ConcatGradKernel : public framework::OpKernel { auto outs = ctx.MultiOutput(framework::GradVarName("X")); int64_t axis = static_cast(ctx.Attr("axis")); + // TODO(zcd): Sometimes direct copies will be faster std::vector outputs(outs.size()); for (size_t j = 0; j < outs.size(); ++j) { outs[j]->mutable_data(ctx.GetPlace()); diff --git a/paddle/fluid/operators/math/concat.cc b/paddle/fluid/operators/math/concat.cc index 5c5c6489d601d63aa975c8811e3320c4c03922c2..b542143419e05e9baf29e9a2322447f32ddd9829 100644 --- a/paddle/fluid/operators/math/concat.cc +++ b/paddle/fluid/operators/math/concat.cc @@ -19,7 +19,8 @@ namespace operators { namespace math { /* - * All tensors' dimension should be the same. + * All tensors' dimension should be the same and the values of + * each dimension are the same, except the axis dimension. */ template class ConcatFunctor { @@ -27,12 +28,9 @@ class ConcatFunctor { void operator()(const platform::CPUDeviceContext& context, const std::vector& input, const int axis, framework::Tensor* output) { - // assume the the max size of input is less than 8 and see the performance - // save origin dim + // TODO(zcd): Add input data validity checking int num = input.size(); - std::vector origin_dim(num); - // get the matrix size int rows = 1; auto dim_0 = input[0].dims(); for (int i = 0; i < axis; ++i) { @@ -40,7 +38,6 @@ class ConcatFunctor { } int out_rows = rows, out_cols = 0; - // get input's cols std::vector input_cols(input.size()); for (int i = 0; i < num; ++i) { int t_cols = input[i].numel() / rows; @@ -64,18 +61,19 @@ class ConcatFunctor { } }; +/* + * All tensors' dimension should be the same and the values of + * each dimension are the same, except the axis dimension. + */ template class ConcatGradFunctor { public: void operator()(const platform::CPUDeviceContext& context, const framework::Tensor& input, const int axis, std::vector& outputs) { - // assume the the max size of input is less than 8 and see the performance - // save origin dim + // TODO(zcd): Add input data validity checking int num = outputs.size(); - std::vector origin_dim(num); - // get the matrix size int input_rows = 1; auto dim_0 = outputs[0].dims(); for (int i = 0; i < axis; ++i) { @@ -83,7 +81,6 @@ class ConcatGradFunctor { } int input_cols = 0; - // get outputs' cols std::vector output_cols(outputs.size()); for (int i = 0; i < num; ++i) { int t_cols = outputs[i].numel() / input_rows; diff --git a/paddle/fluid/operators/math/concat.cu b/paddle/fluid/operators/math/concat.cu index 8af7233426c89df3e2147b56592b856de803a8a6..5f64856a1a5b816afaeaed0e0d63b74ed5d6aa85 100644 --- a/paddle/fluid/operators/math/concat.cu +++ b/paddle/fluid/operators/math/concat.cu @@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/operators/math/concat.h" #include "paddle/fluid/platform/cuda_helper.h" @@ -19,16 +20,6 @@ namespace paddle { namespace operators { namespace math { -// TODO(zcd): This can be replaced by tensor, -// if that, maybe we should add int8 to VarType::Type. -// Or replaced by tensorArray. -static constexpr int MaxSize = 8; -template -struct CUDADeviceArray { - T data[MaxSize]; - int size; -}; - template __device__ T upper_bound(const T* first, T count, T val) { const T* orig = first; @@ -49,25 +40,24 @@ __device__ T upper_bound(const T* first, T count, T val) { } template -__global__ void KernelConcat(const CUDADeviceArray inputs, - const CUDADeviceArray input_cols, +__global__ void KernelConcat(T** inputs, const int* input_cols, int col_size, const int output_rows, const int output_cols, T* output) { int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - int segment = upper_bound(input_cols.data, input_cols.size, tid_x) - 1; + int segment = upper_bound(input_cols, col_size, tid_x) - 1; - int curr_offset = input_cols.data[segment]; + int curr_offset = input_cols[segment]; int curr_segment = segment; for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) { T curr_col_offset; - while ((curr_col_offset = input_cols.data[curr_segment + 1]) <= tid_x) { + while ((curr_col_offset = input_cols[curr_segment + 1]) <= tid_x) { curr_offset = curr_col_offset; ++curr_segment; } int local_col = tid_x - curr_offset; int segment_width = curr_col_offset - curr_offset; - const T* input_ptr = inputs.data[curr_segment]; + T* input_ptr = inputs[curr_segment]; int tid_y = blockIdx.y * blockDim.y + threadIdx.y; for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) output[tid_y * output_cols + tid_x] = @@ -76,41 +66,41 @@ __global__ void KernelConcat(const CUDADeviceArray inputs, } template -__global__ void KernelConcat(const CUDADeviceArray inputs, - const int input_col, const int output_rows, - const int output_cols, T* output) { +__global__ void KernelConcat(T** inputs, const int input_col, + const int output_rows, const int output_cols, + T* output) { int tid_x = blockIdx.x * blockDim.x + threadIdx.x; float inv_input_col = 1.0 / input_col; for (; tid_x < output_cols; tid_x += blockDim.x * gridDim.x) { int split = tid_x * inv_input_col; int in_offset = tid_x - split * input_col; - const T* input_ptr = inputs.data[split]; + T* input_ptr = inputs[split]; int tid_y = blockIdx.y * blockDim.y + threadIdx.y; - for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) + for (; tid_y < output_rows; tid_y += blockDim.y * gridDim.y) { output[tid_y * output_cols + tid_x] = input_ptr[tid_y * input_col + in_offset]; + } } } template __global__ void KernelConcatGrad(const T* input, const int input_row, - const int input_col, - CUDADeviceArray output_cols, - CUDADeviceArray outputs) { + const int input_col, const int* output_cols, + int col_size, T** outputs) { int tid_x = blockIdx.x * blockDim.x + threadIdx.x; - int segment = upper_bound(output_cols.data, output_cols.size, tid_x) - 1; - int curr_offset = output_cols.data[segment]; + int segment = upper_bound(output_cols, col_size, tid_x) - 1; + int curr_offset = output_cols[segment]; int curr_segment = segment; for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) { T curr_col_offset; - while ((curr_col_offset = output_cols.data[curr_segment + 1]) <= tid_x) { + while ((curr_col_offset = output_cols[curr_segment + 1]) <= tid_x) { curr_offset = curr_col_offset; ++curr_segment; } int local_col = tid_x - curr_offset; int segment_width = curr_col_offset - curr_offset; - T* output_ptr = outputs.data[curr_segment]; + T* output_ptr = outputs[curr_segment]; int tid_y = blockIdx.y * blockDim.y + threadIdx.y; for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y) output_ptr[tid_y * segment_width + local_col] = @@ -121,13 +111,13 @@ __global__ void KernelConcatGrad(const T* input, const int input_row, template __global__ void KernelConcatGrad(const T* input, const int input_row, const int input_col, const int output_cols, - CUDADeviceArray outputs) { + T** outputs) { int tid_x = blockIdx.x * blockDim.x + threadIdx.x; float inv_input_col = 1.0 / input_col; for (; tid_x < input_col; tid_x += blockDim.x * gridDim.x) { int split = tid_x * inv_input_col; int in_offset = tid_x - split * input_col; - T* output_ptr = outputs.data[split]; + T* output_ptr = outputs[split]; int tid_y = blockIdx.y * blockDim.y + threadIdx.y; for (; tid_y < input_row; tid_y += blockDim.y * gridDim.y) output_ptr[tid_y * output_cols + in_offset] = @@ -136,7 +126,8 @@ __global__ void KernelConcatGrad(const T* input, const int input_row, } /* - * All tensors' dimension should be the same. + * All tensors' dimension should be the same and the values of + * each dimension are the same, except the axis dimension. */ template class ConcatFunctor { @@ -144,12 +135,8 @@ class ConcatFunctor { void operator()(const platform::CUDADeviceContext& context, const std::vector& input, const int axis, framework::Tensor* output) { - // assume the the max size of input is less than 8 and see the performance - // save origin dim + // TODO(zcd): Add input data validity checking int num = input.size(); - PADDLE_ENFORCE_LT(num, MaxSize, "input number should be less than %d", - MaxSize); - // get the matrix size int rows = 1; auto dim_0 = input[0].dims(); for (int i = 0; i < axis; ++i) { @@ -157,25 +144,27 @@ class ConcatFunctor { } int cols = input[0].numel() / rows; int out_rows = rows, out_cols = 0; - bool sameShape = true; - CUDADeviceArray inputs_data; - CUDADeviceArray inputs_cols; - inputs_data.size = num; - inputs_cols.size = num + 1; - inputs_cols.data[0] = 0; - // reshape to matrix - // check input shape is valid + paddle::framework::Vector inputs_data(num * sizeof(T*) / 2); + paddle::framework::Vector inputs_cols(num + 1); + inputs_cols[0] = 0; + T** inputs_ptr = reinterpret_cast(inputs_data.data()); + + bool sameShape = true; for (int i = 0; i < num; ++i) { int t_cols = input[i].numel() / rows; if (sameShape) { if (t_cols != cols) sameShape = false; } out_cols += t_cols; - inputs_cols.data[i + 1] = out_cols; - inputs_data.data[i] = input[i].data(); + inputs_cols[i + 1] = out_cols; + inputs_ptr[i] = const_cast(input[i].data()); } + T** ins_gpu = + reinterpret_cast(inputs_data.CUDAMutableData(context.GetPlace())); + const int* ins_col_gpu = inputs_cols.CUDAData(context.GetPlace()); + // computation // set the thread block and grid according to CurrentDeviceId const int kThreadsPerBlock = 1024; @@ -198,27 +187,27 @@ class ConcatFunctor { if (sameShape) { KernelConcat<<>>( - inputs_data, cols, out_rows, out_cols, output->data()); + ins_gpu, cols, out_rows, out_cols, output->data()); } else { KernelConcat<<>>( - inputs_data, inputs_cols, out_rows, out_cols, output->data()); + ins_gpu, ins_col_gpu, static_cast(inputs_cols.size()), out_rows, + out_cols, output->data()); } } }; +/* + * All tensors' dimension should be the same and the values of + * each dimension are the same, except the axis dimension. + */ template class ConcatGradFunctor { public: void operator()(const platform::CUDADeviceContext& context, const framework::Tensor& input, const int axis, std::vector& outputs) { - // assume the the max size of input is less than 8 and see the performance - // save origin dim + // TODO(zcd): Add input data validity checking int num = outputs.size(); - PADDLE_ENFORCE_LT(num, MaxSize, "input number should be less than %d", - MaxSize); - - // get the matrix size int input_row = 1; auto dim_0 = outputs[0].dims(); for (int i = 0; i < axis; ++i) { @@ -229,11 +218,10 @@ class ConcatGradFunctor { int input_col = 0; bool sameShape = true; - CUDADeviceArray outputs_data; - CUDADeviceArray outputs_cols; - outputs_data.size = num; - outputs_cols.size = num + 1; - outputs_cols.data[0] = 0; + paddle::framework::Vector outputs_data(num * sizeof(T*) / 2); + paddle::framework::Vector outputs_cols(num + 1); + outputs_cols[0] = 0; + T** outputs_ptr = reinterpret_cast(outputs_data.data()); for (int i = 0; i < num; ++i) { int t_col = outputs[i].numel() / input_row; @@ -241,12 +229,16 @@ class ConcatGradFunctor { if (t_col != output_col_0) sameShape = false; } input_col += t_col; - outputs_cols.data[i + 1] = input_col; - outputs_data.data[i] = outputs[i].data(); + outputs_cols[i + 1] = input_col; + outputs_ptr[i] = outputs[i].data(); } + T** outs_gpu = + reinterpret_cast(outputs_data.CUDAMutableData(context.GetPlace())); + const int* outs_col_gpu = outputs_cols.CUDAData(context.GetPlace()); + // computation - const int kThreadsPerBlock = 256; + const int kThreadsPerBlock = 1024; int block_cols = std::min(input_col, kThreadsPerBlock); int block_rows = std::max(kThreadsPerBlock / block_cols, 1); dim3 block_size = dim3(block_cols, block_rows, 1); @@ -257,10 +249,11 @@ class ConcatGradFunctor { if (sameShape) { KernelConcatGrad<<>>( - input.data(), input_row, input_col, output_col_0, outputs_data); + input.data(), input_row, input_col, output_col_0, outs_gpu); } else { KernelConcatGrad<<>>( - input.data(), input_row, input_col, outputs_cols, outputs_data); + input.data(), input_row, input_col, outs_col_gpu, + static_cast(outputs_cols.size()), outs_gpu); } } }; diff --git a/paddle/fluid/operators/math/concat.h b/paddle/fluid/operators/math/concat.h index bc878318883d197d17823d2e6862251f1b02e6b3..22147d79e4b1eeee76f7445dd963bf5062049a34 100644 --- a/paddle/fluid/operators/math/concat.h +++ b/paddle/fluid/operators/math/concat.h @@ -20,7 +20,16 @@ namespace operators { namespace math { /* + * \brief Concatenate the input tensors along the dimension axis. + * TODO(zcd): maybe it needs to be more detailed. + * Examples: + * Input[0] = [[1,2],[3,4]] + * Input[1] = [[5,6]] + * axis = 0 * + * Output = [[1,2], + * [3,4], + * [5,6]] */ template class ConcatFunctor { @@ -30,6 +39,18 @@ class ConcatFunctor { framework::Tensor* output); }; +/* + * \brief Split the input tensors along the dimension axis into outputs. + * TODO(zcd): maybe it needs to be more detailed. + * Examples: + * Input = [[1,2], + * [3,4], + * [5,6]] + * axis = 0 + * + * Output[0] = [[1,2],[3,4]] + * Output[1] = [[5,6]] + */ template class ConcatGradFunctor { public: