From 34605d26410a89e0e3b90a9236e8f9e4149834ec Mon Sep 17 00:00:00 2001 From: dzhwinter Date: Mon, 26 Feb 2018 20:07:59 +0800 Subject: [PATCH] accelerate the cuda concat op, avoid many times copy (#8585) * "try enhance concat op" * "enhance the concat operator" --- paddle/fluid/operators/concat_op.h | 47 ++++++++++++++++++++++++++---- 1 file changed, 41 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/concat_op.h b/paddle/fluid/operators/concat_op.h index eb0e43ad2d..208a4481c6 100644 --- a/paddle/fluid/operators/concat_op.h +++ b/paddle/fluid/operators/concat_op.h @@ -14,6 +14,7 @@ limitations under the License. */ #pragma once +#include #include #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/operators/strided_memcpy.h" @@ -34,12 +35,46 @@ class ConcatKernel : public framework::OpKernel { auto out_stride = framework::stride_numel(out->dims()); size_t output_offset = 0; - for (auto* in : ins) { - auto in_stride = framework::stride_numel(in->dims()); - StridedNumelCopyWithAxis(ctx.device_context(), axis, - out->data() + output_offset, out_stride, - in->data(), in_stride, in_stride[axis]); - output_offset += in_stride[axis]; + + // If axis >=1, copy to out immediately need to call many times + // of cuda memcpy. Copy the input to cpu and do the stride copy, + // then copy to gpu output. + + if (platform::is_gpu_place(place) && axis >= 1) { + platform::CPUPlace copy_place; + auto& cpu_ctx = *platform::DeviceContextPool::Instance().Get(copy_place); + framework::Tensor cpu_out; + cpu_out.Resize(out->dims()); + cpu_out.mutable_data(copy_place); + auto& dev_ctx = ctx.device_context(); + std::vector> cpu_ins; + for (auto* in : ins) { + std::unique_ptr cpu_in(new framework::Tensor); + framework::TensorCopy(*in, copy_place, dev_ctx, cpu_in.get()); + cpu_ins.emplace_back(std::move(cpu_in)); + } + // TODO(dzhwinter): overlap copy and compute stream + // https://devblogs.nvidia.com/how-overlap-data-transfers-cuda-cc/ + dev_ctx.Wait(); + + for (auto& in : cpu_ins) { + auto& cpu_in = *in.get(); + auto in_stride = framework::stride_numel(cpu_in.dims()); + + StridedNumelCopyWithAxis( + cpu_ctx, axis, cpu_out.data() + output_offset, out_stride, + cpu_in.data(), in_stride, in_stride[axis]); + output_offset += in_stride[axis]; + } + framework::TensorCopy(cpu_out, place, dev_ctx, out); + } else { + for (auto* in : ins) { + auto in_stride = framework::stride_numel(in->dims()); + StridedNumelCopyWithAxis(ctx.device_context(), axis, + out->data() + output_offset, out_stride, + in->data(), in_stride, in_stride[axis]); + output_offset += in_stride[axis]; + } } } }; -- GitLab