From 68b5e5bf85ade89bffeec09a4e959dd11da8af67 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Thu, 21 Sep 2017 10:12:07 +0800 Subject: [PATCH] Use stridecpy instead of CUDA kernel --- paddle/operators/crop_op.cc | 50 +-------------- paddle/operators/crop_op.cu | 121 +----------------------------------- paddle/operators/crop_op.h | 53 +++++++++++++--- 3 files changed, 45 insertions(+), 179 deletions(-) diff --git a/paddle/operators/crop_op.cc b/paddle/operators/crop_op.cc index ee4bc9cda..d38c7ba35 100644 --- a/paddle/operators/crop_op.cc +++ b/paddle/operators/crop_op.cc @@ -128,59 +128,11 @@ class CropOpGrad : public framework::OperatorWithKernel { } }; -int64_t transIndex(std::vector out_shape, std::vector x_shape, - std::vector> crop_rules, size_t index) { - int64_t dim_size = out_shape.size(); - std::vector pos(dim_size); - - for (int64_t i = out_shape.size() - 1; i >= 0; --i) { - pos[i] = (index % out_shape[i]) + crop_rules[i].first; - index = index / out_shape[i]; - } - - size_t result = pos[0]; - for (size_t i = 1; i < x_shape.size(); ++i) { - result = result * x_shape[i] + pos[i]; - } - return result; -} - -template -class CropCPUKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext &context) const override { - auto *x = context.Input("X"); - auto *out = context.Output("Out"); - auto x_data = x->data(); - T *out_data = out->mutable_data(context.GetPlace()); - auto x_dims = x->dims(); - auto out_dims = out->dims(); - int64_t out_count = out->numel(); - std::vector x_shape = framework::vectorize(x_dims); - std::vector out_shape = framework::vectorize(out_dims); - - auto offsets = context.Attr>("offsets"); - PADDLE_ENFORCE_EQ( - x_dims.size(), offsets.size(), - "Offsets size should be equal to dimension size of input tensor."); - - std::vector> crop_rules(x_dims.size()); - for (size_t i = 0; i < crop_rules.size(); ++i) { - crop_rules[i].first = offsets[i]; - crop_rules[i].second = x_dims[i] - out_dims[i] - offsets[i]; - } - - for (int64_t i = 0; i < out_count; ++i) { - out_data[i] = x_data[transIndex(out_shape, x_shape, crop_rules, i)]; - } - } -}; - } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP(crop, ops::CropOp, ops::CropOpMaker, crop_grad, ops::CropOpGrad); -REGISTER_OP_CPU_KERNEL(crop, ops::CropCPUKernel); +REGISTER_OP_CPU_KERNEL(crop, ops::CropKernel); REGISTER_OP_CPU_KERNEL(crop_grad, ops::CropGradKernel); diff --git a/paddle/operators/crop_op.cu b/paddle/operators/crop_op.cu index 05782145b..f8ee18a1d 100644 --- a/paddle/operators/crop_op.cu +++ b/paddle/operators/crop_op.cu @@ -13,128 +13,9 @@ limitations under the License. */ #define EIGEN_USE_GPU -#include #include "paddle/operators/crop_op.h" -namespace paddle { -namespace operators { - -using framework::LoDTensor; -using framework::Tensor; - -template -__global__ void CropKernel(const int N, const int64_t* out_shape, - const int64_t* x_shape, const int* crop_rules, - const T* x_data, T* out_data) { - int64_t pos[D]; - int tmp; - int64_t x_index; - for (int out_index = blockIdx.x * blockDim.x + threadIdx.x; out_index < N; - out_index += blockDim.x * gridDim.x) { - tmp = out_index; - for (int64_t i = D - 1; i >= 0; --i) { - pos[i] = (tmp % out_shape[i]) + crop_rules[i * 2]; - tmp = tmp / out_shape[i]; - } - - x_index = pos[0]; - for (size_t i = 1; i < D; ++i) { - x_index = x_index * x_shape[i] + pos[i]; - } - out_data[out_index] = x_data[x_index]; - } -} - -template -void CropCUDAFunctoin(const framework::ExecutionContext& context) { - PADDLE_ENFORCE(platform::is_gpu_place(context.GetPlace()), - "It must use GPUPlace."); - auto* x = context.Input("X"); - auto* out = context.Output("Out"); - auto x_data = x->data(); - T* out_data = out->mutable_data(paddle::platform::GPUPlace()); - auto x_dims = x->dims(); - auto out_dims = out->dims(); - int64_t out_count = out->numel(); - Tensor x_shape; - Tensor out_shape; - int64_t* x_shape_data = - x_shape.mutable_data({D}, paddle::platform::CPUPlace()); - int64_t* out_shape_data = - out_shape.mutable_data({D}, paddle::platform::CPUPlace()); - for (int i = 0; i < D; ++i) { - x_shape_data[i] = x_dims[i]; - out_shape_data[i] = out_dims[i]; - } - Tensor x_shape_gpu; - Tensor out_shape_gpu; - x_shape_gpu.CopyFrom(x_shape, paddle::platform::GPUPlace()); - out_shape_gpu.CopyFrom(out_shape, paddle::platform::GPUPlace()); - auto offsets = context.op().Attr>("offsets"); - PADDLE_ENFORCE_EQ( - D, offsets.size(), - "Offsets size should be equal to dimension size of input tensor."); - - Tensor crop_rules; - int* crop_rules_data = - crop_rules.mutable_data({D * 2}, paddle::platform::CPUPlace()); - for (size_t i = 0; i < D; ++i) { - crop_rules_data[i * 2] = offsets[i]; - crop_rules_data[i * 2 + 1] = x_dims[i] - out_dims[i] - offsets[i]; - } - - Tensor crop_rules_gpu; - crop_rules_gpu.CopyFrom(crop_rules, paddle::platform::GPUPlace()); - - int n = out_dims[0]; - int d = out_dims[1]; - int block = 512; - int grid = (n * d + block - 1) / block; - - CropKernel< - T, - D><<( - context.device_context()) - .stream()>>>( - out_count, out_shape_gpu.data(), x_shape_gpu.data(), - crop_rules_gpu.data(), x_data, out_data); -} - -template -class CropOpCUDAKernel : public framework::OpKernel { - public: - void Compute(const framework::ExecutionContext& context) const override { - size_t rank = context.Input("X")->dims().size(); - switch (rank) { - case 1: - CropCUDAFunctoin(context); - break; - case 2: - CropCUDAFunctoin(context); - break; - case 3: - CropCUDAFunctoin(context); - break; - case 4: - CropCUDAFunctoin(context); - break; - case 5: - CropCUDAFunctoin(context); - break; - case 6: - CropCUDAFunctoin(context); - break; - default: - PADDLE_THROW( - "CropOp only support tensors with no more than 6 dimensions."); - } - } -}; - -} // namespace operators -} // namespace paddle - namespace ops = paddle::operators; -REGISTER_OP_GPU_KERNEL(crop, ops::CropOpCUDAKernel); +REGISTER_OP_GPU_KERNEL(crop, ops::CropKernel); REGISTER_OP_GPU_KERNEL(crop_grad, ops::CropGradKernel); diff --git a/paddle/operators/crop_op.h b/paddle/operators/crop_op.h index 09d42f4b7..d4c523cf3 100644 --- a/paddle/operators/crop_op.h +++ b/paddle/operators/crop_op.h @@ -16,6 +16,7 @@ #include "paddle/framework/eigen.h" #include "paddle/framework/op_registry.h" +#include "paddle/operators/strided_memcpy.h" namespace paddle { namespace operators { // Internal @@ -24,26 +25,58 @@ template using EigenTensor = framework::EigenTensor; -using framework::LoDTensor; +using framework::Tensor; +using framework::DDim; + +// TODO(wanghaoshuang): move this function to other place +DDim stride(const DDim& ddim) { + std::vector strides(ddim.size()); + strides[ddim.size() - 1] = 1; + for (int i = ddim.size() - 2; i >= 0; --i) { + strides[i] = strides[i + 1] * ddim[i + 1]; + } + return make_ddim(strides); +} + +template +class CropKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + T* x_data = x->data(); + T* out_data = out->mutable_data(context.GetPlace()); + auto x_stride = stride(x->dims()); + auto out_stride = stride(out->dims()); + auto offsets = context.Attr>("offsets"); + PADDLE_ENFORCE_EQ( + x_dims.size(), offsets.size(), + "Offsets size should be equal to dimension size of input tensor."); + int64_t offset = 0; + for (int i = 0; i < offsets.size(); ++i) { + offset += (x_stride[i] * offsets[i]); + } + StridedMemcpy(context.device_context(), x_data + offset, x_stride, + out->dims(), out_stride, out_data); + } +}; template void CropGradFunction(const framework::ExecutionContext& context) { - auto* d_out = context.Input(framework::GradVarName("Out")); - auto* d_x = context.Output(framework::GradVarName("X")); + auto* d_x = context.Output(framework::GradVarName("X")); if (d_x != nullptr) { + auto* d_out = context.Input(framework::GradVarName("Out")); d_x->mutable_data(context.GetPlace()); - auto d_x_dims = d_x->dims(); - auto d_out_dims = d_out->dims(); - auto offsets = context.op().Attr>("offsets"); + auto offsets = context.Attr>("offsets"); Eigen::array, D> paddings; - for (int i = 0; i < d_out_dims.size(); ++i) { + for (int i = 0; i < D; ++i) { paddings[i].first = offsets[i]; paddings[i].second = d_x_dims[i] - d_out_dims[i] - offsets[i]; } auto d_x_tensor = EigenTensor::From(*d_x); auto d_out_tensor = EigenTensor::From(*d_out); - auto place = context.GetEigenDevice(); - d_x_tensor.device(place) = d_out_tensor.pad(paddings, 0); + d_x_tensor.device(context.GetEigenDevice()) = + d_out_tensor.pad(paddings, 0); } } @@ -52,7 +85,7 @@ class CropGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { size_t rank = - context.Input(framework::GradVarName("Out"))->dims().size(); + context.Input(framework::GradVarName("Out"))->dims().size(); switch (rank) { case 1: CropGradFunction(context); -- GitLab