From e4b1ec1b13f3d4ab3904fdbbaf50a1b1cabb7dcb Mon Sep 17 00:00:00 2001 From: ashburnlee <1578034415@qq.com> Date: Mon, 28 Sep 2020 01:22:25 +0000 Subject: [PATCH] Add cuda support for unique op. --- paddle/fluid/operators/unique_op.cu | 127 ++++++++++++++++++++-------- 1 file changed, 92 insertions(+), 35 deletions(-) diff --git a/paddle/fluid/operators/unique_op.cu b/paddle/fluid/operators/unique_op.cu index 522e2f759b7..51ed6e0b515 100644 --- a/paddle/fluid/operators/unique_op.cu +++ b/paddle/fluid/operators/unique_op.cu @@ -18,10 +18,13 @@ limitations under the License. */ #include #include #include -#include "paddle/fluid/operators/unique_op.h" // TransComute +#include +#include "paddle/fluid/framework/tensor_util.h" // TensorToVector() +#include "paddle/fluid/operators/unique_op.h" // TransComute() namespace paddle { namespace operators { + using Tensor = framework::Tensor; // Binary function 'less than' @@ -89,6 +92,70 @@ struct BinaryNotEqual { } }; +// index_select() function for Tensor +template +void IndexSelect(const framework::ExecutionContext& context, + const Tensor& input, const Tensor& index, Tensor* output, + int dim) { + auto input_dim = input.dims(); + auto input_dim_size = input_dim.size(); + auto output_dim = output->dims(); + + auto slice_size = 1; + for (auto i = dim + 1; i < input_dim_size; i++) { + slice_size *= input_dim[i]; + } + + auto input_width = slice_size * input_dim[dim]; + auto output_width = slice_size * output_dim[dim]; + + auto outer_nums = 1; + for (auto i = 0; i < dim; i++) { + outer_nums *= input_dim[i]; + } + + auto index_size = index.dims()[0]; + + std::vector input_vec; + std::vector index_vec; + TensorToVector(input, context.device_context(), &input_vec); + TensorToVector(index, context.device_context(), &index_vec); + std::vector out_vec(output->numel()); + + for (int i = 0; i < index_size; i++) { + PADDLE_ENFORCE_GE( + index_vec[i], 0, + platform::errors::InvalidArgument( + "Variable value (index) of OP(index_select) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + input_dim[dim], index_vec[i])); + PADDLE_ENFORCE_LT( + index_vec[i], input_dim[dim], + platform::errors::InvalidArgument( + "Variable value (index) of OP(index_select) " + "expected >= 0 and < %ld, but got %ld. Please check input " + "value.", + input_dim[dim], index_vec[i])); + } + + for (auto i = 0; i < outer_nums; i++) { + auto input_start_offset = i * input_width; + auto output_start_offset = i * output_width; + + for (auto j = 0; j < index_size; j++) { + int32_t index_value = index_vec[j]; + for (auto k = 0; k < slice_size; k++) { + out_vec[output_start_offset + j * slice_size + k] = + input_vec[input_start_offset + index_value * slice_size + k]; + } + } + } + output->mutable_data(context.GetPlace()); + framework::TensorFromVector(out_vec, context.device_context(), output); + output->Resize(output_dim); +} + /// The core logic of computing Unique template static void ComputeUniqueFlatten(const framework::ExecutionContext& context, @@ -178,10 +245,11 @@ static void ComputeUniqueFlatten(const framework::ExecutionContext& context, template static void ComputeUniqueDims(const framework::ExecutionContext& context, framework::Tensor* sorted_indices, - InT* sorted_indices_data, framework::Tensor* out, - bool return_index, bool return_inverse, - bool return_counts, equal_T equal, - not_equal_T not_equal, int64_t row) { + int32_t* sorted_indices_data, + framework::Tensor* out, bool return_index, + bool return_inverse, bool return_counts, + equal_T equal, not_equal_T not_equal, + int64_t row) { // 1. inverse indices: 'inverse' Tensor* inverse = context.Output("Index"); inverse->Resize(framework::make_ddim({row})); /// in.shape[0] @@ -212,18 +280,15 @@ static void ComputeUniqueDims(const framework::ExecutionContext& context, sorted_indices_data; thrust::device_ptr range_data_ptr_dev(range_data_ptr); range_data_ptr_dev[num_out] = row; + sorted_indices->Resize(framework::make_ddim({num_out})); // 3. counts: 'counts' Tensor* counts = context.Output("Counts"); - counts->Resize(framework::make_ddim({row})); + counts->Resize(framework::make_ddim({num_out})); auto count_data = counts->mutable_data(context.GetPlace()); thrust::fill(thrust::device, count_data, count_data + row, 0); thrust::adjacent_difference(thrust::device, range_data_ptr + 1, range_data_ptr + row + 1, count_data); - - /** - * TODO(ashburnlee) implement index_select() to get 'out' and reshape back - */ } // Calculate unique when 'dim' is not set @@ -244,7 +309,7 @@ static void UniqueDimsCUDATensor(const framework::ExecutionContext& context, framework::Tensor* out, bool return_index, bool return_inverse, bool return_counts, int axis) { - // Transpose & reshape + // 1. Transpose & reshape // Transpose tensor: eg. axis=1, [dim0, dim1, dim2] -> [dim1, dim0, dim2] std::vector permute(in.dims().size()); std::iota(permute.begin(), permute.end(), 0); @@ -258,60 +323,53 @@ static void UniqueDimsCUDATensor(const framework::ExecutionContext& context, in_trans.Resize(in_trans_dims); in_trans.mutable_data(context.GetPlace()); auto& dev_ctx = context.cuda_device_context(); - TransCompute(in.dims().size(), // 维度个数 - dev_ctx, // 设备 - in, // 原始tensor - &in_trans, // Reshape 后的tensor 被修改 - permute); // axis 的索引 + TransCompute(in.dims().size(), // num of dims + dev_ctx, // device + in, // original Tensor + &in_trans, // Tensor after reshape + permute); // index of axis // Reshape tensor: eg. [dim1, dim0, dim2] -> [dim1, dim0*dim2] framework::DDim in_trans_flat_dims = framework::flatten_to_2d(in_trans_dims, 1); in_trans.Resize(in_trans_flat_dims); - // in_trans 2D - // in_trans(unsorted) as 'in' + // now 'in_trans is 2D int64_t col = in_trans.dims()[1]; int64_t row = in_trans.dims()[0]; - const InT* in_trans_data = in_trans.data(); + const InT* in_trans_data = in_trans.data(); // read only - // Tensor in_trans_hat; - // framework::TensorCopy(in_trans, context.GetPlace(), &in_trans_hat); - auto in_trans_data = in_trans.mutable_data(context.GetPlace()); Tensor* sorted_indices = context.Output("Indices"); sorted_indices->Resize(framework::make_ddim({row})); auto sorted_indices_data = sorted_indices->mutable_data(context.GetPlace()); + // 2. Calculate 'sorted_indices', 'inverse', 'counts' // Init index and sort thrust::sequence(thrust::device, sorted_indices_data, sorted_indices_data + row); thrust::sort(thrust::device, sorted_indices_data, sorted_indices_data + row, LessThan(col, in_trans_data)); - ComputeUniqueDims(context, sorted_indices, sorted_indices_data, out, return_index, return_inverse, return_counts, BinaryEqual(col, in_trans_data), BinaryNotEqual(col, in_trans_data), row); - /** - * NOTE: If index_select() is implemented and called in ComputeUniqueDims(), - * the code below can be deleted. - */ - - // Reshape 'out' back - std::vector in_trans_unbind = Unbind(in_trans_hat); - math::ConcatFunctor concat_functor; + // 3. Select indices and reshape back to get 'out' framework::Tensor out_trans; std::vector out_trans_dims_vec = in_trans_dims_vec; - out_trans_dims_vec[0] = in_trans_unbind.size(); + out_trans_dims_vec[0] = sorted_indices->numel(); out_trans.Resize(framework::make_ddim(out_trans_dims_vec)); out_trans.mutable_data(context.GetPlace()); + + IndexSelect(context, in_trans, *sorted_indices, &out_trans, 0); + std::swap(out_trans_dims_vec[0], out_trans_dims_vec[axis]); out->Resize(framework::make_ddim(out_trans_dims_vec)); out->mutable_data(context.GetPlace()); - - concat_functor(dev_ctx, in_trans_unbind, 0, &out_trans); + std::vector out_trans_unbind = Unbind(out_trans); + math::ConcatFunctor concat_functor; + concat_functor(dev_ctx, out_trans_unbind, 0, &out_trans); TransCompute(out_trans.dims().size(), dev_ctx, out_trans, out, permute); } @@ -353,7 +411,6 @@ class UniqueKernel return_inverse, return_counts); } else { int axis = axis_vec[0]; - // 已指明 DeviceContext 为 CUDADeviceContext, 写法正确 UniqueDimsCUDATensor( context, *x, out, return_index, return_inverse, return_counts, axis); } -- GitLab