From 286eca2d9ebc6096bb3d756d0cc9f902ecab7b79 Mon Sep 17 00:00:00 2001 From: wawltor Date: Tue, 25 Aug 2020 22:31:16 +0800 Subject: [PATCH] update the code for the topk v2 add the top v2 for the paddlepaddle api 2.0 --- paddle/fluid/operators/top_k_function_cuda.h | 515 ++++++++++++++++++ paddle/fluid/operators/top_k_op.cu | 462 +--------------- paddle/fluid/operators/top_k_v2_op.cc | 176 ++++++ paddle/fluid/operators/top_k_v2_op.cu | 272 +++++++++ paddle/fluid/operators/top_k_v2_op.h | 321 +++++++++++ .../fluid/tests/unittests/test_top_k_v2_op.py | 244 +++++++++ python/paddle/tensor/search.py | 98 +++- 7 files changed, 1628 insertions(+), 460 deletions(-) create mode 100644 paddle/fluid/operators/top_k_function_cuda.h create mode 100644 paddle/fluid/operators/top_k_v2_op.cc create mode 100644 paddle/fluid/operators/top_k_v2_op.cu create mode 100644 paddle/fluid/operators/top_k_v2_op.h create mode 100644 python/paddle/fluid/tests/unittests/test_top_k_v2_op.py diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h new file mode 100644 index 00000000000..57891699fd2 --- /dev/null +++ b/paddle/fluid/operators/top_k_function_cuda.h @@ -0,0 +1,515 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include +#include "cub/cub.cuh" +#include "paddle/fluid/operators/top_k_op.h" +#include "paddle/fluid/platform/cuda_device_function.h" +#include "paddle/fluid/platform/float16.h" + +// set cub base traits in order to handle float16 +namespace cub { +template <> +struct NumericTraits + : BaseTraits {}; +} // namespace cub + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +struct SegmentOffsetIter { + EIGEN_DEVICE_FUNC + explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const { + return idx * num_cols_; + } + + int num_cols_; +}; + +// Iter using into a column +struct ColumnIndexIter { + explicit ColumnIndexIter(int num_cols) : num_cols_(num_cols) {} + + EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()( + const Eigen::array& ix) const { + return ix[0] % num_cols_; + } + + int num_cols_; +}; + +inline static int GetDesiredBlockDim(int dim) { + if (dim > 128) { + return 256; + } else if (dim > 64) { + return 128; + } else if (dim > 32) { + return 64; + } else { + return 32; + } +} + +template +__global__ void InitIndex(T* indices, T num_rows, T num_cols) { + int col_id = threadIdx.x; + int row_id = blockIdx.x; + + for (int64_t j = row_id; j < num_rows; j += gridDim.x) { + for (int64_t i = col_id; i < num_cols; i += blockDim.x) { + indices[j * num_cols + i] = i; + } + } +} + +template +struct Pair { + __device__ __forceinline__ Pair() {} + __device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {} + + __device__ __forceinline__ void set(T value, int64_t id) { + v = value; + id = id; + } + + __device__ __forceinline__ void operator=(const Pair& in) { + v = in.v; + id = in.id; + } + + __device__ __forceinline__ bool operator<(const T value) const { + return (v < value); + } + + __device__ __forceinline__ bool operator>(const T value) const { + return (v > value); + } + __device__ __forceinline__ bool operator<(const Pair& in) const { + return (v < in.v) || ((v == in.v) && (id > in.id)); + } + + __device__ __forceinline__ bool operator>(const Pair& in) const { + return (v > in.v) || ((v == in.v) && (id < in.id)); + } + + T v; + int64_t id; +}; + +template +__device__ __forceinline__ void AddTo(Pair topk[], const Pair& p, + int beam_size, const bool& largest) { + for (int k = beam_size - 2; k >= 0; k--) { + if (largest) { + if (topk[k] < p) { + topk[k + 1] = topk[k]; + } else { + topk[k + 1] = p; + return; + } + } else { + if (topk[k] > p) { + topk[k + 1] = topk[k]; + } else { + topk[k + 1] = p; + return; + } + } + } + topk[0] = p; +} + +template +__device__ __forceinline__ void GetTopK(Pair topk[], const T* src, int idx, + int dim, int beam_size, + const bool& largest) { + while (idx < dim) { + if (largest) { + if (topk[beam_size - 1] < src[idx]) { + Pair tmp(src[idx], idx); + AddTo(topk, tmp, beam_size, largest); + } + } else { + if (topk[beam_size - 1] > src[idx]) { + Pair tmp(src[idx], idx); + AddTo(topk, tmp, beam_size, largest); + } + } + idx += BlockSize; + } +} + +template +__device__ __forceinline__ void GetTopK(Pair topk[], const T* src, int idx, + int dim, const Pair& max, + int beam_size, const bool& largest) { + while (idx < dim) { + if (largest) { + if (topk[beam_size - 1] < src[idx]) { + Pair tmp(src[idx], idx); + if (tmp < max) { + AddTo(topk, tmp, beam_size, largest); + } + } + } else { + if (topk[beam_size - 1] > src[idx]) { + Pair tmp(src[idx], idx); + if (tmp > max) { + AddTo(topk, tmp, beam_size, largest); + } + } + } + idx += BlockSize; + } +} + +template +__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, + int beam_size, const T* src, + bool* firstStep, bool* is_empty, + Pair* max, int dim, + const int tid, bool largest) { + if (*beam > 0) { + int length = (*beam) < beam_size ? *beam : beam_size; + if (*firstStep) { + *firstStep = false; + GetTopK(topk, src, tid, dim, length, largest); + } else { + for (int k = 0; k < MaxLength; k++) { + if (k < MaxLength - (*beam)) { + topk[k] = topk[k + *beam]; + } else { + topk[k].set(-static_cast(INFINITY), -1); + } + } + if (!(*is_empty)) { + GetTopK(topk + MaxLength - *beam, src, tid, dim, *max, + length, largest); + } + } + + *max = topk[MaxLength - 1]; + if ((*max).v == -static_cast(1)) *is_empty = true; + *beam = 0; + } +} + +template +__device__ __forceinline__ void BlockReduce(Pair* sh_topk, int* maxid, + Pair topk[], T** topVal, + int64_t** topIds, int* beam, int* k, + const int tid, const int warp, + const bool& largest) { + while (true) { + __syncthreads(); + if (tid < BlockSize / 2) { + if (largest) { + if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) { + maxid[tid] = tid + BlockSize / 2; + } else { + maxid[tid] = tid; + } + } else { + if (sh_topk[tid] > sh_topk[tid + BlockSize / 2]) { + maxid[tid] = tid + BlockSize / 2; + } else { + maxid[tid] = tid; + } + } + } + __syncthreads(); + for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) { + if (tid < stride) { + if (largest) { + if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) { + maxid[tid] = maxid[tid + stride]; + } + } else { + if (sh_topk[maxid[tid]] > sh_topk[maxid[tid + stride]]) { + maxid[tid] = maxid[tid + stride]; + } + } + } + __syncthreads(); + } + __syncthreads(); + + if (tid == 0) { + **topVal = sh_topk[maxid[0]].v; + **topIds = sh_topk[maxid[0]].id; + (*topVal)++; + (*topIds)++; + } + if (tid == maxid[0]) (*beam)++; + if (--(*k) == 0) break; + __syncthreads(); + + if (tid == maxid[0]) { + if (*beam < MaxLength) { + sh_topk[tid] = topk[*beam]; + } + } + // NOTE(zcd): temporary solution + unsigned mask = 0u; + CREATE_SHFL_MASK(mask, true); + + if (maxid[0] / 32 == warp) { + if (platform::CudaShuffleSync(mask, *beam, (maxid[0]) % 32, 32) == + MaxLength) + break; + } + } +} + +/** + * Each block compute one sample. + * In a block: + * 1. every thread get top MaxLength value; + * 2. merge to sh_topk, block reduce and get max value; + * 3. go to the second setp, until one thread's topk value is null; + * 4. go to the first setp, until get the topk value. + */ + +template +__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, + const T* src, int lds, int dim, int k, + int grid_dim, int num, bool largest = true) { + __shared__ Pair sh_topk[BlockSize]; + const int tid = threadIdx.x; + const int warp = threadIdx.x / 32; + + const int bid = blockIdx.x; + for (int i = bid; i < num; i += grid_dim) { + int top_num = k; + __shared__ int maxid[BlockSize / 2]; + T* out = output + i * output_stride; + int64_t* inds = indices + i * k; + Pair topk[MaxLength]; + int beam = MaxLength; + Pair max; + bool is_empty = false; + bool firststep = true; + + for (int j = 0; j < MaxLength; j++) { + if (largest) { + topk[j].set(-static_cast(INFINITY), -1); + } else { + topk[j].set(static_cast(INFINITY), -1); + } + } + while (top_num) { + ThreadGetTopK(topk, &beam, k, src + i * lds, + &firststep, &is_empty, &max, dim, + tid, largest); + + sh_topk[tid] = topk[0]; + BlockReduce(sh_topk, maxid, topk, &out, &inds, + &beam, &top_num, tid, warp, largest); + } + } +} + +template +__global__ void AssignGrad(T* x_grad, const int64_t* indices, const T* out_grad, + size_t rows, size_t cols, size_t k) { + for (size_t i = 0; i < rows; ++i) { + for (size_t j = 0; j < cols; ++j) { + x_grad[i * cols + j] = 0; + } + for (size_t j = 0; j < k; ++j) { + size_t idx = indices[i * k + j]; + x_grad[i * cols + idx] = out_grad[i * k + j]; + } + } +} + +// the grad assign with the axis +template +__global__ void AssignGradWithAxis(const T* grad_out, const int64_t* indices, + T* grad_in, int pre, int post, + int raw_height, int k) { + // raw_height is the length of topk axis + for (int i = blockIdx.x; i < pre; i += gridDim.x) { + const int& base_index = i * post * k; + const int& base_grad = i * post * raw_height; + for (int j = threadIdx.x; j < raw_height * post; j += blockDim.x) { + grad_in[base_grad + j] = static_cast(0); + } + for (int j = threadIdx.x; j < k * post; j += blockDim.x) { + const int64_t idx_ij = indices[base_index + j]; + const int64_t in_ij = base_grad + (idx_ij * post) + (j % post); + grad_in[in_ij] = grad_out[idx_ij]; + } + } +} +// use the radix sort for the topk +template +bool SortTopk(const platform::CUDADeviceContext& ctx, + const framework::Tensor* input_tensor, const int64_t num_cols, + const int64_t num_rows, const int k, + framework::Tensor* out_tensor, framework::Tensor* indices_tensor, + bool largest = true) { + auto cu_stream = ctx.stream(); + + Tensor input_indices; + const std::vector dims = {num_rows, num_cols}; + auto dim = framework::make_ddim(dims); + input_indices.Resize(dim); + // input_indices.Resize(num_rows*num_cols); + input_indices.mutable_data(ctx.GetPlace()); + size_t temp_storage_bytes = -1; + + auto ComputeBlockSize = [](int col) { + if (col > 512) + return 1024; + else if (col > 256 && col <= 512) + return 512; + else if (col > 128 && col <= 256) + return 256; + else if (col > 64 && col <= 128) + return 128; + else + return 64; + }; + int block_size = ComputeBlockSize(num_cols); + + unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x; + // actually, int num_rows < max_grid_size + unsigned int grid_size = num_rows < maxGridDimX + ? static_cast(num_rows) + : maxGridDimX; + // Init a index array + InitIndex<<>>( + input_indices.data(), num_rows, num_cols); + + // create iter for counting input + cub::CountingInputIterator counting_iter(0); + // segment_offset is used for move to next row + cub::TransformInputIterator> + segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols)); + + T* sorted_values_ptr; + int64_t* sorted_indices_ptr; + + Tensor temp_values; + Tensor temp_indices; + + const T* input = input_tensor->data(); + T* values = out_tensor->data(); + int64_t* indices = indices_tensor->mutable_data(ctx.GetPlace()); + + if (k == num_cols) { + // Doing a full sort. + sorted_values_ptr = values; + sorted_indices_ptr = indices; + } else { + temp_values.Resize(dim); + temp_indices.Resize(dim); + sorted_values_ptr = temp_values.mutable_data(ctx.GetPlace()); + sorted_indices_ptr = temp_indices.mutable_data(ctx.GetPlace()); + } + + // Get temp storage buffer size, maybe can allocate a fixed buffer to save + // time. + if (largest) { + auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending( + nullptr, temp_storage_bytes, input, sorted_values_ptr, + input_indices.data(), sorted_indices_ptr, num_cols * num_rows, + num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, + cu_stream); + if (err != cudaSuccess) { + LOG(ERROR) + << "TopKOP failed as could not launch " + "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate " + "temp_storage_bytes, status: " + << cudaGetErrorString(err); + return false; + } + } else { + auto err = cub::DeviceSegmentedRadixSort::SortPairs( + nullptr, temp_storage_bytes, input, sorted_values_ptr, + input_indices.data(), sorted_indices_ptr, num_cols * num_rows, + num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, + cu_stream); + if (err != cudaSuccess) { + LOG(ERROR) << "TopKOP failed as could not launch " + "cub::DeviceSegmentedRadixSort::SortPairs to calculate " + "temp_storage_bytes, status: " + << cudaGetErrorString(err); + return false; + } + } + Tensor temp_storage; + temp_storage.mutable_data(ctx.GetPlace(), temp_storage_bytes); + + if (largest) { + auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending( + temp_storage.data(), temp_storage_bytes, input, + sorted_values_ptr, input_indices.data(), sorted_indices_ptr, + num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1, + 0, sizeof(T) * 8, cu_stream); + if (err != cudaSuccess) { + LOG(ERROR) << "TopKOP failed as could not launch " + "cub::DeviceSegmentedRadixSort::SortPairsDescending to " + "sort input, " + "temp_storage_bytes: " + << temp_storage_bytes + << ", status: " << cudaGetErrorString(err); + return false; + } + } else { + auto err = cub::DeviceSegmentedRadixSort::SortPairs( + temp_storage.data(), temp_storage_bytes, input, + sorted_values_ptr, input_indices.data(), sorted_indices_ptr, + num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1, + 0, sizeof(T) * 8, cu_stream); + if (err != cudaSuccess) { + LOG(ERROR) << "TopKOP failed as could not launch " + "cub::DeviceSegmentedRadixSort::SortPairs to " + "sort input, " + "temp_storage_bytes: " + << temp_storage_bytes + << ", status: " << cudaGetErrorString(err); + return false; + } + } + auto& dev = *ctx.eigen_device(); + if (k < num_cols) { + // copy sliced data to output. + const Eigen::DSizes slice_indices{0, 0}; + const Eigen::DSizes slice_sizes{num_rows, k}; + auto e_indices = EigenMatrix::From(*indices_tensor, dim); + auto e_tmp_indices = EigenMatrix::From(temp_indices); + + std::vector odims = {static_cast(num_rows), static_cast(k)}; + auto dim = framework::make_ddim(odims); + auto e_values = EigenMatrix::From(*out_tensor, dim); + auto e_tmp_values = EigenMatrix::From(temp_values); + + e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes); + e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes); + } + return true; +} +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/top_k_op.cu b/paddle/fluid/operators/top_k_op.cu index 82ecc2887ba..d8b2e926160 100644 --- a/paddle/fluid/operators/top_k_op.cu +++ b/paddle/fluid/operators/top_k_op.cu @@ -12,474 +12,21 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#pragma once #include +#include #include "cub/cub.cuh" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/top_k_function_cuda.h" #include "paddle/fluid/operators/top_k_op.h" -#include "paddle/fluid/platform/cuda_device_function.h" #include "paddle/fluid/platform/float16.h" // set cub base traits in order to handle float16 -namespace cub { -template <> -struct NumericTraits - : BaseTraits {}; -} // namespace cub namespace paddle { namespace operators { using Tensor = framework::Tensor; -template -struct Pair { - __device__ __forceinline__ Pair() {} - __device__ __forceinline__ Pair(T value, int64_t id) : v(value), id(id) {} - - __device__ __forceinline__ void set(T value, int64_t id) { - v = value; - id = id; - } - - __device__ __forceinline__ void operator=(const Pair& in) { - v = in.v; - id = in.id; - } - - __device__ __forceinline__ bool operator<(const T value) const { - return (v < value); - } - - __device__ __forceinline__ bool operator<(const Pair& in) const { - return (v < in.v) || ((v == in.v) && (id > in.id)); - } - - __device__ __forceinline__ bool operator>(const Pair& in) const { - return (v > in.v) || ((v == in.v) && (id < in.id)); - } - - T v; - int64_t id; -}; - -template -__device__ __forceinline__ void AddTo(Pair topk[], const Pair& p, - int beam_size) { - for (int k = beam_size - 2; k >= 0; k--) { - if (topk[k] < p) { - topk[k + 1] = topk[k]; - } else { - topk[k + 1] = p; - return; - } - } - topk[0] = p; -} - -template -__device__ __forceinline__ void AddTo(Pair topk[], const Pair& p) { - for (int k = beam_size - 2; k >= 0; k--) { - if (topk[k] < p) { - topk[k + 1] = topk[k]; - } else { - topk[k + 1] = p; - return; - } - } - topk[0] = p; -} - -template -__device__ __forceinline__ void GetTopK(Pair topk[], const T* src, int idx, - int dim, int beam_size) { - while (idx < dim) { - if (topk[beam_size - 1] < src[idx]) { - Pair tmp(src[idx], idx); - AddTo(topk, tmp, beam_size); - } - idx += BlockSize; - } -} - -template -__device__ __forceinline__ void GetTopK(Pair topk[], const T* src, int idx, - int dim, const Pair& max, - int beam_size) { - while (idx < dim) { - if (topk[beam_size - 1] < src[idx]) { - Pair tmp(src[idx], idx); - if (tmp < max) { - AddTo(topk, tmp, beam_size); - } - } - idx += BlockSize; - } -} - -template -__device__ __forceinline__ void GetTopK(Pair topk[], const T* val, int* col, - int idx, int dim, int beam_size) { - while (idx < dim) { - if (topk[beam_size - 1] < val[idx]) { - Pair tmp(val[idx], col[idx]); - AddTo(topk, tmp, beam_size); - } - idx += BlockSize; - } -} - -template -__device__ __forceinline__ void GetTopK(Pair topk[], const T* val, int* col, - int idx, int dim, const Pair& max, - int beam_size) { - while (idx < dim) { - if (topk[beam_size - 1] < val[idx]) { - Pair tmp(val[idx], col[idx]); - if (tmp < max) { - AddTo(topk, tmp, beam_size); - } - } - idx += BlockSize; - } -} - -template -__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, - int beam_size, const T* src, - bool* firstStep, bool* is_empty, - Pair* max, int dim, - const int tid) { - if (*beam > 0) { - int length = (*beam) < beam_size ? *beam : beam_size; - if (*firstStep) { - *firstStep = false; - GetTopK(topk, src, tid, dim, length); - } else { - for (int k = 0; k < MaxLength; k++) { - if (k < MaxLength - (*beam)) { - topk[k] = topk[k + *beam]; - } else { - topk[k].set(-static_cast(INFINITY), -1); - } - } - if (!(*is_empty)) { - GetTopK(topk + MaxLength - *beam, src, tid, dim, *max, - length); - } - } - - *max = topk[MaxLength - 1]; - if ((*max).v == -static_cast(1)) *is_empty = true; - *beam = 0; - } -} - -template -__device__ __forceinline__ void ThreadGetTopK(Pair topk[], int* beam, - int beam_size, const T* val, - int* col, bool* firstStep, - bool* is_empty, Pair* max, - int dim, const int tid) { - if (*beam > 0) { - int length = (*beam) < beam_size ? *beam : beam_size; - if (*firstStep) { - *firstStep = false; - GetTopK(topk, val, col, tid, dim, length); - } else { - for (int k = 0; k < MaxLength; k++) { - if (k < MaxLength - *beam) { - topk[k] = topk[k + *beam]; - } else { - topk[k].set(-static_cast(INFINITY), -1); - } - } - if (!(*is_empty)) { - GetTopK(topk + MaxLength - *beam, val, col, tid, dim, max, - length); - } - } - - *max = topk[MaxLength - 1]; - if ((*max).v == -1) *is_empty = true; - *beam = 0; - } -} - -template -__device__ __forceinline__ void BlockReduce(Pair* sh_topk, int* maxid, - Pair topk[], T** topVal, - int64_t** topIds, int* beam, int* k, - const int tid, const int warp) { - while (true) { - __syncthreads(); - if (tid < BlockSize / 2) { - if (sh_topk[tid] < sh_topk[tid + BlockSize / 2]) { - maxid[tid] = tid + BlockSize / 2; - } else { - maxid[tid] = tid; - } - } - __syncthreads(); - for (int stride = BlockSize / 4; stride > 0; stride = stride / 2) { - if (tid < stride) { - if (sh_topk[maxid[tid]] < sh_topk[maxid[tid + stride]]) { - maxid[tid] = maxid[tid + stride]; - } - } - __syncthreads(); - } - __syncthreads(); - - if (tid == 0) { - **topVal = sh_topk[maxid[0]].v; - **topIds = sh_topk[maxid[0]].id; - (*topVal)++; - (*topIds)++; - } - if (tid == maxid[0]) (*beam)++; - if (--(*k) == 0) break; - __syncthreads(); - - if (tid == maxid[0]) { - if (*beam < MaxLength) { - sh_topk[tid] = topk[*beam]; - } - } - // NOTE(zcd): temporary solution - unsigned mask = 0u; - CREATE_SHFL_MASK(mask, true); - - if (maxid[0] / 32 == warp) { - if (platform::CudaShuffleSync(mask, *beam, (maxid[0]) % 32, 32) == - MaxLength) - break; - } - } -} - -/** - * Each block compute one sample. - * In a block: - * 1. every thread get top MaxLength value; - * 2. merge to sh_topk, block reduce and get max value; - * 3. go to the second setp, until one thread's topk value is null; - * 4. go to the first setp, until get the topk value. - */ - -template -__global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices, - const T* src, int lds, int dim, int k, - int grid_dim, int num) { - __shared__ Pair sh_topk[BlockSize]; - const int tid = threadIdx.x; - const int warp = threadIdx.x / 32; - - const int bid = blockIdx.x; - for (int i = bid; i < num; i += grid_dim) { - int top_num = k; - __shared__ int maxid[BlockSize / 2]; - T* out = output + i * output_stride; - int64_t* inds = indices + i * k; - Pair topk[MaxLength]; - int beam = MaxLength; - Pair max; - bool is_empty = false; - bool firststep = true; - - for (int j = 0; j < MaxLength; j++) { - topk[j].set(-static_cast(INFINITY), -1); - } - while (top_num) { - ThreadGetTopK( - topk, &beam, k, src + i * lds, &firststep, &is_empty, &max, dim, tid); - - sh_topk[tid] = topk[0]; - BlockReduce(sh_topk, maxid, topk, &out, &inds, - &beam, &top_num, tid, warp); - } - } -} - -template -__global__ void AssignGrad(T* x_grad, const int64_t* indices, const T* out_grad, - size_t rows, size_t cols, size_t k) { - for (size_t i = 0; i < rows; ++i) { - for (size_t j = 0; j < cols; ++j) { - x_grad[i * cols + j] = 0; - } - for (size_t j = 0; j < k; ++j) { - size_t idx = indices[i * k + j]; - x_grad[i * cols + idx] = out_grad[i * k + j]; - } - } -} - -inline static int GetDesiredBlockDim(int dim) { - if (dim > 128) { - return 256; - } else if (dim > 64) { - return 128; - } else if (dim > 32) { - return 64; - } else { - return 32; - } -} - -// Iter for move to next row -struct SegmentOffsetIter { - EIGEN_DEVICE_FUNC - explicit SegmentOffsetIter(int num_cols) : num_cols_(num_cols) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()(int idx) const { - return idx * num_cols_; - } - - int num_cols_; -}; - -// Iter using into a column -struct ColumnIndexIter { - explicit ColumnIndexIter(int num_cols) : num_cols_(num_cols) {} - - EIGEN_DEVICE_FUNC EIGEN_STRONG_INLINE int operator()( - const Eigen::array& ix) const { - return ix[0] % num_cols_; - } - - int num_cols_; -}; - -__global__ void InitIndex(int64_t* indices, int64_t num_rows, - int64_t num_cols) { - int col_id = threadIdx.x; - int row_id = blockIdx.x; - - for (int64_t j = row_id; j < num_rows; j += gridDim.x) { - for (int64_t i = col_id; i < num_cols; i += blockDim.x) { - indices[j * num_cols + i] = i; - } - } -} - -template -bool SortTopk(const platform::CUDADeviceContext& ctx, - const framework::Tensor* input_tensor, const int64_t num_cols, - const int64_t num_rows, const int k, - framework::Tensor* out_tensor, - framework::Tensor* indices_tensor) { - auto cu_stream = ctx.stream(); - - Tensor input_indices; - const std::vector dims = {num_rows, num_cols}; - auto dim = framework::make_ddim(dims); - input_indices.Resize(dim); - // input_indices.Resize(num_rows*num_cols); - input_indices.mutable_data(ctx.GetPlace()); - size_t temp_storage_bytes = -1; - - auto ComputeBlockSize = [](int col) { - if (col > 512) - return 1024; - else if (col > 256 && col <= 512) - return 512; - else if (col > 128 && col <= 256) - return 256; - else if (col > 64 && col <= 128) - return 128; - else - return 64; - }; - - int block_size = ComputeBlockSize(num_cols); - - unsigned int maxGridDimX = ctx.GetCUDAMaxGridDimSize().x; - // actually, int num_rows < max_grid_size - unsigned int grid_size = num_rows < maxGridDimX - ? static_cast(num_rows) - : maxGridDimX; - // Init a index array - InitIndex<<>>( - input_indices.data(), num_rows, num_cols); - - // create iter for counting input - cub::CountingInputIterator counting_iter(0); - // segment_offset is used for move to next row - cub::TransformInputIterator> - segment_offsets_t(counting_iter, SegmentOffsetIter(num_cols)); - - T* sorted_values_ptr; - int64_t* sorted_indices_ptr; - - Tensor temp_values; - Tensor temp_indices; - - const T* input = input_tensor->data(); - T* values = out_tensor->data(); - int64_t* indices = indices_tensor->mutable_data(ctx.GetPlace()); - - if (k == num_cols) { - // Doing a full sort. - sorted_values_ptr = values; - sorted_indices_ptr = indices; - } else { - temp_values.Resize(dim); - temp_indices.Resize(dim); - sorted_values_ptr = temp_values.mutable_data(ctx.GetPlace()); - sorted_indices_ptr = temp_indices.mutable_data(ctx.GetPlace()); - } - - // Get temp storage buffer size, maybe can allocate a fixed buffer to save - // time. - auto err = cub::DeviceSegmentedRadixSort::SortPairsDescending( - nullptr, temp_storage_bytes, input, sorted_values_ptr, - input_indices.data(), sorted_indices_ptr, num_cols * num_rows, - num_rows, segment_offsets_t, segment_offsets_t + 1, 0, sizeof(T) * 8, - cu_stream); - if (err != cudaSuccess) { - LOG(ERROR) - << "TopKOP failed as could not launch " - "cub::DeviceSegmentedRadixSort::SortPairsDescending to calculate " - "temp_storage_bytes, status: " - << cudaGetErrorString(err); - return false; - } - Tensor temp_storage; - temp_storage.mutable_data(ctx.GetPlace(), temp_storage_bytes); - - err = cub::DeviceSegmentedRadixSort::SortPairsDescending( - temp_storage.data(), temp_storage_bytes, input, - sorted_values_ptr, input_indices.data(), sorted_indices_ptr, - num_cols * num_rows, num_rows, segment_offsets_t, segment_offsets_t + 1, - 0, sizeof(T) * 8, cu_stream); - if (err != cudaSuccess) { - LOG(ERROR) - << "TopKOP failed as could not launch " - "cub::DeviceSegmentedRadixSort::SortPairsDescending to sort input, " - "temp_storage_bytes: " - << temp_storage_bytes << ", status: " << cudaGetErrorString(err); - return false; - } - auto& dev = *ctx.eigen_device(); - if (k < num_cols) { - // copy sliced data to output. - const Eigen::DSizes slice_indices{0, 0}; - const Eigen::DSizes slice_sizes{num_rows, k}; - auto e_indices = EigenMatrix::From(*indices_tensor, dim); - auto e_tmp_indices = EigenMatrix::From(temp_indices); - - std::vector odims = {static_cast(num_rows), static_cast(k)}; - auto dim = framework::make_ddim(odims); - auto e_values = EigenMatrix::From(*out_tensor, dim); - auto e_tmp_values = EigenMatrix::From(temp_values); - - e_indices.device(dev) = e_tmp_indices.slice(slice_indices, slice_sizes); - e_values.device(dev) = e_tmp_values.slice(slice_indices, slice_sizes); - } - return true; -} - #define FIXED_BLOCK_DIM_BASE(dim, ...) \ case (dim): { \ constexpr auto kBlockDim = (dim); \ @@ -523,7 +70,6 @@ class TopkOpCUDAKernel : public framework::OpKernel { framework::slice_ddim(inputdims, 0, inputdims.size() - 1)); const int64_t input_width = inputdims[inputdims.size() - 1]; const auto& dev_ctx = ctx.cuda_device_context(); - if ((input_width <= 1024 || k >= 128 || k == input_width)) { if (SortTopk(dev_ctx, input, input_width, input_height, k, output, indices)) { @@ -576,7 +122,6 @@ class TopkOpGradCUDAKernel : public framework::OpKernel { framework::product(framework::slice_ddim(xdims, 0, xdims.size() - 1)); const size_t col = xdims[xdims.size() - 1]; const auto& dev_ctx = context.cuda_device_context(); - const int kMaxHeight = 2048; int gridx = row < kMaxHeight ? row : kMaxHeight; switch (GetDesiredBlockDim(col)) { @@ -595,7 +140,6 @@ class TopkOpGradCUDAKernel : public framework::OpKernel { } // namespace operators } // namespace paddle - REGISTER_OP_CUDA_KERNEL( top_k, paddle::operators::TopkOpCUDAKernel + +namespace paddle { +namespace operators { + +class TopkV2Op : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of TopkOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of TopkOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Indices"), + "Output(Indices) of TopkOp should not be null."); + + auto input_dims = ctx->GetInputDim("X"); + const int& dim_size = input_dims.size(); + const int k = static_cast(ctx->Attrs().Get("k")); + int axis = static_cast(ctx->Attrs().Get("axis")); + PADDLE_ENFORCE_EQ((axis < dim_size) && (axis >= (-1 * dim_size)), true, + "the axis of topk" + "must be [-%d, %d), but you set axis is %d", + dim_size, dim_size, axis); + + if (axis < 0) axis += dim_size; + + PADDLE_ENFORCE_GE( + k, 1, "the attribute of k in the topk must >= 1, but received %d .", k); + PADDLE_ENFORCE_GE(input_dims.size(), 1, + "input of topk must have >= 1d shape"); + + if (ctx->IsRuntime()) { + PADDLE_ENFORCE_GE( + input_dims[axis], k, + "input of topk op must have >= %d columns in axis of %d", k, axis); + } + + framework::DDim dims = input_dims; + + dims[axis] = k; + ctx->SetOutputDim("Out", dims); + ctx->SetOutputDim("Indices", dims); + ctx->ShareLoD("X", "Out"); + ctx->ShareLoD("X", "Indices"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + framework::LibraryType library_{framework::LibraryType::kPlain}; + framework::DataLayout layout_ = framework::DataLayout::kAnyLayout; + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.device_context(), + layout_, library_); + } +}; + +class TopkV2OpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "(Tensor) The input of Topk op"); + AddInput("K", + "(Tensor) Number of top elements to look for along " + "the last dimension (along each row for matrices).") + .AsDispensable(); + AddOutput("Out", "(Tensor) The output tensor of Topk op"); + AddOutput("Indices", "(Tensor) The indices of Topk elements of input"); + AddComment(R"DOC( +Top K operator + +If the input is a vector (1d tensor), this operator finds the k largest +entries in the vector and outputs their values and indices as vectors. +Thus values[j] is the j-th largest entry in input, and its index is indices[j]. + +For matrices, this operator computes the top k entries in each row. )DOC"); + AddAttr("k", + "(int, default 1) Number of top elements to look for along " + "the tensor).") + .SetDefault(1); + AddAttr("axis", + "the axis to sort and get the k indices, value." + "if not set, will get k value in last axis.") + .SetDefault(-1); + AddAttr("largest", + "control flag whether to return largest or smallest") + .SetDefault(true); + AddAttr("sorted", + "control flag whether to return elements in sorted order") + .SetDefault(true); + } +}; + +class TopkV2OpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ( + ctx->HasInput("X"), true, + platform::errors::InvalidArgument("Input(X) should be not null")); + PADDLE_ENFORCE_EQ( + ctx->HasInput("Indices"), true, + platform::errors::InvalidArgument("Input(Indices) should be not null")); + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Out")), true, + platform::errors::InvalidArgument( + "Grad Input(Out) should be not null")); + PADDLE_ENFORCE_EQ( + ctx->HasOutput(framework::GradVarName("X")), true, + platform::errors::InvalidArgument("Grad Output(X) should be not null")); + + auto x_dims = ctx->GetInputDim("X"); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +template +class TopkV2GradOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("top_k_v2_grad"); + op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + op->SetInput("X", this->Input("X")); + op->SetInput("Indices", this->Output("Indices")); + op->SetOutput(framework::GradVarName("X"), this->InputGrad("X")); + op->SetAttrMap(this->Attrs()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(top_k_v2, ops::TopkV2Op, ops::TopkV2OpMaker, + ops::TopkV2GradOpMaker, + ops::TopkV2GradOpMaker); + +REGISTER_OPERATOR(top_k_v2_grad, ops::TopkV2OpGrad); + +REGISTER_OP_CPU_KERNEL(top_k_v2, + ops::TopkV2Kernel, + ops::TopkV2Kernel, + ops::TopkV2Kernel, + ops::TopkV2Kernel) + +REGISTER_OP_CPU_KERNEL( + top_k_v2_grad, ops::TopkV2GradKernel, + ops::TopkV2GradKernel, + ops::TopkV2GradKernel, + ops::TopkV2GradKernel) diff --git a/paddle/fluid/operators/top_k_v2_op.cu b/paddle/fluid/operators/top_k_v2_op.cu new file mode 100644 index 00000000000..51545032920 --- /dev/null +++ b/paddle/fluid/operators/top_k_v2_op.cu @@ -0,0 +1,272 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/p_norm_op.h" +#include "paddle/fluid/operators/top_k_function_cuda.h" +#include "paddle/fluid/operators/top_k_v2_op.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +#define FIXED_BLOCK_DIM_BASE(dim, ...) \ + case (dim): { \ + constexpr auto kBlockDim = (dim); \ + __VA_ARGS__; \ + } break + +#define FIXED_BLOCK_DIM(...) \ + FIXED_BLOCK_DIM_BASE(256, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_BASE(128, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_BASE(64, ##__VA_ARGS__); \ + FIXED_BLOCK_DIM_BASE(32, ##__VA_ARGS__) + +template +class TopkV2OpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_ENFORCE(platform::is_gpu_place(ctx.GetPlace()), + "It must use CUDAPlace."); + auto* input = ctx.Input("X"); + auto* output = ctx.Output("Out"); + auto* indices = ctx.Output("Indices"); + + // get the attributes + int k = static_cast(ctx.Attr("k")); + int axis = static_cast(ctx.Attr("axis")); + const bool& sorted = static_cast(ctx.Attr("sorted")); + const bool& largest = static_cast(ctx.Attr("largest")); + + // get the input dims + const auto& in_dims = input->dims(); + // calcluate the real axis + if (axis < 0) axis += in_dims.size(); + + auto* k_t = ctx.Input("K"); + if (k_t) { + Tensor k_host; + framework::TensorCopySync(*k_t, platform::CPUPlace(), &k_host); + k = k_host.data()[0]; + framework::DDim output_dims = output->dims(); + output_dims[axis] = k; + output->Resize(output_dims); + indices->Resize(output_dims); + } + + const auto& out_dims = output->dims(); + + const T* input_data = input->data(); + T* output_data = output->mutable_data(ctx.GetPlace()); + int64_t* indices_data = indices->mutable_data(ctx.GetPlace()); + + if (axis == in_dims.size() - 1) { + // if get the topK from the last axis + const int64_t& input_height = framework::product( + framework::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t& input_width = in_dims[in_dims.size() - 1]; + const auto& dev_ctx = ctx.cuda_device_context(); + + if (k > input_width) k = input_width; + + if ((input_width <= 1024 || k >= 128 || k == input_width)) { + if (SortTopk(dev_ctx, input, input_width, input_height, k, output, + indices, largest)) { + // Successed, return. + return; + } else { + LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use " + "default topk kernel."; + } + } + + // NOTE: pass lds and dim same to input width. + // NOTE: old matrix implementation of stride is different to eigen. + const int kMaxHeight = 2048; + int gridx = input_height < kMaxHeight ? input_height : kMaxHeight; + switch (GetDesiredBlockDim(input_width)) { + FIXED_BLOCK_DIM( + KeMatrixTopK<<>>( + output_data, k, indices_data, input_data, input_width, + input_width, static_cast(k), gridx, input_height, + largest)); + default: + PADDLE_THROW(platform::errors::Fatal( + "the input data shape has error in the topk cuda kernel.")); + } + } else { + // if get topK not from the last axis, will tranpose the tensor and get + // TopK + + // first step, prepare the trans args for the tranpose + std::vector trans; + for (int i = 0; i < axis; i++) { + trans.emplace_back(i); + } + trans.emplace_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans.emplace_back(i); + } + trans.emplace_back(axis); + + framework::DDim trans_dims(in_dims); + framework::DDim trans_out_dims(output->dims()); + for (int i = 0; i < trans.size(); i++) { + trans_dims[i] = in_dims[trans[i]]; + trans_out_dims[i] = out_dims[trans[i]]; + } + // second step, tranpose the input + Tensor trans_input; + trans_input.mutable_data(trans_dims, ctx.GetPlace()); + int ndims = trans.size(); + const auto& dev_ctx = ctx.cuda_device_context(); + TransCompute(ndims, dev_ctx, *input, + &trans_input, trans); + // third step, calcluate the topk + // allocate the tmp cuda memory for the tmp result + Tensor trans_ind; + trans_ind.mutable_data(trans_out_dims, ctx.GetPlace()); + Tensor trans_out; + trans_out.mutable_data(trans_out_dims, ctx.GetPlace()); + + const int64_t input_height = framework::product( + framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); + const int64_t input_width = trans_dims[trans_dims.size() - 1]; + + if (k > input_width) k = input_width; + + if ((input_width <= 1024 || k >= 128 || k == input_width)) { + if (SortTopk(dev_ctx, &trans_input, input_width, input_height, k, + &trans_out, &trans_ind, largest)) { + // last step, tranpose back the indices and output + TransCompute( + ndims, dev_ctx, trans_ind, indices, trans); + TransCompute( + ndims, dev_ctx, trans_out, output, trans); + return; + } else { + LOG(INFO) << "TopKOP: Some errors happened when use cub sorting, use " + "default topk kernel."; + } + } + + const int kMaxHeight = 2048; + int gridx = input_height < kMaxHeight ? input_height : kMaxHeight; + switch (GetDesiredBlockDim(input_width)) { + FIXED_BLOCK_DIM( + KeMatrixTopK<<>>( + trans_out.data(), k, trans_ind.data(), + trans_input.data(), input_width, input_width, + static_cast(k), gridx, input_height, largest)); + default: + PADDLE_THROW(platform::errors::Fatal( + "the input data shape has error in the topk cuda kernel.")); + } + + // last step, tranpose back the indices and output + TransCompute( + ndims, dev_ctx, trans_ind, indices, trans); + TransCompute(ndims, dev_ctx, trans_out, + output, trans); + } + } +}; + +#undef FIXED_BLOCK_DIM_BASE +#undef FIXED_BLOCK_DIM +template +class TopkV2OpGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + PADDLE_ENFORCE_EQ( + platform::is_gpu_place(context.GetPlace()), true, + platform::errors::InvalidArgument("It must use CUDAPlace.")); + auto* x = context.Input("X"); + auto* out_grad = context.Input(framework::GradVarName("Out")); + auto* indices = context.Input("Indices"); + auto* x_grad = context.Output(framework::GradVarName("X")); + int axis = context.Attr("axis"); + + const auto& in_dims = x->dims(); + const auto& out_dims = indices->dims(); + + // get the real the axis and the k + if (axis < 0) axis += in_dims.size(); + const int& k = out_dims[axis]; + const int& raw_height = in_dims[axis]; + + // allocate the cuda memory for the x_grad + T* x_grad_data = x_grad->mutable_data(context.GetPlace()); + const T* out_grad_data = out_grad->data(); + const int64_t* indices_data = indices->data(); + + int pre, n, post; + GetDims(in_dims, axis, &pre, &n, &post); + + // calcluate the block and grid num + auto& dev_ctx = context.cuda_device_context(); + auto ComputeBlockSize = [](int col) { + if (col > 512) + return 1024; + else if (col > 256 && col <= 512) + return 512; + else if (col > 128 && col <= 256) + return 256; + else if (col > 64 && col <= 128) + return 128; + else + return 64; + }; + int block_size = ComputeBlockSize(post * k); + int max_threads = dev_ctx.GetMaxPhysicalThreadCount(); + const int max_blocks = std::max(((max_threads - 1) / block_size + 1), 1); + int grid_size = std::min(max_blocks, pre); + + // lanuch the cuda kernel to assign the grad + AssignGradWithAxis<<>>( + out_grad_data, indices_data, x_grad_data, pre, post, n, k); + } +}; + +} // namespace operators +} // namespace paddle + +REGISTER_OP_CUDA_KERNEL( + top_k_v2, + paddle::operators::TopkV2OpCUDAKernel, + paddle::operators::TopkV2OpCUDAKernel, + paddle::operators::TopkV2OpCUDAKernel, + paddle::operators::TopkV2OpCUDAKernel, + paddle::operators::TopkV2OpCUDAKernel); + +REGISTER_OP_CUDA_KERNEL( + top_k_v2_grad, paddle::operators::TopkV2OpGradCUDAKernel< + paddle::platform::CUDADeviceContext, float>, + paddle::operators::TopkV2OpGradCUDAKernel< + paddle::platform::CUDADeviceContext, double>, + paddle::operators::TopkV2OpGradCUDAKernel< + paddle::platform::CUDADeviceContext, int>, + paddle::operators::TopkV2OpGradCUDAKernel< + paddle::platform::CUDADeviceContext, int64_t>, + paddle::operators::TopkV2OpGradCUDAKernel< + paddle::platform::CUDADeviceContext, paddle::platform::float16>); diff --git a/paddle/fluid/operators/top_k_v2_op.h b/paddle/fluid/operators/top_k_v2_op.h new file mode 100644 index 00000000000..a77285d1236 --- /dev/null +++ b/paddle/fluid/operators/top_k_v2_op.h @@ -0,0 +1,321 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +/* + The reason why we need the topk v2 is because the compatibility. We redefine + the NaN is maximum value + in the process of comparing. If do not add the topk v2, will affect the + inference result of model that traing + by the older version paddlepaddle. +*/ + +#pragma once +#include +#include +#include +#include +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/top_k_op.h" +#include "paddle/fluid/operators/transpose_op.h" + +namespace paddle { +namespace operators { + +template +static void FullTopK(Type input_height, Type input_width, int input_dim, + const framework::Tensor* input, T* t_out, Type* t_indices, + const int& k, const bool& largest, const bool& sorted) { + // when the k is small, will the partial sort + bool partial_sort_flag = (k * 64) < input_width; + +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + // Eigen::DSizes flat2dims(input_height, input_width); + for (Type i = 0; i < input_height; ++i) { + std::vector> col_vec; + col_vec.reserve(input_width); + if (input_dim == 1) { + auto e_input = EigenVector::Flatten(*input); + for (Type j = 0; j < input_width; ++j) { + col_vec.emplace_back(std::pair(e_input(j), j)); + } + } else { + auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); + for (Type j = 0; j < input_width; ++j) { + col_vec.emplace_back(std::pair(e_input(i, j), j)); + } + } + if (partial_sort_flag) { + std::partial_sort( + col_vec.begin(), col_vec.begin() + k, col_vec.end(), + [&largest](const std::pair& l, const std::pair& r) { + if (largest) { + return (std::isnan(static_cast(l.first)) && + !std::isnan(static_cast(r.first))) || + (l.first > r.first); + } else { + return (!std::isnan(static_cast(l.first)) && + std::isnan(static_cast(r.first))) || + (l.first < r.first); + } + }); + } else { + // use the nth-element to get the K-larger or K-small element + if (largest) { + std::nth_element( + col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(), + [](const std::pair& l, const std::pair& r) { + return (std::isnan(static_cast(l.first)) && + !std::isnan(static_cast(r.first))) || + (l.first > r.first); + }); + // the nth-element will get the unorder elements, sort the element + if (sorted) { + std::sort(col_vec.begin(), col_vec.begin() + k - 1, + [&largest](const std::pair& l, + const std::pair& r) { + return (std::isnan(static_cast(l.first)) && + !std::isnan(static_cast(r.first))) || + (l.first > r.first); + }); + } + } else { + std::nth_element( + col_vec.begin(), col_vec.begin() + k - 1, col_vec.end(), + [](const std::pair& l, const std::pair& r) { + return (!std::isnan(static_cast(l.first)) && + std::isnan(static_cast(r.first))) || + (l.first < r.first); + }); + // the nth-element will get the unorder elements, sort the element + if (sorted) { + std::sort( + col_vec.begin(), col_vec.begin() + k - 1, + [](const std::pair& l, const std::pair& r) { + return (!std::isnan(static_cast(l.first)) && + std::isnan(static_cast(r.first))) || + (l.first < r.first); + }); + } + } + } + for (Type j = 0; j < k; ++j) { + t_out[i * k + j] = col_vec[j].first; + t_indices[i * k + j] = col_vec[j].second; + } + } +} + +template +static void FullTopKAssign(const Type& input_height, const Type& input_width, + const int& input_dim, const framework::Tensor* input, + const framework::Tensor* indices, T* output_data, + const int& k) { +#ifdef PADDLE_WITH_MKLML +#pragma omp parallel for +#endif + for (Type i = 0; i < input_height; ++i) { + if (input_dim == 1) { + auto e_input = EigenVector::Flatten(*input); + auto e_indices = EigenVector::Flatten(*indices); + for (Type j = 0; j < k; ++j) { + output_data[i * input_width + e_indices(j)] = e_input(j); + } + } else { + auto e_input = EigenMatrix::Reshape(*input, input_dim - 1); + auto e_indices = EigenMatrix::Reshape(*indices, input_dim - 1); + for (Type j = 0; j < k; ++j) { + output_data[i * input_width + e_indices(i, j)] = e_input(i, j); + } + } + } +} + +template +class TopkV2Kernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + // Get the top k elements of each row of input tensor + auto* input = context.Input("X"); + auto* output = context.Output("Out"); + auto* indices = context.Output("Indices"); + const auto& in_dims = input->dims(); + int k = static_cast(context.Attr("k")); + const auto& sorted = static_cast(context.Attr("sorted")); + const auto& largest = static_cast(context.Attr("largest")); + + // axis < 0, cacluate the real axis + int axis = static_cast(context.Attr("axis")); + if (axis < 0) axis += in_dims.size(); + + // if K tensor is not null, will the use K tesnor as k + auto* k_t = context.Input("K"); + if (k_t) { + k = k_t->data()[0]; + framework::DDim output_dims = output->dims(); + // accroding to axis to set K value in the dim + output_dims[axis] = k; + output->Resize(output_dims); + indices->Resize(output_dims); + } + + T* output_data = output->mutable_data(context.GetPlace()); + int64_t* indices_data = indices->mutable_data(context.GetPlace()); + const auto& out_dims = output->dims(); + if (axis + 1 == in_dims.size()) { + const int64_t& input_height = framework::product( + framework::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t& input_width = in_dims[in_dims.size() - 1]; + FullTopK(input_height, input_width, in_dims.size(), input, + output_data, indices_data, k, largest, sorted); + } else { + // if the topk dims is not last dim, will tranpose and do topk + std::vector trans; + for (int i = 0; i < axis; i++) { + trans.emplace_back(i); + } + trans.push_back(in_dims.size() - 1); + for (int i = axis + 1; i < in_dims.size() - 1; i++) { + trans.emplace_back(i); + } + trans.emplace_back(axis); + + // get the trans input_dims, out_dims + framework::DDim trans_dims(in_dims); + framework::DDim trans_out_dims(output->dims()); + for (size_t i = 0; i < trans.size(); i++) { + trans_dims[i] = in_dims[trans[i]]; + } + for (size_t i = 0; i < trans.size(); i++) { + trans_out_dims[i] = out_dims[trans[i]]; + } + + Tensor trans_inp; + trans_inp.mutable_data(trans_dims, context.GetPlace()); + int ndims = trans.size(); + auto& dev_context = + context.template device_context(); + + // transpose the input value + TransCompute(ndims, dev_context, *input, + &trans_inp, trans); + + const int64_t input_height = framework::product( + framework::slice_ddim(trans_dims, 0, trans_dims.size() - 1)); + const int64_t input_width = trans_dims[trans_dims.size() - 1]; + + // Allocate the temp tensor to the save the topk indices, values + Tensor tmp_out; + T* t_out = tmp_out.mutable_data(trans_out_dims, context.GetPlace()); + Tensor tmp_indices; + auto* t_ind = + tmp_indices.mutable_data(trans_out_dims, context.GetPlace()); + + // get the TopK value + FullTopK(input_height, input_width, in_dims.size(), + &trans_inp, t_out, t_ind, k, largest, sorted); + // transpose back + TransCompute( + ndims, dev_context, tmp_indices, indices, trans); + TransCompute(ndims, dev_context, tmp_out, + output, trans); + } + } +}; + +template +class TopkV2GradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out_grad = context.Input(framework::GradVarName("Out")); + auto* indices = context.Input("Indices"); + auto* x_grad = context.Output(framework::GradVarName("X")); + int axis = static_cast(context.Attr("axis")); + + const auto& in_dims = x->dims(); + const auto& out_dims = indices->dims(); + + // axis < 0, get the real axis + axis = (axis < 0) ? (in_dims.size() + axis) : axis; + const size_t& k = out_dims[axis]; + + T* x_grad_data = x_grad->mutable_data(context.GetPlace()); + if (axis + 1 == in_dims.size()) { + // allocate the memory for the input_grad + + // assign the out_grad to input_grad directly + const int64_t input_height = framework::product( + framework::slice_ddim(in_dims, 0, in_dims.size() - 1)); + const int64_t input_width = in_dims[in_dims.size() - 1]; + + // init the output grad with 0, because some input elements has no grad + memset(x_grad_data, 0, x_grad->numel() * sizeof(T)); + // Assign the output_grad to input_grad + FullTopKAssign(input_height, input_width, in_dims.size(), out_grad, + indices, x_grad_data, k); + } else { + // can not assign grad to input_grad, must do the transpose + std::vector trans; + for (int i = 0; i < axis; i++) { + trans.emplace_back(i); + } + trans.emplace_back(out_dims.size() - 1); + for (int i = axis + 1; i < out_dims.size() - 1; i++) { + trans.emplace_back(i); + } + trans.emplace_back(axis); + framework::DDim trans_dims(out_dims); + framework::DDim trans_in_dims(in_dims); + for (size_t i = 0; i < trans.size(); i++) { + trans_dims[i] = out_dims[trans[i]]; + trans_in_dims[i] = in_dims[trans[i]]; + } + // transpose the out_grad, indices + Tensor trans_dO; + trans_dO.mutable_data(trans_dims, context.GetPlace()); + Tensor trans_ind; + trans_ind.mutable_data(trans_dims, context.GetPlace()); + int ndims = trans.size(); + auto& dev_context = + context.template device_context(); + + // Do transpose + TransCompute(ndims, dev_context, *out_grad, + &trans_dO, trans); + TransCompute( + ndims, dev_context, *indices, &trans_ind, trans); + const int64_t input_height = framework::product( + framework::slice_ddim(trans_in_dims, 0, trans_in_dims.size() - 1)); + const int64_t input_width = trans_in_dims[trans_in_dims.size() - 1]; + + // Assign the out_grad to tranpose input_grad + Tensor tmp_out; + T* t_out = tmp_out.mutable_data(trans_in_dims, context.GetPlace()); + memset(t_out, 0, x_grad->numel() * sizeof(T)); + + FullTopKAssign(input_height, input_width, in_dims.size(), + &trans_dO, &trans_ind, t_out, k); + + // Transpose back + TransCompute(ndims, dev_context, tmp_out, + x_grad, trans); + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py new file mode 100644 index 00000000000..54e7765c0fb --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_top_k_v2_op.py @@ -0,0 +1,244 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +import paddle +import paddle.fluid.core as core + + +def numpy_topk(x, k=1, axis=-1, largest=True): + if axis < 0: + axis = len(x.shape) + axis + if largest: + indices = np.argsort(-x, axis=axis) + else: + indices = np.argsort(x, axis=axis) + if largest: + value = -np.sort(-x, axis=axis) + else: + value = np.sort(x, axis=axis) + indices = indices.take(indices=range(0, k), axis=axis) + value = value.take(indices=range(0, k), axis=axis) + return value, indices + + +class TestTopkOp(OpTest): + def init_args(self): + self.k = 3 + self.axis = 1 + self.largest = True + + def setUp(self): + self.op_type = "top_k_v2" + self.dtype = np.float64 + self.input_data = np.random.rand(10, 20) + self.init_args() + self.inputs = {'X': self.input_data} + self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest} + output, indices = numpy_topk( + self.input_data, axis=self.axis, k=self.k, largest=self.largest) + self.outputs = {'Out': output, 'Indices': indices} + + def test_check_output(self): + paddle.enable_static() + self.check_output() + + def test_check_grad(self): + paddle.enable_static() + self.check_grad(set(['X']), 'Out') + + +class TestTopOp1(TestTopkOp): + def init_args(self): + self.k = 3 + self.axis = 0 + self.largest = True + + +class TestTopOp2(TestTopkOp): + def init_args(self): + self.k = 3 + self.axis = 0 + self.largest = False + + +class TestTopOp3(TestTopkOp): + def init_args(self): + self.k = 4 + self.axis = 0 + self.largest = False + + +class TestTopOp4(TestTopkOp): + def init_args(self): + self.k = 4 + self.axis = 0 + self.largest = False + + +class TestTopkOp5(TestTopkOp): + def init_args(self): + self.k = 3 + self.axis = 1 + self.largest = True + + def setUp(self): + self.op_type = "top_k_v2" + self.dtype = np.float64 + self.input_data = np.random.rand(10, 10, 5) + self.init_args() + self.inputs = {'X': self.input_data} + self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest} + output, indices = numpy_topk( + self.input_data, axis=self.axis, k=self.k, largest=self.largest) + self.outputs = {'Out': output, 'Indices': indices} + + +class TestTopkOp6(TestTopkOp): + def init_args(self): + self.k = 3 + self.axis = 1 + self.largest = True + + def setUp(self): + self.op_type = "top_k_v2" + self.dtype = np.float64 + self.input_data = np.random.rand(10, 10, 5) + self.init_args() + self.inputs = {'X': self.input_data} + self.attrs = {'k': self.k, 'axis': self.axis, 'largest': self.largest} + output, indices = numpy_topk( + self.input_data, axis=self.axis, k=self.k, largest=self.largest) + self.outputs = {'Out': output, 'Indices': indices} + + +class TestTopKAPI(unittest.TestCase): + def setUp(self): + np.random.seed(123) + self.input_data = np.random.rand(6, 7, 8) + self.large_input_data = np.random.rand(2, 1030) + + def run_dygraph(self, place): + paddle.disable_static(place) + input_tensor = paddle.to_tensor(self.input_data) + large_input_tensor = paddle.to_tensor(self.large_input_data) + # test case for basic test case 1 + paddle_result = paddle.topk(input_tensor, k=2) + numpy_result = numpy_topk(self.input_data, k=2) + self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + # test case for basic test case 2 with axis + paddle_result = paddle.topk(input_tensor, k=2, axis=1) + numpy_result = numpy_topk(self.input_data, k=2, axis=1) + self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + # test case for basic test case 3 with tensor K + k_tensor = paddle.to_tensor(np.array([2])) + paddle_result = paddle.topk(input_tensor, k=k_tensor, axis=1) + numpy_result = numpy_topk(self.input_data, k=2, axis=1) + self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + # test case for basic test case 4 with tensor largest + k_tensor = paddle.to_tensor(np.array([2])) + paddle_result = paddle.topk(input_tensor, k=2, axis=1, largest=False) + numpy_result = numpy_topk(self.input_data, k=2, axis=1, largest=False) + self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + # test case for basic test case 5 with axis -1 + k_tensor = paddle.to_tensor(np.array([2])) + paddle_result = paddle.topk(input_tensor, k=2, axis=-1, largest=False) + numpy_result = numpy_topk(self.input_data, k=2, axis=-1, largest=False) + self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + # test case for basic test case 6 for the partial sort + paddle_result = paddle.topk(large_input_tensor, k=1, axis=-1) + numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1) + self.assertTrue(np.allclose(paddle_result[0].numpy(), numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[1].numpy(), numpy_result[1])) + # test case for basic test case 7 for the unsorted + paddle_result = paddle.topk(input_tensor, k=2, axis=1, sorted=False) + sort_paddle = numpy_topk( + np.array(paddle_result[0].numpy()), axis=1, k=2) + numpy_result = numpy_topk(self.input_data, k=2, axis=1) + self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0])) + + def run_static(self, place): + paddle.enable_static() + with paddle.static.program_guard(paddle.static.Program(), + paddle.static.Program()): + input_tensor = paddle.static.data( + name="x", shape=[6, 7, 8], dtype="float64") + large_input_tensor = paddle.static.data( + name="large_x", shape=[2, 1030], dtype="float64") + k_tensor = paddle.static.data(name="k", shape=[1], dtype="int32") + result1 = paddle.topk(input_tensor, k=2) + result2 = paddle.topk(input_tensor, k=2, axis=-1) + result3 = paddle.topk(input_tensor, k=k_tensor, axis=1) + result4 = paddle.topk(input_tensor, k=2, axis=1, largest=False) + result5 = paddle.topk(input_tensor, k=2, axis=-1, largest=False) + result6 = paddle.topk(large_input_tensor, k=1, axis=-1) + result7 = paddle.topk(input_tensor, k=2, axis=1, sorted=False) + exe = paddle.static.Executor(place) + input_data = np.random.rand(10, 20).astype("float64") + large_input_data = np.random.rand(2, 100).astype("float64") + paddle_result = exe.run( + feed={ + "x": self.input_data, + "large_x": self.large_input_data, + "k": np.array([2]).astype("int32") + }, + fetch_list=[ + result1[0], result1[1], result2[0], result2[1], result3[0], + result3[1], result4[0], result4[1], result5[0], result5[1], + result6[0], result6[1], result7[0], result7[1] + ]) + numpy_result = numpy_topk(self.input_data, k=2) + self.assertTrue(np.allclose(paddle_result[0], numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[1], numpy_result[1])) + numpy_result = numpy_topk(self.input_data, k=2, axis=-1) + self.assertTrue(np.allclose(paddle_result[2], numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[3], numpy_result[1])) + numpy_result = numpy_topk(self.input_data, k=2, axis=1) + self.assertTrue(np.allclose(paddle_result[4], numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[5], numpy_result[1])) + numpy_result = numpy_topk( + self.input_data, k=2, axis=1, largest=False) + self.assertTrue(np.allclose(paddle_result[6], numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[7], numpy_result[1])) + numpy_result = numpy_topk( + self.input_data, k=2, axis=-1, largest=False) + self.assertTrue(np.allclose(paddle_result[8], numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[9], numpy_result[1])) + numpy_result = numpy_topk(self.large_input_data, k=1, axis=-1) + self.assertTrue(np.allclose(paddle_result[10], numpy_result[0])) + self.assertTrue(np.allclose(paddle_result[11], numpy_result[1])) + sort_paddle = numpy_topk(paddle_result[12], axis=1, k=2) + numpy_result = numpy_topk(self.input_data, k=2, axis=1) + self.assertTrue(np.allclose(sort_paddle[0], numpy_result[0])) + + def test_cases(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + for place in places: + self.run_dygraph(place) + self.run_static(place) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py index 91ad3bfa9cc..eede022e05b 100644 --- a/python/paddle/tensor/search.py +++ b/python/paddle/tensor/search.py @@ -21,7 +21,6 @@ from ..fluid import core, layers from ..fluid.layers import argmin #DEFINE_ALIAS from ..fluid.layers import has_inf #DEFINE_ALIAS from ..fluid.layers import has_nan #DEFINE_ALIAS -from ..fluid.layers import topk #DEFINE_ALIAS __all__ = [ 'argmax', @@ -756,3 +755,100 @@ def masked_select(x, mask, name=None): type='masked_select', inputs={'X': x, 'Mask': mask}, outputs={'Y': out}) return out + + +def topk(x, k, axis=None, largest=True, sorted=True, name=None): + """ + This OP is used to find values and indices of the k largest or smallest at the optional axis. + If the input is a 1-D Tensor, finds the k largest or smallest values and indices. + If the input is a Tensor with higher rank, this operator computes the top k values and indices along the :attr:`axis`. + + Args: + x(Tensor): Tensor, an input N-D Tensor with type float32, float64, int32, int64. + k(int, Tensor): The number of top elements to look for along the axis. + axis(int, optional): Axis to compute indices along. The effective range + is [-R, R), where R is x.ndim. when axis < 0, it works the same way + as axis + R. Default is -1. + largest(bool, optional) : largest is a flag, if set to true, + algorithm will sort by descending order, otherwise sort by + ascending order. Default is True. + sorted(bool, optional): controls whether to return the elements in sorted order, default value is True. In gpu device, it always return the sorted value. + name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`. + + Returns: + tuple(Tensor), return the values and indices. The value data type is the same as the input `x`. The indices data type is int64. + + Examples: + + .. code-block:: python + + import numpy as np + import paddle + + paddle.disable_static() + + data_1 = np.array([1, 4, 5, 7]) + tensor_1 = paddle.to_tensor(data_1) + value_1, indices_1 = paddle.topk(tensor_1, k=1) + print(value_1.numpy()) + # [7] + print(indices_1.numpy()) + # [3] + data_2 = np.array([[1, 4, 5, 7], [2, 6, 2, 5]]) + tensor_2 = paddle.to_tensor(data_2) + value_2, indices_2 = paddle.topk(tensor_2, k=1) + print(value_2.numpy()) + # [[7] + # [6]] + print(indices_2.numpy()) + # [[3] + # [1]] + value_3, indices_3 = paddle.topk(tensor_2, k=1, axis=-1) + print(value_3.numpy()) + # [[7] + # [6]] + print(indices_3.numpy()) + # [[3] + # [1]] + value_4, indices_4 = paddle.topk(tensor_2, k=1, axis=0) + print(value_4.numpy()) + # [[2 6 5 7]] + print(indices_4.numpy()) + # [[1 1 0 0]] + + """ + if in_dygraph_mode(): + k = k.numpy().item(0) if isinstance(k, Variable) else k + if axis is None: + out, indices = core.ops.top_k_v2(x, 'k', + int(k), 'largest', largest, + 'sorted', sorted) + else: + out, indices = core.ops.top_k_v2(x, 'k', + int(k), 'axis', axis, 'largest', + largest, 'sorted', sorted) + return out, indices + + helper = LayerHelper("top_k_v2", **locals()) + inputs = {"X": [x]} + attrs = {} + if isinstance(k, Variable): + inputs['K'] = [k] + else: + attrs = {'k': k} + attrs['largest'] = largest + attrs['sorted'] = sorted + if axis is not None: + attrs['axis'] = axis + + values = helper.create_variable_for_type_inference(dtype=x.dtype) + indices = helper.create_variable_for_type_inference(dtype="int64") + + helper.append_op( + type="top_k_v2", + inputs=inputs, + outputs={"Out": [values], + "Indices": [indices]}, + attrs=attrs) + indices.stop_gradient = True + return values, indices -- GitLab