未验证 提交 2b879a69 编写于 作者: 六个骨头 提交者: GitHub

【Hackathon No.21】为 Paddle 新增 paddle.incubate.sparse.transpose 稀疏 API (#45849)

上级 4b9dae01
......@@ -385,6 +385,17 @@
kernel :
func : coo_to_dense { sparse_coo -> dense }
- backward_op : transpose_grad
forward : transpose(Tensor x, int[] perm) -> Tensor(out)
args : (Tensor out_grad, int[] perm)
output : Tensor(x_grad)
infer_meta :
func : TransposeGradInferMeta
param : [out_grad, perm]
kernel :
func : transpose_coo_grad {sparse_coo -> sparse_coo},
transpose_csr_grad {sparse_csr -> sparse_csr}
- backward_op : values_grad
forward : values_coo(Tensor x) -> Tensor(out)
args : (Tensor x, Tensor out_grad)
......
......@@ -457,3 +457,15 @@
mv_csr{sparse_csr, dense -> dense}
layout : x
backward: mv_grad
- op : transpose
args : (Tensor x, int[] perm)
output : Tensor(out)
infer_meta :
func : TransposeInferMeta
param: [ x, perm ]
kernel :
func : transpose_coo{sparse_coo -> sparse_coo},
transpose_csr{sparse_csr -> sparse_csr}
layout : x
backward : transpose_grad
......@@ -274,7 +274,7 @@ class SparseCooTensor : public TensorBase,
[0, 0, 0, 0]]
dims_ = (4, 4)
non_zero_elements_ = [[0, 1, 0, 0], [0, 0, 4, 0]]
non_zero_indices_ = [0, 2],
non_zero_indices_ = [[0, 2], [1, 2]]
*/
};
......
......@@ -209,7 +209,7 @@ class SparseCsrTensor : public TensorBase,
[0, 0, 4, 0],
[0, 5, 0, 6]]
dims_ = (4, 4)
non_zero_elements_ = [1, 2, 3, 4, 5 ,6]
non_zero_elements_ = [1, 2, 3, 4, 5, 6]
non_zero_crows_ = [0, 1, 3, 4, 6]
non_zero_cols_ = [1, 0, 3, 2, 1, 3]
*/
......@@ -228,7 +228,7 @@ class SparseCsrTensor : public TensorBase,
[0, 0, 4, 0],
[0, 5, 0, 0]]]
dims_ = (2, 4, 4)
non_zero_elements_ = [1, 2, 3, 4, 5 ,6, 1, 2, 3, 4, 5]
non_zero_elements_ = [1, 2, 3, 4, 5, 6, 1, 2, 3, 4, 5]
non_zero_crows_ = [0, 1, 3, 4, 6, 0, 1, 2, 4, 5]
non_zero_cols_ = [1, 0, 3, 2, 1, 3, 1, 0, 3, 2, 1]
*/
......
......@@ -24,5 +24,12 @@ void IndicesInferMeta(const MetaTensor& x, MetaTensor* out);
void ValuesInferMeta(const MetaTensor& x, MetaTensor* out);
void TransposeInferMeta(const MetaTensor& x,
const std::vector<int>& axis,
MetaTensor* out);
void TransposeGradInferMeta(const MetaTensor& x,
const std::vector<int>& axis,
MetaTensor* out);
} // namespace sparse
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/sparse/unary_grad_kernel.h"
#include "paddle/phi/kernels/sparse/unary_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/sparse/empty_kernel.h"
#include "paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h"
namespace phi {
namespace sparse {
std::vector<int> get_cpu_grad_perm(std::vector<int> perm) {
std::vector<int> grad_perm(perm.size());
for (unsigned int i = 0; i < perm.size(); ++i) {
grad_perm[perm[i]] = i;
}
return grad_perm;
}
template <typename T, typename Context>
void TransposeCooGradKernel(const Context& dev_ctx,
const SparseCooTensor& dout,
const std::vector<int>& perm,
SparseCooTensor* dx) {
std::vector<int> grad_perm = get_cpu_grad_perm(perm);
TransposeCooKernel<T, Context>(dev_ctx, dout, grad_perm, dx);
}
template <typename T, typename Context>
void TransposeCsrGradKernel(const Context& dev_ctx,
const SparseCsrTensor& dout,
const std::vector<int>& perm,
SparseCsrTensor* dx) {
std::vector<int> grad_perm = get_cpu_grad_perm(perm);
TransposeCsrKernel<T, Context>(dev_ctx, dout, grad_perm, dx);
}
} // namespace sparse
} // namespace phi
PD_REGISTER_KERNEL(transpose_coo_grad,
CPU,
ALL_LAYOUT,
phi::sparse::TransposeCooGradKernel,
float,
double,
int8_t,
uint8_t,
int16_t,
int,
int64_t,
bool) {}
PD_REGISTER_KERNEL(transpose_csr_grad,
CPU,
ALL_LAYOUT,
phi::sparse::TransposeCsrGradKernel,
float,
double,
int8_t,
uint8_t,
int16_t,
int,
int64_t,
bool) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/sparse/unary_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/eigen_function.h"
#include "paddle/phi/kernels/sparse/empty_kernel.h"
namespace phi {
namespace sparse {
template <typename T, typename Context>
void TransposeCooKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const std::vector<int>& perm,
SparseCooTensor* out) {
// create out sparse tensor
int64_t x_nnz = x.nnz();
DDim out_dims = x.dims().transpose(perm);
DenseTensor out_indices = EmptyLike<int64_t, Context>(dev_ctx, x.indices());
DenseTensor out_values(x.values());
out->SetMember(out_indices, out_values, out_dims, x.coalesced());
// compute values of indices
const DenseTensor& x_indices = x.indices();
const auto* x_indices_data = x_indices.data<int64_t>();
auto* out_indices_data = out_indices.data<int64_t>();
for (unsigned int i = 0; i < perm.size(); ++i) {
for (int64_t j = 0; j < x_nnz; ++j) {
out_indices_data[j + i * x_nnz] = x_indices_data[j + perm[i] * x_nnz];
}
}
}
template <typename T, typename Context>
void TransposeCsrKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const std::vector<int>& perm,
SparseCsrTensor* out) {
unsigned int n_dim = perm.size();
const DenseTensor& x_crows = x.crows();
const DenseTensor& x_cols = x.cols();
const DenseTensor& x_values = x.values();
DenseTensor out_crows, out_cols, out_values;
// return a copy of x
if (perm[0] == 0 && perm[1] == 1 && (n_dim == 2 || perm[2] == 2)) {
out_crows = x_crows;
out_cols = x_cols;
out_values = x_values;
out->SetMember(out_crows, out_cols, out_values, x.dims());
return;
}
// create out sparse tensor
DDim out_dims = x.dims().transpose(perm);
if (n_dim == 2) {
out_crows = Empty<int64_t, Context>(dev_ctx, {out_dims[0] + 1});
} else {
out_crows =
Empty<int64_t, Context>(dev_ctx, {out_dims[0] * (out_dims[1] + 1)});
}
out_cols = EmptyLike<int64_t, Context>(dev_ctx, x.cols());
out_values = EmptyLike<T, Context>(dev_ctx, x.values());
out->SetMember(out_crows, out_cols, out_values, out_dims);
// transpose by two stages
if (perm[0] == 1 && perm[1] == 2) { // perm == {1, 2, 0}
SparseCsrTensor temp;
TransposeCsrKernel<T, Context>(dev_ctx, x, {1, 0, 2}, &temp);
TransposeCsrKernel<T, Context>(dev_ctx, temp, {0, 2, 1}, out);
return;
} else if (perm[0] == 2 && perm[1] == 0) { // perm == {2, 0, 1}
SparseCsrTensor temp;
TransposeCsrKernel<T, Context>(dev_ctx, x, {0, 2, 1}, &temp);
TransposeCsrKernel<T, Context>(dev_ctx, temp, {1, 0, 2}, out);
return;
} else if (perm[0] == 2 && perm[1] == 1) { // perm == {2, 1, 0}
SparseCsrTensor temp;
TransposeCsrKernel<T, Context>(dev_ctx, x, {1, 0, 2}, &temp);
TransposeCsrKernel<T, Context>(dev_ctx, temp, {2, 0, 1}, out);
return;
}
int64_t* out_crows_data = out_crows.data<int64_t>();
int64_t* out_cols_data = out_cols.data<int64_t>();
T* out_values_data = out_values.data<T>();
const int64_t* x_crows_data = x_crows.data<int64_t>();
const int64_t* x_cols_data = x_cols.data<int64_t>();
const T* x_values_data = x_values.data<T>();
int64_t x_nnz = x.nnz();
if (n_dim == 2) { // perm == {1, 0}
// compute out_crows_data by x_cols_data
for (int i = 0; i < out_dims[0]; ++i) {
out_crows_data[i] = 0;
}
for (int i = 0; i < x_nnz; ++i) {
int j = x_cols_data[i];
out_crows_data[j + 1]++;
}
out_crows_data[out_dims[0]] = x_nnz;
for (int i = 1; i < out_dims[0]; ++i) {
out_crows_data[i] += out_crows_data[i - 1];
}
// compute out_cols_data and out_values_data by out_crows_data and x
std::unordered_map<int64_t, int> cols_offset;
for (int i = 0; i < x.dims()[0]; ++i) {
int64_t start = x_crows_data[i];
int64_t end = x_crows_data[i + 1];
for (int64_t j = start; j < end; ++j) {
int64_t x_cols_j = x_cols_data[j];
int64_t jjj = out_crows_data[x_cols_j];
if (cols_offset.count(jjj)) {
cols_offset[jjj]++;
} else {
cols_offset[jjj] = 0;
}
int64_t jjj_offset = jjj + cols_offset[jjj];
out_cols_data[jjj_offset] = i;
out_values_data[jjj_offset] = x_values_data[j];
}
}
} else { // n_dim == 3
int out_n_rows = out_dims[1];
int x_n_rows = x.dims()[1];
for (int k = 0; k < out_dims[0]; ++k) {
if (perm[0] == 0) { // perm == {0, 2, 1}
// compute out_crows_data by x_cols_data
for (int i = 0; i < out_n_rows; ++i) {
out_crows_data[i] = 0;
}
for (int i = 0; i < x_crows_data[x_n_rows]; ++i) {
int j = x_cols_data[i];
out_crows_data[j + 1]++;
}
out_crows_data[out_n_rows] = x_crows_data[x_n_rows];
for (int i = 1; i < out_n_rows; ++i) {
out_crows_data[i] += out_crows_data[i - 1];
}
// compute out_cols_data and out_values_data by out_crows_data and x
std::unordered_map<int64_t, int> cols_offset;
for (int i = 0; i < x_n_rows; ++i) {
int64_t start = x_crows_data[i];
int64_t end = x_crows_data[i + 1];
for (int64_t j = start; j < end; ++j) {
int64_t x_cols_j = x_cols_data[j];
int64_t jjj = out_crows_data[x_cols_j];
if (cols_offset.count(jjj)) {
cols_offset[jjj]++;
} else {
cols_offset[jjj] = 0;
}
int64_t jjj_offset = jjj + cols_offset[jjj];
out_cols_data[jjj_offset] = i;
out_values_data[jjj_offset] = x_values_data[j];
}
}
// x offset
x_cols_data += x_crows_data[x_n_rows];
x_values_data += x_crows_data[x_n_rows];
x_crows_data += x_n_rows + 1;
} else if (perm[0] == 1 && perm[1] == 0) { // perm == {1, 0, 2}
for (int i = 0; i < out_n_rows; ++i) {
out_crows_data[i] = 0;
}
int x_cols_offset = 0;
int out_cols_index = 0;
for (int i = 0; i < x.dims()[0]; ++i) {
int x_crows_index = i * (x_n_rows + 1);
int start = x_crows_data[x_crows_index + k];
int end = x_crows_data[x_crows_index + 1 + k];
out_crows_data[i + 1] = end - start;
for (int j = start; j < end; ++j) {
out_cols_data[out_cols_index] = x_cols_data[x_cols_offset + j];
out_values_data[out_cols_index] = x_values_data[x_cols_offset + j];
out_cols_index++;
}
x_cols_offset += x_crows_data[x_crows_index + x_n_rows];
}
for (int i = 1; i <= out_n_rows; ++i) {
out_crows_data[i] += out_crows_data[i - 1];
}
}
// out offset
out_cols_data += out_crows_data[out_n_rows];
out_values_data += out_crows_data[out_n_rows];
out_crows_data += out_n_rows + 1;
}
}
}
} // namespace sparse
} // namespace phi
PD_REGISTER_KERNEL(transpose_coo,
CPU,
ALL_LAYOUT,
phi::sparse::TransposeCooKernel,
float,
double,
int8_t,
uint8_t,
int16_t,
int,
int64_t,
bool) {}
PD_REGISTER_KERNEL(transpose_csr,
CPU,
ALL_LAYOUT,
phi::sparse::TransposeCsrKernel,
float,
double,
int8_t,
uint8_t,
int16_t,
int,
int64_t,
bool) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/sparse/unary_grad_kernel.h"
#include "paddle/phi/kernels/sparse/unary_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/sparse/empty_kernel.h"
#include "paddle/phi/kernels/sparse/impl/unary_grad_kernel_impl.h"
namespace phi {
namespace sparse {
std::vector<int> get_gpu_grad_perm(std::vector<int> perm) {
std::vector<int> grad_perm(perm.size());
for (unsigned int i = 0; i < perm.size(); ++i) {
grad_perm[perm[i]] = i;
}
return grad_perm;
}
template <typename T, typename Context>
void TransposeCooGradKernel(const Context& dev_ctx,
const SparseCooTensor& dout,
const std::vector<int>& perm,
SparseCooTensor* dx) {
std::vector<int> grad_perm = get_gpu_grad_perm(perm);
TransposeCooKernel<T, Context>(dev_ctx, dout, grad_perm, dx);
}
template <typename T, typename Context>
void TransposeCsrGradKernel(const Context& dev_ctx,
const SparseCsrTensor& dout,
const std::vector<int>& perm,
SparseCsrTensor* dx) {
std::vector<int> grad_perm = get_gpu_grad_perm(perm);
TransposeCsrKernel<T, Context>(dev_ctx, dout, grad_perm, dx);
}
} // namespace sparse
} // namespace phi
PD_REGISTER_KERNEL(transpose_coo_grad,
GPU,
ALL_LAYOUT,
phi::sparse::TransposeCooGradKernel,
phi::dtype::float16,
float,
double,
int8_t,
uint8_t,
int16_t,
int,
int64_t,
bool) {}
PD_REGISTER_KERNEL(transpose_csr_grad,
GPU,
ALL_LAYOUT,
phi::sparse::TransposeCsrGradKernel,
phi::dtype::float16,
float,
double,
int8_t,
uint8_t,
int16_t,
int,
int64_t,
bool) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/sparse/unary_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
#include "paddle/phi/kernels/sparse/empty_kernel.h"
namespace phi {
namespace sparse {
__global__ void TransposeCooCudaKernel(const int64_t *x_indices_data,
const int *perm,
const std::size_t n_dim,
const int64_t x_nnz,
int64_t *out_indices_data) {
CUDA_KERNEL_LOOP_TYPE(index, x_nnz * n_dim, int64_t) {
int64_t i = index / x_nnz;
int64_t j = index % x_nnz;
out_indices_data[index] = x_indices_data[j + perm[i] * x_nnz];
}
}
template <typename T>
__global__ void TransposeCsr2DCudaKernel(const int64_t *x_crows_data,
const int64_t *x_cols_data,
const T *x_values_data,
const int *perm,
const int64_t *x_dims,
const int64_t *out_dims,
const int64_t x_nnz,
int64_t *out_crows_data,
int64_t *out_cols_data,
T *out_values_data) {
int64_t __index__ =
static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
// compute out_crows_data by x_cols_data
for (int64_t i = __index__; i <= out_dims[0]; i += blockDim.x * gridDim.x) {
out_crows_data[i] = 0;
}
__syncthreads();
if (__index__ == 0) {
for (int64_t i = 0; i < x_nnz; ++i) {
int j = x_cols_data[i];
out_crows_data[j + 2]++;
}
for (int64_t i = 0; i < out_dims[0]; i += 1) {
out_crows_data[i + 1] += out_crows_data[i];
}
// compute out_cols_data and out_values_data by out_crows_data and x
for (int i = 0; i < x_dims[0]; ++i) {
int64_t start = x_crows_data[i];
int64_t end = x_crows_data[i + 1];
for (int64_t j = start; j < end; ++j) {
int64_t x_cols_j = x_cols_data[j] + 1;
int64_t jjj = out_crows_data[x_cols_j];
out_cols_data[jjj] = i;
out_values_data[jjj] = x_values_data[j];
out_crows_data[x_cols_j]++;
}
}
}
}
template <typename T>
__global__ void TransposeCsr3DCudaKernel(const int64_t *x_crows_data,
const int64_t *x_cols_data,
const T *x_values_data,
const int *perm,
const int64_t *x_dims,
const int64_t *out_dims,
const std::size_t n_dim,
const int64_t x_nnz,
int64_t *out_crows_data,
int64_t *out_cols_data,
T *out_values_data) {
int64_t __index__ =
static_cast<int64_t>(blockIdx.x) * blockDim.x + threadIdx.x;
if (__index__ == 0) {
int out_n_rows = out_dims[1];
int x_n_rows = x_dims[1];
for (int k = 0; k < out_dims[0]; ++k) {
if (perm[0] == 0) { // dims == {0, 2, 1}
// compute out_crows_data by x_cols_data
for (int i = 0; i <= out_n_rows; ++i) {
out_crows_data[i] = 0;
}
for (int i = 0; i < x_crows_data[x_n_rows]; ++i) {
int j = x_cols_data[i];
out_crows_data[j + 2]++;
}
for (int i = 0; i < out_n_rows; ++i) {
out_crows_data[i + 1] += out_crows_data[i];
}
// compute out_cols_data and out_values_data by out_crows_data and x
for (int i = 0; i < x_n_rows; ++i) {
int64_t start = x_crows_data[i];
int64_t end = x_crows_data[i + 1];
for (int64_t j = start; j < end; ++j) {
int64_t x_cols_j = x_cols_data[j] + 1;
int64_t jjj = out_crows_data[x_cols_j];
out_cols_data[jjj] = i;
out_values_data[jjj] = x_values_data[j];
out_crows_data[x_cols_j]++;
}
}
// x offset
x_cols_data += x_crows_data[x_n_rows];
x_values_data += x_crows_data[x_n_rows];
x_crows_data += x_n_rows + 1;
} else if (perm[0] == 1 && perm[1] == 0) { // perm == {1, 0, 2}
for (int i = 0; i < out_n_rows; ++i) {
out_crows_data[i] = 0;
}
int x_cols_offset = 0;
int out_cols_index = 0;
for (int i = 0; i < x_dims[0]; ++i) {
int x_crows_index = i * (x_n_rows + 1);
int start = x_crows_data[x_crows_index + k];
int end = x_crows_data[x_crows_index + 1 + k];
out_crows_data[i + 1] = end - start;
for (int j = start; j < end; ++j) {
out_cols_data[out_cols_index] = x_cols_data[x_cols_offset + j];
out_values_data[out_cols_index] = x_values_data[x_cols_offset + j];
out_cols_index++;
}
x_cols_offset += x_crows_data[x_crows_index + x_n_rows];
}
for (int i = 1; i <= out_n_rows; ++i) {
out_crows_data[i] += out_crows_data[i - 1];
}
}
// out offset
out_cols_data += out_crows_data[out_n_rows];
out_values_data += out_crows_data[out_n_rows];
out_crows_data += out_n_rows + 1;
}
}
}
template <typename T, typename Context>
void TransposeCooKernel(const Context &dev_ctx,
const SparseCooTensor &x,
const std::vector<int> &perm,
SparseCooTensor *out) {
// create out sparse tensor
int64_t x_nnz = x.nnz();
std::size_t n_dim = perm.size();
DDim out_dims = x.dims().transpose(perm);
DenseTensor out_indices = EmptyLike<int64_t, Context>(dev_ctx, x.indices());
DenseTensor out_values(x.values());
out->SetMember(out_indices, out_values, out_dims, x.coalesced());
// compute values of indices
const DenseTensor &x_indices = x.indices();
const auto *x_indices_data = x_indices.data<int64_t>();
auto *out_indices_data = out_indices.data<int64_t>();
int *d_perm;
#ifdef PADDLE_WITH_HIP
hipMalloc(reinterpret_cast<void **>(&d_perm), sizeof(int) * perm.size());
hipMemcpy(
d_perm, perm.data(), sizeof(int) * perm.size(), hipMemcpyHostToDevice);
#else
cudaMalloc(reinterpret_cast<void **>(&d_perm), sizeof(int) * perm.size());
cudaMemcpy(
d_perm, perm.data(), sizeof(int) * perm.size(), cudaMemcpyHostToDevice);
#endif
auto config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, x_nnz * n_dim, 1);
TransposeCooCudaKernel<<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(
x_indices_data, d_perm, n_dim, x_nnz, out_indices_data);
}
template <typename T, typename Context>
void TransposeCsrKernel(const Context &dev_ctx,
const SparseCsrTensor &x,
const std::vector<int> &perm,
SparseCsrTensor *out) {
std::size_t n_dim = perm.size();
const DenseTensor &x_crows = x.crows();
const DenseTensor &x_cols = x.cols();
const DenseTensor &x_values = x.non_zero_elements();
DenseTensor out_crows, out_cols, out_values;
// return a copy of x
if (perm[0] == 0 && perm[1] == 1 && (n_dim == 2 || perm[2] == 2)) {
out_crows = x_crows;
out_cols = x_cols;
out_values = x_values;
out->SetMember(out_crows, out_cols, out_values, x.dims());
return;
}
// create out sparse tensor
DDim out_dims = x.dims().transpose(perm);
if (n_dim == 2) {
out_crows = Empty<int64_t, Context>(dev_ctx, {out_dims[0] + 1});
} else {
out_crows =
Empty<int64_t, Context>(dev_ctx, {out_dims[0] * (out_dims[1] + 1)});
}
out_cols = EmptyLike<int64_t, Context>(dev_ctx, x.cols());
out_values = EmptyLike<T, Context>(dev_ctx, x.values());
out->SetMember(out_crows, out_cols, out_values, out_dims);
// transpose by two stages
if (perm[0] == 1 && perm[1] == 2) { // perm == {1, 2, 0}
SparseCsrTensor temp;
TransposeCsrKernel<T, Context>(dev_ctx, x, {1, 0, 2}, &temp);
TransposeCsrKernel<T, Context>(dev_ctx, temp, {0, 2, 1}, out);
return;
} else if (perm[0] == 2 && perm[1] == 0) { // perm == {2, 0, 1}
SparseCsrTensor temp;
TransposeCsrKernel<T, Context>(dev_ctx, x, {0, 2, 1}, &temp);
TransposeCsrKernel<T, Context>(dev_ctx, temp, {1, 0, 2}, out);
return;
} else if (perm[0] == 2 && perm[1] == 1) { // perm == {2, 1, 0}
SparseCsrTensor temp;
TransposeCsrKernel<T, Context>(dev_ctx, x, {1, 0, 2}, &temp);
TransposeCsrKernel<T, Context>(dev_ctx, temp, {2, 0, 1}, out);
return;
}
int64_t *out_crows_data = out_crows.data<int64_t>();
int64_t *out_cols_data = out_cols.data<int64_t>();
T *out_values_data = out_values.data<T>();
const int64_t *x_crows_data = x_crows.data<int64_t>();
const int64_t *x_cols_data = x_cols.data<int64_t>();
const T *x_values_data = x_values.data<T>();
int *d_perm;
int64_t *d_x_dims, *d_out_dims;
#ifdef PADDLE_WITH_HIP
hipMalloc(reinterpret_cast<void **>(&d_perm), sizeof(int) * perm.size());
hipMemcpy(
d_perm, perm.data(), sizeof(int) * perm.size(), hipMemcpyHostToDevice);
hipMalloc(reinterpret_cast<void **>(&d_x_dims),
sizeof(int64_t) * x.dims().size());
hipMemcpy(d_x_dims,
x.dims().Get(),
sizeof(int64_t) * x.dims().size(),
hipMemcpyHostToDevice);
hipMalloc(reinterpret_cast<void **>(&d_out_dims),
sizeof(int64_t) * out_dims.size());
hipMemcpy(d_out_dims,
out_dims.Get(),
sizeof(int64_t) * out_dims.size(),
hipMemcpyHostToDevice);
#else
cudaMalloc(reinterpret_cast<void **>(&d_perm), sizeof(int) * perm.size());
cudaMemcpy(
d_perm, perm.data(), sizeof(int) * perm.size(), cudaMemcpyHostToDevice);
cudaMalloc(reinterpret_cast<void **>(&d_x_dims),
sizeof(int64_t) * x.dims().size());
cudaMemcpy(d_x_dims,
x.dims().Get(),
sizeof(int64_t) * x.dims().size(),
cudaMemcpyHostToDevice);
cudaMalloc(reinterpret_cast<void **>(&d_out_dims),
sizeof(int64_t) * out_dims.size());
cudaMemcpy(d_out_dims,
out_dims.Get(),
sizeof(int64_t) * out_dims.size(),
cudaMemcpyHostToDevice);
#endif
int64_t x_nnz = x.nnz();
auto config =
phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, out_dims[0], 1);
if (perm.size() == 2) {
TransposeCsr2DCudaKernel<T><<<config.block_per_grid.x,
config.thread_per_block.x,
0,
dev_ctx.stream()>>>(x_crows_data,
x_cols_data,
x_values_data,
d_perm,
d_x_dims,
d_out_dims,
x_nnz,
out_crows_data,
out_cols_data,
out_values_data);
} else {
TransposeCsr3DCudaKernel<T><<<1, 1, 0, dev_ctx.stream()>>>(x_crows_data,
x_cols_data,
x_values_data,
d_perm,
d_x_dims,
d_out_dims,
perm.size(),
x_nnz,
out_crows_data,
out_cols_data,
out_values_data);
}
}
} // namespace sparse
} // namespace phi
PD_REGISTER_KERNEL(transpose_coo,
GPU,
ALL_LAYOUT,
phi::sparse::TransposeCooKernel,
phi::dtype::float16,
float,
double,
int8_t,
uint8_t,
int16_t,
int,
int64_t,
bool) {}
PD_REGISTER_KERNEL(transpose_csr,
GPU,
ALL_LAYOUT,
phi::sparse::TransposeCsrKernel,
phi::dtype::float16,
float,
double,
int8_t,
uint8_t,
int16_t,
int,
int64_t,
bool) {}
......@@ -77,5 +77,17 @@ void CastCsrGradKernel(const Context& dev_ctx,
DataType value_dtype,
SparseCsrTensor* dx);
template <typename T, typename Context>
void TransposeCooGradKernel(const Context& dev_ctx,
const SparseCooTensor& dout,
const std::vector<int>& perm,
SparseCooTensor* dx);
template <typename T, typename Context>
void TransposeCsrGradKernel(const Context& dev_ctx,
const SparseCsrTensor& dout,
const std::vector<int>& perm,
SparseCsrTensor* dx);
} // namespace sparse
} // namespace phi
......@@ -99,6 +99,48 @@ void CastCsrKernel(const Context& dev_ctx,
DataType value_dtype,
SparseCsrTensor* out);
template <typename T, typename Context>
void TransposeCooKernel(const Context& dev_ctx,
const SparseCooTensor& x,
const std::vector<int>& perm,
SparseCooTensor* out);
template <typename T, typename Context>
void TransposeCsrKernel(const Context& dev_ctx,
const SparseCsrTensor& x,
const std::vector<int>& perm,
SparseCsrTensor* out);
template <typename T, typename Context>
SparseCooTensor TransposeCoo(const Context& dev_ctx,
const SparseCooTensor& x,
const std::vector<int>& perm) {
PADDLE_ENFORCE_EQ(x.sparse_dim(),
perm.size(),
phi::errors::InvalidArgument(
"size of perm must be equal than the x.sparse_dim()"));
SparseCooTensor coo;
TransposeCooKernel<T, Context>(dev_ctx, x, perm, &coo);
return coo;
}
template <typename T, typename Context>
SparseCsrTensor TransposeCsr(const Context& dev_ctx,
const SparseCsrTensor& x,
const std::vector<int>& perm) {
PADDLE_ENFORCE_LE(
2,
perm.size(),
phi::errors::InvalidArgument("size of perm must be equal to 2 or 3"));
PADDLE_ENFORCE_GE(
3,
perm.size(),
phi::errors::InvalidArgument("size of perm must be equal to 2 or 3"));
SparseCsrTensor csr;
TransposeCsrKernel<T, Context>(dev_ctx, x, perm, &csr);
return csr;
}
template <typename T, typename Context>
SparseCooTensor ReluCoo(const Context& dev_ctx, const SparseCooTensor& x) {
SparseCooTensor coo;
......
......@@ -74,6 +74,10 @@ cc_test(
test_sparse_elementwise_dev_api
SRCS test_sparse_elementwise_dev_api.cc
DEPS phi phi_api_utils)
cc_test(
test_sparse_transpose_dev_api
SRCS test_sparse_transpose_dev_api.cc
DEPS phi phi_api_utils)
cc_test(
test_math_function
......
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <memory>
#include "paddle/fluid/memory/allocation/allocator_facade.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/sparse/empty_kernel.h"
#include "paddle/phi/kernels/sparse/sparse_utils_kernel.h"
#include "paddle/phi/kernels/sparse/unary_grad_kernel.h"
#include "paddle/phi/kernels/sparse/unary_kernel.h"
#include "paddle/phi/kernels/transpose_grad_kernel.h"
#include "paddle/phi/kernels/transpose_kernel.h"
namespace phi {
namespace tests {
TEST(DEV_API, sparse_transpose_coo) {
std::vector<float> data = {0, -1, 0, 2, 0, 0, -3, 0, 4, 5, 0, 0};
phi::CPUContext dev_ctx_cpu;
dev_ctx_cpu.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx_cpu.SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
DenseTensor dense_x = phi::Empty(
dev_ctx_cpu,
DenseTensorMeta(
DataType::FLOAT32, phi::make_ddim({3, 2, 2}), DataLayout::NCHW));
memcpy(dense_x.data<float>(), data.data(), data.size() * sizeof(float));
auto sparse_coo = sparse::DenseToCoo<float>(dev_ctx_cpu, dense_x, 3);
auto sparse_out =
sparse::TransposeCoo<float>(dev_ctx_cpu, sparse_coo, {2, 1, 0});
DenseTensor dense_out = phi::Empty(
dev_ctx_cpu,
DenseTensorMeta(
DataType::FLOAT32, phi::make_ddim({2, 2, 3}), DataLayout::NCHW));
TransposeKernel<float>(dev_ctx_cpu, dense_x, {2, 1, 0}, &dense_out);
// backward
DenseTensor dense_grad_x = phi::EmptyLike<float>(dev_ctx_cpu, dense_out);
TransposeGradKernel<float>(dev_ctx_cpu, dense_out, {2, 1, 0}, &dense_grad_x);
SparseCooTensor sparse_grad_x;
sparse::EmptyLikeCooKernel<float>(dev_ctx_cpu, sparse_coo, &sparse_grad_x);
SparseCooTensor sparse_out_grad(
sparse_coo.indices(), sparse_coo.values(), {2, 2, 3});
sparse::TransposeCooGradKernel<float>(
dev_ctx_cpu, sparse_out_grad, {2, 1, 0}, &sparse_grad_x);
}
TEST(DEV_API, sparse_transpose_csr_case1) {
std::vector<float> data = {0, -1, 0, 2, 0, 0, -3, 0, 4, 5, 0, 0};
phi::CPUContext dev_ctx_cpu;
dev_ctx_cpu.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx_cpu.SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
DenseTensor dense_x = phi::Empty(
dev_ctx_cpu,
DenseTensorMeta(
DataType::FLOAT32, phi::make_ddim({3, 2, 2}), DataLayout::NCHW));
memcpy(dense_x.data<float>(), data.data(), data.size() * sizeof(float));
auto sparse_csr = sparse::DenseToCsr<float>(dev_ctx_cpu, dense_x);
auto sparse_out =
sparse::TransposeCsr<float>(dev_ctx_cpu, sparse_csr, {2, 1, 0});
DenseTensor dense_out = phi::Empty(
dev_ctx_cpu,
DenseTensorMeta(
DataType::FLOAT32, phi::make_ddim({2, 2, 3}), DataLayout::NCHW));
TransposeKernel<float>(dev_ctx_cpu, dense_x, {2, 1, 0}, &dense_out);
// backward
DenseTensor dense_grad_x = phi::EmptyLike<float>(dev_ctx_cpu, dense_out);
TransposeGradKernel<float>(dev_ctx_cpu, dense_out, {2, 1, 0}, &dense_grad_x);
SparseCsrTensor sparse_grad_x;
sparse::EmptyLikeCsrKernel<float>(dev_ctx_cpu, sparse_csr, &sparse_grad_x);
sparse::TransposeCsrGradKernel<float>(
dev_ctx_cpu, sparse_out, {2, 1, 0}, &sparse_grad_x);
}
TEST(DEV_API, sparse_transpose_csr_case2) {
std::vector<float> data = {0, -1, 0, 2, 0, 0, -3, 0, 4, 5, 0, 0};
phi::CPUContext dev_ctx_cpu;
dev_ctx_cpu.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx_cpu.SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
DenseTensor dense_x = phi::Empty(
dev_ctx_cpu,
DenseTensorMeta(
DataType::FLOAT32, phi::make_ddim({3, 2, 2}), DataLayout::NCHW));
memcpy(dense_x.data<float>(), data.data(), data.size() * sizeof(float));
auto sparse_csr = sparse::DenseToCsr<float>(dev_ctx_cpu, dense_x);
auto sparse_out =
sparse::TransposeCsr<float>(dev_ctx_cpu, sparse_csr, {1, 2, 0});
DenseTensor dense_out = phi::Empty(
dev_ctx_cpu,
DenseTensorMeta(
DataType::FLOAT32, phi::make_ddim({2, 2, 3}), DataLayout::NCHW));
TransposeKernel<float>(dev_ctx_cpu, dense_x, {1, 2, 0}, &dense_out);
}
TEST(DEV_API, sparse_transpose_csr_case3) {
std::vector<float> data = {0, -1, 0, 2, 0, 0, -3, 0, 4, 5, 0, 0};
phi::CPUContext dev_ctx_cpu;
dev_ctx_cpu.SetAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
dev_ctx_cpu.SetHostAllocator(
paddle::memory::allocation::AllocatorFacade::Instance()
.GetAllocator(paddle::platform::CPUPlace())
.get());
DenseTensor dense_x = phi::Empty(
dev_ctx_cpu,
DenseTensorMeta(
DataType::FLOAT32, phi::make_ddim({3, 4}), DataLayout::NCHW));
memcpy(dense_x.data<float>(), data.data(), data.size() * sizeof(float));
auto sparse_csr = sparse::DenseToCsr<float>(dev_ctx_cpu, dense_x);
auto sparse_out =
sparse::TransposeCsr<float>(dev_ctx_cpu, sparse_csr, {1, 0});
DenseTensor dense_out = phi::Empty(
dev_ctx_cpu,
DenseTensorMeta(
DataType::FLOAT32, phi::make_ddim({4, 3}), DataLayout::NCHW));
TransposeKernel<float>(dev_ctx_cpu, dense_x, {1, 0}, &dense_out);
}
} // namespace tests
} // namespace phi
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
import numpy as np
import unittest
from paddle.fluid.framework import _test_eager_guard
class TestTranspose(unittest.TestCase):
# x: sparse, out: sparse
def check_result(self, x_shape, dims, format):
with _test_eager_guard():
mask = paddle.randint(0, 2, x_shape).astype("float32")
origin_x = paddle.rand(x_shape, dtype='float32') * mask
dense_x = origin_x.detach()
dense_x.stop_gradient = False
dense_out = paddle.transpose(dense_x, dims)
if format == "coo":
sp_x = origin_x.detach().to_sparse_coo(len(x_shape))
else:
sp_x = origin_x.detach().to_sparse_csr()
sp_x.stop_gradient = False
sp_out = paddle.incubate.sparse.transpose(sp_x, dims)
np.testing.assert_allclose(sp_out.to_dense().numpy(),
dense_out.numpy(),
rtol=1e-05)
dense_out.backward()
sp_out.backward()
np.testing.assert_allclose(sp_x.grad.to_dense().numpy(),
(dense_x.grad * mask).numpy(),
rtol=1e-05)
def test_transpose_2d(self):
self.check_result([2, 5], [0, 1], 'coo')
self.check_result([2, 5], [0, 1], 'csr')
self.check_result([2, 5], [1, 0], 'coo')
self.check_result([2, 5], [1, 0], 'csr')
def test_transpose_3d(self):
self.check_result([6, 2, 3], [0, 1, 2], 'coo')
self.check_result([6, 2, 3], [0, 1, 2], 'csr')
self.check_result([6, 2, 3], [0, 2, 1], 'coo')
self.check_result([6, 2, 3], [0, 2, 1], 'csr')
self.check_result([6, 2, 3], [1, 0, 2], 'coo')
self.check_result([6, 2, 3], [1, 0, 2], 'csr')
self.check_result([6, 2, 3], [2, 0, 1], 'coo')
self.check_result([6, 2, 3], [2, 0, 1], 'csr')
self.check_result([6, 2, 3], [2, 1, 0], 'coo')
self.check_result([6, 2, 3], [2, 1, 0], 'csr')
self.check_result([6, 2, 3], [1, 2, 0], 'coo')
self.check_result([6, 2, 3], [1, 2, 0], 'csr')
def test_transpose_nd(self):
self.check_result([8, 3, 4, 4, 5, 3], [5, 3, 4, 1, 0, 2], 'coo')
# Randint now only supports access to dimension 0 to 9.
self.check_result([2, 3, 4, 2, 3, 4, 2, 3, 4],
[2, 3, 4, 5, 6, 7, 8, 0, 1], 'coo')
if __name__ == "__main__":
unittest.main()
......@@ -34,6 +34,7 @@ from .unary import coalesce
from .unary import deg2rad
from .unary import rad2deg
from .unary import expm1
from .unary import transpose
from .binary import mv
from .binary import matmul
......@@ -75,6 +76,7 @@ __all__ = [
'addmm',
'add',
'subtract',
'transpose',
'multiply',
'divide',
'coalesce',
......
......@@ -119,6 +119,37 @@ def asin(x, name=None):
return _C_ops.sparse_asin(x)
@dygraph_only
def transpose(x, perm, name=None):
"""
Changes the perm order of ``x`` without changing its data, requiring x to be a SparseCooTensor or SparseCsrTensor.
.. math::
out = transpose(x, perm)
Parameters:
x (Tensor): The input Sparse Tensor with data type float32, float64.
perm (list|tuple): Permute the input according to the data of perm.
name (str, optional): Name for the operation (optional, default is None).
For more information, please refer to :ref:`api_guide_Name`.
Returns:
A transposed Sparse Tensor with the same data type as ``x``.
Examples:
.. code-block:: python
import paddle
dense_x = paddle.to_tensor([[-2., 0.], [1., 2.]])
sparse_x = dense_x.to_sparse_coo(1)
out = paddle.incubate.sparse.transpose(sparse_x, [1, 0])
"""
return _C_ops.sparse_transpose(x, perm)
@dygraph_only
def atan(x, name=None):
"""
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册