未验证 提交 ba89a3d3 编写于 作者: X xiongkun 提交者: GitHub

[ Phi ] svd transfer (#44392)

* svd cpu forward

* svd gpu forward

* transfer the backward of svd

* remove cusolver in svd_grad

* svd kernel bug fix

* fix bugs

* fix bugs.

* fix bug
上级 55e5ab82
......@@ -45,71 +45,6 @@ template <typename T,
typename IndexType = Eigen::DenseIndex>
using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
template <typename T>
void LapackSvd(
const T* X, T* U, T* VH, T* S, int rows, int cols, int full = false) {
char jobz = full ? 'A' : 'S';
int mx = std::max(rows, cols);
int mn = std::min(rows, cols);
T* a = const_cast<T*>(X);
int lda = rows;
int ldu = rows;
int ldvt = full ? cols : mn;
int lwork = full ? (4 * mn * mn + 6 * mn + mx) : (4 * mn * mn + 7 * mn);
std::vector<T> work(lwork);
std::vector<int> iwork(8 * mn);
int info;
phi::funcs::lapackSvd<T>(jobz,
rows,
cols,
a,
lda,
S,
U,
ldu,
VH,
ldvt,
work.data(),
lwork,
iwork.data(),
&info);
if (info < 0) {
PADDLE_THROW(platform::errors::InvalidArgument(
"This %s-th argument has an illegal value", info));
}
if (info > 0) {
PADDLE_THROW(platform::errors::InvalidArgument(
"DBDSDC/SBDSDC did not converge, updating process failed. May be you "
"passes a invalid matrix."));
}
}
template <typename T>
void BatchSvd(const T* X,
T* U,
T* VH,
T* S,
int rows,
int cols,
int batches,
int full = false) {
// NOTE: this function is row major, because this function called the lapack.
int stride = rows * cols;
int k = std::min(rows, cols);
int stride_u = full ? rows * rows : k * rows;
int stride_v = full ? cols * cols : k * cols;
for (int i = 0; i < batches; ++i) {
LapackSvd<T>(X + i * stride,
U + i * stride_u,
VH + i * stride_v,
S + i * k,
rows,
cols,
full);
}
return;
}
template <typename T>
struct PowFunctor {
PowFunctor(const T* input, T* output, int64_t numel, T exp)
......
......@@ -12,13 +12,12 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/svd_op.h"
#include <memory>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/phi/core/ddim.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
......@@ -167,11 +166,3 @@ REGISTER_OPERATOR(svd,
ops::SvdGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(svd_grad, ops::SvdGradOp);
REGISTER_OP_CPU_KERNEL(svd,
ops::SvdCPUKernel<float>,
ops::SvdCPUKernel<double>);
REGISTER_OP_CPU_KERNEL(svd_grad,
ops::SvdGradKernel<phi::CPUContext, float>,
ops::SvdGradKernel<phi::CPUContext, double>);
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef PADDLE_WITH_HIP
// HIP not support cusolver
#include <thrust/device_vector.h>
#include <algorithm>
#include <vector>
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/svd_op.h"
#include "paddle/fluid/platform/dynload/cusolver.h"
namespace paddle {
namespace operators {
template <typename T>
class SvdGPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
auto& dev_ctx =
context.template device_context<platform::CUDADeviceContext>();
const Tensor* x = context.Input<Tensor>("X");
Tensor* U = context.Output<Tensor>("U");
Tensor* VH = context.Output<Tensor>("VH");
Tensor* S = context.Output<Tensor>("S");
const bool full_matrices = context.Attr<bool>("full_matrices");
auto& dims = x->dims();
int batch_count = 1;
for (int i = 0; i < dims.size() - 2; i++) {
batch_count *= dims[i];
}
int rank = dims.size();
int m = dims[rank - 2];
int n = dims[rank - 1];
auto* vh_data = VH->mutable_data<T>(context.GetPlace());
auto* s_data = S->mutable_data<T>(context.GetPlace());
auto* u_data = U->mutable_data<T>(context.GetPlace());
// NOTE:(@xiongkun03)
// matrices are assumed to be stored in column-major order in cusolver
// then view A as n x m and do A^T SVD, we can avoid transpose
// Must Copy X once, because the gesvdj will change the origin input matrix
Tensor x_tmp;
paddle::framework::TensorCopy(*x, context.GetPlace(), &x_tmp);
auto info = memory::Alloc(dev_ctx, sizeof(int) * batch_count);
int* info_ptr = reinterpret_cast<int*>(info->ptr());
GesvdjBatched(dev_ctx,
batch_count,
n,
m,
std::min(m, n),
x_tmp.mutable_data<T>(context.GetPlace()),
vh_data,
u_data,
s_data,
info_ptr,
!full_matrices);
framework::DDim UT_dim = U->dims();
std::swap(UT_dim[rank - 1], UT_dim[rank - 2]); // Get the dim of UT_dim
U->Resize(UT_dim); // U is entirely UT
auto dito =
math::DeviceIndependenceTensorOperations<platform::CUDADeviceContext,
T>(context);
auto tmp_U = dito.Transpose(*U);
U->ShareDataWith(tmp_U); // U becomse UT, aka VT
}
void GesvdjBatched(const platform::CUDADeviceContext& dev_ctx,
int batchSize,
int m,
int n,
int k,
T* A,
T* U,
T* V,
T* S,
int* info,
int thin_UV = 1) const;
};
template <>
void SvdGPUKernel<float>::GesvdjBatched(
const platform::CUDADeviceContext& dev_ctx,
int batchSize,
int m,
int n,
int k,
float* A,
float* U,
float* V,
float* S,
int* info,
int thin_UV) const {
/* compute singular vectors */
const cusolverEigMode_t jobz =
CUSOLVER_EIG_MODE_VECTOR; /* compute singular vectors */
gesvdjInfo_t gesvdj_params = NULL;
int lda = m;
int ldu = m;
int ldt = n;
int lwork = 0;
auto handle = dev_ctx.cusolver_dn_handle();
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cusolverDnSgesvdj_bufferSize(handle,
jobz,
thin_UV,
m,
n,
A,
lda,
S,
U,
ldu,
V,
ldt,
&lwork,
gesvdj_params));
auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(float));
float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
int stride_A = lda * n;
int stride_U = ldu * (thin_UV ? k : m);
int stride_V = ldt * (thin_UV ? k : n);
for (int i = 0; i < batchSize; ++i) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cusolverDnSgesvdj(handle,
jobz,
thin_UV,
m,
n,
A + stride_A * i,
lda,
S + k * i,
U + stride_U * i,
ldu,
V + stride_V * i,
ldt,
workspace_ptr,
lwork,
info,
gesvdj_params));
// check the error info
int error_info;
memory::Copy(platform::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
sizeof(int),
dev_ctx.stream());
PADDLE_ENFORCE_EQ(
error_info,
0,
platform::errors::PreconditionNotMet(
"For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
}
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
}
template <>
void SvdGPUKernel<double>::GesvdjBatched(
const platform::CUDADeviceContext& dev_ctx,
int batchSize,
int m,
int n,
int k,
double* A,
double* U,
double* V,
double* S,
int* info,
int thin_UV) const {
/* compute singular vectors */
const cusolverEigMode_t jobz =
CUSOLVER_EIG_MODE_VECTOR; /* compute singular vectors */
gesvdjInfo_t gesvdj_params = NULL;
int lda = m;
int ldu = m;
int ldt = n;
int lwork = 0;
auto handle = dev_ctx.cusolver_dn_handle();
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cusolverDnDgesvdj_bufferSize(handle,
jobz,
thin_UV,
m,
n,
A,
lda,
S,
U,
ldu,
V,
ldt,
&lwork,
gesvdj_params));
auto workspace = memory::Alloc(dev_ctx, lwork * sizeof(double));
double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
int stride_A = lda * n;
int stride_U = ldu * (thin_UV ? k : m);
int stride_V = ldt * (thin_UV ? k : n);
for (int i = 0; i < batchSize; ++i) {
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cusolverDnDgesvdj(handle,
jobz,
thin_UV,
m,
n,
A + stride_A * i,
lda,
S + k * i,
U + stride_U * i,
ldu,
V + stride_V * i,
ldt,
workspace_ptr,
lwork,
info,
gesvdj_params));
// check the error info
int error_info;
memory::Copy(platform::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
sizeof(int),
dev_ctx.stream());
PADDLE_ENFORCE_EQ(
error_info,
0,
platform::errors::PreconditionNotMet(
"For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
}
PADDLE_ENFORCE_GPU_SUCCESS(
platform::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
}
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_CUDA_KERNEL(svd,
ops::SvdGPUKernel<float>,
ops::SvdGPUKernel<double>);
REGISTER_OP_CUDA_KERNEL(
svd_grad,
ops::SvdGradKernel<paddle::platform::CUDADeviceContext, float>,
ops::SvdGradKernel<paddle::platform::CUDADeviceContext, double>);
#endif // not PADDLE_WITH_HIP
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cstdarg>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/svd_helper.h"
#include "paddle/fluid/platform/for_range.h"
#include "paddle/phi/kernels/funcs/complex_functors.h"
#include "paddle/phi/kernels/transpose_kernel.h"
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
using DDim = framework::DDim;
template <typename T>
class SvdCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* x = context.Input<Tensor>("X");
Tensor* U = context.Output<Tensor>("U");
Tensor* VH = context.Output<Tensor>("VH");
Tensor* S = context.Output<Tensor>("S");
int full = context.Attr<bool>("full_matrices");
/*Create Tensors and output, set the dim ...*/
auto numel = x->numel();
auto& orig_dev_ctx = context.template device_context<phi::CPUContext>();
auto& dev_ctx = static_cast<
const typename framework::ConvertToPhiContext<phi::CPUContext>::TYPE&>(
orig_dev_ctx);
Tensor trans_x = ::phi::TransposeLast2Dim<T>(dev_ctx, *x);
auto* x_data = trans_x.data<T>();
auto x_dims = x->dims();
int rows = x_dims[x_dims.size() - 2];
int cols = x_dims[x_dims.size() - 1];
int k = std::min(rows, cols);
int col_u = full ? rows : k;
int col_v = full ? cols : k;
int batches = numel / (rows * cols);
auto* U_out = U->mutable_data<phi::dtype::Real<T>>(
context.GetPlace(),
size_t(batches * rows * col_u * sizeof(phi::dtype::Real<T>)));
auto* VH_out = VH->mutable_data<phi::dtype::Real<T>>(
context.GetPlace(),
size_t(batches * col_v * cols * sizeof(phi::dtype::Real<T>)));
auto* S_out = S->mutable_data<phi::dtype::Real<T>>(
context.GetPlace(), size_t(batches * k * sizeof(phi::dtype::Real<T>)));
/*SVD Use the Eigen Library*/
math::BatchSvd<T>(x_data, U_out, VH_out, S_out, rows, cols, batches, full);
/* let C[m, n] as a col major matrix with m rows and n cols.
* let R[m, n] is row major matrix with m rows and n cols.
* then we have: R[m,n] = C[m, n].resize((n,m)).tranpose_last_two()
* */
auto col_major_to_row_major = [&dev_ctx](Tensor* out) {
auto origin_dim = out->dims();
int64_t& x = origin_dim[origin_dim.size() - 1];
int64_t& y = origin_dim[origin_dim.size() - 2];
std::swap(x, y);
out->Resize(origin_dim);
return ::phi::TransposeLast2Dim<T>(dev_ctx, *out);
};
*U = col_major_to_row_major(U);
*VH = col_major_to_row_major(VH);
}
};
template <typename DeviceContext, typename T>
class SvdGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const {
const framework::Tensor& U_const = *ctx.Input<framework::Tensor>("U");
const framework::Tensor& VH_const = *ctx.Input<framework::Tensor>("VH");
const framework::Tensor& S = *ctx.Input<framework::Tensor>("S");
framework::Tensor& dX =
*ctx.Output<framework::Tensor>(framework::GradVarName("X"));
const framework::Tensor& dU_const =
*ctx.Input<framework::Tensor>(framework::GradVarName("U"));
const framework::Tensor& dVH_const =
*ctx.Input<framework::Tensor>(framework::GradVarName("VH"));
const bool full = ctx.Attr<bool>("full_matrices");
int m = dX.dims()[dX.dims().size() - 2];
int n = dX.dims()[dX.dims().size() - 1];
int k = S.dims()[S.dims().size() - 1];
auto dito = math::DeviceIndependenceTensorOperations<DeviceContext, T>(ctx);
framework::Tensor U, VH, dU, dV, dVH;
if (full) {
// if full_matrices is set, slice the U and VT to k columns
U = dito.Slice(U_const, {-1}, {0}, {k});
VH = dito.Slice(VH_const, {-2}, {0}, {k});
dU = dito.Slice(dU_const, {-1}, {0}, {k});
dVH = dito.Slice(dVH_const, {-2}, {0}, {k});
} else {
U = U_const;
VH = VH_const;
dU = dU_const;
dVH = dVH_const;
}
auto s_inverse = dito.Pow(S, -1);
auto s_square = dito.Pow(S, 2);
auto F =
dito.Sub(dito.Unsqueeze(s_square, -2), dito.Unsqueeze(s_square, -1));
F = dito.Add(F, dito.Diag(dito.Infinits({k})));
F = dito.Pow(F, -1);
Tensor sigma_term;
Tensor u_term;
Tensor v_term;
if (ctx.HasInput(framework::GradVarName("S"))) {
const framework::Tensor& gS =
*ctx.Input<framework::Tensor>(framework::GradVarName("S"));
sigma_term = dito.Mul(dito.Unsqueeze(gS, -2), U);
sigma_term = dito.Matmul(sigma_term, VH);
}
if (ctx.HasInput(framework::GradVarName("U"))) {
auto UTG = dito.Matmul(U, dU, true, false);
auto GTU = dito.Matmul(dU, U, true, false);
u_term = dito.Mul(dito.Mul(dito.Sub(UTG, GTU), F), dito.Unsqueeze(S, -2));
u_term = dito.Matmul(U, u_term);
if (m > k) {
auto project = dito.Sub(dito.Eye(m), dito.Matmul(U, U, false, true));
u_term = dito.Add(
u_term,
dito.Mul(dito.Matmul(project, dU), dito.Unsqueeze(s_inverse, -2)));
}
u_term = dito.Matmul(u_term, VH);
}
if (ctx.HasInput(framework::GradVarName("VH"))) {
auto UTG = dito.Matmul(VH, dVH, false, true);
auto GTU = dito.Matmul(dVH, VH, false, true);
v_term = dito.Mul(dito.Matmul(dito.Mul(dito.Sub(UTG, GTU), F), VH),
dito.Unsqueeze(S, -1));
if (n > k) {
auto project = dito.Sub(dito.Eye(n), dito.Matmul(VH, VH, true, false));
v_term = dito.Add(
v_term,
dito.Mul(dito.Matmul(dVH, project), dito.Unsqueeze(s_inverse, -1)));
}
v_term = dito.Matmul(U, v_term);
}
dX.ShareDataWith(dito.Add(dito.Add(u_term, sigma_term), v_term));
}
};
} // namespace operators
} // namespace paddle
......@@ -103,4 +103,15 @@ void PowKernel(const Context& dev_ctx,
const Scalar& factor,
DenseTensor* out);
template <typename T, typename Context>
DenseTensor Pow(const Context& dev_ctx,
const DenseTensor& x,
const Scalar& factor) {
DenseTensor out;
MetaTensor meta_out(out);
UnchangedInferMeta(x, &meta_out);
PowKernel<T, Context>(dev_ctx, x, factor, &out);
return out;
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/svd_grad_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/svd_grad_kernel_impl.h"
PD_REGISTER_KERNEL(
svd_grad, CPU, ALL_LAYOUT, phi::SvdGradKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/svd_kernel.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/funcs/complex_functors.h"
#include "paddle/phi/kernels/funcs/lapack/lapack_function.h"
#include "paddle/phi/kernels/transpose_kernel.h"
namespace phi {
template <typename T>
void LapackSvd(
const T* X, T* U, T* VH, T* S, int rows, int cols, int full = false) {
char jobz = full ? 'A' : 'S';
int mx = std::max(rows, cols);
int mn = std::min(rows, cols);
T* a = const_cast<T*>(X);
int lda = rows;
int ldu = rows;
int ldvt = full ? cols : mn;
int lwork = full ? (4 * mn * mn + 6 * mn + mx) : (4 * mn * mn + 7 * mn);
std::vector<T> work(lwork);
std::vector<int> iwork(8 * mn);
int info;
phi::funcs::lapackSvd<T>(jobz,
rows,
cols,
a,
lda,
S,
U,
ldu,
VH,
ldvt,
work.data(),
lwork,
iwork.data(),
&info);
if (info < 0) {
PADDLE_THROW(phi::errors::InvalidArgument(
"This %s-th argument has an illegal value", info));
}
if (info > 0) {
PADDLE_THROW(phi::errors::InvalidArgument(
"DBDSDC/SBDSDC did not converge, updating process failed. May be you "
"passes a invalid matrix."));
}
}
template <typename T>
void BatchSvd(const T* X,
T* U,
T* VH,
T* S,
int rows,
int cols,
int batches,
int full = false) {
// NOTE: this function is row major, because this function called the lapack.
int stride = rows * cols;
int k = std::min(rows, cols);
int stride_u = full ? rows * rows : k * rows;
int stride_v = full ? cols * cols : k * cols;
for (int i = 0; i < batches; ++i) {
LapackSvd<T>(X + i * stride,
U + i * stride_u,
VH + i * stride_v,
S + i * k,
rows,
cols,
full);
}
return;
}
template <typename T, typename Context>
void SvdKernel(const Context& dev_ctx,
const DenseTensor& X,
bool full_matrices,
DenseTensor* U,
DenseTensor* S,
DenseTensor* VH) {
int full = full_matrices;
/*Create Tensors and output, set the dim ...*/
auto numel = X.numel();
DenseTensor trans_x = ::phi::TransposeLast2Dim<T>(dev_ctx, X);
auto* x_data = trans_x.data<T>();
auto x_dims = X.dims();
int rows = x_dims[x_dims.size() - 2];
int cols = x_dims[x_dims.size() - 1];
// int k = std::min(rows, cols);
// int col_u = full ? rows : k;
// int col_v = full ? cols : k;
int batches = numel / (rows * cols);
auto* U_out = dev_ctx.template Alloc<phi::dtype::Real<T>>(U);
auto* VH_out = dev_ctx.template Alloc<phi::dtype::Real<T>>(VH);
auto* S_out = dev_ctx.template Alloc<phi::dtype::Real<T>>(S);
/*SVD Use the Eigen Library*/
BatchSvd<T>(x_data, U_out, VH_out, S_out, rows, cols, batches, full);
/* let C[m, n] as a col major matrix with m rows and n cols.
* let R[m, n] is row major matrix with m rows and n cols.
* then we have: R[m,n] = C[m, n].resize((n,m)).tranpose_last_two()
* */
auto col_major_to_row_major = [&dev_ctx](DenseTensor* out) {
auto origin_dim = out->dims();
int64_t& x = origin_dim[origin_dim.size() - 1];
int64_t& y = origin_dim[origin_dim.size() - 2];
std::swap(x, y);
out->Resize(origin_dim);
return ::phi::TransposeLast2Dim<T>(dev_ctx, *out);
};
*U = col_major_to_row_major(U);
*VH = col_major_to_row_major(VH);
}
} // namespace phi
PD_REGISTER_KERNEL(svd, CPU, ALL_LAYOUT, phi::SvdKernel, float, double) {}
......@@ -15,6 +15,7 @@
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/infermeta/unary.h"
namespace phi {
......@@ -45,4 +46,16 @@ void DiagKernel(const Context& dev_ctx,
float padding_value,
DenseTensor* out);
template <typename T, typename Context>
DenseTensor Diag(const Context& dev_ctx,
const DenseTensor& x,
int offset,
float padding_value) {
DenseTensor dense_out;
MetaTensor meta_out(&dense_out);
DiagInferMeta(x, offset, padding_value, &meta_out);
DiagKernel<T, Context>(dev_ctx, x, offset, padding_value, &dense_out);
return dense_out;
}
} // namespace phi
......@@ -35,7 +35,7 @@
namespace phi {
template <typename T>
void GesvdjBatched(const phi::GPUContext& dev_ctx,
static void GesvdjBatched(const phi::GPUContext& dev_ctx,
int batchSize,
int m,
int n,
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/svd_grad_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/svd_grad_kernel_impl.h"
PD_REGISTER_KERNEL(
svd_grad, GPU, ALL_LAYOUT, phi::SvdGradKernel, float, double) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef PADDLE_WITH_HIP
// HIP not support cusolver
#include "paddle/phi/kernels/svd_kernel.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/backends/dynload/cusolver.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/complex_functors.h"
#include "paddle/phi/kernels/transpose_kernel.h"
namespace phi {
template <class T>
static void GesvdjBatched(const phi::GPUContext& dev_ctx,
int batchSize,
int m,
int n,
int k,
T* A,
T* U,
T* V,
T* S,
int* info,
int thin_UV = 1);
template <>
void GesvdjBatched<float>(const phi::GPUContext& dev_ctx,
int batchSize,
int m,
int n,
int k,
float* A,
float* U,
float* V,
float* S,
int* info,
int thin_UV) {
/* compute singular vectors */
const cusolverEigMode_t jobz =
CUSOLVER_EIG_MODE_VECTOR; /* compute singular vectors */
gesvdjInfo_t gesvdj_params = NULL;
int lda = m;
int ldu = m;
int ldt = n;
int lwork = 0;
auto handle = dev_ctx.cusolver_dn_handle();
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::cusolverDnSgesvdj_bufferSize(handle,
jobz,
thin_UV,
m,
n,
A,
lda,
S,
U,
ldu,
V,
ldt,
&lwork,
gesvdj_params));
auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float));
float* workspace_ptr = reinterpret_cast<float*>(workspace->ptr());
int stride_A = lda * n;
int stride_U = ldu * (thin_UV ? k : m);
int stride_V = ldt * (thin_UV ? k : n);
for (int i = 0; i < batchSize; ++i) {
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnSgesvdj(handle,
jobz,
thin_UV,
m,
n,
A + stride_A * i,
lda,
S + k * i,
U + stride_U * i,
ldu,
V + stride_V * i,
ldt,
workspace_ptr,
lwork,
info,
gesvdj_params));
// check the error info
int error_info;
paddle::memory::Copy(phi::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
sizeof(int),
dev_ctx.stream());
PADDLE_ENFORCE_EQ(
error_info,
0,
phi::errors::PreconditionNotMet(
"For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
}
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
}
template <>
void GesvdjBatched<double>(const phi::GPUContext& dev_ctx,
int batchSize,
int m,
int n,
int k,
double* A,
double* U,
double* V,
double* S,
int* info,
int thin_UV) {
/* compute singular vectors */
const cusolverEigMode_t jobz =
CUSOLVER_EIG_MODE_VECTOR; /* compute singular vectors */
gesvdjInfo_t gesvdj_params = NULL;
int lda = m;
int ldu = m;
int ldt = n;
int lwork = 0;
auto handle = dev_ctx.cusolver_dn_handle();
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params));
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::cusolverDnDgesvdj_bufferSize(handle,
jobz,
thin_UV,
m,
n,
A,
lda,
S,
U,
ldu,
V,
ldt,
&lwork,
gesvdj_params));
auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double));
double* workspace_ptr = reinterpret_cast<double*>(workspace->ptr());
int stride_A = lda * n;
int stride_U = ldu * (thin_UV ? k : m);
int stride_V = ldt * (thin_UV ? k : n);
for (int i = 0; i < batchSize; ++i) {
PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cusolverDnDgesvdj(handle,
jobz,
thin_UV,
m,
n,
A + stride_A * i,
lda,
S + k * i,
U + stride_U * i,
ldu,
V + stride_V * i,
ldt,
workspace_ptr,
lwork,
info,
gesvdj_params));
// check the error info
int error_info;
paddle::memory::Copy(phi::CPUPlace(),
&error_info,
dev_ctx.GetPlace(),
info,
sizeof(int),
dev_ctx.stream());
PADDLE_ENFORCE_EQ(
error_info,
0,
phi::errors::PreconditionNotMet(
"For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info));
}
PADDLE_ENFORCE_GPU_SUCCESS(
phi::dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params));
}
template <typename T, typename Context>
void SvdKernel(const Context& dev_ctx,
const DenseTensor& X,
bool full_matrices,
DenseTensor* U,
DenseTensor* S,
DenseTensor* VH) {
auto& dims = X.dims();
int batch_count = 1;
for (int i = 0; i < dims.size() - 2; i++) {
batch_count *= dims[i];
}
int rank = dims.size();
int m = dims[rank - 2];
int n = dims[rank - 1];
auto* u_data = dev_ctx.template Alloc<phi::dtype::Real<T>>(U);
auto* vh_data = dev_ctx.template Alloc<phi::dtype::Real<T>>(VH);
auto* s_data = dev_ctx.template Alloc<phi::dtype::Real<T>>(S);
// NOTE:(@xiongkun03)
// matrices are assumed to be stored in column-major order in cusolver
// then view A as n x m and do A^T SVD, we can avoid transpose
// Must Copy X once, because the gesvdj will change the origin input matrix
DenseTensor x_tmp;
Copy(dev_ctx, X, dev_ctx.GetPlace(), false, &x_tmp);
auto info = Empty<int, Context>(dev_ctx, {batch_count});
int* info_ptr = reinterpret_cast<int*>(info.data());
GesvdjBatched<T>(dev_ctx,
batch_count,
n,
m,
std::min(m, n),
dev_ctx.template Alloc<T>(&x_tmp),
vh_data,
u_data,
s_data,
info_ptr,
!full_matrices);
auto UT_dim = U->dims();
std::swap(UT_dim[rank - 1], UT_dim[rank - 2]); // Get the dim of UT_dim
U->Resize(UT_dim); // U is entirely UT
auto tmp_U = TransposeLast2Dim<T>(dev_ctx, *U);
U->ShareDataWith(tmp_U); // U becomse UT, aka VT;
}
} // namespace phi
PD_REGISTER_KERNEL(svd, // cuda_only
GPU,
ALL_LAYOUT,
phi::SvdKernel,
float,
double) {}
#endif // not PADDLE_WITH_HIP
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/activation_kernel.h"
#include "paddle/phi/kernels/diag_kernel.h"
#include "paddle/phi/kernels/elementwise_add_kernel.h"
#include "paddle/phi/kernels/elementwise_multiply_kernel.h"
#include "paddle/phi/kernels/elementwise_subtract_kernel.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/matmul_kernel.h"
#include "paddle/phi/kernels/slice_kernel.h"
namespace phi {
template <class T, class Context>
static DenseTensor Fill(const Context& ctx,
std::vector<int> shape,
float fill_value) {
DenseTensor ret;
ret.Resize(make_ddim(shape));
ctx.template Alloc<T>(&ret);
funcs::SetConstant<Context, T>()(ctx, &ret, T(fill_value));
return ret;
}
template <class T, class Context>
static DenseTensor Eye(const Context& dev_ctx, int n) {
auto output = Fill<T, Context>(dev_ctx, {n}, 1);
auto ret = Diag<T, Context>(dev_ctx, output, 0, 0);
return ret;
}
template <class T, class Context>
static DenseTensor Infinits(const Context& ctx, std::vector<int> shape) {
auto value = static_cast<T>(std::numeric_limits<double>::infinity());
return Fill<T, Context>(ctx, shape, value);
}
static DenseTensor Unsqueeze(const DenseTensor& x, int axis = 0) {
// don't copy data, only change the dims
DenseTensor out;
out.ShareDataWith(x);
std::vector<int> out_shape = phi::vectorize<int>(x.dims());
if (axis >= 0) {
auto index = (out_shape.begin() + axis);
out_shape.insert(index, 1);
} else if (axis < 0) {
auto index = (out_shape.end() + axis + 1);
out_shape.insert(index, 1);
}
out.Resize(phi::make_ddim(out_shape));
return out;
}
template <typename T, typename Context>
void SvdGradKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& u,
const DenseTensor& vh,
const DenseTensor& s,
const DenseTensor& u_grad,
const DenseTensor& vh_grad,
const DenseTensor& s_grad,
bool full_matrices,
DenseTensor* x_grad) {
const auto& dX = *x_grad;
int m = dX.dims()[dX.dims().size() - 2];
int n = dX.dims()[dX.dims().size() - 1];
int k = s.dims()[s.dims().size() - 1];
DenseTensor U, VH, dU, dV, dVH;
if (full_matrices) {
// if full_matrices is set, slice the U and VT to k columns
U = SliceKernel<T, Context>(
dev_ctx, u, {u.dims().size() - 1}, {0}, {k}, {1}, {});
VH = SliceKernel<T, Context>(
dev_ctx, vh, {vh.dims().size() - 2}, {0}, {k}, {1}, {});
dU = SliceKernel<T, Context>(
dev_ctx, u_grad, {u_grad.dims().size() - 1}, {0}, {k}, {1}, {});
dVH = SliceKernel<T, Context>(
dev_ctx, vh_grad, {vh.dims().size() - 2}, {0}, {k}, {1}, {});
} else {
U = u;
VH = vh;
dU = u_grad;
dVH = vh_grad;
}
auto s_inverse = Pow<T, Context>(dev_ctx, s, -1);
auto s_square = Pow<T, Context>(dev_ctx, s, 2);
auto F = Subtract<T, Context>(
dev_ctx, Unsqueeze(s_square, -2), Unsqueeze(s_square, -1));
F = Add<T, Context>(
dev_ctx,
F,
Diag<T, Context>(dev_ctx, Infinits<T, Context>(dev_ctx, {k}), 0, 0));
F = Pow<T, Context>(dev_ctx, F, -1);
DenseTensor sigma_term;
DenseTensor u_term;
DenseTensor v_term;
// if (ctx.HasInput(framework::GradVarName("S")))
{
const DenseTensor& gS = s_grad;
sigma_term = Multiply<T, Context>(dev_ctx, Unsqueeze(gS, -2), U);
sigma_term = Matmul<T, Context>(dev_ctx, sigma_term, VH);
}
// if (ctx.HasInput(framework::GradVarName("U"))) {
{
auto UTG = Matmul<T, Context>(dev_ctx, U, dU, true, false);
auto GTU = Matmul<T, Context>(dev_ctx, dU, U, true, false);
u_term = Multiply<T, Context>(
dev_ctx,
Multiply<T, Context>(
dev_ctx, Subtract<T, Context>(dev_ctx, UTG, GTU), F),
Unsqueeze(s, -2));
u_term = Matmul<T, Context>(dev_ctx, U, u_term);
if (m > k) {
auto project =
Subtract<T, Context>(dev_ctx,
Eye<T, Context>(dev_ctx, m),
Matmul<T, Context>(dev_ctx, U, U, false, true));
u_term = Add<T, Context>(
dev_ctx,
u_term,
Multiply<T, Context>(dev_ctx,
Matmul<T, Context>(dev_ctx, project, dU),
Unsqueeze(s_inverse, -2)));
}
u_term = Matmul<T, Context>(dev_ctx, u_term, VH);
}
// }
// if (ctx.HasInput(framework::GradVarName("VH"))) {
{
auto UTG = Matmul<T, Context>(dev_ctx, VH, dVH, false, true);
auto GTU = Matmul<T, Context>(dev_ctx, dVH, VH, false, true);
v_term = Multiply<T, Context>(
dev_ctx,
Matmul<T, Context>(
dev_ctx,
Multiply<T, Context>(
dev_ctx, Subtract<T, Context>(dev_ctx, UTG, GTU), F),
VH),
Unsqueeze(s, -1));
if (n > k) {
auto project = Subtract<T, Context>(
dev_ctx,
Eye<T, Context>(dev_ctx, n),
Matmul<T, Context>(dev_ctx, VH, VH, true, false));
v_term = Add<T, Context>(
dev_ctx,
v_term,
Multiply<T, Context>(dev_ctx,
Matmul<T, Context>(dev_ctx, dVH, project),
Unsqueeze(s_inverse, -1)));
}
v_term = Matmul<T, Context>(dev_ctx, U, v_term);
}
*x_grad = Add<T, Context>(
dev_ctx, Add<T, Context>(dev_ctx, u_term, sigma_term), v_term);
}
} // namespace phi
......@@ -16,6 +16,7 @@
#include "paddle/phi/common/int_array.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/infermeta/unary.h"
namespace phi {
......@@ -29,4 +30,21 @@ void SliceRawKernel(const Context& ctx,
const std::vector<int64_t>& decrease_axis,
DenseTensor* out);
template <typename T, typename Context>
DenseTensor SliceKernel(const Context& ctx,
const DenseTensor& input,
const std::vector<int64_t>& axes,
const IntArray& starts,
const IntArray& ends,
const std::vector<int64_t>& infer_flags,
const std::vector<int64_t>& decrease_axis) {
DenseTensor dense_out;
MetaTensor meta_out(&dense_out);
SliceRawInferMeta(
input, axes, starts, ends, infer_flags, decrease_axis, &meta_out);
SliceRawKernel<T, Context>(
ctx, input, axes, starts, ends, infer_flags, decrease_axis, &dense_out);
return dense_out;
}
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void SvdGradKernel(const Context& dev_ctx,
const DenseTensor& X,
const DenseTensor& U,
const DenseTensor& VH,
const DenseTensor& S,
const DenseTensor& U_grad,
const DenseTensor& VH_grad,
const DenseTensor& S_grad,
bool full_matrices,
DenseTensor* X_grad);
} // namespace phi
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/phi/core/dense_tensor.h"
namespace phi {
template <typename T, typename Context>
void SvdKernel(const Context& dev_ctx,
const DenseTensor& X,
bool full_matrices,
DenseTensor* U,
DenseTensor* S,
DenseTensor* VH);
} // namespace phi
/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/phi/core/compat/op_utils.h"
namespace phi {
KernelSignature SvdGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
return KernelSignature("svd_grad",
{"X", "U", "VH", "S", "U@GRAD", "VH@GRAD", "S@GRAD"},
{"full_matrices"},
{"X@GRAD"});
}
} // namespace phi
PD_REGISTER_ARG_MAPPING_FN(svd_grad, phi::SvdGradOpArgumentMapping);
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册