// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef PADDLE_WITH_HIP // HIP not support cusolver #include "paddle/phi/kernels/matrix_rank_tol_kernel.h" #include #include #include "paddle/fluid/memory/memory.h" #include "paddle/phi/backends/dynload/cusolver.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/abs_kernel.h" #include "paddle/phi/kernels/elementwise_kernel.h" #include "paddle/phi/kernels/full_kernel.h" #include "paddle/phi/kernels/funcs/broadcast_function.h" #include "paddle/phi/kernels/funcs/compare_functors.h" #include "paddle/phi/kernels/impl/matrix_rank_kernel_impl.h" #include "paddle/phi/kernels/reduce_kernel.h" namespace phi { template void GesvdjBatched(const phi::GPUContext& dev_ctx, int batchSize, int m, int n, int k, T* A, T* U, T* V, T* S, int* info, int thin_UV = 1); template void SyevjBatched(const phi::GPUContext& dev_ctx, int batchSize, int n, T* A, T* W, int* info); template <> void GesvdjBatched(const phi::GPUContext& dev_ctx, int batchSize, int m, int n, int k, float* A, float* U, float* V, float* S, int* info, int thin_UV) { // do not compute singular vectors const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; gesvdjInfo_t gesvdj_params = NULL; int lda = m; int ldu = m; int ldt = n; int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); PADDLE_ENFORCE_GPU_SUCCESS( dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params)); PADDLE_ENFORCE_GPU_SUCCESS( dynload::cusolverDnSgesvdj_bufferSize(handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork, gesvdj_params)); auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float)); float* workspace_ptr = reinterpret_cast(workspace->ptr()); int stride_A = lda * n; int stride_U = ldu * (thin_UV ? k : m); int stride_V = ldt * (thin_UV ? k : n); for (int i = 0; i < batchSize; i++) { PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSgesvdj(handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i, U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork, info, gesvdj_params)); int error_info; paddle::memory::Copy(phi::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, sizeof(int), dev_ctx.stream()); PADDLE_ENFORCE_EQ( error_info, 0, phi::errors::PreconditionNotMet( "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info)); } PADDLE_ENFORCE_GPU_SUCCESS( dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params)); } template <> void GesvdjBatched(const phi::GPUContext& dev_ctx, int batchSize, int m, int n, int k, double* A, double* U, double* V, double* S, int* info, int thin_UV) { // do not compute singular vectors const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; gesvdjInfo_t gesvdj_params = NULL; int lda = m; int ldu = m; int ldt = n; int lwork = 0; auto handle = dev_ctx.cusolver_dn_handle(); PADDLE_ENFORCE_GPU_SUCCESS( dynload::cusolverDnCreateGesvdjInfo(&gesvdj_params)); PADDLE_ENFORCE_GPU_SUCCESS( dynload::cusolverDnDgesvdj_bufferSize(handle, jobz, thin_UV, m, n, A, lda, S, U, ldu, V, ldt, &lwork, gesvdj_params)); auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double)); double* workspace_ptr = reinterpret_cast(workspace->ptr()); int stride_A = lda * n; int stride_U = ldu * (thin_UV ? k : m); int stride_V = ldt * (thin_UV ? k : n); for (int i = 0; i < batchSize; ++i) { PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDgesvdj(handle, jobz, thin_UV, m, n, A + stride_A * i, lda, S + k * i, U + stride_U * i, ldu, V + stride_V * i, ldt, workspace_ptr, lwork, info, gesvdj_params)); // check the error info int error_info; paddle::memory::Copy(phi::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, sizeof(int), dev_ctx.stream()); PADDLE_ENFORCE_EQ( error_info, 0, phi::errors::PreconditionNotMet( "For batch [%d]: CUSolver SVD is not zero. [%d]", i, error_info)); } PADDLE_ENFORCE_GPU_SUCCESS( dynload::cusolverDnDestroyGesvdjInfo(gesvdj_params)); } template <> void SyevjBatched(const phi::GPUContext& dev_ctx, int batchSize, int n, float* A, float* W, int* info) { auto handle = dev_ctx.cusolver_dn_handle(); // Compute eigenvalues only const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; // matrix is saved as column-major in cusolver. // numpy and torch use lower triangle to compute eigenvalues, so here use // upper triangle cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER; int lda = n; int stride_A = lda * n; int lwork = 0; syevjInfo_t params = NULL; PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(¶ms)); PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj_bufferSize( handle, jobz, uplo, n, A, lda, W, &lwork, params)); auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(float)); float* workspace_ptr = reinterpret_cast(workspace->ptr()); for (int i = 0; i < batchSize; i++) { PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnSsyevj(handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr, lwork, info, params)); int error_info; paddle::memory::Copy(phi::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, sizeof(int), dev_ctx.stream()); PADDLE_ENFORCE_EQ( error_info, 0, phi::errors::PreconditionNotMet( "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i, error_info)); } PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDestroySyevjInfo(params)); } template <> void SyevjBatched(const phi::GPUContext& dev_ctx, int batchSize, int n, double* A, double* W, int* info) { auto handle = dev_ctx.cusolver_dn_handle(); // Compute eigenvalues only const cusolverEigMode_t jobz = CUSOLVER_EIG_MODE_NOVECTOR; // upper triangle of A is stored cublasFillMode_t uplo = CUBLAS_FILL_MODE_UPPER; int lda = n; int stride_A = lda * n; int lwork = 0; syevjInfo_t params = NULL; PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnCreateSyevjInfo(¶ms)); PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevj_bufferSize( handle, jobz, uplo, n, A, lda, W, &lwork, params)); auto workspace = paddle::memory::Alloc(dev_ctx, lwork * sizeof(double)); double* workspace_ptr = reinterpret_cast(workspace->ptr()); for (int i = 0; i < batchSize; i++) { PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDsyevj(handle, jobz, uplo, n, A + stride_A * i, lda, W + n * i, workspace_ptr, lwork, info, params)); int error_info; paddle::memory::Copy(phi::CPUPlace(), &error_info, dev_ctx.GetPlace(), info, sizeof(int), dev_ctx.stream()); PADDLE_ENFORCE_EQ( error_info, 0, phi::errors::PreconditionNotMet( "For batch [%d]: CUSolver eigenvalues is not zero. [%d]", i, error_info)); } PADDLE_ENFORCE_GPU_SUCCESS(dynload::cusolverDnDestroySyevjInfo(params)); } template void MatrixRankTolKernel(const Context& dev_ctx, const DenseTensor& x, const DenseTensor& atol_tensor, bool use_default_tol, bool hermitian, DenseTensor* out) { auto* x_data = x.data(); dev_ctx.template Alloc(out); auto dim_x = x.dims(); auto dim_out = out->dims(); int rows = dim_x[dim_x.size() - 2]; int cols = dim_x[dim_x.size() - 1]; int k = std::min(rows, cols); auto numel = x.numel(); int batches = numel / (rows * cols); T rtol_T = 0; if (use_default_tol) { rtol_T = std::numeric_limits::epsilon() * std::max(rows, cols); } // Must Copy X once, because the gesvdj will destory the content when exit. DenseTensor x_tmp; paddle::framework::TensorCopy(x, dev_ctx.GetPlace(), &x_tmp); auto info = paddle::memory::Alloc(dev_ctx, sizeof(int) * batches); int* info_ptr = reinterpret_cast(info->ptr()); DenseTensor eigenvalue_tensor; eigenvalue_tensor.Resize(detail::GetEigenvalueDim(dim_x, k)); auto* eigenvalue_data = dev_ctx.template Alloc(&eigenvalue_tensor); if (hermitian) { SyevjBatched( dev_ctx, batches, rows, x_tmp.data(), eigenvalue_data, info_ptr); phi::AbsKernel(dev_ctx, eigenvalue_tensor, &eigenvalue_tensor); } else { DenseTensor U, VH; U.Resize(detail::GetUDDim(dim_x, k)); VH.Resize(detail::GetVHDDim(dim_x, k)); auto* u_data = dev_ctx.template Alloc(&U); auto* vh_data = dev_ctx.template Alloc(&VH); GesvdjBatched(dev_ctx, batches, cols, rows, k, x_tmp.data(), vh_data, u_data, eigenvalue_data, info_ptr, 1); } DenseTensor max_eigenvalue_tensor; dev_ctx.template Alloc(&max_eigenvalue_tensor); max_eigenvalue_tensor.Resize(detail::RemoveLastDim(eigenvalue_tensor.dims())); phi::MaxKernel(dev_ctx, eigenvalue_tensor, std::vector{-1}, false, &max_eigenvalue_tensor); DenseTensor temp_rtol_tensor; temp_rtol_tensor = phi::Full(dev_ctx, {1}, static_cast(rtol_T)); DenseTensor rtol_tensor = phi::Multiply(dev_ctx, temp_rtol_tensor, max_eigenvalue_tensor); DenseTensor tol_tensor; tol_tensor.Resize(dim_out); dev_ctx.template Alloc(&tol_tensor); funcs::ElementwiseCompute, T, T>( dev_ctx, atol_tensor, rtol_tensor, -1, GreaterElementFunctor(), &tol_tensor); tol_tensor.Resize(detail::NewAxisDim(tol_tensor.dims(), 1)); DenseTensor compare_result; compare_result.Resize(detail::NewAxisDim(dim_out, k)); dev_ctx.template Alloc(&compare_result); int axis = -1; funcs::ElementwiseCompute, T, int64_t>( dev_ctx, eigenvalue_tensor, tol_tensor, axis, funcs::GreaterThanFunctor(), &compare_result); phi::SumKernel(dev_ctx, compare_result, std::vector{-1}, compare_result.dtype(), false, out); } } // namespace phi PD_REGISTER_KERNEL(matrix_rank_tol, // cuda_only GPU, ALL_LAYOUT, phi::MatrixRankTolKernel, float, double) {} #endif // not PADDLE_WITH_HIP