From 7c30253841b5e1f6091d61aa0785b950d6c0a904 Mon Sep 17 00:00:00 2001 From: huangjiyi <43315610+huangjiyi@users.noreply.github.com> Date: Wed, 9 Nov 2022 20:55:26 +0800 Subject: [PATCH] [PHI decoupling] remove "paddle/fluid/platform/dynload/xxx.h" in phi (#47787) * rm "paddle/fluid/platform/dynload/cudnn.h" in phi * rm "paddle/fluid/platform/dynload/mklml.h" in phi * rm "paddle/fluid/platform/dynload/rocblas.h" in phi * replace "paddle::platform::dynload::" with "phi::dynload::" in phi * revert "blas_impl.cu.h" --- paddle/phi/kernels/funcs/blas/blas.h | 2 +- paddle/phi/kernels/funcs/blas/blas_impl.h | 248 ++++----- paddle/phi/kernels/funcs/blas/blas_impl.hip.h | 499 +++++++++--------- paddle/phi/kernels/funcs/cpu_vec.h | 10 +- paddle/phi/kernels/funcs/math_function.cc | 2 +- paddle/phi/kernels/gpu/rnn_functor.h | 104 ++-- .../phi/kernels/gpu/sync_batch_norm_utils.h | 16 +- paddle/phi/kernels/gpudnn/softmax_gpudnn.h | 31 +- paddle/phi/kernels/impl/conv_cudnn_impl.h | 2 +- 9 files changed, 439 insertions(+), 475 deletions(-) diff --git a/paddle/phi/kernels/funcs/blas/blas.h b/paddle/phi/kernels/funcs/blas/blas.h index 28c5e89516b..a44c24e971a 100644 --- a/paddle/phi/kernels/funcs/blas/blas.h +++ b/paddle/phi/kernels/funcs/blas/blas.h @@ -18,7 +18,7 @@ #include "paddle/phi/core/dense_tensor.h" #ifdef PADDLE_WITH_MKLML -#include "paddle/fluid/platform/dynload/mklml.h" +#include "paddle/phi/backends/dynload/mklml.h" #endif #ifdef PADDLE_WITH_LIBXSMM diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h index a18ec953d0a..1607bd4c484 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.h @@ -113,27 +113,27 @@ template <> struct CBlas { template static void GEMM(ARGS... args) { - paddle::platform::dynload::cblas_sgemm(args...); + phi::dynload::cblas_sgemm(args...); } template static float *GEMM_ALLOC(ARGS... args) { - return paddle::platform::dynload::cblas_sgemm_alloc(args...); + return phi::dynload::cblas_sgemm_alloc(args...); } template static void GEMM_PACK(ARGS... args) { - paddle::platform::dynload::cblas_sgemm_pack(args...); + phi::dynload::cblas_sgemm_pack(args...); } template static void GEMM_COMPUTE(ARGS... args) { - paddle::platform::dynload::cblas_sgemm_compute(args...); + phi::dynload::cblas_sgemm_compute(args...); } template static void GEMM_FREE(ARGS... args) { - paddle::platform::dynload::cblas_sgemm_free(args...); + phi::dynload::cblas_sgemm_free(args...); } #ifdef PADDLE_WITH_LIBXSMM @@ -145,93 +145,93 @@ struct CBlas { template static void AXPY(ARGS... args) { - paddle::platform::dynload::cblas_saxpy(args...); + phi::dynload::cblas_saxpy(args...); } template static void VCOPY(ARGS... args) { - paddle::platform::dynload::cblas_scopy(args...); + phi::dynload::cblas_scopy(args...); } template static void GEMV(ARGS... args) { - paddle::platform::dynload::cblas_sgemv(args...); + phi::dynload::cblas_sgemv(args...); } template static float DOT(ARGS... args) { - return paddle::platform::dynload::cblas_sdot(args...); + return phi::dynload::cblas_sdot(args...); } template static void SCAL(ARGS... args) { - paddle::platform::dynload::cblas_sscal(args...); + phi::dynload::cblas_sscal(args...); } template static float ASUM(ARGS... args) { - return paddle::platform::dynload::cblas_sasum(args...); + return phi::dynload::cblas_sasum(args...); } template static void GEMM_BATCH(ARGS... args) { - paddle::platform::dynload::cblas_sgemm_batch(args...); + phi::dynload::cblas_sgemm_batch(args...); } template static void VADD(ARGS... args) { - paddle::platform::dynload::vsAdd(args...); + phi::dynload::vsAdd(args...); } template static void VSUB(ARGS... args) { - paddle::platform::dynload::vsSub(args...); + phi::dynload::vsSub(args...); } template static void VMUL(ARGS... args) { - paddle::platform::dynload::vsMul(args...); + phi::dynload::vsMul(args...); } template static void VDIV(ARGS... args) { - paddle::platform::dynload::vsDiv(args...); + phi::dynload::vsDiv(args...); } template static void VEXP(ARGS... args) { - paddle::platform::dynload::vsExp(args...); + phi::dynload::vsExp(args...); } template static void VSQUARE(ARGS... args) { - paddle::platform::dynload::vsSqr(args...); + phi::dynload::vsSqr(args...); } template static void VPOW(ARGS... args) { - paddle::platform::dynload::vsPowx(args...); + phi::dynload::vsPowx(args...); } template static void VINV(ARGS... args) { - paddle::platform::dynload::vsInv(args...); + phi::dynload::vsInv(args...); } template static void VMERF(ARGS... args) { - paddle::platform::dynload::vmsErf(args...); + phi::dynload::vmsErf(args...); } #if !defined(_WIN32) template static void CSRMM(ARGS... args) { - paddle::platform::dynload::mkl_scsrmm(args...); + phi::dynload::mkl_scsrmm(args...); } #endif template static void TRSM(ARGS... args) { - paddle::platform::dynload::cblas_strsm(args...); + phi::dynload::cblas_strsm(args...); } }; @@ -239,27 +239,27 @@ template <> struct CBlas { template static void GEMM(ARGS... args) { - paddle::platform::dynload::cblas_dgemm(args...); + phi::dynload::cblas_dgemm(args...); } template static double *GEMM_ALLOC(ARGS... args) { - return paddle::platform::dynload::cblas_dgemm_alloc(args...); + return phi::dynload::cblas_dgemm_alloc(args...); } template static void GEMM_PACK(ARGS... args) { - paddle::platform::dynload::cblas_dgemm_pack(args...); + phi::dynload::cblas_dgemm_pack(args...); } template static void GEMM_COMPUTE(ARGS... args) { - paddle::platform::dynload::cblas_dgemm_compute(args...); + phi::dynload::cblas_dgemm_compute(args...); } template static void GEMM_FREE(ARGS... args) { - paddle::platform::dynload::cblas_dgemm_free(args...); + phi::dynload::cblas_dgemm_free(args...); } #ifdef PADDLE_WITH_LIBXSMM @@ -271,93 +271,93 @@ struct CBlas { template static void AXPY(ARGS... args) { - paddle::platform::dynload::cblas_daxpy(args...); + phi::dynload::cblas_daxpy(args...); } template static void VCOPY(ARGS... args) { - paddle::platform::dynload::cblas_dcopy(args...); + phi::dynload::cblas_dcopy(args...); } template static void GEMV(ARGS... args) { - paddle::platform::dynload::cblas_dgemv(args...); + phi::dynload::cblas_dgemv(args...); } template static double DOT(ARGS... args) { - return paddle::platform::dynload::cblas_ddot(args...); + return phi::dynload::cblas_ddot(args...); } template static void SCAL(ARGS... args) { - paddle::platform::dynload::cblas_dscal(args...); + phi::dynload::cblas_dscal(args...); } template static double ASUM(ARGS... args) { - return paddle::platform::dynload::cblas_dasum(args...); + return phi::dynload::cblas_dasum(args...); } template static void GEMM_BATCH(ARGS... args) { - paddle::platform::dynload::cblas_dgemm_batch(args...); + phi::dynload::cblas_dgemm_batch(args...); } template static void VADD(ARGS... args) { - paddle::platform::dynload::vdAdd(args...); + phi::dynload::vdAdd(args...); } template static void VSUB(ARGS... args) { - paddle::platform::dynload::vdSub(args...); + phi::dynload::vdSub(args...); } template static void VMUL(ARGS... args) { - paddle::platform::dynload::vdMul(args...); + phi::dynload::vdMul(args...); } template static void VDIV(ARGS... args) { - paddle::platform::dynload::vdDiv(args...); + phi::dynload::vdDiv(args...); } template static void VEXP(ARGS... args) { - paddle::platform::dynload::vdExp(args...); + phi::dynload::vdExp(args...); } template static void VSQUARE(ARGS... args) { - paddle::platform::dynload::vdSqr(args...); + phi::dynload::vdSqr(args...); } template static void VPOW(ARGS... args) { - paddle::platform::dynload::vdPowx(args...); + phi::dynload::vdPowx(args...); } template static void VINV(ARGS... args) { - paddle::platform::dynload::vdInv(args...); + phi::dynload::vdInv(args...); } template static void VMERF(ARGS... args) { - paddle::platform::dynload::vmdErf(args...); + phi::dynload::vmdErf(args...); } #if !defined(_WIN32) template static void CSRMM(ARGS... args) { - paddle::platform::dynload::mkl_dcsrmm(args...); + phi::dynload::mkl_dcsrmm(args...); } #endif template static void TRSM(ARGS... args) { - paddle::platform::dynload::cblas_dtrsm(args...); + phi::dynload::cblas_dtrsm(args...); } }; @@ -370,12 +370,12 @@ struct CBlas> { const int incX, phi::dtype::complex *Y, const int incY) { - paddle::platform::dynload::cblas_caxpy(n, &alpha, X, incX, Y, incY); + phi::dynload::cblas_caxpy(n, &alpha, X, incX, Y, incY); } template static void VCOPY(ARGS... args) { - paddle::platform::dynload::cblas_ccopy(args...); + phi::dynload::cblas_ccopy(args...); } // the libmklml_intel.so paddle used has no vcAdd, vcSub, @@ -384,22 +384,22 @@ struct CBlas> { /* template static void VADD(ARGS... args) { - paddle::platform::dynload::vcAdd(args...); + phi::dynload::vcAdd(args...); } template static void VSUB(ARGS... args) { - paddle::platform::dynload::vcSub(args...); + phi::dynload::vcSub(args...); } template static void VMUL(ARGS... args) { - paddle::platform::dynload::vcMul(args...); + phi::dynload::vcMul(args...); } template static void VDIV(ARGS... args) { - paddle::platform::dynload::vcDiv(args...); + phi::dynload::vcDiv(args...); } */ @@ -458,7 +458,7 @@ struct CBlas> { const void *a_ = (const void *)(A); const void *x_ = (const void *)(X); void *y_ = static_cast(Y); - paddle::platform::dynload::cblas_cgemv( + phi::dynload::cblas_cgemv( layout, trans, M, N, &alpha, a_, lda, x_, incx, &beta, y_, incy); } @@ -480,20 +480,20 @@ struct CBlas> { const void *a_ = (const void *)(A); const void *b_ = (const void *)(B); void *c_ = static_cast(C); - paddle::platform::dynload::cblas_cgemm(layout, - trans_a, - trans_b, - M, - N, - K, - &alpha, - a_, - lda, - b_, - ldb, - &beta, - c_, - ldc); + phi::dynload::cblas_cgemm(layout, + trans_a, + trans_b, + M, + N, + K, + &alpha, + a_, + lda, + b_, + ldb, + &beta, + c_, + ldc); } static void TRSM(CBLAS_LAYOUT layout, @@ -510,7 +510,7 @@ struct CBlas> { int ldb) { const void *a_ = (const void *)(A); void *b_ = static_cast(B); - paddle::platform::dynload::cblas_ctrsm( + phi::dynload::cblas_ctrsm( layout, side, uplo, trans_a, diag, M, N, &alpha, a_, lda, b_, ldb); } @@ -535,27 +535,27 @@ struct CBlas> { const void **B_void = (const void **)(&(*B)); void **C_void = reinterpret_cast(C); - paddle::platform::dynload::cblas_cgemm_batch(layout, - trans_a, - trans_b, - M, - N, - K, - alpha, - A_void, - lda, - B_void, - ldb, - beta, - C_void, - ldc, - group_count, - group_size); + phi::dynload::cblas_cgemm_batch(layout, + trans_a, + trans_b, + M, + N, + K, + alpha, + A_void, + lda, + B_void, + ldb, + beta, + C_void, + ldc, + group_count, + group_size); } template static void GEMM_EX(ARGS... args) { - paddle::platform::dynload::cblas_cgemm_batch(args...); + phi::dynload::cblas_cgemm_batch(args...); } }; @@ -568,12 +568,12 @@ struct CBlas> { const int incX, phi::dtype::complex *Y, const int incY) { - paddle::platform::dynload::cblas_zaxpy(n, &alpha, X, incX, Y, incY); + phi::dynload::cblas_zaxpy(n, &alpha, X, incX, Y, incY); } template static void VCOPY(ARGS... args) { - paddle::platform::dynload::cblas_zcopy(args...); + phi::dynload::cblas_zcopy(args...); } // the libmklml_intel.so paddle used has no vzAdd, vzSub, @@ -582,22 +582,22 @@ struct CBlas> { /* template static void VADD(ARGS... args) { - paddle::platform::dynload::vzAdd(args...); + phi::dynload::vzAdd(args...); } template static void VSUB(ARGS... args) { - paddle::platform::dynload::vzSub(args...); + phi::dynload::vzSub(args...); } template static void VMUL(ARGS... args) { - paddle::platform::dynload::vzMul(args...); + phi::dynload::vzMul(args...); } template static void VDIV(ARGS... args) { - paddle::platform::dynload::vzDiv(args...); + phi::dynload::vzDiv(args...); } */ @@ -656,7 +656,7 @@ struct CBlas> { const void *a_ = (const void *)(A); const void *x_ = (const void *)(X); void *y_ = static_cast(Y); - paddle::platform::dynload::cblas_zgemv( + phi::dynload::cblas_zgemv( layout, trans, M, N, &alpha, a_, lda, x_, incx, &beta, y_, incy); } @@ -678,20 +678,20 @@ struct CBlas> { const void *a_ = (const void *)(A); const void *b_ = (const void *)(B); void *c_ = static_cast(C); - paddle::platform::dynload::cblas_zgemm(layout, - trans_a, - trans_b, - M, - N, - K, - &alpha, - a_, - lda, - b_, - ldb, - &beta, - c_, - ldc); + phi::dynload::cblas_zgemm(layout, + trans_a, + trans_b, + M, + N, + K, + &alpha, + a_, + lda, + b_, + ldb, + &beta, + c_, + ldc); } static void TRSM(CBLAS_LAYOUT layout, @@ -708,7 +708,7 @@ struct CBlas> { int ldb) { const void *a_ = (const void *)(A); void *b_ = static_cast(B); - paddle::platform::dynload::cblas_ztrsm( + phi::dynload::cblas_ztrsm( layout, side, uplo, trans_a, diag, M, N, &alpha, a_, lda, b_, ldb); } @@ -733,27 +733,27 @@ struct CBlas> { const void **B_void = (const void **)(&(*B)); void **C_void = reinterpret_cast(C); - paddle::platform::dynload::cblas_zgemm_batch(layout, - trans_a, - trans_b, - M, - N, - K, - alpha, - A_void, - lda, - B_void, - ldb, - beta, - C_void, - ldc, - group_count, - group_size); + phi::dynload::cblas_zgemm_batch(layout, + trans_a, + trans_b, + M, + N, + K, + alpha, + A_void, + lda, + B_void, + ldb, + beta, + C_void, + ldc, + group_count, + group_size); } template static void GEMM_EX(ARGS... args) { - paddle::platform::dynload::cblas_zgemm_batch(args...); + phi::dynload::cblas_zgemm_batch(args...); } }; diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h index b35882df1e5..cbde4fdbc81 100644 --- a/paddle/phi/kernels/funcs/blas/blas_impl.hip.h +++ b/paddle/phi/kernels/funcs/blas/blas_impl.hip.h @@ -15,7 +15,7 @@ #pragma once #include "paddle/fluid/platform/device/gpu/gpu_info.h" -#include "paddle/fluid/platform/dynload/rocblas.h" +#include "paddle/phi/backends/dynload/rocblas.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/kernels/funcs/math_function.h" @@ -31,38 +31,33 @@ template <> struct CUBlas { template static void GEMM(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_sgemm(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_sgemm(args...)); } template static void AXPY(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_saxpy(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_saxpy(args...)); } template static void SCAL(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_sscal(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_sscal(args...)); } template static void VCOPY(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_scopy(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_scopy(args...)); } template static void GEMV(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_sgemv(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_sgemv(args...)); } template static void GEMM_STRIDED_BATCH(ARGS... args) { PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_sgemm_strided_batched(args...)); + phi::dynload::rocblas_sgemm_strided_batched(args...)); } // HIP not supportted, refer to the doc here: @@ -75,8 +70,7 @@ struct CUBlas { template static void TRSM(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_strsm(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_strsm(args...)); } template @@ -108,38 +102,33 @@ template <> struct CUBlas { template static void GEMM(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_dgemm(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_dgemm(args...)); } template static void AXPY(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_daxpy(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_daxpy(args...)); } template static void SCAL(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_dscal(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_dscal(args...)); } template static void VCOPY(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_dcopy(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_dcopy(args...)); } template static void GEMV(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_dgemv(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_dgemv(args...)); } template static void GEMM_STRIDED_BATCH(ARGS... args) { PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_dgemm_strided_batched(args...)); + phi::dynload::rocblas_dgemm_strided_batched(args...)); } template @@ -150,8 +139,7 @@ struct CUBlas { template static void TRSM(ARGS... args) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_dtrsm(args...)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_dtrsm(args...)); } template @@ -197,7 +185,7 @@ struct CUBlas { const float16 *beta, float16 *C, int ldc) { - PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::rocblas_hgemm( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_hgemm( handle, transa, transb, @@ -232,26 +220,25 @@ struct CUBlas { int ldc, long long int strideC, // NOLINT int batchCount) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_hgemm_strided_batched( - handle, - transa, - transb, - m, - n, - k, - reinterpret_cast(alpha), - reinterpret_cast(A), - lda, - strideA, - reinterpret_cast(B), - ldb, - strideB, - reinterpret_cast(beta), - reinterpret_cast(C), - ldc, - strideC, - batchCount)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_hgemm_strided_batched( + handle, + transa, + transb, + m, + n, + k, + reinterpret_cast(alpha), + reinterpret_cast(A), + lda, + strideA, + reinterpret_cast(B), + ldb, + strideB, + reinterpret_cast(beta), + reinterpret_cast(C), + ldc, + strideC, + batchCount)); } // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. @@ -277,31 +264,30 @@ struct CUBlas { rocblas_datatype computeType) { rocblas_gemm_algo algo = rocblas_gemm_algo_standard; dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_gemm_ex(handle, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc, - C, - Ctype, - ldc, - computeType, - algo, - 0, - 0)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_gemm_ex(handle, + transa, + transb, + m, + n, + k, + alpha, + A, + Atype, + lda, + B, + Btype, + ldb, + beta, + C, + Ctype, + ldc, + C, + Ctype, + ldc, + computeType, + algo, + 0, + 0)); }); } }; @@ -320,7 +306,7 @@ struct CUBlas> { const phi::dtype::complex *beta, phi::dtype::complex *C, int ldc) { - PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::rocblas_cgemv( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_cgemv( handle, transa, m, @@ -342,7 +328,7 @@ struct CUBlas> { const int incX, phi::dtype::complex *Y, const int incY) { - PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::rocblas_caxpy( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_caxpy( handle, n, reinterpret_cast(alpha), @@ -370,26 +356,25 @@ struct CUBlas> { int ldc, long long int strideC, // NOLINT int batchCount) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_cgemm_strided_batched( - handle, - transa, - transb, - m, - n, - k, - reinterpret_cast(alpha), - reinterpret_cast(A), - lda, - strideA, - reinterpret_cast(B), - ldb, - strideB, - reinterpret_cast(beta), - reinterpret_cast(C), - ldc, - strideC, - batchCount)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_cgemm_strided_batched( + handle, + transa, + transb, + m, + n, + k, + reinterpret_cast(alpha), + reinterpret_cast(A), + lda, + strideA, + reinterpret_cast(B), + ldb, + strideB, + reinterpret_cast(beta), + reinterpret_cast(C), + ldc, + strideC, + batchCount)); } static void GEMM(rocblas_handle handle, @@ -406,7 +391,7 @@ struct CUBlas> { const phi::dtype::complex *beta, phi::dtype::complex *C, int ldc) { - PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::rocblas_cgemm( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_cgemm( handle, transa, transb, @@ -446,31 +431,30 @@ struct CUBlas> { rocblas_datatype computeType) { rocblas_gemm_algo algo = rocblas_gemm_algo_standard; dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_gemm_ex(handle, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc, - C, - Ctype, - ldc, - computeType, - algo, - 0, - 0)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_gemm_ex(handle, + transa, + transb, + m, + n, + k, + alpha, + A, + Atype, + lda, + B, + Btype, + ldb, + beta, + C, + Ctype, + ldc, + C, + Ctype, + ldc, + computeType, + algo, + 0, + 0)); }); } }; @@ -489,7 +473,7 @@ struct CUBlas> { const phi::dtype::complex *beta, phi::dtype::complex *C, int ldc) { - PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::rocblas_zgemv( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_zgemv( handle, transa, m, @@ -511,7 +495,7 @@ struct CUBlas> { const int incX, phi::dtype::complex *Y, const int incY) { - PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::rocblas_zaxpy( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_zaxpy( handle, n, reinterpret_cast(alpha), @@ -540,26 +524,25 @@ struct CUBlas> { int ldc, long long int strideC, // NOLINT int batchCount) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_zgemm_strided_batched( - handle, - transa, - transb, - m, - n, - k, - reinterpret_cast(alpha), - reinterpret_cast(A), - lda, - strideA, - reinterpret_cast(B), - ldb, - strideB, - reinterpret_cast(beta), - reinterpret_cast(C), - ldc, - strideC, - batchCount)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_zgemm_strided_batched( + handle, + transa, + transb, + m, + n, + k, + reinterpret_cast(alpha), + reinterpret_cast(A), + lda, + strideA, + reinterpret_cast(B), + ldb, + strideB, + reinterpret_cast(beta), + reinterpret_cast(C), + ldc, + strideC, + batchCount)); } static void GEMM(rocblas_handle handle, @@ -576,7 +559,7 @@ struct CUBlas> { const phi::dtype::complex *beta, phi::dtype::complex *C, int ldc) { - PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::rocblas_zgemm( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_zgemm( handle, transa, transb, @@ -616,31 +599,30 @@ struct CUBlas> { rocblas_datatype computeType) { rocblas_gemm_algo algo = rocblas_gemm_algo_standard; dev_ctx->TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_gemm_ex(handle, - transa, - transb, - m, - n, - k, - alpha, - A, - Atype, - lda, - B, - Btype, - ldb, - beta, - C, - Ctype, - ldc, - C, - Ctype, - ldc, - computeType, - algo, - 0, - 0)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::rocblas_gemm_ex(handle, + transa, + transb, + m, + n, + k, + alpha, + A, + Atype, + lda, + B, + Btype, + ldb, + beta, + C, + Ctype, + ldc, + C, + Ctype, + ldc, + computeType, + algo, + 0, + 0)); }); } }; @@ -778,30 +760,30 @@ inline void Blas::GEMM(CBLAS_TRANSPOSE transA, context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_gemm_ex(handle, - cuTransB, - cuTransA, - N, - M, - K, - &h_alpha, - B, - rocblas_datatype_bf16_r, - ldb, - A, - rocblas_datatype_bf16_r, - lda, - &h_beta, - C, - rocblas_datatype_bf16_r, - N, - C, - rocblas_datatype_bf16_r, - N, - rocblas_datatype_f32_r, - algo, - 0, - 0)); + phi::dynload::rocblas_gemm_ex(handle, + cuTransB, + cuTransA, + N, + M, + K, + &h_alpha, + B, + rocblas_datatype_bf16_r, + ldb, + A, + rocblas_datatype_bf16_r, + lda, + &h_beta, + C, + rocblas_datatype_bf16_r, + N, + C, + rocblas_datatype_bf16_r, + N, + rocblas_datatype_f32_r, + algo, + 0, + 0)); }); } @@ -1159,24 +1141,24 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, const int64_t strideC = M * N; context_.CublasCall([&](rocblas_handle handle) { PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_sgemm_strided_batched(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - strideB, - A, - lda, - strideA, - &beta, - C, - ldc, - strideC, - batchCount)); + phi::dynload::rocblas_sgemm_strided_batched(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B, + ldb, + strideB, + A, + lda, + strideA, + &beta, + C, + ldc, + strideC, + batchCount)); }); } @@ -1209,24 +1191,24 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, const int64_t strideC = M * N; context_.CublasCall([&](rocblas_handle handle) { PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_dgemm_strided_batched(handle, - cuTransB, - cuTransA, - N, - M, - K, - &alpha, - B, - ldb, - strideB, - A, - lda, - strideA, - &beta, - C, - ldc, - strideC, - batchCount)); + phi::dynload::rocblas_dgemm_strided_batched(handle, + cuTransB, + cuTransA, + N, + M, + K, + &alpha, + B, + ldb, + strideB, + A, + lda, + strideA, + &beta, + C, + ldc, + strideC, + batchCount)); }); } @@ -1261,36 +1243,35 @@ inline void Blas::BatchedGEMM(CBLAS_TRANSPOSE transA, context_.TensorCoreCublasCallIfAvailable([&](rocblas_handle handle) { PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::rocblas_gemm_strided_batched_ex( - handle, - cuTransB, - cuTransA, - N, - M, - K, - &h_alpha, - B, - rocblas_datatype_bf16_r, - ldb, - strideB, - A, - rocblas_datatype_bf16_r, - lda, - strideA, - &h_beta, - C, - rocblas_datatype_bf16_r, - ldc, - strideC, - C, - rocblas_datatype_bf16_r, - ldc, - strideC, - batchCount, - rocblas_datatype_f32_r, - algo, - 0, - 0)); + phi::dynload::rocblas_gemm_strided_batched_ex(handle, + cuTransB, + cuTransA, + N, + M, + K, + &h_alpha, + B, + rocblas_datatype_bf16_r, + ldb, + strideB, + A, + rocblas_datatype_bf16_r, + lda, + strideA, + &h_beta, + C, + rocblas_datatype_bf16_r, + ldc, + strideC, + C, + rocblas_datatype_bf16_r, + ldc, + strideC, + batchCount, + rocblas_datatype_f32_r, + algo, + 0, + 0)); }); } diff --git a/paddle/phi/kernels/funcs/cpu_vec.h b/paddle/phi/kernels/funcs/cpu_vec.h index 7bb2a5fcfb3..21a0b429c99 100644 --- a/paddle/phi/kernels/funcs/cpu_vec.h +++ b/paddle/phi/kernels/funcs/cpu_vec.h @@ -21,7 +21,7 @@ limitations under the License. */ #include "paddle/fluid/platform/enforce.h" #ifdef PADDLE_WITH_MKLML -#include "paddle/fluid/platform/dynload/mklml.h" +#include "paddle/phi/backends/dynload/mklml.h" #endif namespace phi { @@ -60,23 +60,23 @@ inline void vec_exp(const int n, const float* x, float* y) { y[i] = std::exp(x[i]); } } else { - paddle::platform::dynload::vsExp(n, x, y); + phi::dynload::vsExp(n, x, y); } } template <> inline void vec_exp(const int n, const double* x, double* y) { - paddle::platform::dynload::vdExp(n, x, y); + phi::dynload::vdExp(n, x, y); } template <> inline void vec_scal(const int n, const float a, float* x) { - paddle::platform::dynload::cblas_sscal(n, a, x, 1); + phi::dynload::cblas_sscal(n, a, x, 1); } template <> inline void vec_scal(const int n, const double a, double* x) { - paddle::platform::dynload::cblas_dscal(n, a, x, 1); + phi::dynload::cblas_dscal(n, a, x, 1); } #endif diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc index 8b6fd117e96..5f7524cde59 100644 --- a/paddle/phi/kernels/funcs/math_function.cc +++ b/paddle/phi/kernels/funcs/math_function.cc @@ -15,7 +15,7 @@ limitations under the License. */ #include "paddle/phi/kernels/funcs/math_function.h" #ifdef PADDLE_WITH_MKLML -#include "paddle/fluid/platform/dynload/mklml.h" +#include "paddle/phi/backends/dynload/mklml.h" #endif #ifdef PADDLE_USE_OPENBLAS diff --git a/paddle/phi/kernels/gpu/rnn_functor.h b/paddle/phi/kernels/gpu/rnn_functor.h index fb8e07b8f14..14778dc1847 100644 --- a/paddle/phi/kernels/gpu/rnn_functor.h +++ b/paddle/phi/kernels/gpu/rnn_functor.h @@ -102,14 +102,12 @@ class RNNDescriptors { if (!is_test_ && !is_initialized) { #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenDropoutGetStatesSize(handle, - &state_size)); + phi::dynload::miopenDropoutGetStatesSize(handle, &state_size)); dropout_state->mutable_data({static_cast(state_size)}, place); #else PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnDropoutGetStatesSize(handle, - &state_size)); + phi::dynload::cudnnDropoutGetStatesSize(handle, &state_size)); dropout_state->mutable_data({static_cast(state_size)}, place); #endif @@ -124,33 +122,31 @@ class RNNDescriptors { // ------------------- cudnn rnn descriptors --------------------- #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenSetRNNDescriptor_V2( - rnn_desc_.desc(), - hidden_size_, - num_layers_, - dropout_desc_.desc(), - miopenRNNlinear, - is_bidirec_ ? miopenRNNbidirection : miopenRNNunidirection, - mode_, - miopenRNNwithBias, - miopenRNNdefault, - cudnn_type)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSetRNNDescriptor_V2( + rnn_desc_.desc(), + hidden_size_, + num_layers_, + dropout_desc_.desc(), + miopenRNNlinear, + is_bidirec_ ? miopenRNNbidirection : miopenRNNunidirection, + mode_, + miopenRNNwithBias, + miopenRNNdefault, + cudnn_type)); #elif CUDNN_VERSION >= 6000 - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnSetRNNDescriptor_v6( - handle, - rnn_desc_.desc(), - hidden_size_, - num_layers_, - dropout_desc_.desc(), - CUDNN_LINEAR_INPUT, - is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, - mode_, - CUDNN_RNN_ALGO_STANDARD, - cudnn_type)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetRNNDescriptor_v6( + handle, + rnn_desc_.desc(), + hidden_size_, + num_layers_, + dropout_desc_.desc(), + CUDNN_LINEAR_INPUT, + is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, + mode_, + CUDNN_RNN_ALGO_STANDARD, + cudnn_type)); #else - PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSetRNNDescriptor( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetRNNDescriptor( rnn_desc_.desc(), hidden_size_, num_layers_, @@ -163,20 +159,18 @@ class RNNDescriptors { #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201 if (!sequence_length.empty()) { - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnSetRNNPaddingMode( - rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSetRNNPaddingMode( + rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED)); } #endif // ------------------- cudnn weights_size --------------------- size_t weights_size_; #ifdef PADDLE_WITH_HIP - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenGetRNNParamsSize( - handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenGetRNNParamsSize( + handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type)); #else - PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnGetRNNParamsSize( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnGetRNNParamsSize( handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type)); #endif PADDLE_ENFORCE_EQ( @@ -192,32 +186,22 @@ class RNNDescriptors { // ------------------- cudnn workspace, reserve size --------------------- #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenGetRNNWorkspaceSize(handle, - rnn_desc_.desc(), - seq_length_, - x_descs_.data(), - workspace_size)); - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenGetRNNTrainingReserveSize( - handle, - rnn_desc_.desc(), - seq_length_, - x_descs_.data(), - reserve_size)); + phi::dynload::miopenGetRNNWorkspaceSize(handle, + rnn_desc_.desc(), + seq_length_, + x_descs_.data(), + workspace_size)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenGetRNNTrainingReserveSize( + handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size)); #else PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnGetRNNWorkspaceSize(handle, - rnn_desc_.desc(), - seq_length_, - x_descs_.data(), - workspace_size)); - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::cudnnGetRNNTrainingReserveSize( - handle, - rnn_desc_.desc(), - seq_length_, - x_descs_.data(), - reserve_size)); + phi::dynload::cudnnGetRNNWorkspaceSize(handle, + rnn_desc_.desc(), + seq_length_, + x_descs_.data(), + workspace_size)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnGetRNNTrainingReserveSize( + handle, rnn_desc_.desc(), seq_length_, x_descs_.data(), reserve_size)); #endif } #ifdef PADDLE_WITH_HIP diff --git a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h index 6cc29e71cce..544544591f3 100644 --- a/paddle/phi/kernels/gpu/sync_batch_norm_utils.h +++ b/paddle/phi/kernels/gpu/sync_batch_norm_utils.h @@ -434,14 +434,14 @@ void SyncBatchNormGradFunctor( int dtype = paddle::platform::ToNCCLDataType( paddle::framework::TransToProtoVarType(scale.dtype())); // In-place operation - PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::ncclAllReduce( - stats, - stats, - 2 * C + 1, - static_cast(dtype), - ncclSum, - comm, - stream)); + PADDLE_ENFORCE_GPU_SUCCESS( + phi::dynload::ncclAllReduce(stats, + stats, + 2 * C + 1, + static_cast(dtype), + ncclSum, + comm, + stream)); VLOG(3) << "Sync result using all reduce"; } #endif diff --git a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h index a079827ca9f..99cd4c9b6d8 100644 --- a/paddle/phi/kernels/gpudnn/softmax_gpudnn.h +++ b/paddle/phi/kernels/gpudnn/softmax_gpudnn.h @@ -1063,7 +1063,7 @@ void SoftmaxForwardCudnnKernel(const GPUContext& dev_ctx, auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE : MIOPEN_SOFTMAX_MODE_CHANNEL; auto algo = log_mode ? MIOPEN_SOFTMAX_LOG : MIOPEN_SOFTMAX_ACCURATE; - PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::miopenSoftmaxForward_V2( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSoftmaxForward_V2( handle, paddle::platform::CudnnDataType::kOne(), desc, @@ -1078,7 +1078,7 @@ void SoftmaxForwardCudnnKernel(const GPUContext& dev_ctx, auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE : CUDNN_SOFTMAX_MODE_CHANNEL; auto algo = log_mode ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE; - PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxForward( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSoftmaxForward( handle, algo, mode, @@ -1135,25 +1135,24 @@ void SoftmaxBackwardCudnnKernel(const GPUContext& dev_ctx, auto mode = axis == rank - 1 ? MIOPEN_SOFTMAX_MODE_INSTANCE : MIOPEN_SOFTMAX_MODE_CHANNEL; auto algo = log_mode ? MIOPEN_SOFTMAX_LOG : MIOPEN_SOFTMAX_ACCURATE; - PADDLE_ENFORCE_GPU_SUCCESS( - paddle::platform::dynload::miopenSoftmaxBackward_V2( - handle, - paddle::platform::CudnnDataType::kOne(), - desc, - out_data, - desc, - dout_data, - paddle::platform::CudnnDataType::kZero(), - desc, - dx_data, - algo, - mode)); + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::miopenSoftmaxBackward_V2( + handle, + paddle::platform::CudnnDataType::kOne(), + desc, + out_data, + desc, + dout_data, + paddle::platform::CudnnDataType::kZero(), + desc, + dx_data, + algo, + mode)); #else cudnnTensorDescriptor_t desc = scoped_desc.descriptor(layout, tensor_dims); auto mode = axis == rank - 1 ? CUDNN_SOFTMAX_MODE_INSTANCE : CUDNN_SOFTMAX_MODE_CHANNEL; auto algo = log_mode ? CUDNN_SOFTMAX_LOG : CUDNN_SOFTMAX_ACCURATE; - PADDLE_ENFORCE_GPU_SUCCESS(paddle::platform::dynload::cudnnSoftmaxBackward( + PADDLE_ENFORCE_GPU_SUCCESS(phi::dynload::cudnnSoftmaxBackward( handle, algo, mode, diff --git a/paddle/phi/kernels/impl/conv_cudnn_impl.h b/paddle/phi/kernels/impl/conv_cudnn_impl.h index 02b0ab64471..b66dd60279f 100644 --- a/paddle/phi/kernels/impl/conv_cudnn_impl.h +++ b/paddle/phi/kernels/impl/conv_cudnn_impl.h @@ -24,9 +24,9 @@ #endif #include "paddle/fluid/platform/cudnn_workspace_helper.h" -#include "paddle/fluid/platform/dynload/cudnn.h" #include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/profiler.h" +#include "paddle/phi/backends/dynload/cudnn.h" #include "paddle/phi/kernels/cpu/conv_util.h" #include "paddle/phi/kernels/funcs/batch_norm_utils.h" #include "paddle/phi/kernels/funcs/padding.h" -- GitLab