fix cublasLt workspace size (#43877)

6d436f6e · sneaxiy · GitHub · a8113a65 · 6d436f6e
隐藏空白更改
内联并排

Showing with 6 addition and 2 deletion

paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu +6 -2

未找到文件。
--- a/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
+++ b/paddle/fluid/operators/fused/fused_gemm_epilogue_op.cu
@@ -146,7 +146,9 @@ class FusedGemmEpilogueKernel : public framework::OpKernel<T> {
        &out_desc, mat_type, N, M, N));
    cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle();
-    size_t workspace_size = static_cast<size_t>(4) * 1024 * 1024 * 1024;
+    // NOTE(zengjinle): I do not know whether the 4MB workspace size is
+    // "enough". I just followed the settings from the NVIDIA MLPerf BERT code.
+    size_t workspace_size = static_cast<size_t>(4) * 1024 * 1024;
    cudaStream_t stream = dev_ctx.stream();
    memory::allocation::AllocationPtr workspace =
        memory::Alloc(dev_ctx, workspace_size);
@@ -356,7 +358,9 @@ class FusedGemmEpilogueGradKernel : public framework::OpKernel<T> {
    }
    cublasLtHandle_t lt_handle = dev_ctx.cublaslt_handle();
-    size_t workspace_size = static_cast<size_t>(4) * 1024 * 1024 * 1024;
+    // NOTE(zengjinle): I do not know whether the 4MB workspace size is
+    // "enough". I just followed the settings from the NVIDIA MLPerf BERT code.
+    size_t workspace_size = static_cast<size_t>(4) * 1024 * 1024;
    const cublasLtMatmulAlgo_t* algo = nullptr;
    cudaStream_t stream = dev_ctx.stream();