From 33e4aca96c9b7908892f8d1165230556f593b948 Mon Sep 17 00:00:00 2001
From: Yuang Liu <liuyuang@baidu.com>
Date: Mon, 12 Jun 2023 11:24:37 +0800
Subject: [PATCH] fix a100 cuda 12 ut (#54542)

---
 test/collective/CMakeLists.txt                | 16 ++++++++++++----
 test/collective/fleet/CMakeLists.txt          | 19 +++++++++++++------
 .../fleet/hybrid_parallel_mp_layers.py        |  2 +-
 test/distributed_passes/CMakeLists.txt        |  2 +-
 4 files changed, 27 insertions(+), 12 deletions(-)

diff --git a/test/collective/CMakeLists.txt b/test/collective/CMakeLists.txt
index ee29e8842e2..e2ee8e68336 100644
--- a/test/collective/CMakeLists.txt
+++ b/test/collective/CMakeLists.txt
@@ -200,8 +200,12 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
-    test_collective_reduce_api MODULES test_collective_reduce_api ENVS
-    "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+    test_collective_reduce_api
+    MODULES
+    test_collective_reduce_api
+    ENVS
+    "NVIDIA_TF32_OVERRIDE=0;http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python"
+  )
   set_tests_properties(test_collective_reduce_api
                        PROPERTIES TIMEOUT "500" LABELS "RUN_TYPE=DIST")
 endif()
@@ -272,8 +276,12 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX))
 endif()
 if((WITH_GPU OR WITH_ROCM) AND (LINUX))
   py_test_modules(
-    test_collective_split_col_linear MODULES test_collective_split_col_linear
-    ENVS "http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python")
+    test_collective_split_col_linear
+    MODULES
+    test_collective_split_col_linear
+    ENVS
+    "NVIDIA_TF32_OVERRIDE=0;http_proxy=;https_proxy=;PYTHONPATH=..:${PADDLE_BINARY_DIR}/python"
+  )
   set_tests_properties(test_collective_split_col_linear
                        PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
 endif()
diff --git a/test/collective/fleet/CMakeLists.txt b/test/collective/fleet/CMakeLists.txt
index aa27af240ac..47d6db03896 100644
--- a/test/collective/fleet/CMakeLists.txt
+++ b/test/collective/fleet/CMakeLists.txt
@@ -332,7 +332,7 @@ if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
     LABELS
     "RUN_TYPE=DIST"
     ENVS
-    "PADDLE_DIST_UT_PORT=21234;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+    "NVIDIA_TF32_OVERRIDE=0;PADDLE_DIST_UT_PORT=21234;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
   )
   set_tests_properties(test_hybrid_parallel_inference_helper PROPERTIES TIMEOUT
                                                                         "120")
@@ -351,8 +351,12 @@ if((WITH_GPU OR WITH_ROCM) AND (LINUX OR WIN32))
 endif()
 if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
   py_test_modules(
-    test_recv_save_op MODULES test_recv_save_op ENVS
-    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+    test_recv_save_op
+    MODULES
+    test_recv_save_op
+    ENVS
+    "NVIDIA_TF32_OVERRIDE=0;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
 endif()
 if(LOCAL_ALL_ARCH AND LOCAL_ALL_PLAT)
   py_test_modules(
@@ -696,7 +700,7 @@ if((WITH_GPU OR WITH_ROCM) AND LOCAL_ALL_PLAT)
     LABELS
     "RUN_TYPE=DIST"
     ENVS
-    "PADDLE_DIST_UT_PORT=21274;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+    "NVIDIA_TF32_OVERRIDE=0;PADDLE_DIST_UT_PORT=21274;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
   )
   set_tests_properties(test_parallel_dygraph_mnist PROPERTIES TIMEOUT "200")
 endif()
@@ -922,9 +926,12 @@ if((WITH_GPU) AND (LINUX))
 endif()
 if((WITH_GPU) AND (LINUX))
   py_test_modules(
-    test_dygraph_save_for_auto_infer MODULES test_dygraph_save_for_auto_infer
+    test_dygraph_save_for_auto_infer
+    MODULES
+    test_dygraph_save_for_auto_infer
     ENVS
-    "http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python")
+    "NVIDIA_TF32_OVERRIDE=0;http_proxy=;https_proxy=;PYTHONPATH=../..:${PADDLE_BINARY_DIR}/python"
+  )
   set_tests_properties(test_dygraph_save_for_auto_infer
                        PROPERTIES TIMEOUT "300" LABELS "RUN_TYPE=DIST")
 endif()
diff --git a/test/collective/fleet/hybrid_parallel_mp_layers.py b/test/collective/fleet/hybrid_parallel_mp_layers.py
index b8e57a9a11b..751bc9255c1 100644
--- a/test/collective/fleet/hybrid_parallel_mp_layers.py
+++ b/test/collective/fleet/hybrid_parallel_mp_layers.py
@@ -221,7 +221,7 @@ class TestDistTraning(unittest.TestCase):
             optimizer_b.step()
 
             np.testing.assert_allclose(
-                loss_a.numpy(), loss_b.numpy(), rtol=5e-6
+                loss_a.numpy(), loss_b.numpy(), rtol=5e-5
             )
 
     def test_parallel_embedding(self):
diff --git a/test/distributed_passes/CMakeLists.txt b/test/distributed_passes/CMakeLists.txt
index e2b8697fc85..79bc34620a4 100644
--- a/test/distributed_passes/CMakeLists.txt
+++ b/test/distributed_passes/CMakeLists.txt
@@ -27,7 +27,7 @@ if(NOT ((WITH_GPU) AND (CUDA_VERSION GREATER_EQUAL 11.6)))
 endif()
 
 foreach(TEST_OP ${TEST_OPS})
-  py_test_modules(${TEST_OP} MODULES ${TEST_OP})
+  py_test_modules(${TEST_OP} MODULES ${TEST_OP} ENVS "NVIDIA_TF32_OVERRIDE=0")
   list(APPEND DIST_TEST_OPS ${TEST_OP})
   set_tests_properties(${TEST_OP} PROPERTIES TIMEOUT 200)
   set_tests_properties(${TEST_OP} PROPERTIES LABELS "RUN_TYPE=DIST")
-- 
GitLab