From 7fd20772119b53b3f43ba28a85872830b5d34207 Mon Sep 17 00:00:00 2001 From: Tao Luo Date: Mon, 9 Nov 2020 18:43:27 +0800 Subject: [PATCH] set NCCL_SHM_DISABLE=1 for test_parallel_executor_profilery.py (#28484) --- .../tests/unittests/test_parallel_executor_profiler.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_profiler.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_profiler.py index 62ecb2207c..0fac0610fd 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_profiler.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_profiler.py @@ -19,6 +19,14 @@ import numpy as np import paddle.fluid as fluid import paddle.fluid.core as core from paddle.fluid.tests.unittests.test_profiler import TestProfiler +import os + +# NCCL 2.7 decides to use shared memory while NCCL 2.6 didn't, hence causing the error. +# include/shm.h:28 NCCL WARN Call to posix_fallocate failed: No space left on device +# +# Set environment variables NCCL_SHM_DISABLE=1 to disables the Shared Memory (SHM) transports +# and force to use P2P which is the default transports way of NCCL2.6. +os.environ['NCCL_SHM_DISABLE'] = str(1) class TestPEProfiler(TestProfiler): -- GitLab