Add install check for multigpu (#18323)

* test=develop, add_install_check_for_multigpu * test=develop, refine code to use cuda_devices

Add install check for multigpu (#18323)
* test=develop, add_install_check_for_multigpu * test=develop, refine code to use cuda_devices
831a3e62 · Jiabin Yang · GitHub · f88e07a0 · 831a3e62
显示空白变更内容
内联并排

Showing with 33 addition and 31 deletion

python/paddle/fluid/install_check.py python/paddle/fluid/install_check.py +33 -31

未找到文件。
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -12,7 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from .framework import Program, program_guard, unique_name, default_startup_program
+import os
+from .framework import Program, program_guard, unique_name, cuda_places, cpu_places
 from .param_attr import ParamAttr
 from .initializer import Constant
 from . import layers
@@ -24,7 +25,6 @@ from . import core
 from . import compiler
 import logging
 import numpy as np
-import os

 __all__ = ['run_check']

@@ -48,39 +48,43 @@ def run_check():
    This func should not be called only if you need to verify installation
    '''
    print("Running Verify Fluid Program ... ")
-    use_cuda = False if not core.is_compiled_with_cuda() else True
-    place = core.CPUPlace() if not core.is_compiled_with_cuda(
-    ) else core.CUDAPlace(0)
-    np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-
-    if use_cuda:
-        if core.get_cuda_device_count() > 1:
-            os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"
+
+    device_list = []
+    if core.is_compiled_with_cuda():
+        try:
+            core.get_cuda_device_count()
+        except Exception as e:
+            logging.warning(
+                "You are using GPU version Paddle Fluid, But Your CUDA Device is not set properly"
+                "\n Original Error is {}".format(e))
+            return 0
+        device_list = cuda_places()
    else:
-            os.environ['CUDA_VISIBLE_DEVICES'] = "0"
+        device_list = [core.CPUPlace(), core.CPUPlace()]
+
+    np_inp_single = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
+    inp = []
+    for i in range(len(device_list)):
+        inp.append(np_inp_single)
+    np_inp_muti = np.array(inp)
+    np_inp_muti = np_inp_muti.reshape(len(device_list), 2, 2)

    def test_parallerl_exe():
        train_prog = Program()
        startup_prog = Program()
        scope = core.Scope()
-        if not use_cuda:
-            os.environ['CPU_NUM'] = "2"
        with executor.scope_guard(scope):
            with program_guard(train_prog, startup_prog):
                with unique_name.guard():
-                    places = []
                    build_strategy = compiler.BuildStrategy()
                    build_strategy.enable_inplace = True
                    build_strategy.memory_optimize = True
-                    inp = layers.data(
-                        name="inp", shape=[2, 2], append_batch_size=False)
+                    inp = layers.data(name="inp", shape=[2, 2])
                    simple_layer = SimpleLayer("simple_layer")
                    out = simple_layer(inp)
-                    exe = executor.Executor(place)
-                    if use_cuda:
-                        places = [core.CUDAPlace(0), core.CUDAPlace(1)]
-                    else:
-                        places = [core.CPUPlace(), core.CPUPlace()]
+                    exe = executor.Executor(
+                        core.CUDAPlace(0) if core.is_compiled_with_cuda() and
+                        (core.get_cuda_device_count() > 0) else core.CPUPlace())
                    loss = layers.mean(out)
                    loss.persistable = True
                    optimizer.SGD(learning_rate=0.01).minimize(loss)
@@ -89,19 +93,17 @@ def run_check():
                        train_prog).with_data_parallel(
                            build_strategy=build_strategy,
                            loss_name=loss.name,
-                            places=places)
+                            places=device_list)
                    exe.run(startup_prog)

                    exe.run(compiled_prog,
-                            feed={inp.name: np_inp},
+                            feed={inp.name: np_inp_muti},
                            fetch_list=[loss.name])

    def test_simple_exe():
        train_prog = Program()
        startup_prog = Program()
        scope = core.Scope()
-        if not use_cuda:
-            os.environ['CPU_NUM'] = "1"
        with executor.scope_guard(scope):
            with program_guard(train_prog, startup_prog):
                with unique_name.guard():
@@ -111,11 +113,11 @@ def run_check():
                    out0 = simple_layer0(inp0)
                    param_grads = backward.append_backward(
                        out0, parameter_list=[simple_layer0._fc1._w.name])[0]
-                    exe0 = executor.Executor(core.CPUPlace()
-                                             if not core.is_compiled_with_cuda()
-                                             else core.CUDAPlace(0))
+                    exe0 = executor.Executor(
+                        core.CUDAPlace(0) if core.is_compiled_with_cuda() and
+                        (core.get_cuda_device_count() > 0) else core.CPUPlace())
                    exe0.run(startup_prog)
-                    exe0.run(feed={inp0.name: np_inp},
+                    exe0.run(feed={inp0.name: np_inp_single},
                             fetch_list=[out0.name, param_grads[1].name])

    test_simple_exe()
@@ -130,7 +132,7 @@ def run_check():
    except Exception as e:
        logging.warning(
            "Your Paddle Fluid has some problem with multiple GPU. This may be caused by:"
-            "\n 1. There is only 1 GPU visible on your Device;"
+            "\n 1. There is only 1 or 0 GPU visible on your Device;"
            "\n 2. No.1 or No.2 GPU or both of them are occupied now"
            "\n 3. Wrong installation of NVIDIA-NCCL2, please follow instruction on https://github.com/NVIDIA/nccl-tests "
            "\n to test your NCCL, or reinstall it following https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html"
@@ -139,4 +141,4 @@ def run_check():
        print("\n Original Error is: {}".format(e))
        print(
            "Your Paddle Fluid is installed successfully ONLY for SINGLE GPU or CPU! "
-            "\n Let's start deep Learning with Paddle Fluid now!")
+            "\n Let's start deep Learning with Paddle Fluid now")