test=develop, add add_multi_gpu_install_check (#18157)

* test=develop, add add_multi_gpu_install_check * test=develop, refine warning doc * test=develop, refine warning doc * test=develop, refine warning doc * test=develop, support multi cpu

test=develop, add add_multi_gpu_install_check (#18157)
* test=develop, add add_multi_gpu_install_check * test=develop, refine warning doc * test=develop, refine warning doc * test=develop, refine warning doc * test=develop, support multi cpu
991c94f1 · Jiabin Yang · GitHub · bbc29292 · 991c94f1 · 991c94f1
Showing with 99 addition and 22 deletion

python/paddle/fluid/install_check.py python/paddle/fluid/install_check.py +95 -22

python/paddle/fluid/tests/unittests/CMakeLists.txt python/paddle/fluid/tests/unittests/CMakeLists.txt +4 -0

未找到文件。
--- a/python/paddle/fluid/install_check.py
+++ b/python/paddle/fluid/install_check.py
@@ -19,9 +19,12 @@ from . import layers
 from . import backward
 from .dygraph import Layer, nn
 from . import executor
-
+from . import optimizer
 from . import core
+from . import compiler
+import logging
 import numpy as np
+import os

 __all__ = ['run_check']

@@ -45,25 +48,95 @@ def run_check():
    This func should not be called only if you need to verify installation
    '''
    print("Running Verify Fluid Program ... ")
-    prog = Program()
-    startup_prog = Program()
-    scope = core.Scope()
-    with executor.scope_guard(scope):
-        with program_guard(prog, startup_prog):
-            with unique_name.guard():
-                np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
-                inp = layers.data(
-                    name="inp", shape=[2, 2], append_batch_size=False)
-                simple_layer = SimpleLayer("simple_layer")
-                out = simple_layer(inp)
-                param_grads = backward.append_backward(
-                    out, parameter_list=[simple_layer._fc1._w.name])[0]
-                exe = executor.Executor(core.CPUPlace(
-                ) if not core.is_compiled_with_cuda() else core.CUDAPlace(0))
-                exe.run(default_startup_program())
-                exe.run(feed={inp.name: np_inp},
-                        fetch_list=[out.name, param_grads[1].name])
+    use_cuda = False if not core.is_compiled_with_cuda() else True
+    place = core.CPUPlace() if not core.is_compiled_with_cuda(
+    ) else core.CUDAPlace(0)
+    np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32)
+
+    if use_cuda:
+        if core.get_cuda_device_count() > 1:
+            os.environ['CUDA_VISIBLE_DEVICES'] = "0,1"
+        else:
+            os.environ['CUDA_VISIBLE_DEVICES'] = "0"
+
+    def test_parallerl_exe():
+        train_prog = Program()
+        startup_prog = Program()
+        scope = core.Scope()
+        if not use_cuda:
+            os.environ['CPU_NUM'] = "2"
+        with executor.scope_guard(scope):
+            with program_guard(train_prog, startup_prog):
+                with unique_name.guard():
+                    places = []
+                    build_strategy = compiler.BuildStrategy()
+                    build_strategy.enable_inplace = True
+                    build_strategy.memory_optimize = True
+                    inp = layers.data(
+                        name="inp", shape=[2, 2], append_batch_size=False)
+                    simple_layer = SimpleLayer("simple_layer")
+                    out = simple_layer(inp)
+                    exe = executor.Executor(place)
+                    if use_cuda:
+                        places = [core.CUDAPlace(0), core.CUDAPlace(1)]
+                    else:
+                        places = [core.CPUPlace(), core.CPUPlace()]
+                    loss = layers.mean(out)
+                    loss.persistable = True
+                    optimizer.SGD(learning_rate=0.01).minimize(loss)
+                    startup_prog.random_seed = 1
+                    compiled_prog = compiler.CompiledProgram(
+                        train_prog).with_data_parallel(
+                            build_strategy=build_strategy,
+                            loss_name=loss.name,
+                            places=places)
+                    exe.run(startup_prog)
+
+                    exe.run(compiled_prog,
+                            feed={inp.name: np_inp},
+                            fetch_list=[loss.name])
+
+    def test_simple_exe():
+        train_prog = Program()
+        startup_prog = Program()
+        scope = core.Scope()
+        if not use_cuda:
+            os.environ['CPU_NUM'] = "1"
+        with executor.scope_guard(scope):
+            with program_guard(train_prog, startup_prog):
+                with unique_name.guard():
+                    inp0 = layers.data(
+                        name="inp", shape=[2, 2], append_batch_size=False)
+                    simple_layer0 = SimpleLayer("simple_layer")
+                    out0 = simple_layer0(inp0)
+                    param_grads = backward.append_backward(
+                        out0, parameter_list=[simple_layer0._fc1._w.name])[0]
+                    exe0 = executor.Executor(core.CPUPlace()
+                                             if not core.is_compiled_with_cuda()
+                                             else core.CUDAPlace(0))
+                    exe0.run(startup_prog)
+                    exe0.run(feed={inp0.name: np_inp},
+                             fetch_list=[out0.name, param_grads[1].name])
+
+    test_simple_exe()
+
+    print("Your Paddle Fluid works well on SINGLE GPU or CPU.")
+    try:
+        test_parallerl_exe()
+        print("Your Paddle Fluid works well on MUTIPLE GPU or CPU.")
+        print(
+            "Your Paddle Fluid is installed successfully! Let's start deep Learning with Paddle Fluid now"
+        )
+    except Exception as e:
+        logging.warning(
+            "Your Paddle Fluid has some problem with multiple GPU. This may be caused by:"
+            "\n 1. There is only 1 GPU visible on your Device;"
+            "\n 2. No.1 or No.2 GPU or both of them are occupied now"
+            "\n 3. Wrong installation of NVIDIA-NCCL2, please follow instruction on https://github.com/NVIDIA/nccl-tests "
+            "\n to test your NCCL, or reinstall it following https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html"
+        )

-    print(
-        "Your Paddle Fluid is installed successfully! Let's start deep Learning with Paddle Fluid now"
-    )
+        print("\n Original Error is: {}".format(e))
+        print(
+            "Your Paddle Fluid is installed successfully ONLY for SINGLE GPU or CPU! "
+            "\n Let's start deep Learning with Paddle Fluid now")
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -116,6 +116,7 @@ list(REMOVE_ITEM TEST_OPS test_imperative_mnist)
 list(REMOVE_ITEM TEST_OPS test_ir_memory_optimize_transformer)
 list(REMOVE_ITEM TEST_OPS test_layers)
 list(REMOVE_ITEM TEST_OPS test_imperative_ocr_attention_model)
+list(REMOVE_ITEM TEST_OPS test_install_check)

 # Some ops need to check results when gc is enabled
 # Currently, only ops that register NoNeedBufferVarsInference need to do this test   
@@ -172,6 +173,9 @@ py_test_modules(test_imperative_mnist_sorted_gradient MODULES test_imperative_mn
 py_test_modules(test_imperative_se_resnext MODULES test_imperative_se_resnext ENVS
    FLAGS_cudnn_deterministic=1 SERIAL)
 set_tests_properties(test_imperative_se_resnext PROPERTIES LABELS "RUN_TYPE=EXCLUSIVE")
+py_test_modules(test_install_check MODULES test_install_check ENVS
+        FLAGS_cudnn_deterministic=1 SERIAL)
+set_tests_properties(test_install_check PROPERTIES LABELS "RUN_TYPE=DIST")

 if(WITH_DISTRIBUTE)
    py_test_modules(test_dist_train MODULES test_dist_train)