From 831a3e62cae1d8b84d8ca25c0da6124cbe99e239 Mon Sep 17 00:00:00 2001 From: Jiabin Yang Date: Tue, 25 Jun 2019 20:35:09 +0800 Subject: [PATCH] Add install check for multigpu (#18323) * test=develop, add_install_check_for_multigpu * test=develop, refine code to use cuda_devices --- python/paddle/fluid/install_check.py | 64 ++++++++++++++-------------- 1 file changed, 33 insertions(+), 31 deletions(-) diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py index 527ae3a914..05907562e5 100644 --- a/python/paddle/fluid/install_check.py +++ b/python/paddle/fluid/install_check.py @@ -12,7 +12,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .framework import Program, program_guard, unique_name, default_startup_program +import os +from .framework import Program, program_guard, unique_name, cuda_places, cpu_places from .param_attr import ParamAttr from .initializer import Constant from . import layers @@ -24,7 +25,6 @@ from . import core from . import compiler import logging import numpy as np -import os __all__ = ['run_check'] @@ -48,39 +48,43 @@ def run_check(): This func should not be called only if you need to verify installation ''' print("Running Verify Fluid Program ... ") - use_cuda = False if not core.is_compiled_with_cuda() else True - place = core.CPUPlace() if not core.is_compiled_with_cuda( - ) else core.CUDAPlace(0) - np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) - if use_cuda: - if core.get_cuda_device_count() > 1: - os.environ['CUDA_VISIBLE_DEVICES'] = "0,1" - else: - os.environ['CUDA_VISIBLE_DEVICES'] = "0" + device_list = [] + if core.is_compiled_with_cuda(): + try: + core.get_cuda_device_count() + except Exception as e: + logging.warning( + "You are using GPU version Paddle Fluid, But Your CUDA Device is not set properly" + "\n Original Error is {}".format(e)) + return 0 + device_list = cuda_places() + else: + device_list = [core.CPUPlace(), core.CPUPlace()] + + np_inp_single = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + inp = [] + for i in range(len(device_list)): + inp.append(np_inp_single) + np_inp_muti = np.array(inp) + np_inp_muti = np_inp_muti.reshape(len(device_list), 2, 2) def test_parallerl_exe(): train_prog = Program() startup_prog = Program() scope = core.Scope() - if not use_cuda: - os.environ['CPU_NUM'] = "2" with executor.scope_guard(scope): with program_guard(train_prog, startup_prog): with unique_name.guard(): - places = [] build_strategy = compiler.BuildStrategy() build_strategy.enable_inplace = True build_strategy.memory_optimize = True - inp = layers.data( - name="inp", shape=[2, 2], append_batch_size=False) + inp = layers.data(name="inp", shape=[2, 2]) simple_layer = SimpleLayer("simple_layer") out = simple_layer(inp) - exe = executor.Executor(place) - if use_cuda: - places = [core.CUDAPlace(0), core.CUDAPlace(1)] - else: - places = [core.CPUPlace(), core.CPUPlace()] + exe = executor.Executor( + core.CUDAPlace(0) if core.is_compiled_with_cuda() and + (core.get_cuda_device_count() > 0) else core.CPUPlace()) loss = layers.mean(out) loss.persistable = True optimizer.SGD(learning_rate=0.01).minimize(loss) @@ -89,19 +93,17 @@ def run_check(): train_prog).with_data_parallel( build_strategy=build_strategy, loss_name=loss.name, - places=places) + places=device_list) exe.run(startup_prog) exe.run(compiled_prog, - feed={inp.name: np_inp}, + feed={inp.name: np_inp_muti}, fetch_list=[loss.name]) def test_simple_exe(): train_prog = Program() startup_prog = Program() scope = core.Scope() - if not use_cuda: - os.environ['CPU_NUM'] = "1" with executor.scope_guard(scope): with program_guard(train_prog, startup_prog): with unique_name.guard(): @@ -111,11 +113,11 @@ def run_check(): out0 = simple_layer0(inp0) param_grads = backward.append_backward( out0, parameter_list=[simple_layer0._fc1._w.name])[0] - exe0 = executor.Executor(core.CPUPlace() - if not core.is_compiled_with_cuda() - else core.CUDAPlace(0)) + exe0 = executor.Executor( + core.CUDAPlace(0) if core.is_compiled_with_cuda() and + (core.get_cuda_device_count() > 0) else core.CPUPlace()) exe0.run(startup_prog) - exe0.run(feed={inp0.name: np_inp}, + exe0.run(feed={inp0.name: np_inp_single}, fetch_list=[out0.name, param_grads[1].name]) test_simple_exe() @@ -130,7 +132,7 @@ def run_check(): except Exception as e: logging.warning( "Your Paddle Fluid has some problem with multiple GPU. This may be caused by:" - "\n 1. There is only 1 GPU visible on your Device;" + "\n 1. There is only 1 or 0 GPU visible on your Device;" "\n 2. No.1 or No.2 GPU or both of them are occupied now" "\n 3. Wrong installation of NVIDIA-NCCL2, please follow instruction on https://github.com/NVIDIA/nccl-tests " "\n to test your NCCL, or reinstall it following https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html" @@ -139,4 +141,4 @@ def run_check(): print("\n Original Error is: {}".format(e)) print( "Your Paddle Fluid is installed successfully ONLY for SINGLE GPU or CPU! " - "\n Let's start deep Learning with Paddle Fluid now!") + "\n Let's start deep Learning with Paddle Fluid now") -- GitLab