未验证 提交 e44958c4 编写于 作者: W wopeizl 提交者: GitHub

use the fluid.core.get_cuda_device_count to detect GPU numbers (#2939)

上级 a7d3b2e1
...@@ -78,13 +78,7 @@ def parse_args(): ...@@ -78,13 +78,7 @@ def parse_args():
def get_device_num(): def get_device_num():
if os.getenv("CPU_NUM"): if os.getenv("CPU_NUM"):
return int(os.getenv("CPU_NUM")) return int(os.getenv("CPU_NUM"))
visible_device = os.getenv('CUDA_VISIBLE_DEVICES') return fluid.core.get_cuda_device_count()
if visible_device:
device_num = len(visible_device.split(','))
else:
device_num = subprocess.check_output(
['nvidia-smi', '-L']).decode().count('\n')
return device_num
def prepare_reader(is_train, pyreader, args, pass_id=1): def prepare_reader(is_train, pyreader, args, pass_id=1):
......
...@@ -62,18 +62,7 @@ def parse_args(): ...@@ -62,18 +62,7 @@ def parse_args():
return args return args
def get_device_num(): DEVICE_NUM = fluid.core.get_cuda_device_count()
import subprocess
visible_device = os.getenv('CUDA_VISIBLE_DEVICES')
if visible_device:
device_num = len(visible_device.split(','))
else:
device_num = subprocess.check_output(
['nvidia-smi', '-L']).decode().count('\n')
return device_num
DEVICE_NUM = get_device_num()
def test_parallel(exe, test_args, args, test_reader, feeder, bs): def test_parallel(exe, test_args, args, test_reader, feeder, bs):
......
...@@ -52,14 +52,9 @@ num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) ...@@ -52,14 +52,9 @@ num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
def get_device_num(): def get_device_num():
# NOTE(zcd): for multi-processe training, each process use one GPU card. # NOTE(zcd): for multi-processe training, each process use one GPU card.
if num_trainers > 1: return 1 if num_trainers > 1:
visible_device = os.environ.get('CUDA_VISIBLE_DEVICES', None) return 1
if visible_device: return fluid.core.get_cuda_device_count()
device_num = len(visible_device.split(','))
else:
device_num = subprocess.check_output(
['nvidia-smi', '-L']).decode().count('\n')
return device_num
def train(): def train():
......
...@@ -51,14 +51,9 @@ num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) ...@@ -51,14 +51,9 @@ num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1))
def get_device_num(): def get_device_num():
# NOTE(zcd): for multi-processe training, each process use one GPU card. # NOTE(zcd): for multi-processe training, each process use one GPU card.
if num_trainers > 1: return 1 if num_trainers > 1:
visible_device = os.environ.get('CUDA_VISIBLE_DEVICES', None) return 1
if visible_device: return fluid.core.get_cuda_device_count()
device_num = len(visible_device.split(','))
else:
device_num = subprocess.check_output(
['nvidia-smi', '-L']).decode().count('\n')
return device_num
def train(): def train():
......
...@@ -158,14 +158,9 @@ def parse_args(): ...@@ -158,14 +158,9 @@ def parse_args():
def get_device_num(): def get_device_num():
# NOTE(zcd): for multi-processe training, each process use one GPU card. # NOTE(zcd): for multi-processe training, each process use one GPU card.
if num_trainers > 1: return 1 if num_trainers > 1:
visible_device = os.environ.get('CUDA_VISIBLE_DEVICES', None) return 1
if visible_device: return fluid.core.get_cuda_device_count()
device_num = len(visible_device.split(','))
else:
device_num = subprocess.check_output(
['nvidia-smi', '-L']).decode().count('\n')
return device_num
def append_nccl2_prepare(startup_prog, trainer_id, worker_endpoints, def append_nccl2_prepare(startup_prog, trainer_id, worker_endpoints,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册