未验证 提交 8d7908f3 编写于 作者: D danleifeng 提交者: GitHub

【paddle.fleet】raise error when using multi-cards in fleet non_distributed mode (#27854)

* raise error if use multi-cards in fleet non_distributed mode; test=develop
上级 4a4f7736
...@@ -186,6 +186,15 @@ class Fleet(object): ...@@ -186,6 +186,15 @@ class Fleet(object):
fleet.util._set_role_maker(self._role_maker) fleet.util._set_role_maker(self._role_maker)
self.strategy_compiler = StrategyCompiler() self.strategy_compiler = StrategyCompiler()
if self._role_maker._is_non_distributed() and self._is_collective:
if paddle.fluid.core.is_compiled_with_cuda():
gpus_num = paddle.fluid.core.get_cuda_device_count()
if gpus_num != 1:
raise ValueError(
"CUDA_VISIBLE_DEVICES shoule be set only 1 card if you use `python` to launch fleet program."
)
if paddle.fluid.framework.in_dygraph_mode(): if paddle.fluid.framework.in_dygraph_mode():
if self.worker_num() == 1: if self.worker_num() == 1:
return return
...@@ -568,8 +577,6 @@ class Fleet(object): ...@@ -568,8 +577,6 @@ class Fleet(object):
""" """
self.user_defined_optimizer = optimizer self.user_defined_optimizer = optimizer
if paddle.fluid.framework.in_dygraph_mode():
return self
if strategy == None: if strategy == None:
strategy = DistributedStrategy() strategy = DistributedStrategy()
......
...@@ -129,6 +129,8 @@ if (NOT ${WITH_GPU}) ...@@ -129,6 +129,8 @@ if (NOT ${WITH_GPU})
LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer) LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer)
LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm) LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm)
LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision) LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
LIST(REMOVE_ITEM TEST_OPS test_fleet_base_single)
elseif(${CUDNN_VERSION} VERSION_LESS 7100) elseif(${CUDNN_VERSION} VERSION_LESS 7100)
LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
endif() endif()
......
...@@ -171,45 +171,7 @@ class TestFleetDygraph(unittest.TestCase): ...@@ -171,45 +171,7 @@ class TestFleetDygraph(unittest.TestCase):
final_strategy = fleet._final_strategy() final_strategy = fleet._final_strategy()
class LinearNet(nn.Layer): class TestFleetBaseSingleError(unittest.TestCase):
def __init__(self):
super(LinearNet, self).__init__()
self._linear1 = nn.Linear(10, 10)
self._linear2 = nn.Linear(10, 1)
def forward(self, x):
return self._linear2(self._linear1(x))
class TestFleetDygraphSingle(unittest.TestCase):
def setUp(self):
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
os.environ["PADDLE_TRAINERS_NUM"] = "1"
os.environ["PADDLE_TRAINER_ID"] = "0"
def test_dygraph_single(self):
paddle.disable_static()
fleet.init(is_collective=True)
layer = LinearNet()
loss_fn = nn.MSELoss()
adam = paddle.optimizer.Adam(
learning_rate=0.001, parameters=layer.parameters())
adam = fleet.distributed_optimizer(adam)
dp_layer = fleet.distributed_model(layer)
for step in range(2):
inputs = paddle.randn([10, 10], 'float32')
outputs = dp_layer(inputs)
labels = paddle.randn([10, 1], 'float32')
loss = loss_fn(outputs, labels)
loss.backward()
adam.step()
adam.clear_grad()
class TestFleetBaseSingleRunCollective(unittest.TestCase):
def setUp(self): def setUp(self):
os.environ.pop("PADDLE_TRAINER_ENDPOINTS") os.environ.pop("PADDLE_TRAINER_ENDPOINTS")
...@@ -221,71 +183,23 @@ class TestFleetBaseSingleRunCollective(unittest.TestCase): ...@@ -221,71 +183,23 @@ class TestFleetBaseSingleRunCollective(unittest.TestCase):
} }
def test_single_run_collective_minimize(self): def test_single_run_collective_minimize(self):
input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32') def test_single_error():
input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64') input_x = paddle.static.data(
name="x", shape=[-1, 32], dtype='float32')
fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh') input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=input_y) fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
avg_cost = paddle.mean(x=cost) prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
fleet.init(is_collective=True) avg_cost = paddle.mean(x=cost)
optimizer = fluid.optimizer.SGD(learning_rate=0.001) fleet.init(is_collective=True)
optimizer = fleet.distributed_optimizer(optimizer)
optimizer.minimize(avg_cost) # in non_distributed mode(use `python` to launch), raise error if has multi cards
if fluid.core.is_compiled_with_cuda(
place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda( ) and fluid.core.get_cuda_device_count() > 1:
) else fluid.CPUPlace() self.assertRaises(ValueError, test_single_error)
else:
exe = fluid.Executor(place) test_single_error()
exe.run(paddle.static.default_startup_program())
for i in range(10):
cost_val = exe.run(feed=self.gen_data(), fetch_list=[avg_cost.name])
print("cost of step[{}] = {}".format(i, cost_val))
class TestFleetBaseSingleRunPS(unittest.TestCase):
def setUp(self):
os.environ.pop("PADDLE_PSERVERS_IP_PORT_LIST")
def gen_data(self):
return {
"x": np.random.random(size=(128, 32)).astype('float32'),
"y": np.random.randint(
2, size=(128, 1)).astype('int64')
}
def test_single_run_ps_minimize(self):
input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32')
input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
avg_cost = paddle.mean(x=cost)
fleet.init()
strategy = paddle.distributed.fleet.DistributedStrategy()
optimizer = fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
if fleet.is_server():
fleet.init_server()
fleet.run_server()
elif fleet.is_worker():
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(paddle.static.default_startup_program())
step = 100
for i in range(step):
cost_val = exe.run(program=fluid.default_main_program(),
feed=self.gen_data(),
fetch_list=[avg_cost.name])
print("worker_index: %d, step%d cost = %f" %
(fleet.worker_index(), i, cost_val[0]))
fleet.save_persistables(exe, "fleet_single_model/")
print("save fleet models done.")
if __name__ == "__main__": if __name__ == "__main__":
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import os
cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
if cuda_visible_devices is None or cuda_visible_devices == "":
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
else:
os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices.split(',')[0]
import paddle
import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker
import paddle.fluid as fluid
import unittest
import paddle.nn as nn
class LinearNet(nn.Layer):
def __init__(self):
super(LinearNet, self).__init__()
self._linear1 = nn.Linear(10, 10)
self._linear2 = nn.Linear(10, 1)
def forward(self, x):
return self._linear2(self._linear1(x))
class TestFleetDygraphSingle(unittest.TestCase):
def setUp(self):
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
os.environ["PADDLE_TRAINERS_NUM"] = "1"
os.environ["PADDLE_TRAINER_ID"] = "0"
def test_dygraph_single(self):
paddle.disable_static()
fleet.init(is_collective=True)
layer = LinearNet()
loss_fn = nn.MSELoss()
adam = paddle.optimizer.Adam(
learning_rate=0.001, parameters=layer.parameters())
adam = fleet.distributed_optimizer(adam)
dp_layer = fleet.distributed_model(layer)
for step in range(2):
inputs = paddle.randn([10, 10], 'float32')
outputs = dp_layer(inputs)
labels = paddle.randn([10, 1], 'float32')
loss = loss_fn(outputs, labels)
loss = dp_layer.scale_loss(loss)
loss.backward()
dp_layer.apply_collective_grads()
adam.step()
adam.clear_grad()
class TestFleetBaseSingleRunCollective(unittest.TestCase):
def setUp(self):
pass
def gen_data(self):
return {
"x": np.random.random(size=(128, 32)).astype('float32'),
"y": np.random.randint(
2, size=(128, 1)).astype('int64')
}
def test_single_run_collective_minimize(self):
input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32')
input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
avg_cost = paddle.mean(x=cost)
fleet.init(is_collective=True)
optimizer = fluid.optimizer.SGD(learning_rate=0.001)
optimizer = fleet.distributed_optimizer(optimizer)
optimizer.minimize(avg_cost)
place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda(
) else fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(paddle.static.default_startup_program())
for i in range(10):
cost_val = exe.run(feed=self.gen_data(), fetch_list=[avg_cost.name])
print("cost of step[{}] = {}".format(i, cost_val))
class TestFleetBaseSingleRunPS(unittest.TestCase):
def setUp(self):
pass
def gen_data(self):
return {
"x": np.random.random(size=(128, 32)).astype('float32'),
"y": np.random.randint(
2, size=(128, 1)).astype('int64')
}
def test_single_run_ps_minimize(self):
input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32')
input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
avg_cost = paddle.mean(x=cost)
fleet.init()
strategy = paddle.distributed.fleet.DistributedStrategy()
optimizer = fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
if fleet.is_server():
fleet.init_server()
fleet.run_server()
elif fleet.is_worker():
place = fluid.CPUPlace()
exe = fluid.Executor(place)
exe.run(paddle.static.default_startup_program())
step = 10
for i in range(step):
cost_val = exe.run(program=fluid.default_main_program(),
feed=self.gen_data(),
fetch_list=[avg_cost.name])
print("worker_index: %d, step%d cost = %f" %
(fleet.worker_index(), i, cost_val[0]))
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册