未验证 提交 86cb3fb8 编写于 作者: M Ming-Xu Huang 提交者: GitHub

Distributed Automatic SParsity with Fleet (#33558)

上级 1e5437de
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
// Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
......@@ -189,6 +190,7 @@ message DistributedStrategy {
optional bool without_graph_optimization = 30 [ default = false ];
optional int32 fuse_grad_size_in_num = 31 [ default = 1 ];
optional bool calc_comm_same_stream = 32 [ default = false ];
optional bool asp = 33 [ default = false ];
optional RecomputeConfig recompute_configs = 101;
optional AMPConfig amp_configs = 102;
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -446,6 +447,31 @@ class DistributedStrategy(object):
check_configs_key(self.strategy.amp_configs, configs, "amp_configs")
assign_configs_value(self.strategy.amp_configs, configs)
@property
def asp(self):
"""
Indicating whether we are using automatic sparsity training
Default Value: False
Examples:
.. code-block:: python
import paddle.distributed.fleet as fleet
strategy = fleet.DistributedStrategy()
strategy.asp = True # by default this is false
"""
return self.strategy.asp
@asp.setter
@is_strict_auto
def asp(self, flag):
if isinstance(flag, bool):
self.strategy.asp = flag
else:
print("WARNING: asp should have value of bool type")
@property
def recompute(self):
"""
......
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
......@@ -12,6 +13,7 @@
# See the License for the specific language governing permissions and
from .amp_optimizer import AMPOptimizer
from .asp_optimizer import ASPOptimizer
from .recompute_optimizer import RecomputeOptimizer
from .gradient_merge_optimizer import GradientMergeOptimizer
from .graph_execution_optimizer import GraphExecutionOptimizer
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
from paddle.fluid.contrib.sparsity.asp import ASPHelper
from .meta_optimizer_base import MetaOptimizerBase
__all__ = []
class ASPOptimizer(MetaOptimizerBase):
def __init__(self, optimizer):
super(ASPOptimizer, self).__init__(optimizer)
self.inner_opt = optimizer
# we do not allow meta optimizer to be inner optimizer currently
self.meta_optimizers_white_list = [
"AMPOptimizer", "LarsOptimizer", "LambOptimizer",
"GraphExecutionOptimizer", "RecomputeOptimizer",
"GradientMergeOptimizer"
]
self.meta_optimizers_black_list = []
def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
user_defined_strategy):
super(ASPOptimizer, self)._set_basic_info(
loss, role_maker, user_defined_optimizer, user_defined_strategy)
def _can_apply(self):
if not self.role_maker._is_collective:
return False
if self.user_defined_strategy.asp:
return True
return False
def _disable_strategy(self, dist_strategy):
dist_strategy.asp = False
def _enable_strategy(self, dist_strategy, context):
dist_strategy.asp = True
def minimize_impl(self,
loss,
startup_program=None,
parameter_list=None,
no_grad_set=None):
optimize_ops, params_grads = ASPHelper._minimize(
self.inner_opt,
loss,
startup_program=startup_program,
parameter_list=parameter_list,
no_grad_set=no_grad_set)
return optimize_ops, params_grads
......@@ -64,12 +64,15 @@ def decorate(optimizer):
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
from paddle.fluid.contrib import sparsity
main_program = fluid.Program()
startup_program = fluid.Program()
paddle.enable_static()
with fluid.program_guard(main_program, startup_program):
input_data = fluid.layers.data(name='data', shape=[None, 128])
label = fluid.layers.data(name='label', shape=[None, 10])
......@@ -78,17 +81,13 @@ def decorate(optimizer):
loss = fluid.layers.mean(fluid.layers.square_error_cost(prob, label))
optimizer = fluid.optimizer.SGD(learning_rate=0.1)
optimizer = sparsity.decorate(optimizer)
optimizer.minimize(loss, startup_program)
# if do sparse training with Fleet, please replace above decorate with:
# strategy = paddle.distributed.fleet.DistributedStrategy()
# strategy.asp = True
# optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
# When apply distributed training with Fleet
import paddle.distributed.fleet as fleet
optimizer = fluid.optimizer.SGD(learning_rate=0.1)
optimizer = sparsity.decorate(optimizer) # Need to be called before `fleet.distributed_optimizer`
optimizer = fleet.distributed_optimizer(optimizer)
optimizer.minimize(loss, startup_program)
optimizer.minimize(loss, startup_program)
"""
return ASPHelper.decorate(optimizer)
......@@ -126,23 +125,38 @@ def prune_model(place,
Examples:
.. code-block:: python
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
from paddle.fluid.contrib import sparsity
paddle.enable_static()
main_program = fluid.Program()
startup_program = fluid.Program()
place = fluid.CUDAPlace(0)
place = paddle.CPUPlace()
if core.is_compiled_with_cuda():
place = paddle.CUDAPlace(0)
with fluid.program_guard(main_program, startup_program):
input_data = fluid.layers.data(name='data', shape=[None, 128])
label = fluid.layers.data(name='label', shape=[None, 10])
hidden = fluid.layers.fc(input=input_data, num_flatten_dims=-1, size=32, act=None)
hidden = fluid.layers.fc(input=input_data, num_flatten_dims=-1, size=32, act=None, name="need_sparse")
hidden = fluid.layers.fc(input=hidden, num_flatten_dims=-1, size=32, act=None, name="need_dense")
prob = fluid.layers.fc(input=hidden, num_flatten_dims=-1, size=10, act=None)
loss = fluid.layers.mean(fluid.layers.square_error_cost(prob, label))
optimizer = decorate(fluid.optimizer.SGD(learning_rate=0.1))
optimizer.minimize(optimizer, loss, main_program, startup_program)
# Setup exluded layers out from ASP workflow.
# Please note, excluded_layers must be set before calling `optimizer.minimize()`.
sparsity.set_excluded_layers(main_program, ["need_dense"])
optimizer = fluid.optimizer.SGD(learning_rate=0.1)
optimizer = fluid.contrib.mixed_precision.decorator.decorate(optimizer )
# Calling sparsity.decorate() to wrap minimize() in optimizer, which
# will insert necessary masking operations for ASP workflow.
optimizer = sparsity.decorate(optimizer)
optimizer.minimize(loss, startup_program)
exe = fluid.Executor(place)
exe.run(startup_program)
......
file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp"})
list(REMOVE_ITEM TEST_OPS "test_fleet_with_asp_amp"})
foreach(TEST_OP ${TEST_OPS})
py_test_modules(${TEST_OP} MODULES ${TEST_OP})
endforeach(TEST_OP)
if(WITH_DISTRIBUTE)
py_test_modules(test_fleet_with_asp MODULES test_fleet_with_asp ENVS ${dist_ENVS})
py_test_modules(test_fleet_with_asp_amp MODULES test_fleet_with_asp_amp ENVS ${dist_ENVS})
endif()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker
import unittest
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import os
from paddle.fluid.contrib import sparsity
from paddle.fluid.contrib.sparsity.asp import ASPHelper
import numpy as np
cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
if cuda_visible_devices is None or cuda_visible_devices == "":
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
else:
os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices.split(',')[0]
paddle.enable_static()
class TestFleetWithASP(unittest.TestCase):
def setUp(self):
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
os.environ["PADDLE_TRAINERS_NUM"] = "1"
os.environ["PADDLE_TRAINER_ID"] = "0"
def net(self, main_prog, startup_prog):
with fluid.program_guard(main_prog, startup_prog):
input_x = paddle.static.data(
name="x", shape=[-1, 32], dtype='float32')
input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
avg_cost = paddle.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.asp = True
return avg_cost, strategy, input_x, input_y
def test_with_asp(self):
fleet.init(is_collective=True)
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy, input_x, input_y = self.net(train_prog,
startup_prog)
with fluid.program_guard(train_prog, startup_prog):
optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(
optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda(
) else fluid.CPUPlace()
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
exe.run(startup_prog)
sparsity.prune_model(place, train_prog)
data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
exe.run(train_prog, feed=feeder.feed([data]))
for param in train_prog.global_block().all_parameters():
if ASPHelper._is_supported_layer(train_prog, param.name):
mat = np.array(fluid.global_scope().find_var(param.name)
.get_tensor())
self.assertTrue(sparsity.check_sparsity(mat.T, n=2, m=4))
if __name__ == "__main__":
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 2021 NVIDIA Corporation. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle.distributed.fleet as fleet
import paddle.distributed.fleet.base.role_maker as role_maker
import unittest
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
import os
from paddle.fluid.contrib import sparsity
from paddle.fluid.contrib.sparsity.asp import ASPHelper
import numpy as np
cuda_visible_devices = os.getenv('CUDA_VISIBLE_DEVICES')
if cuda_visible_devices is None or cuda_visible_devices == "":
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
else:
os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices.split(',')[0]
paddle.enable_static()
class TestFleetWithASP(unittest.TestCase):
def setUp(self):
os.environ["PADDLE_TRAINER_ENDPOINTS"] = "127.0.0.1:36213"
os.environ["PADDLE_CURRENT_ENDPOINTS"] = "127.0.0.1:36213"
os.environ["PADDLE_TRAINERS_NUM"] = "1"
os.environ["PADDLE_TRAINER_ID"] = "0"
def net(self, main_prog, startup_prog):
with fluid.program_guard(main_prog, startup_prog):
input_x = paddle.static.data(
name="x", shape=[-1, 32], dtype='float32')
input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64')
fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh')
prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax')
cost = fluid.layers.cross_entropy(input=prediction, label=input_y)
avg_cost = paddle.mean(x=cost)
strategy = paddle.distributed.fleet.DistributedStrategy()
strategy.asp = True
return avg_cost, strategy, input_x, input_y
def test_with_asp_and_amp(self):
fleet.init(is_collective=True)
train_prog, startup_prog = fluid.Program(), fluid.Program()
avg_cost, strategy, input_x, input_y = self.net(train_prog,
startup_prog)
strategy.amp = True
with fluid.program_guard(train_prog, startup_prog):
optimizer = paddle.optimizer.SGD(learning_rate=0.01)
optimizer = fleet.distributed_optimizer(
optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda(
) else fluid.CPUPlace()
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
exe.run(startup_prog)
optimizer.amp_init(place)
sparsity.prune_model(place, train_prog)
data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
exe.run(train_prog, feed=feeder.feed([data]))
for param in train_prog.global_block().all_parameters():
if ASPHelper._is_supported_layer(train_prog, param.name):
mat = np.array(fluid.global_scope().find_var(param.name)
.get_tensor())
self.assertTrue(sparsity.check_sparsity(mat.T, n=2, m=4))
def test_with_asp_and_pure_fp16(self):
fleet.init(is_collective=True)
train_prog, startup_prog = fluid.Program(), fluid.Program()
with paddle.static.amp.fp16_guard():
avg_cost, strategy, \
input_x, input_y = self.net(train_prog,
startup_prog)
strategy.amp = True
strategy.amp_configs = {'use_pure_fp16': True}
with fluid.program_guard(train_prog, startup_prog):
with paddle.static.amp.fp16_guard():
optimizer = optimizer = paddle.optimizer.Momentum(
learning_rate=0.01, multi_precision=True)
optimizer = fleet.distributed_optimizer(
optimizer, strategy=strategy)
optimizer.minimize(avg_cost)
place = fluid.CUDAPlace(0) if paddle.fluid.is_compiled_with_cuda(
) else fluid.CPUPlace()
exe = fluid.Executor(place)
feeder = fluid.DataFeeder(feed_list=[input_x, input_y], place=place)
exe.run(startup_prog)
optimizer.amp_init(place)
sparsity.prune_model(place, train_prog)
data = (np.random.randn(64, 32), np.random.randint(2, size=(64, 1)))
exe.run(train_prog, feed=feeder.feed([data]))
for param in train_prog.global_block().all_parameters():
if ASPHelper._is_supported_layer(train_prog, param.name):
mat = np.array(fluid.global_scope().find_var(param.name)
.get_tensor())
self.assertTrue(sparsity.check_sparsity(mat.T, n=2, m=4))
if __name__ == "__main__":
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册