提交 58994aa3 编写于 作者: Y Yang Zhang 提交者: GitHub

Revert to mixed precision training with manual control (#3434)

* Place mixed precision inside PaddleDetection

roll back to the monkey patch version as a temporary measure, before it is
merged into paddle

* Add command flag for `loss_scale`

* Fix a stupid indentation error

optimizer should be in the mixed precision context

* Initial FP16 training

* Add mixed precision training to rest of the detection models

* Revert "Add support for mixed precision training (#3406)"

This reverts commit 3a2c106271885071db7c0d85587540a8f83c24db.

* Bug fixes and some tweaks
上级 09539ce1
from __future__ import absolute_import
from .mixed_precision import *
from . import mixed_precision
__all__ = mixed_precision.__all__
from __future__ import absolute_import
from __future__ import print_function
import six
from paddle.fluid.framework import Parameter
from paddle.fluid import layers
from paddle.fluid import core
from paddle.fluid import unique_name
import paddle.fluid.layer_helper_base as lhb
import paddle.fluid.optimizer as optim
__all__ = ['mixed_precision_global_state', 'mixed_precision_context',
'StaticLossScale', 'DynamicLossScale']
_mixed_precision_global_state = None
def mixed_precision_global_state():
return _mixed_precision_global_state
class LossScale(object):
def __init__(self):
super(LossScale, self).__init__()
def get_loss_scale_var(self):
return self.scale
def increment(self):
raise NotImplementedError()
def decrement(self):
raise NotImplementedError()
class StaticLossScale(LossScale):
"""
Static (fixed) loss scale manager.
Args:
init_loss_scale (float): initial loss scale value.
Examples:
.. code-block:: python
from paddle import fluid
from ppdet.experimental import (mixed_precision_context,
StaticLossScale)
with mixed_precision_context(StaticLossScale(8.), True) as ctx:
# ...
# scale loss
loss_scale = ctx.get_loss_scale_var()
"""
def __init__(self, init_loss_scale=1.):
super(StaticLossScale, self).__init__()
self.scale = layers.create_global_var(
name=unique_name.generate("loss_scale"),
shape=[1],
value=init_loss_scale,
dtype='float32',
persistable=True)
class DynamicLossScale(LossScale):
"""
Dynamic loss scale manager. it works as follows:
if gradients is valid for `increment_every` steps, loss scale values is
increased by `factor`, otherwise loss scale values is decreased by `factor`
Args:
init_loss_scale (float): initial loss scale value.
increment_every (int): minimum 'good' steps before loss scale increase.
factor (float): increase/decrease loss scale by this much.
Examples:
.. code-block:: python
from paddle import fluid
from ppdet.experimental import (mixed_precision_context,
DynamicLossScale)
loss_scale = DynamicLossScale(8., 1000, 4.)
with mixed_precision_context(loss_scale, True) as ctx:
# ...
# scale loss
loss_scale = ctx.get_loss_scale_var()
"""
def __init__(self, init_loss_scale=2**15, increment_every=2000, factor=2.):
super(DynamicLossScale, self).__init__()
self.scale = layers.create_global_var(
name=unique_name.generate("loss_scale"),
shape=[1],
value=init_loss_scale,
dtype='float32',
persistable=True)
self.good_steps = layers.create_global_var(
name=unique_name.generate("good_steps"),
shape=[1],
value=0,
dtype='int32',
persistable=True)
self.increment_every = layers.fill_constant(
shape=[1], dtype='int32', value=increment_every)
self.factor = factor
def increment(self):
enough_steps = layers.less_than(self.increment_every,
self.good_steps + 1)
with layers.Switch() as switch:
with switch.case(enough_steps):
new_scale = self.scale * self.factor
scale_valid = layers.isfinite(new_scale)
with layers.Switch() as switch2:
with switch2.case(scale_valid):
layers.assign(new_scale, self.scale)
layers.assign(layers.zeros_like(self.good_steps),
self.good_steps)
with switch2.default():
layers.increment(self.good_steps)
with switch.default():
layers.increment(self.good_steps)
def decrement(self):
new_scale = self.scale / self.factor
one = layers.fill_constant(shape=[1], dtype='float32', value=1.0)
less_than_one = layers.less_than(new_scale, one)
with layers.Switch() as switch:
with switch.case(less_than_one):
layers.assign(one, self.scale)
with switch.default():
layers.assign(new_scale, self.scale)
layers.assign(layers.zeros_like(self.good_steps),
self.good_steps)
class mixed_precision_context(object):
"""
Context manager for mixed precision training.
Args:
loss_scale (float, str or obj): loss scale settings, can be:
1. an number: use fixed loss scale.
2. 'dynamic': use a default `DynamicLossScale`.
3. `DynamicLossScale` or `StaticLossScale` instance.
enabled (bool): enable mixed precision training.
Examples:
.. code-block:: python
from paddle import fluid
from ppdet.experimental import mixed_precision_context
with mixed_precision_context('dynamic', True) as ctx:
# cast inputs to float16
inputs = fluid.layers.cast(inputs, "float16")
# build model here
logits = model(inputs)
# use float32 for softmax
logits = fluid.layers.cast(logits, "float32")
softmax = fluid.layers.softmax(logits)
loss = fluid.layers.cross_entropy(input=softmax, label=label)
avg_loss = fluid.layers.mean(loss)
# scale loss
loss_scale = ctx.get_loss_scale_var()
avg_loss *= loss_scale
optimizer = fluid.optimizer.Momentum(...)
optimizer.minimize(avg_loss)
"""
def __init__(self, loss_scale=1., enabled=True):
super(mixed_precision_context, self).__init__()
self.enabled = enabled
if not enabled:
return
monkey_patch()
if isinstance(loss_scale, six.integer_types + (float,)):
self.loss_scale = StaticLossScale(loss_scale)
elif loss_scale == 'dynamic':
self.loss_scale = DynamicLossScale()
else:
assert isinstance(loss_scale, LossScale), \
"Invalid loss scale argument"
self.loss_scale = loss_scale
@property
def dynamic_scaling(self):
return isinstance(self.loss_scale, DynamicLossScale)
def __getattr__(self, attr):
if attr in ['get_loss_scale_var', 'increment', 'decrement']:
return getattr(self.loss_scale, attr)
def __enter__(self):
if not self.enabled:
return
global _mixed_precision_global_state
_mixed_precision_global_state = self
return mixed_precision_global_state()
def __exit__(self, *args):
if not self.enabled:
return
global _mixed_precision_global_state
_mixed_precision_global_state = None
return mixed_precision_global_state()
def create_parameter(self,
attr,
shape,
dtype,
is_bias=False,
default_initializer=None):
mp_state = mixed_precision_global_state()
is_half = (isinstance(dtype, str) and dtype == 'float16') \
or (isinstance(dtype, core.VarDesc.VarType)
and dtype == core.VarDesc.VarType.FP16)
if is_half and mp_state is not None:
dtype = 'float32'
param = self._create_parameter(attr, shape, dtype,
is_bias, default_initializer)
if not is_half or mp_state is None:
return param
param16 = self.main_program.current_block().create_var(
name=param.name + '.fp16',
dtype='float16',
type=param.type,
persistable=False)
self.append_op(
type='cast',
inputs={'X': [param]},
outputs={'Out': [param16]},
attrs={'in_dtype': param.dtype,
'out_dtype': param16.dtype})
return param16
def scale_gradient(block, context):
state = mixed_precision_global_state()
if state is None:
return
scale = state.get_loss_scale_var()
op_desc = block.desc.op(block.desc.op_size() - 1)
op_role_attr_name = core.op_proto_and_checker_maker.kOpRoleAttrName()
bwd_role = core.op_proto_and_checker_maker.OpRole.Backward
for name in [n for n in op_desc.output_arg_names() if n in context]:
fwd_var = block._var_recursive(context[name])
if not isinstance(fwd_var, Parameter):
continue # TODO verify all use cases
clip_op_desc = block.desc.append_op()
clip_op_desc.set_type("elementwise_div")
clip_op_desc.set_input("X", [name])
clip_op_desc.set_input("Y", [scale.name])
clip_op_desc.set_output("Out", [name])
clip_op_desc._set_attr(op_role_attr_name, bwd_role)
def update_loss_scale(grads):
state = mixed_precision_global_state()
if state is None or not state.dynamic_scaling:
return
per_grad_check = layers.stack([layers.reduce_sum(g) for g in grads])
grad_valid = layers.isfinite(per_grad_check)
with layers.Switch() as switch:
with switch.case(grad_valid):
state.increment()
with switch.default():
state.decrement()
return grad_valid
def backward(self, loss, **kwargs):
state = mixed_precision_global_state()
callbacks = 'callbacks' in kwargs and kwargs['callbacks'] or None
if callbacks is None:
from paddle.fluid.clip import error_clip_callback
callbacks = [error_clip_callback] # XXX what if gradient is zero?
if state is not None:
kwargs['callbacks'] = [scale_gradient] + callbacks
else:
kwargs['callbacks'] = callbacks
param_grads = self._backward(loss, **kwargs)
if state is not None:
grad_valid = update_loss_scale(v for k, v in param_grads)
if state.dynamic_scaling:
with layers.Switch() as switch:
with switch.case(grad_valid):
pass
with switch.default():
for _, g in param_grads:
layers.assign(layers.zeros_like(g), g)
return param_grads
mixed_precision_patched = False
# XXX this is a temporary measure, until thoroughly evaluated
def monkey_patch():
global mixed_precision_patched
if mixed_precision_patched:
return
create_parameter_orig = lhb.LayerHelperBase.create_parameter
lhb.LayerHelperBase.create_parameter = create_parameter
lhb.LayerHelperBase._create_parameter = create_parameter_orig
backward_orig = optim.Optimizer.backward
optim.Optimizer.backward = backward
optim.Optimizer._backward = backward_orig
mixed_precision_patched = True
...@@ -16,8 +16,11 @@ from __future__ import absolute_import ...@@ -16,8 +16,11 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
from collections import OrderedDict
import paddle.fluid as fluid import paddle.fluid as fluid
from ppdet.experimental import mixed_precision_global_state
from ppdet.core.workspace import register from ppdet.core.workspace import register
__all__ = ['CascadeMaskRCNN'] __all__ = ['CascadeMaskRCNN']
...@@ -98,9 +101,19 @@ class CascadeMaskRCNN(object): ...@@ -98,9 +101,19 @@ class CascadeMaskRCNN(object):
im_info = feed_vars['im_info'] im_info = feed_vars['im_info']
mixed_precision_enabled = mixed_precision_global_state() is not None
# cast inputs to FP16
if mixed_precision_enabled:
im = fluid.layers.cast(im, 'float16')
# backbone # backbone
body_feats = self.backbone(im) body_feats = self.backbone(im)
# cast features back to FP32
if mixed_precision_enabled:
body_feats = OrderedDict((k, fluid.layers.cast(v, 'float32'))
for k, v in body_feats.items())
# FPN # FPN
if self.fpn is not None: if self.fpn is not None:
body_feats, spatial_scale = self.fpn.get_output(body_feats) body_feats, spatial_scale = self.fpn.get_output(body_feats)
......
...@@ -16,8 +16,11 @@ from __future__ import absolute_import ...@@ -16,8 +16,11 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
from collections import OrderedDict
import paddle.fluid as fluid import paddle.fluid as fluid
from ppdet.experimental import mixed_precision_global_state
from ppdet.core.workspace import register from ppdet.core.workspace import register
__all__ = ['CascadeRCNN'] __all__ = ['CascadeRCNN']
...@@ -87,9 +90,19 @@ class CascadeRCNN(object): ...@@ -87,9 +90,19 @@ class CascadeRCNN(object):
gt_box = feed_vars['gt_box'] gt_box = feed_vars['gt_box']
is_crowd = feed_vars['is_crowd'] is_crowd = feed_vars['is_crowd']
mixed_precision_enabled = mixed_precision_global_state() is not None
# cast inputs to FP16
if mixed_precision_enabled:
im = fluid.layers.cast(im, 'float16')
# backbone # backbone
body_feats = self.backbone(im) body_feats = self.backbone(im)
# cast features back to FP32
if mixed_precision_enabled:
body_feats = OrderedDict((k, fluid.layers.cast(v, 'float32'))
for k, v in body_feats.items())
# FPN # FPN
if self.fpn is not None: if self.fpn is not None:
body_feats, spatial_scale = self.fpn.get_output(body_feats) body_feats, spatial_scale = self.fpn.get_output(body_feats)
......
...@@ -16,8 +16,11 @@ from __future__ import absolute_import ...@@ -16,8 +16,11 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
from collections import OrderedDict
from paddle import fluid from paddle import fluid
from ppdet.experimental import mixed_precision_global_state
from ppdet.core.workspace import register from ppdet.core.workspace import register
__all__ = ['FasterRCNN'] __all__ = ['FasterRCNN']
...@@ -67,9 +70,21 @@ class FasterRCNN(object): ...@@ -67,9 +70,21 @@ class FasterRCNN(object):
is_crowd = feed_vars['is_crowd'] is_crowd = feed_vars['is_crowd']
else: else:
im_shape = feed_vars['im_shape'] im_shape = feed_vars['im_shape']
mixed_precision_enabled = mixed_precision_global_state() is not None
# cast inputs to FP16
if mixed_precision_enabled:
im = fluid.layers.cast(im, 'float16')
body_feats = self.backbone(im) body_feats = self.backbone(im)
body_feat_names = list(body_feats.keys()) body_feat_names = list(body_feats.keys())
# cast features back to FP32
if mixed_precision_enabled:
body_feats = OrderedDict((k, fluid.layers.cast(v, 'float32'))
for k, v in body_feats.items())
if self.fpn is not None: if self.fpn is not None:
body_feats, spatial_scale = self.fpn.get_output(body_feats) body_feats, spatial_scale = self.fpn.get_output(body_feats)
......
...@@ -16,7 +16,11 @@ from __future__ import absolute_import ...@@ -16,7 +16,11 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
from paddle import fluid from collections import OrderedDict
import paddle.fluid as fluid
from ppdet.experimental import mixed_precision_global_state
from ppdet.core.workspace import register from ppdet.core.workspace import register
__all__ = ['MaskRCNN'] __all__ = ['MaskRCNN']
...@@ -79,8 +83,19 @@ class MaskRCNN(object): ...@@ -79,8 +83,19 @@ class MaskRCNN(object):
"{} has no {} field".format(feed_vars, var) "{} has no {} field".format(feed_vars, var)
im_info = feed_vars['im_info'] im_info = feed_vars['im_info']
mixed_precision_enabled = mixed_precision_global_state() is not None
# cast inputs to FP16
if mixed_precision_enabled:
im = fluid.layers.cast(im, 'float16')
# backbone
body_feats = self.backbone(im) body_feats = self.backbone(im)
# cast features back to FP32
if mixed_precision_enabled:
body_feats = OrderedDict((k, fluid.layers.cast(v, 'float32'))
for k, v in body_feats.items())
# FPN # FPN
if self.fpn is not None: if self.fpn is not None:
body_feats, spatial_scale = self.fpn.get_output(body_feats) body_feats, spatial_scale = self.fpn.get_output(body_feats)
......
...@@ -16,8 +16,11 @@ from __future__ import absolute_import ...@@ -16,8 +16,11 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
from collections import OrderedDict
import paddle.fluid as fluid import paddle.fluid as fluid
from ppdet.experimental import mixed_precision_global_state
from ppdet.core.workspace import register from ppdet.core.workspace import register
__all__ = ['RetinaNet'] __all__ = ['RetinaNet']
...@@ -50,9 +53,20 @@ class RetinaNet(object): ...@@ -50,9 +53,20 @@ class RetinaNet(object):
gt_box = feed_vars['gt_box'] gt_box = feed_vars['gt_box']
gt_label = feed_vars['gt_label'] gt_label = feed_vars['gt_label']
is_crowd = feed_vars['is_crowd'] is_crowd = feed_vars['is_crowd']
mixed_precision_enabled = mixed_precision_global_state() is not None
# cast inputs to FP16
if mixed_precision_enabled:
im = fluid.layers.cast(im, 'float16')
# backbone # backbone
body_feats = self.backbone(im) body_feats = self.backbone(im)
# cast features back to FP32
if mixed_precision_enabled:
body_feats = OrderedDict((k, fluid.layers.cast(v, 'float32'))
for k, v in body_feats.items())
# FPN # FPN
body_feats, spatial_scale = self.fpn.get_output(body_feats) body_feats, spatial_scale = self.fpn.get_output(body_feats)
......
...@@ -16,8 +16,11 @@ from __future__ import absolute_import ...@@ -16,8 +16,11 @@ from __future__ import absolute_import
from __future__ import division from __future__ import division
from __future__ import print_function from __future__ import print_function
from paddle import fluid from collections import OrderedDict
import paddle.fluid as fluid
from ppdet.experimental import mixed_precision_global_state
from ppdet.core.workspace import register from ppdet.core.workspace import register
from ppdet.modeling.ops import SSDOutputDecoder from ppdet.modeling.ops import SSDOutputDecoder
...@@ -59,7 +62,22 @@ class SSD(object): ...@@ -59,7 +62,22 @@ class SSD(object):
gt_box = feed_vars['gt_box'] gt_box = feed_vars['gt_box']
gt_label = feed_vars['gt_label'] gt_label = feed_vars['gt_label']
mixed_precision_enabled = mixed_precision_global_state() is not None
# cast inputs to FP16
if mixed_precision_enabled:
im = fluid.layers.cast(im, 'float16')
# backbone
body_feats = self.backbone(im) body_feats = self.backbone(im)
if isinstance(body_feats, OrderedDict):
body_feat_names = list(body_feats.keys())
body_feats = [body_feats[name] for name in body_feat_names]
# cast features back to FP32
if mixed_precision_enabled:
body_feats = [fluid.layers.cast(v, 'float32') for v in body_feats]
locs, confs, box, box_var = self.multi_box_head( locs, confs, box, box_var = self.multi_box_head(
inputs=body_feats, image=im, num_classes=self.num_classes) inputs=body_feats, image=im, num_classes=self.num_classes)
......
...@@ -18,6 +18,9 @@ from __future__ import print_function ...@@ -18,6 +18,9 @@ from __future__ import print_function
from collections import OrderedDict from collections import OrderedDict
from paddle import fluid
from ppdet.experimental import mixed_precision_global_state
from ppdet.core.workspace import register from ppdet.core.workspace import register
__all__ = ['YOLOv3'] __all__ = ['YOLOv3']
...@@ -43,12 +46,23 @@ class YOLOv3(object): ...@@ -43,12 +46,23 @@ class YOLOv3(object):
def build(self, feed_vars, mode='train'): def build(self, feed_vars, mode='train'):
im = feed_vars['image'] im = feed_vars['image']
mixed_precision_enabled = mixed_precision_global_state() is not None
# cast inputs to FP16
if mixed_precision_enabled:
im = fluid.layers.cast(im, 'float16')
body_feats = self.backbone(im) body_feats = self.backbone(im)
if isinstance(body_feats, OrderedDict): if isinstance(body_feats, OrderedDict):
body_feat_names = list(body_feats.keys()) body_feat_names = list(body_feats.keys())
body_feats = [body_feats[name] for name in body_feat_names] body_feats = [body_feats[name] for name in body_feat_names]
# cast features back to FP32
if mixed_precision_enabled:
body_feats = [fluid.layers.cast(v, 'float32') for v in body_feats]
if mode == 'train': if mode == 'train':
gt_box = feed_vars['gt_box'] gt_box = feed_vars['gt_box']
gt_label = feed_vars['gt_label'] gt_label = feed_vars['gt_label']
......
...@@ -36,7 +36,8 @@ set_paddle_flags( ...@@ -36,7 +36,8 @@ set_paddle_flags(
) )
from paddle import fluid from paddle import fluid
from paddle.fluid.contrib import mixed_precision
from ppdet.experimental import mixed_precision_context
from ppdet.core.workspace import load_config, merge_config, create from ppdet.core.workspace import load_config, merge_config, create
from ppdet.data.data_feed import create_reader from ppdet.data.data_feed import create_reader
...@@ -115,16 +116,18 @@ def main(): ...@@ -115,16 +116,18 @@ def main():
with fluid.unique_name.guard(): with fluid.unique_name.guard():
model = create(main_arch) model = create(main_arch)
train_pyreader, feed_vars = create_feed(train_feed) train_pyreader, feed_vars = create_feed(train_feed)
train_fetches = model.train(feed_vars)
loss = train_fetches['loss'] with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx:
lr = lr_builder() train_fetches = model.train(feed_vars)
optimizer = optim_builder(lr)
if FLAGS.fp16: loss = train_fetches['loss']
optimizer = mixed_precision.decorate( if FLAGS.fp16:
optimizer=optimizer, loss *= ctx.get_loss_scale_var()
init_loss_scaling=FLAGS.loss_scale, lr = lr_builder()
use_dynamic_loss_scaling=False) optimizer = optim_builder(lr)
optimizer.minimize(loss) optimizer.minimize(loss)
if FLAGS.fp16:
loss /= ctx.get_loss_scale_var()
# parse train fetches # parse train fetches
train_keys, train_values, _ = parse_fetches(train_fetches) train_keys, train_values, _ = parse_fetches(train_fetches)
...@@ -154,8 +157,6 @@ def main(): ...@@ -154,8 +157,6 @@ def main():
# compile program for multi-devices # compile program for multi-devices
build_strategy = fluid.BuildStrategy() build_strategy = fluid.BuildStrategy()
build_strategy.fuse_all_optimizer_ops = False build_strategy.fuse_all_optimizer_ops = False
if FLAGS.fp16:
build_strategy.fuse_all_reduce_ops = False
# only enable sync_bn in multi GPU devices # only enable sync_bn in multi GPU devices
sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn'
build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \
...@@ -280,6 +281,12 @@ def main(): ...@@ -280,6 +281,12 @@ def main():
if __name__ == '__main__': if __name__ == '__main__':
parser = ArgsParser() parser = ArgsParser()
parser.add_argument(
"-r",
"--resume_checkpoint",
default=None,
type=str,
help="Checkpoint path for resuming training.")
parser.add_argument( parser.add_argument(
"--fp16", "--fp16",
action='store_true', action='store_true',
...@@ -290,12 +297,6 @@ if __name__ == '__main__': ...@@ -290,12 +297,6 @@ if __name__ == '__main__':
default=8., default=8.,
type=float, type=float,
help="Mixed precision training loss scale.") help="Mixed precision training loss scale.")
parser.add_argument(
"-r",
"--resume_checkpoint",
default=None,
type=str,
help="Checkpoint path for resuming training.")
parser.add_argument( parser.add_argument(
"--eval", "--eval",
action='store_true', action='store_true',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册