未验证 提交 f8dbd7a2 编写于 作者: C ceci3 提交者: GitHub

fix syncbn, test=develop (#26523)

* fix syncbn, test=develop

* fix test,test=develop

* fix unittest,test=develop
上级 dd3df693
...@@ -36,7 +36,7 @@ __all__ = [ ...@@ -36,7 +36,7 @@ __all__ = [
'Conv2D', 'Conv3D', 'Pool2D', 'Linear', 'BatchNorm', 'Dropout', 'Embedding', 'Conv2D', 'Conv3D', 'Pool2D', 'Linear', 'BatchNorm', 'Dropout', 'Embedding',
'GRUUnit', 'InstanceNorm', 'LayerNorm', 'NCE', 'PRelu', 'GRUUnit', 'InstanceNorm', 'LayerNorm', 'NCE', 'PRelu',
'BilinearTensorProduct', 'Conv2DTranspose', 'Conv3DTranspose', 'GroupNorm', 'BilinearTensorProduct', 'Conv2DTranspose', 'Conv3DTranspose', 'GroupNorm',
'SpectralNorm', 'TreeConv', 'Flatten', 'SyncBatchNorm' 'SpectralNorm', 'TreeConv', 'Flatten'
] ]
...@@ -3203,220 +3203,6 @@ class TreeConv(layers.Layer): ...@@ -3203,220 +3203,6 @@ class TreeConv(layers.Layer):
return self._helper.append_activation(pre_activation, act=self._act) return self._helper.append_activation(pre_activation, act=self._act)
class SyncBatchNorm(layers.Layer):
"""
This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can
be used as a normalizer function for other operations, such as conv2d and fully connected
operations.
The data is normalized by the mean and variance of the channel based on whole mini-batch
, which including data in all gpus.
Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
for more details.
When model in training mode, the :math:`\\mu_{\\beta}`
and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus.
Calculated as follows:
.. math::
\\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
\ mini-batch\ mean \\\\
\\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
\\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
- :math:`x` : whole mini-batch data in all gpus
- :math:`m` : the size of the whole mini-batch data
When model in evaluation mode, the :math:`\\mu_{\\beta}`
and :math:`\\sigma_{\\beta}^{2}` are global statistics (moving_mean and moving_variance,
which usually got from the pre-trained model). Global statistics calculated as follows:
.. math::
moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
The formula of normalization is as follows:
.. math::
\\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
\\sigma_{\\beta}^{2} + \\eps}} \\qquad &//\ normalize \\\\
y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
- :math:`\\eps` : add a smaller value to the variance to prevent division by zero
- :math:`\\gamma` : trainable scale parameter vector
- :math:`\\beta` : trainable shift parameter vector
Parameters:
num_features(int): Indicate the number of channels of the input ``Tensor``.
epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
of this layer. If it is set to None or one attribute of ParamAttr, this layerr
will create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. If it is set to False,
this layer will not have trainable scale parameter. Default: None.
bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of this layer.
If it is set to None or one attribute of ParamAttr, this layer
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. If it is set to False, this layer will not
have trainable bias parameter. Default: None.
track_running_stats(bool, optional): Whether to compute global stats, which including running mean and
running variance. Default: True.
Returns:
None
Examples:
.. code-block:: python
import paddle
import paddle.nn as nn
import numpy as np
x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
paddle.disable_static()
x = paddle.to_tensor(x)
if paddle.fluid.is_compiled_with_cuda():
sync_batch_norm = nn.SyncBatchNorm(2)
hidden1 = sync_batch_norm(x)
print(hidden1.numpy())
# [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]]
"""
def __init__(self,
num_features,
epsilon=1e-05,
momentum=0.9,
track_running_stats=True,
weight_attr=None,
bias_attr=None,
data_format='NCHW',
name=None):
super(SyncBatchNorm, self).__init__()
self._weight_attr = weight_attr
self._bias_attr = bias_attr
self._num_features = num_features
self._data_layout = data_format
self._momentum = momentum
self._epsilon = epsilon
self._track_running_stats = track_running_stats
if self._track_running_stats == False:
logging.warn(
"moving mean and moving variance will be calculated whether `track_running_stats` is set to `True` or `False`, we will fix it in the next version."
)
param_shape = [self._num_features]
# create parameter
if weight_attr == False:
self.weight = self.create_parameter(
attr=None, shape=param_shape, default_initializer=Constant(1.0))
self.weight.stop_gradient = True
else:
self.weight = self.create_parameter(
attr=self._weight_attr,
shape=param_shape,
default_initializer=Constant(1.0))
self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
if bias_attr == False:
self.bias = self.create_parameter(
attr=None,
shape=param_shape,
default_initializer=Constant(0.0),
is_bias=True)
self.bias.stop_gradient = True
else:
self.bias = self.create_parameter(
attr=self._bias_attr, shape=param_shape, is_bias=True)
self.bias.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
self._mean = self.create_parameter(
attr=ParamAttr(
name=None,
initializer=Constant(0.0),
trainable=False,
do_model_average=True),
shape=param_shape,
dtype=self._dtype)
self._mean.stop_gradient = True
self._variance = self.create_parameter(
attr=ParamAttr(
name=None,
initializer=Constant(1.0),
trainable=False,
do_model_average=True),
shape=param_shape,
dtype=self._dtype)
self._variance.stop_gradient = True
def forward(self, x):
# create output
# mean and mean_out share the same memory
mean_out = self._mean
# variance and variance out share the same memory
variance_out = self._variance
### train mode: use mini-batch stats, eval mode: use global stats
if in_dygraph_mode():
attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
"is_test", not self.training, "data_layout",
self._data_layout, "use_mkldnn", False, "fuse_with_relu",
False, "use_global_stats", not self.training,
'trainable_statistics', False)
sync_batch_norm_out, _, _, _, _, _ = core.ops.sync_batch_norm(
x, self.weight, self.bias, self._mean, self._variance, mean_out,
variance_out, *attrs)
return sync_batch_norm_out
check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
'BatchNorm')
attrs = {
"momentum": self._momentum,
"epsilon": self._epsilon,
"is_test": not self.training,
"data_layout": self._data_layout,
"use_mkldnn": False,
"fuse_with_relu": False,
"use_global_stats": not self.training,
"trainable_statistics": False,
}
inputs = {
"X": [x],
"Scale": [self.weight],
"Bias": [self.bias],
"Mean": [self._mean],
"Variance": [self._variance]
}
saved_mean = self._helper.create_variable_for_type_inference(
dtype=self._dtype, stop_gradient=True)
saved_variance = self._helper.create_variable_for_type_inference(
dtype=self._dtype, stop_gradient=True)
sync_batch_norm_out = self._helper.create_variable_for_type_inference(
self._dtype)
outputs = {
"Y": [sync_batch_norm_out],
"MeanOut": [mean_out],
"VarianceOut": [variance_out],
"SavedMean": [saved_mean],
"SavedVariance": [saved_variance]
}
self._helper.append_op(
type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
return sync_batch_norm_out
class Flatten(layers.Layer): class Flatten(layers.Layer):
""" """
:alias_main: paddle.nn.Flatten :alias_main: paddle.nn.Flatten
......
...@@ -287,7 +287,7 @@ class TestLayer(LayerTest): ...@@ -287,7 +287,7 @@ class TestLayer(LayerTest):
if core.is_compiled_with_cuda(): if core.is_compiled_with_cuda():
with self.static_graph(): with self.static_graph():
t = layers.data(name='t', shape=[-1, 3, 5, 5], dtype='float32') t = layers.data(name='t', shape=[-1, 3, 5, 5], dtype='float32')
my_sync_bn = nn.SyncBatchNorm(3) my_sync_bn = paddle.nn.SyncBatchNorm(3)
ret = my_sync_bn(t) ret = my_sync_bn(t)
static_ret = self.get_static_graph_result( static_ret = self.get_static_graph_result(
feed={'t': np.ones( feed={'t': np.ones(
......
...@@ -25,7 +25,7 @@ class TestParallelDygraphMnist(TestDistBase): ...@@ -25,7 +25,7 @@ class TestParallelDygraphMnist(TestDistBase):
def _setup_config(self): def _setup_config(self):
self._sync_mode = False self._sync_mode = False
self._nccl2_mode = True self._nccl2_mode = True
self._dygraph = False #True self._dygraph = True
def test_mnist(self): def test_mnist(self):
if fluid.core.is_compiled_with_cuda(): if fluid.core.is_compiled_with_cuda():
......
...@@ -22,6 +22,7 @@ import unittest ...@@ -22,6 +22,7 @@ import unittest
import numpy as np import numpy as np
import os import os
import six import six
import paddle
import paddle.fluid.core as core import paddle.fluid.core as core
import paddle.fluid as fluid import paddle.fluid as fluid
from paddle.fluid import compiler from paddle.fluid import compiler
...@@ -209,7 +210,7 @@ class TestDygraphSyncBatchNormAPIError(unittest.TestCase): ...@@ -209,7 +210,7 @@ class TestDygraphSyncBatchNormAPIError(unittest.TestCase):
return return
with program_guard(Program(), Program()): with program_guard(Program(), Program()):
my_sync_batch_norm = fluid.dygraph.SyncBatchNorm(10) my_sync_batch_norm = paddle.nn.SyncBatchNorm(10)
x1 = fluid.create_lod_tensor( x1 = fluid.create_lod_tensor(
np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CUDAPlace(0)) np.array([-1, 3, 5, 5]), [[1, 1, 1, 1]], fluid.CUDAPlace(0))
self.assertRaises(TypeError, my_sync_batch_norm, x1) self.assertRaises(TypeError, my_sync_batch_norm, x1)
......
...@@ -14,15 +14,239 @@ ...@@ -14,15 +14,239 @@
# TODO: define normalization api # TODO: define normalization api
import warnings
from ...fluid.dygraph.nn import InstanceNorm from ...fluid.dygraph.nn import InstanceNorm
from ...fluid.dygraph import BatchNorm #DEFINE_ALIAS from ...fluid.dygraph import BatchNorm #DEFINE_ALIAS
from ...fluid.dygraph import GroupNorm #DEFINE_ALIAS from ...fluid.dygraph import GroupNorm #DEFINE_ALIAS
from ...fluid.dygraph import LayerNorm #DEFINE_ALIAS from ...fluid.dygraph import LayerNorm #DEFINE_ALIAS
from ...fluid.dygraph import SpectralNorm #DEFINE_ALIAS from ...fluid.dygraph import SpectralNorm #DEFINE_ALIAS
from ...fluid.dygraph import SyncBatchNorm #DEFINE_ALIAS
from ...fluid.dygraph import layers
from ...fluid.framework import in_dygraph_mode
from ...fluid.initializer import Constant
from ...fluid.param_attr import ParamAttr
from ...fluid.data_feeder import check_variable_and_dtype, check_type
from ...fluid import core
__all__ = [ __all__ = [
'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm', 'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'InstanceNorm',
'SyncBatchNorm' 'SyncBatchNorm'
] ]
class SyncBatchNorm(layers.Layer):
"""
This interface is used to construct a callable object of the ``SyncBatchNorm`` class.
It implements the function of the Cross-GPU Synchronized Batch Normalization Layer, and can
be used as a normalizer function for other operations, such as conv2d and fully connected
operations.
The data is normalized by the mean and variance of the channel based on whole mini-batch
, which including data in all gpus.
Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing
Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_
for more details.
When model in training mode, the :math:`\\mu_{\\beta}`
and :math:`\\sigma_{\\beta}^{2}` are the statistics of whole mini-batch data in all gpus.
Calculated as follows:
.. math::
\\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\
\ mini-batch\ mean \\\\
\\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\
\\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\
- :math:`x` : whole mini-batch data in all gpus
- :math:`m` : the size of the whole mini-batch data
When model in evaluation mode, the :math:`\\mu_{\\beta}`
and :math:`\\sigma_{\\beta}^{2}` are global statistics (moving_mean and moving_variance,
which usually got from the pre-trained model). Global statistics calculated as follows:
.. math::
moving\_mean = moving\_mean * momentum + \mu_{\beta} * (1. - momentum) \quad &// global mean \\
moving\_variance = moving\_variance * momentum + \sigma_{\beta}^{2} * (1. - momentum) \quad &// global variance \\
The formula of normalization is as follows:
.. math::
\\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\
\\sigma_{\\beta}^{2} + \\eps}} \\qquad &//\ normalize \\\\
y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift
- :math:`\\eps` : add a smaller value to the variance to prevent division by zero
- :math:`\\gamma` : trainable scale parameter vector
- :math:`\\beta` : trainable shift parameter vector
Parameters:
num_features(int): Indicate the number of channels of the input ``Tensor``.
epsilon(float, optional): The small value added to the variance to prevent division by zero. Default: 1e-5.
momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
of this layer. If it is set to None or one attribute of ParamAttr, this layerr
will create ParamAttr as param_attr. If the Initializer of the param_attr
is not set, the parameter is initialized with Xavier. If it is set to False,
this layer will not have trainable scale parameter. Default: None.
bias_attr(ParamAttr|bool, optional): The parameter attribute for the bias of this layer.
If it is set to None or one attribute of ParamAttr, this layer
will create ParamAttr as bias_attr. If the Initializer of the bias_attr
is not set, the bias is initialized zero. If it is set to False, this layer will not
have trainable bias parameter. Default: None.
track_running_stats(bool, optional): Whether to compute global stats, which including running mean and
running variance. Default: True.
Shapes:
input: Tensor that the dimension from 2 to 5.
output: Tensor with the same shape as input.
Examples:
.. code-block:: python
import paddle
import paddle.nn as nn
import numpy as np
x = np.array([[[[0.3, 0.4], [0.3, 0.07]], [[0.83, 0.37], [0.18, 0.93]]]]).astype('float32')
paddle.disable_static()
x = paddle.to_tensor(x)
if paddle.fluid.is_compiled_with_cuda():
sync_batch_norm = nn.SyncBatchNorm(2)
hidden1 = sync_batch_norm(x)
print(hidden1.numpy())
# [[[[0.26824948, 1.0936325],[0.26824948, -1.6301316]],[[ 0.8095662, -0.665287],[-1.2744656, 1.1301866 ]]]]
"""
def __init__(self,
num_features,
epsilon=1e-05,
momentum=0.9,
track_running_stats=True,
weight_attr=None,
bias_attr=None,
data_format='NCHW',
name=None):
super(SyncBatchNorm, self).__init__()
self._weight_attr = weight_attr
self._bias_attr = bias_attr
self._num_features = num_features
self._data_layout = data_format
self._momentum = momentum
self._epsilon = epsilon
self._track_running_stats = track_running_stats
if self._track_running_stats == False:
warnings.warn(
"moving mean and moving variance will be calculated whether `track_running_stats` is set to `True` or `False`, we will fix it in the next version."
)
param_shape = [self._num_features]
# create parameter
if weight_attr == False:
self.weight = self.create_parameter(
attr=None, shape=param_shape, default_initializer=Constant(1.0))
self.weight.stop_gradient = True
else:
self.weight = self.create_parameter(
attr=self._weight_attr,
shape=param_shape,
default_initializer=Constant(1.0))
self.weight.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
if bias_attr == False:
self.bias = self.create_parameter(
attr=None,
shape=param_shape,
default_initializer=Constant(0.0),
is_bias=True)
self.bias.stop_gradient = True
else:
self.bias = self.create_parameter(
attr=self._bias_attr, shape=param_shape, is_bias=True)
self.bias.stop_gradient = self._weight_attr != None and self._weight_attr.learning_rate == 0.
self._mean = self.create_parameter(
attr=ParamAttr(
name=None,
initializer=Constant(0.0),
trainable=False,
do_model_average=True),
shape=param_shape,
dtype=self._dtype)
self._mean.stop_gradient = True
self._variance = self.create_parameter(
attr=ParamAttr(
name=None,
initializer=Constant(1.0),
trainable=False,
do_model_average=True),
shape=param_shape,
dtype=self._dtype)
self._variance.stop_gradient = True
def forward(self, x):
# create output
# mean and mean_out share the same memory
mean_out = self._mean
# variance and variance out share the same memory
variance_out = self._variance
### train mode: use mini-batch stats, eval mode: use global stats
### use_global_stats only support False in sync_batch_norm
if in_dygraph_mode():
attrs = ("momentum", self._momentum, "epsilon", self._epsilon,
"is_test", not self.training, "data_layout",
self._data_layout, "use_mkldnn", False, "fuse_with_relu",
False, "use_global_stats", False, 'trainable_statistics',
False)
sync_batch_norm_out, _, _, _, _, _ = core.ops.sync_batch_norm(
x, self.weight, self.bias, self._mean, self._variance, mean_out,
variance_out, *attrs)
return sync_batch_norm_out
check_variable_and_dtype(x, 'input', ['float16', 'float32', 'float64'],
'BatchNorm')
attrs = {
"momentum": self._momentum,
"epsilon": self._epsilon,
"is_test": not self.training,
"data_layout": self._data_layout,
"use_mkldnn": False,
"fuse_with_relu": False,
"use_global_stats": False,
"trainable_statistics": False,
}
inputs = {
"X": [x],
"Scale": [self.weight],
"Bias": [self.bias],
"Mean": [self._mean],
"Variance": [self._variance]
}
saved_mean = self._helper.create_variable_for_type_inference(
dtype=self._dtype, stop_gradient=True)
saved_variance = self._helper.create_variable_for_type_inference(
dtype=self._dtype, stop_gradient=True)
sync_batch_norm_out = self._helper.create_variable_for_type_inference(
self._dtype)
outputs = {
"Y": [sync_batch_norm_out],
"MeanOut": [mean_out],
"VarianceOut": [variance_out],
"SavedMean": [saved_mean],
"SavedVariance": [saved_variance]
}
self._helper.append_op(
type="sync_batch_norm", inputs=inputs, outputs=outputs, attrs=attrs)
return sync_batch_norm_out
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册