未验证 提交 1aafc31b 编写于 作者: J Jackwaterveg 提交者: GitHub

[Cherry-pick]to Release/2.3, Improve MSRAInitializer (#43721)

* fix conflict

* improve the doc
上级 4dcfc6df
...@@ -679,20 +679,23 @@ class MSRAInitializer(Initializer): ...@@ -679,20 +679,23 @@ class MSRAInitializer(Initializer):
.. math:: .. math::
x = \sqrt{\\frac{6.0}{fan\_in}} x = gain \times \sqrt{\frac{3}{fan\_in}}
In case of Normal distribution, the mean is 0 and the standard deviation In case of Normal distribution, the mean is 0 and the standard deviation
is is
.. math:: .. math::
\sqrt{\\frac{2.0}{fan\_in}} \frac{gain}{\sqrt{{fan\_in}}}
Args: Args:
uniform (bool): whether to use uniform or normal distribution uniform (bool): whether to use uniform or normal distribution
fan_in (float32|None): fan_in for MSRAInitializer. If None, it is\ fan_in (float32|None): fan_in (in_features) of trainable Tensor,\
inferred from the variable. default is None. If None, it will be infered automaticly. If you don't want to use in_features of the Tensor,\
you can set the value of 'fan_in' smartly by yourself. default is None.
seed (int32): random seed seed (int32): random seed
negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0.
nonlinearity(str, optional): the non-linear function. default is relu.
Note: Note:
It is recommended to set fan_in to None for most cases. It is recommended to set fan_in to None for most cases.
...@@ -709,7 +712,12 @@ class MSRAInitializer(Initializer): ...@@ -709,7 +712,12 @@ class MSRAInitializer(Initializer):
""" """
def __init__(self, uniform=True, fan_in=None, seed=0): def __init__(self,
uniform=True,
fan_in=None,
seed=0,
negative_slope=0,
nonlinearity='relu'):
"""Constructor for MSRAInitializer """Constructor for MSRAInitializer
""" """
assert uniform is not None assert uniform is not None
...@@ -718,6 +726,8 @@ class MSRAInitializer(Initializer): ...@@ -718,6 +726,8 @@ class MSRAInitializer(Initializer):
self._uniform = uniform self._uniform = uniform
self._fan_in = fan_in self._fan_in = fan_in
self._seed = seed self._seed = seed
self._negative_slope = negative_slope
self._nonlinearity = nonlinearity
def __call__(self, var, block=None): def __call__(self, var, block=None):
"""Initialize the input tensor with MSRA initialization. """Initialize the input tensor with MSRA initialization.
...@@ -759,13 +769,16 @@ class MSRAInitializer(Initializer): ...@@ -759,13 +769,16 @@ class MSRAInitializer(Initializer):
if framework._non_static_mode(): if framework._non_static_mode():
if self._uniform: if self._uniform:
limit = np.sqrt(6.0 / float(fan_in)) gain = calculate_gain(self._nonlinearity, self._negative_slope)
limit = gain * math.sqrt(3.0 / float(fan_in))
out_var = _C_ops.uniform_random('shape', out_var.shape, 'min', out_var = _C_ops.uniform_random('shape', out_var.shape, 'min',
-limit, 'max', limit, 'seed', -limit, 'max', limit, 'seed',
self._seed, 'dtype', self._seed, 'dtype',
int(out_dtype)) int(out_dtype))
else: else:
std = math.sqrt(2.0 / float(fan_in)) gain = calculate_gain(self._nonlinearity, self._negative_slope)
std = gain / math.sqrt(float(fan_in))
if in_dygraph_mode(): if in_dygraph_mode():
place = _current_expected_place() place = _current_expected_place()
out_var = _C_ops.final_state_gaussian_random( out_var = _C_ops.final_state_gaussian_random(
...@@ -786,33 +799,33 @@ class MSRAInitializer(Initializer): ...@@ -786,33 +799,33 @@ class MSRAInitializer(Initializer):
return None return None
else: else:
if self._uniform: if self._uniform:
limit = np.sqrt(6.0 / float(fan_in)) gain = calculate_gain(self._nonlinearity, self._negative_slope)
op = block.append_op( limit = gain * math.sqrt(3.0 / float(fan_in))
type="uniform_random", op = block.append_op(type="uniform_random",
inputs={}, inputs={},
outputs={"Out": out_var}, outputs={"Out": out_var},
attrs={ attrs={
"shape": out_var.shape, "shape": out_var.shape,
"dtype": int(out_dtype), "dtype": int(out_dtype),
"min": -limit, "min": -limit,
"max": limit, "max": limit,
"seed": self._seed "seed": self._seed
}, },
stop_gradient=True) stop_gradient=True)
else: else:
std = np.sqrt(2.0 / float(fan_in)) gain = calculate_gain(self._nonlinearity, self._negative_slope)
op = block.append_op( std = gain / math.sqrt(float(fan_in))
type="gaussian_random", op = block.append_op(type="gaussian_random",
outputs={"Out": out_var}, outputs={"Out": out_var},
attrs={ attrs={
"shape": out_var.shape, "shape": out_var.shape,
"dtype": int(out_dtype), "dtype": int(out_dtype),
"mean": 0.0, "mean": 0.0,
"std": std, "std": std,
"seed": self._seed "seed": self._seed
}, },
stop_gradient=True) stop_gradient=True)
if var.dtype == VarDesc.VarType.FP16 or ( if var.dtype == VarDesc.VarType.FP16 or (
var.dtype == VarDesc.VarType.BF16 and not self._uniform): var.dtype == VarDesc.VarType.BF16 and not self._uniform):
......
...@@ -33,11 +33,14 @@ class KaimingNormal(MSRAInitializer): ...@@ -33,11 +33,14 @@ class KaimingNormal(MSRAInitializer):
.. math:: .. math::
\sqrt{\frac{2.0}{fan\_in}} \frac{gain}{\sqrt{{fan\_in}}}
Args: Args:
fan_in (float32|None): fan_in for Kaiming normal Initializer. If None, it is\ fan_in (float32|None): fan_in (in_features) of trainable Tensor,\
inferred from the variable. default is None. If None, it will be infered automaticly. If you don't want to use in_features of the Tensor,\
you can set the value of 'fan_in' smartly by yourself. default is None.
negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0.
nonlinearity(str, optional): the non-linear function. default is relu.
Note: Note:
It is recommended to set fan_in to None for most cases. It is recommended to set fan_in to None for most cases.
...@@ -56,9 +59,12 @@ class KaimingNormal(MSRAInitializer): ...@@ -56,9 +59,12 @@ class KaimingNormal(MSRAInitializer):
""" """
def __init__(self, fan_in=None): def __init__(self, fan_in=None, negative_slope=0.0, nonlinearity='relu'):
super(KaimingNormal, self).__init__( super(KaimingNormal, self).__init__(uniform=False,
uniform=False, fan_in=fan_in, seed=0) fan_in=fan_in,
seed=0,
negative_slope=negative_slope,
nonlinearity=nonlinearity)
class KaimingUniform(MSRAInitializer): class KaimingUniform(MSRAInitializer):
...@@ -75,11 +81,14 @@ class KaimingUniform(MSRAInitializer): ...@@ -75,11 +81,14 @@ class KaimingUniform(MSRAInitializer):
.. math:: .. math::
x = \sqrt{\frac{6.0}{fan\_in}} x = gain \times \sqrt{\frac{3}{fan\_in}}
Args: Args:
fan_in (float32|None): fan_in for Kaiming uniform Initializer. If None, it is\ fan_in (float32|None): fan_in (in_features) of trainable Tensor,\
inferred from the variable. default is None. If None, it will be infered automaticly. If you don't want to use in_features of the Tensor,\
you can set the value of 'fan_in' smartly by yourself. default is None.
negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0.
nonlinearity(str, optional): the non-linear function. default is relu.
Note: Note:
It is recommended to set fan_in to None for most cases. It is recommended to set fan_in to None for most cases.
...@@ -98,6 +107,9 @@ class KaimingUniform(MSRAInitializer): ...@@ -98,6 +107,9 @@ class KaimingUniform(MSRAInitializer):
""" """
def __init__(self, fan_in=None): def __init__(self, fan_in=None, negative_slope=0.0, nonlinearity='relu'):
super(KaimingUniform, self).__init__( super(KaimingUniform, self).__init__(uniform=True,
uniform=True, fan_in=fan_in, seed=0) fan_in=fan_in,
seed=0,
negative_slope=negative_slope,
nonlinearity=nonlinearity)
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册