[Cherry-pick]to Release/2.3, Improve MSRAInitializer (#43721)

* fix conflict * improve the doc

[Cherry-pick]to Release/2.3, Improve MSRAInitializer (#43721)
* fix conflict * improve the doc
1aafc31b · Jackwaterveg · GitHub · 4dcfc6df · 1aafc31b · 1aafc31b
显示空白变更内容
内联并排

Showing with 69 addition and 44 deletion

python/paddle/fluid/initializer.py python/paddle/fluid/initializer.py +45 -32

python/paddle/nn/initializer/kaiming.py python/paddle/nn/initializer/kaiming.py +24 -12

未找到文件。
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -679,20 +679,23 @@ class MSRAInitializer(Initializer):

    .. math::

-        x = \sqrt{\\frac{6.0}{fan\_in}}
+        x = gain \times \sqrt{\frac{3}{fan\_in}}

    In case of Normal distribution, the mean is 0 and the standard deviation
    is

    .. math::

-        \sqrt{\\frac{2.0}{fan\_in}}
+        \frac{gain}{\sqrt{{fan\_in}}}

    Args:
        uniform (bool): whether to use uniform or normal distribution
-        fan_in (float32|None): fan_in for MSRAInitializer. If None, it is\
-        inferred from the variable. default is None.
+        fan_in (float32|None): fan_in (in_features) of trainable Tensor,\
+        If None, it will be infered automaticly. If you don't want to use in_features of the Tensor,\
+        you can set the value of 'fan_in' smartly by yourself. default is None.
        seed (int32): random seed
+        negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0.
+        nonlinearity(str, optional): the non-linear function. default is relu.

    Note:
        It is recommended to set fan_in to None for most cases.
@@ -709,7 +712,12 @@ class MSRAInitializer(Initializer):

    """

-    def __init__(self, uniform=True, fan_in=None, seed=0):
+    def __init__(self,
+                 uniform=True,
+                 fan_in=None,
+                 seed=0,
+                 negative_slope=0,
+                 nonlinearity='relu'):
        """Constructor for MSRAInitializer
        """
        assert uniform is not None
@@ -718,6 +726,8 @@ class MSRAInitializer(Initializer):
        self._uniform = uniform
        self._fan_in = fan_in
        self._seed = seed
+        self._negative_slope = negative_slope
+        self._nonlinearity = nonlinearity

    def __call__(self, var, block=None):
        """Initialize the input tensor with MSRA initialization.
@@ -759,13 +769,16 @@ class MSRAInitializer(Initializer):

        if framework._non_static_mode():
            if self._uniform:
-                limit = np.sqrt(6.0 / float(fan_in))
+                gain = calculate_gain(self._nonlinearity, self._negative_slope)
+                limit = gain * math.sqrt(3.0 / float(fan_in))
+
                out_var = _C_ops.uniform_random('shape', out_var.shape, 'min',
                                                -limit, 'max', limit, 'seed',
                                                self._seed, 'dtype',
                                                int(out_dtype))
            else:
-                std = math.sqrt(2.0 / float(fan_in))
+                gain = calculate_gain(self._nonlinearity, self._negative_slope)
+                std = gain / math.sqrt(float(fan_in))
                if in_dygraph_mode():
                    place = _current_expected_place()
                    out_var = _C_ops.final_state_gaussian_random(
@@ -786,9 +799,9 @@ class MSRAInitializer(Initializer):
            return None
        else:
            if self._uniform:
-                limit = np.sqrt(6.0 / float(fan_in))
-                op = block.append_op(
-                    type="uniform_random",
+                gain = calculate_gain(self._nonlinearity, self._negative_slope)
+                limit = gain * math.sqrt(3.0 / float(fan_in))
+                op = block.append_op(type="uniform_random",
                                     inputs={},
                                     outputs={"Out": out_var},
                                     attrs={
@@ -801,9 +814,9 @@ class MSRAInitializer(Initializer):
                                     stop_gradient=True)

            else:
-                std = np.sqrt(2.0 / float(fan_in))
-                op = block.append_op(
-                    type="gaussian_random",
+                gain = calculate_gain(self._nonlinearity, self._negative_slope)
+                std = gain / math.sqrt(float(fan_in))
+                op = block.append_op(type="gaussian_random",
                                     outputs={"Out": out_var},
                                     attrs={
                                         "shape": out_var.shape,

--- a/python/paddle/nn/initializer/kaiming.py
+++ b/python/paddle/nn/initializer/kaiming.py
@@ -33,11 +33,14 @@ class KaimingNormal(MSRAInitializer):

    .. math::

-        \sqrt{\frac{2.0}{fan\_in}}
+        \frac{gain}{\sqrt{{fan\_in}}}

    Args:
-        fan_in (float32|None): fan_in for Kaiming normal Initializer. If None, it is\
-        inferred from the variable. default is None.
+        fan_in (float32|None): fan_in (in_features) of trainable Tensor,\
+        If None, it will be infered automaticly. If you don't want to use in_features of the Tensor,\
+        you can set the value of 'fan_in' smartly by yourself. default is None.
+        negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0.
+        nonlinearity(str, optional): the non-linear function. default is relu.

    Note:
        It is recommended to set fan_in to None for most cases.
@@ -56,9 +59,12 @@ class KaimingNormal(MSRAInitializer):

    """

-    def __init__(self, fan_in=None):
-        super(KaimingNormal, self).__init__(
-            uniform=False, fan_in=fan_in, seed=0)
+    def __init__(self, fan_in=None, negative_slope=0.0, nonlinearity='relu'):
+        super(KaimingNormal, self).__init__(uniform=False,
+                                            fan_in=fan_in,
+                                            seed=0,
+                                            negative_slope=negative_slope,
+                                            nonlinearity=nonlinearity)


 class KaimingUniform(MSRAInitializer):
@@ -75,11 +81,14 @@ class KaimingUniform(MSRAInitializer):

    .. math::

-        x = \sqrt{\frac{6.0}{fan\_in}}
+        x = gain \times \sqrt{\frac{3}{fan\_in}}

    Args:
-        fan_in (float32|None): fan_in for Kaiming uniform Initializer. If None, it is\
-        inferred from the variable. default is None.
+        fan_in (float32|None): fan_in (in_features) of trainable Tensor,\
+        If None, it will be infered automaticly. If you don't want to use in_features of the Tensor,\
+        you can set the value of 'fan_in' smartly by yourself. default is None.
+        negative_slope (float, optional): negative_slope (only used with leaky_relu). default is 0.0.
+        nonlinearity(str, optional): the non-linear function. default is relu.

    Note:
        It is recommended to set fan_in to None for most cases.
@@ -98,6 +107,9 @@ class KaimingUniform(MSRAInitializer):

    """

-    def __init__(self, fan_in=None):
-        super(KaimingUniform, self).__init__(
-            uniform=True, fan_in=fan_in, seed=0)
+    def __init__(self, fan_in=None, negative_slope=0.0, nonlinearity='relu'):
+        super(KaimingUniform, self).__init__(uniform=True,
+                                             fan_in=fan_in,
+                                             seed=0,
+                                             negative_slope=negative_slope,
+                                             nonlinearity=nonlinearity)