未验证 提交 62ad3594 编写于 作者: zhouweiwei2014's avatar zhouweiwei2014 提交者: GitHub

add new API paddle.nn.initializer.Orthogonal and calculate_gain (#37163)

* add new API paddle.nn.initializer.Orthogonal and calculate_gain

* fix comment

* fix comment
上级 af83e79a
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
from __future__ import print_function from __future__ import print_function
import math
from . import framework from . import framework
from . import core from . import core
from .framework import in_dygraph_mode, default_main_program from .framework import in_dygraph_mode, default_main_program
...@@ -1033,6 +1034,52 @@ def _global_bias_initializer(): ...@@ -1033,6 +1034,52 @@ def _global_bias_initializer():
return _global_bias_initializer_ return _global_bias_initializer_
def calculate_gain(nonlinearity, param=None):
"""
Get the recommended gain value of some nonlinearity function.
Args:
nonlinearity(str): nonlinearity function.
param(bool|int|float, optional): optional parameter for somme nonlinearity function. Now, it only applies to 'leaky_relu'. Default: None,
it will be calculated as 0.01 in the formula.
Returns:
The recommended gain value for nonlinearity function.
Examples:
.. code-block:: python
import paddle
gain = paddle.nn.initializer.calculate_gain('tanh') # 5.0 / 3
gain = paddle.nn.initializer.calculate_gain('leaky_relu', param=1.0) # 1.0 = math.sqrt(2.0 / (1+param^2))
"""
if param is None:
param = 0.01
else:
assert isinstance(param, (bool, int, float))
param = float(param)
recommended_gain = {
'sigmoid': 1,
'linear': 1,
'conv1d': 1,
'conv2d': 1,
'conv3d': 1,
'conv_transpose1d': 1,
'conv_transpose2d': 1,
'conv_transpose3d': 1,
'tanh': 5.0 / 3,
'relu': math.sqrt(2.0),
'leaky_relu': math.sqrt(2.0 / (1 + param**2)),
'selu': 3.0 / 4
}
if nonlinearity in recommended_gain.keys():
return recommended_gain[nonlinearity]
else:
raise ValueError("nonlinearity function {} is not suppported now.".
format(nonlinearity))
# We short the class name, since users will use the initializer with the package # We short the class name, since users will use the initializer with the package
# name. The sample code: # name. The sample code:
# #
......
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
from __future__ import print_function from __future__ import print_function
import numpy as np import numpy as np
import math
import unittest import unittest
import paddle import paddle
...@@ -41,6 +42,17 @@ def output_hist(out): ...@@ -41,6 +42,17 @@ def output_hist(out):
class TestConstantInitializer(unittest.TestCase): class TestConstantInitializer(unittest.TestCase):
def test_calculate_gain(self):
self.assertEqual(paddle.nn.initializer.calculate_gain('sigmoid'), 1)
self.assertEqual(paddle.nn.initializer.calculate_gain('linear'), 1)
self.assertEqual(paddle.nn.initializer.calculate_gain('conv2d'), 1)
self.assertEqual(paddle.nn.initializer.calculate_gain('tanh'), 5.0 / 3)
self.assertEqual(
paddle.nn.initializer.calculate_gain('relu'), math.sqrt(2.0))
self.assertEqual(
paddle.nn.initializer.calculate_gain('leaky_relu', 1), 1)
self.assertEqual(paddle.nn.initializer.calculate_gain('selu'), 3.0 / 4)
def test_constant_initializer_default_value(self, dtype="float32"): def test_constant_initializer_default_value(self, dtype="float32"):
"""Test the constant initializer with default value """Test the constant initializer with default value
""" """
...@@ -716,5 +728,192 @@ class TesetconsistencyOfDynamicAndStaticGraph(unittest.TestCase): ...@@ -716,5 +728,192 @@ class TesetconsistencyOfDynamicAndStaticGraph(unittest.TestCase):
self.assertTrue(np.array_equal(dynamic_res[1], static_res[1])) self.assertTrue(np.array_equal(dynamic_res[1], static_res[1]))
# 2-D Parameter with shape: [10, 15]
class TestOrthogonalInitializer1(unittest.TestCase):
"""
case 1
"""
def config(self):
self.weight_attr = paddle.ParamAttr(
initializer=paddle.nn.initializer.Orthogonal(gain=3.0))
self.dtype = "float64"
self.in_features = 10
self.out_features = 15
self.num_ops = 9
def check_result(self, a, b):
self.assertTrue(np.array_equal(a, b))
self.assertTrue(np.allclose(np.matmul(a, a.T), 9 * np.eye(10)))
def test_orthogonal(self):
self.config()
paddle.set_default_dtype(self.dtype)
paddle.disable_static()
paddle.seed(2021)
linear = paddle.nn.Linear(
self.in_features, self.out_features, weight_attr=self.weight_attr)
res_dygraph = linear.weight.numpy()
paddle.enable_static()
paddle.seed(2021)
start_prog = paddle.static.Program()
main_prog = paddle.static.Program()
with paddle.static.program_guard(main_prog, start_prog):
linear = paddle.nn.Linear(
self.in_features,
self.out_features,
weight_attr=self.weight_attr)
block = start_prog.global_block()
self.assertEqual(len(block.ops), self.num_ops)
self.assertEqual(block.ops[0].type, 'gaussian_random')
self.assertEqual(block.ops[1].type, 'qr')
self.assertEqual(block.ops[2].type, 'diag_v2')
self.assertEqual(block.ops[3].type, 'sign')
self.assertEqual(block.ops[4].type, 'elementwise_mul')
self.assertEqual(block.ops[-3].type, 'reshape2')
self.assertEqual(block.ops[-2].type, 'scale')
exe = paddle.static.Executor()
res_static = exe.run(start_prog, fetch_list=[linear.weight])[0]
self.check_result(res_dygraph, res_static)
# 2-D Parameter with shape: [15, 10]
class TestOrthogonalInitializer2(TestOrthogonalInitializer1):
"""
case 2
"""
def config(self):
self.weight_attr = paddle.ParamAttr(
initializer=paddle.nn.initializer.Orthogonal(gain=2.0))
self.dtype = "float64"
self.in_features = 15
self.out_features = 10
self.num_ops = 8
def check_result(self, a, b):
self.assertTrue(np.array_equal(a, b))
self.assertTrue(np.allclose(np.matmul(a.T, a), 4 * np.eye(10)))
# 2-D Parameter with shape: [10, 10]
class TestOrthogonalInitializer3(TestOrthogonalInitializer1):
"""
case 3
"""
def config(self):
self.weight_attr = paddle.ParamAttr(
initializer=paddle.nn.initializer.Orthogonal())
self.dtype = "float32"
self.in_features = 10
self.out_features = 10
self.num_ops = 8
def check_result(self, a, b):
self.assertTrue(np.array_equal(a, b))
self.assertTrue(np.allclose(np.matmul(a.T, a), np.eye(10), atol=1.e-6))
self.assertTrue(np.allclose(np.matmul(a, a.T), np.eye(10), atol=1.e-6))
def test_error(self):
self.config()
with self.assertRaises(AssertionError):
paddle.nn.Linear(10, 10, bias_attr=self.weight_attr)
# 4-D Parameter with shape: [6, 4, 3, 3]
class TestOrthogonalInitializer4(unittest.TestCase):
"""
case 4
"""
def config(self):
self.weight_attr = paddle.ParamAttr(
initializer=paddle.nn.initializer.Orthogonal(gain=3.0))
self.dtype = "float64"
self.in_features = 4
self.out_features = 6
self.kernel_size = (3, 3)
def check_result(self, a, b):
self.assertTrue(np.array_equal(a, b))
a = a.reshape(6, -1)
self.assertTrue(np.allclose(np.matmul(a, a.T), 9 * np.eye(6)))
def test_orthogonal(self):
self.config()
paddle.set_default_dtype(self.dtype)
paddle.disable_static()
paddle.seed(2021)
conv2d = paddle.nn.Conv2D(
self.in_features,
self.out_features,
self.kernel_size,
weight_attr=self.weight_attr)
res_dygraph = conv2d.weight.numpy()
paddle.enable_static()
paddle.seed(2021)
start_prog = paddle.static.Program()
main_prog = paddle.static.Program()
with paddle.static.program_guard(main_prog, start_prog):
conv2d = paddle.nn.Conv2D(
self.in_features,
self.out_features,
self.kernel_size,
weight_attr=self.weight_attr)
exe = paddle.static.Executor()
res_static = exe.run(paddle.static.default_startup_program(),
fetch_list=[conv2d.weight])[0]
self.check_result(res_dygraph, res_static)
# 4-D Parameter with shape: [50, 4, 3, 3]
class TestOrthogonalInitializer5(TestOrthogonalInitializer4):
"""
case 5
"""
def config(self):
self.weight_attr = paddle.ParamAttr(
initializer=paddle.nn.initializer.Orthogonal(gain=2.0))
self.dtype = "float64"
self.in_features = 4
self.out_features = 50
self.kernel_size = (3, 3)
def check_result(self, a, b):
self.assertTrue(np.array_equal(a, b))
a = a.reshape(50, -1)
self.assertTrue(np.allclose(np.matmul(a.T, a), 4 * np.eye(36)))
# 4-D Parameter with shape: [36, 4, 3, 3]
class TestOrthogonalInitializer6(TestOrthogonalInitializer4):
"""
case 6
"""
def config(self):
self.weight_attr = paddle.ParamAttr(
initializer=paddle.nn.initializer.Orthogonal())
self.dtype = "float32"
self.in_features = 4
self.out_features = 36
self.kernel_size = (3, 3)
def check_result(self, a, b):
self.assertTrue(np.array_equal(a, b))
a = a.reshape(36, -1)
self.assertTrue(np.allclose(np.matmul(a.T, a), np.eye(36), atol=1.e-6))
self.assertTrue(np.allclose(np.matmul(a, a.T), np.eye(36), atol=1.e-6))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -15,6 +15,7 @@ ...@@ -15,6 +15,7 @@
# TODO: define the initializers to create a Parameter in neural network # TODO: define the initializers to create a Parameter in neural network
from ...fluid.initializer import Bilinear # noqa: F401 from ...fluid.initializer import Bilinear # noqa: F401
from ...fluid.initializer import set_global_initializer # noqa: F401 from ...fluid.initializer import set_global_initializer # noqa: F401
from ...fluid.initializer import calculate_gain # noqa: F401
from .constant import Constant # noqa: F401 from .constant import Constant # noqa: F401
...@@ -31,6 +32,8 @@ from .normal import TruncatedNormal # noqa: F401 ...@@ -31,6 +32,8 @@ from .normal import TruncatedNormal # noqa: F401
from .uniform import Uniform # noqa: F401 from .uniform import Uniform # noqa: F401
from .orthogonal import Orthogonal # noqa: F401
__all__ = [ #noqa __all__ = [ #noqa
'Bilinear', 'Bilinear',
'Constant', 'Constant',
...@@ -42,5 +45,7 @@ __all__ = [ #noqa ...@@ -42,5 +45,7 @@ __all__ = [ #noqa
'Normal', 'Normal',
'TruncatedNormal', 'TruncatedNormal',
'Uniform', 'Uniform',
'set_global_initializer' 'Orthogonal',
'set_global_initializer',
'calculate_gain'
] ]
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ...fluid.initializer import Initializer
from ...fluid.data_feeder import check_variable_and_dtype
from ...fluid.core import VarDesc
from ...fluid import unique_name, framework
from ...tensor import diag, transpose, sign, qr, reshape
__all__ = []
class Orthogonal(Initializer):
"""The orthogonal initializer. The initialized tensor is (semi) orthogonal.
Assuming that 'weight' will be initialized, its shape is [M, N].
.. code-block:: text
if M < N:
The rows are orthogonal vectors
elif M > N:
The columns are orthogonal vectors
else M = N:
Both rows and columns are orthogonal vectors
Only Tensor with 2 or more dimensions can initialized by Orthogonal.
Args:
gain(float, optional): The multiplication coefficient for initialized tensor. Default: 1.0.
name(str, optional): The default value is None. Normally there is no need for user to set this
property. For more information, please refer to :ref:`api_guide_Name`.
Returns:
A parameter initialized by orthogonal initialized.
Examples:
.. code-block:: python
import paddle
weight_attr = paddle.ParamAttr(initializer=paddle.nn.initializer.Orthogonal())
linear = paddle.nn.Linear(10, 15, weight_attr=weight_attr)
# linear.weight: X * X' = I
linear = paddle.nn.Linear(15, 10, weight_attr=weight_attr)
# linear.weight: X' * X = I
"""
def __init__(self, gain=1.0, name=None):
assert gain is not None, 'gain should not be None'
super(Orthogonal, self).__init__()
self._gain = gain
def __call__(self, var, block=None):
"""Initialize the input tensor with orthogonal initializer.
Args:
var(Tensor): Tensor that needs to be initialized.
block(Block, optional): The block in which initialization ops
should be added. Used in static graph only, default None.
Returns:
The last initialization op, it contain 8 ops in orthogonal initializer.
"""
block = self._check_block(block)
assert isinstance(var, framework.Parameter)
assert isinstance(block, framework.Block)
# 'qr' op only support float32/float64 now
check_variable_and_dtype(var, "Out", ["float32", "float64"],
"Orthogonal")
self._seed = block.program.random_seed
shape = var.shape
assert len(
shape
) >= 2, "Only Tensor with 2 or more dimensions can be initialized by Orthogonal"
row = shape[0]
col = 1
for i in shape[1:]:
col *= i
flatten_shape = [max(row, col), min(row, col)]
normal_var = block.create_var(
name=unique_name.generate('.'.join(['gaussian_random', 'tmp'])),
dtype=var.dtype,
persistable=False,
stop_gradient=True)
block.append_op(
type='gaussian_random',
inputs={},
outputs={'Out': normal_var},
attrs={
'mean': 0.0,
'std': 1.0,
'shape': flatten_shape,
'seed': self._seed,
'dtype': var.dtype
},
stop_gradient=True)
q = block.create_var(
name=unique_name.generate('.'.join(['qr', 'q', 'tmp'])),
dtype=normal_var.dtype,
persistable=False,
stop_gradient=True)
r = block.create_var(
name=unique_name.generate('.'.join(['qr', 'r', 'tmp'])),
dtype=normal_var.dtype,
persistable=False,
stop_gradient=True)
block.append_op(
type='qr',
inputs={'X': [normal_var]},
outputs={
'Q': q,
'R': r,
},
attrs={'mode': 'reduced'},
stop_gradient=True)
r_diag = block.create_var(
name=unique_name.generate('.'.join(['diag', 'tmp'])),
dtype=r.dtype,
persistable=False,
stop_gradient=True)
block.append_op(
type='diag_v2',
inputs={'X': r},
outputs={'Out': r_diag},
attrs={'offset': 0,
'padding_value': 0},
stop_gradient=True)
r_sign = r_diag
block.append_op(
type='sign',
inputs={'X': [r_diag]},
outputs={'Out': r_sign},
stop_gradient=True)
block.append_op(
type='elementwise_mul',
inputs={'X': q,
'Y': r_sign},
outputs={'Out': q},
attrs={},
stop_gradient=True)
x_shape = block.create_var(
name=unique_name.generate('.'.join(['transpose', 'shape', 'tmp'])),
dtype=q.dtype,
persistable=False,
stop_gradient=True)
if row < col:
q_transpose = block.create_var(
name=unique_name.generate('.'.join(['transpose', 'tmp'])),
dtype=q.dtype,
persistable=False,
stop_gradient=True)
block.append_op(
type='transpose2',
inputs={'X': q},
outputs={'Out': q_transpose,
'XShape': x_shape},
attrs={'axis': [1, 0]},
stop_gradient=True)
q = q_transpose
block.append_op(
type='reshape2',
inputs={'X': q},
outputs={'Out': q,
"XShape": x_shape},
attrs={'shape': var.shape},
stop_gradient=True)
op = block.append_op(
type='scale',
inputs={'X': q},
outputs={'Out': var},
attrs={'scale': self._gain,
'bias': 0.0})
return op
...@@ -209,28 +209,28 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None): ...@@ -209,28 +209,28 @@ def norm(x, p='fro', axis=None, keepdim=False, name=None):
# [[ 0. 1. 2. 3.] [ 4. 5. 6. 7.] [ 8. 9. 10. 11.]]] # [[ 0. 1. 2. 3.] [ 4. 5. 6. 7.] [ 8. 9. 10. 11.]]]
# compute frobenius norm along last two dimensions. # compute frobenius norm along last two dimensions.
out_fro = paddle.norm(x, p='fro', axis=[0,1]) out_fro = paddle.linalg.norm(x, p='fro', axis=[0,1])
# out_fro.numpy() [17.435596 16.911535 16.7332 16.911535] # out_fro.numpy() [17.435596 16.911535 16.7332 16.911535]
# compute 2-order vector norm along last dimension. # compute 2-order vector norm along last dimension.
out_pnorm = paddle.norm(x, p=2, axis=-1) out_pnorm = paddle.linalg.norm(x, p=2, axis=-1)
#out_pnorm.numpy(): [[21.118711 13.190906 5.477226] #out_pnorm.numpy(): [[21.118711 13.190906 5.477226]
# [ 3.7416575 11.224972 19.131126]] # [ 3.7416575 11.224972 19.131126]]
# compute 2-order norm along [0,1] dimension. # compute 2-order norm along [0,1] dimension.
out_pnorm = paddle.norm(x, p=2, axis=[0,1]) out_pnorm = paddle.linalg.norm(x, p=2, axis=[0,1])
#out_pnorm.numpy(): [17.435596 16.911535 16.7332 16.911535] #out_pnorm.numpy(): [17.435596 16.911535 16.7332 16.911535]
# compute inf-order norm # compute inf-order norm
out_pnorm = paddle.norm(x, p=np.inf) out_pnorm = paddle.linalg.norm(x, p=np.inf)
#out_pnorm.numpy() = [12.] #out_pnorm.numpy() = [12.]
out_pnorm = paddle.norm(x, p=np.inf, axis=0) out_pnorm = paddle.linalg.norm(x, p=np.inf, axis=0)
#out_pnorm.numpy(): [[12. 11. 10. 9.] [8. 7. 6. 7.] [8. 9. 10. 11.]] #out_pnorm.numpy(): [[12. 11. 10. 9.] [8. 7. 6. 7.] [8. 9. 10. 11.]]
# compute -inf-order norm # compute -inf-order norm
out_pnorm = paddle.norm(x, p=-np.inf) out_pnorm = paddle.linalg.norm(x, p=-np.inf)
#out_pnorm.numpy(): [0.] #out_pnorm.numpy(): [0.]
out_pnorm = paddle.norm(x, p=-np.inf, axis=0) out_pnorm = paddle.linalg.norm(x, p=-np.inf, axis=0)
#out_pnorm.numpy(): [[0. 1. 2. 3.] [4. 5. 6. 5.] [4. 3. 2. 1.]] #out_pnorm.numpy(): [[0. 1. 2. 3.] [4. 5. 6. 5.] [4. 3. 2. 1.]]
""" """
...@@ -1084,7 +1084,7 @@ def cholesky(x, upper=False, name=None): ...@@ -1084,7 +1084,7 @@ def cholesky(x, upper=False, name=None):
a_t = np.transpose(a, [1, 0]) a_t = np.transpose(a, [1, 0])
x_data = np.matmul(a, a_t) + 1e-03 x_data = np.matmul(a, a_t) + 1e-03
x = paddle.to_tensor(x_data) x = paddle.to_tensor(x_data)
out = paddle.cholesky(x, upper=False) out = paddle.linalg.cholesky(x, upper=False)
print(out) print(out)
# [[1.190523 0. 0. ] # [[1.190523 0. 0. ]
# [0.9906703 0.27676893 0. ] # [0.9906703 0.27676893 0. ]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册