gradclip.py 3.2 KB
Newer Older
H
Hui Zhang 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import paddle
from paddle.fluid import core
16 17
from paddle.fluid import layers
from paddle.fluid.dygraph import base as imperative_base
H
Hui Zhang 已提交
18

19
from deepspeech.utils.log import Log
H
Hui Zhang 已提交
20

21
__all__ = ["ClipGradByGlobalNormWithLog"]
H
Hui Zhang 已提交
22

23 24 25 26
logger = Log(__name__).getlog()


class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
H
Hui Zhang 已提交
27 28 29
    def __init__(self, clip_norm):
        super().__init__(clip_norm)

30 31 32
    def __repr__(self):
        return f"{self.__class__.__name__}(global_clip_norm={self.clip_norm})"

H
Hui Zhang 已提交
33 34 35 36
    @imperative_base.no_grad
    def _dygraph_clip(self, params_grads):
        params_and_grads = []
        sum_square_list = []
H
Hui Zhang 已提交
37
        for i, (p, g) in enumerate(params_grads):
H
Hui Zhang 已提交
38 39 40 41 42 43 44 45 46 47 48 49
            if g is None:
                continue
            if getattr(p, 'need_clip', True) is False:
                continue
            merge_grad = g
            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
                merge_grad = layers.merge_selected_rows(g)
                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
            square = layers.square(merge_grad)
            sum_square = layers.reduce_sum(square)
            sum_square_list.append(sum_square)

50 51 52 53
            # debug log, not dump all since slow down train process
            if i < 10:
                logger.debug(
                    f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
54

H
Hui Zhang 已提交
55 56 57 58 59 60 61
        # all parameters have been filterd out
        if len(sum_square_list) == 0:
            return params_grads

        global_norm_var = layers.concat(sum_square_list)
        global_norm_var = layers.reduce_sum(global_norm_var)
        global_norm_var = layers.sqrt(global_norm_var)
62 63 64
        # debug log
        logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!")

H
Hui Zhang 已提交
65 66 67 68 69
        max_global_norm = layers.fill_constant(
            shape=[1], dtype=global_norm_var.dtype, value=self.clip_norm)
        clip_var = layers.elementwise_div(
            x=max_global_norm,
            y=layers.elementwise_max(x=global_norm_var, y=max_global_norm))
H
Hui Zhang 已提交
70
        for i, (p, g) in enumerate(params_grads):
H
Hui Zhang 已提交
71 72 73 74 75 76 77 78
            if g is None:
                continue
            if getattr(p, 'need_clip', True) is False:
                params_and_grads.append((p, g))
                continue
            new_grad = layers.elementwise_mul(x=g, y=clip_var)
            params_and_grads.append((p, new_grad))

79 80 81 82 83
            # debug log, not dump all since slow down train process
            if i < 10:
                logger.debug(
                    f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"
                )
84

H
Hui Zhang 已提交
85
        return params_and_grads