old grad clip has 0d tensor problem, fix it (#3334)

a2ae6396 · Hui Zhang · GitHub · 5153ac83 · a2ae6396 · 5153ac83
4 changed file
--- a/paddlespeech/s2t/exps/deepspeech2/model.py
+++ b/paddlespeech/s2t/exps/deepspeech2/model.py
@@ -27,7 +27,6 @@ from paddlespeech.audio.text.text_featurizer import TextFeaturizer
 from paddlespeech.s2t.io.dataloader import BatchDataLoader
 from paddlespeech.s2t.models.ds2 import DeepSpeech2InferModel
 from paddlespeech.s2t.models.ds2 import DeepSpeech2Model
-from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
 from paddlespeech.s2t.training.reporter import report
 from paddlespeech.s2t.training.timer import Timer
 from paddlespeech.s2t.training.trainer import Trainer
@@ -148,7 +147,7 @@ class DeepSpeech2Trainer(Trainer):
        if not self.train:
            return
-        grad_clip = ClipGradByGlobalNormWithLog(config.global_grad_clip)
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(config.global_grad_clip)
        lr_scheduler = paddle.optimizer.lr.ExponentialDecay(
            learning_rate=config.lr, gamma=config.lr_decay, verbose=True)
        optimizer = paddle.optimizer.Adam(

--- a/paddlespeech/s2t/training/gradclip.py
+++ b/paddlespeech/s2t/training/gradclip.py
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import paddle
-from paddle.fluid import core
-from paddle.fluid import layers
-from paddle.fluid.dygraph import base as imperative_base
-from paddlespeech.s2t.utils.log import Log
-__all__ = ["ClipGradByGlobalNormWithLog"]
-logger = Log(__name__).getlog()
-class ClipGradByGlobalNormWithLog(paddle.nn.ClipGradByGlobalNorm):
-    def __init__(self, clip_norm):
-        super().__init__(clip_norm)
-    def __repr__(self):
-        return f"{self.__class__.__name__}(global_clip_norm={self.clip_norm})"
-    @imperative_base.no_grad
-    def _dygraph_clip(self, params_grads):
-        params_and_grads = []
-        sum_square_list = []
-        for i, (p, g) in enumerate(params_grads):
-            if g is None:
-                continue
-            if getattr(p, 'need_clip', True) is False:
-                continue
-            merge_grad = g
-            if g.type == core.VarDesc.VarType.SELECTED_ROWS:
-                merge_grad = layers.merge_selected_rows(g)
-                merge_grad = layers.get_tensor_from_selected_rows(merge_grad)
-            square = paddle.square(merge_grad)
-            sum_square = paddle.sum(square)
-            sum_square_list.append(sum_square)
-            # debug log, not dump all since slow down train process
-            if i < 10:
-                logger.debug(
-                    f"Grad Before Clip: {p.name}: {float(sum_square.sqrt()) }")
-        # all parameters have been filterd out
-        if len(sum_square_list) == 0:
-            return params_grads
-        global_norm_var = paddle.concat(sum_square_list)
-        global_norm_var = paddle.sum(global_norm_var)
-        global_norm_var = paddle.sqrt(global_norm_var)
-        # debug log
-        logger.debug(f"Grad Global Norm: {float(global_norm_var)}!!!!")
-        max_global_norm = paddle.full(
-            shape=[1], dtype=global_norm_var.dtype, fill_value=self.clip_norm)
-        clip_var = paddle.divide(
-            x=max_global_norm,
-            y=paddle.maximum(x=global_norm_var, y=max_global_norm))
-        for i, (p, g) in enumerate(params_grads):
-            if g is None:
-                continue
-            if getattr(p, 'need_clip', True) is False:
-                params_and_grads.append((p, g))
-                continue
-            new_grad = paddle.multiply(x=g, y=clip_var)
-            params_and_grads.append((p, new_grad))
-            # debug log, not dump all since slow down train process
-            if i < 10:
-                logger.debug(
-                    f"Grad After Clip: {p.name}: {float(new_grad.square().sum().sqrt())}"
-                )
-        return params_and_grads
--- a/paddlespeech/s2t/training/optimizer/__init__.py
+++ b/paddlespeech/s2t/training/optimizer/__init__.py
@@ -19,7 +19,7 @@ from typing import Text
 import paddle
 from paddle.optimizer import Optimizer
 from paddle.regularizer import L2Decay
-from paddlespeech.s2t.training.gradclip import ClipGradByGlobalNormWithLog
 from paddlespeech.s2t.utils.dynamic_import import dynamic_import
 from paddlespeech.s2t.utils.dynamic_import import instance_class
 from paddlespeech.s2t.utils.log import Log
@@ -100,7 +100,7 @@ class OptimizerFactory():
        assert "parameters" in args, "parameters not in args."
        assert "learning_rate" in args, "learning_rate not in args."
-        grad_clip = ClipGradByGlobalNormWithLog(
+        grad_clip = paddle.nn.ClipGradByGlobalNorm(
            args['grad_clip']) if "grad_clip" in args else None
        weight_decay = L2Decay(
            args['weight_decay']) if "weight_decay" in args else None

--- a/tests/unit/tts/test_ssml.py
+++ b/tests/unit/tts/test_ssml.py
@@ -72,3 +72,12 @@ if __name__ == '__main__':
    for i, sub in enumerate(outs):
        print(i, sub)
    print()
+    import json
+    import xmltodict
+    text = "<speak>我们的声学模型使用了 Fast Speech Two。前浪<say-as pinyin='dao3'>倒</say-as>在沙滩上,沙滩上倒了一堆<say-as pinyin='tu3'>土</say-as>。 想象<say-as pinyin='gan1 gan1'>干干</say-as>的树干<say-as pinyin='dao3'>倒</say-as>了, 里面有个干尸，不知是被谁<say-as pinyin='gan4'>干</say-as>死的。</speak>"
+    ssml = xmltodict.parse(text)
+    print(json.dumps(ssml))
+    print(ssml['speak'].keys())
+    print(ssml['speak']['#text'])
+    print(ssml['speak']['say-as'])