diff --git a/example/bert_clue/utils.py b/example/bert_clue/utils.py index 6bf0c87a67dbd0c2f4591836a6a2a7203a2c5809..1d05b957404d226ce6864be7634fd0ac7f862d50 100644 --- a/example/bert_clue/utils.py +++ b/example/bert_clue/utils.py @@ -30,7 +30,7 @@ from mindspore.train.parallel_utils import ParallelMode from mindspore.communication.management import get_group_size from mindspore import context from mindspore.model_zoo.Bert_NEZHA.bert_model import BertModel -from mindspore.model_zoo.Bert_NEZHA.bert_for_pre_training import ClipGradients +from mindspore.model_zoo.Bert_NEZHA.bert_for_pre_training import clip_grad from CRF import CRF GRADIENT_CLIP_TYPE = 1 @@ -66,7 +66,6 @@ class BertFinetuneCell(nn.Cell): degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) - self.clip_gradients = ClipGradients() self.cast = P.Cast() self.alloc_status = P.NPUAllocFloatStatus() self.get_status = P.NPUGetFloatStatus() @@ -110,7 +109,7 @@ class BertFinetuneCell(nn.Cell): F.control_depend(loss, init) self.depend_parameter_use(clear_before_grad, scaling_sens) grads = self.hyper_map(F.partial(grad_scale, scaling_sens), grads) - grads = self.clip_gradients(grads, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE) + grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) if self.reducer_flag: grads = self.grad_reducer(grads) flag = self.get_status(init) diff --git a/mindspore/model_zoo/Bert_NEZHA/bert_for_pre_training.py b/mindspore/model_zoo/Bert_NEZHA/bert_for_pre_training.py index c324f10f6b5f61caa1da864f36a3fefbb5583e7c..a5507fa782b8979cb02f996c410e22fca51c55cc 100644 --- a/mindspore/model_zoo/Bert_NEZHA/bert_for_pre_training.py +++ b/mindspore/model_zoo/Bert_NEZHA/bert_for_pre_training.py @@ -32,44 +32,31 @@ from .bert_model import BertModel GRADIENT_CLIP_TYPE = 1 GRADIENT_CLIP_VALUE = 1.0 +_nn_clip_by_norm = nn.ClipByNorm() +clip_grad = C.MultitypeFuncGraph("clip_grad") +@clip_grad.register("Number", "Number", "Tensor") -class ClipGradients(nn.Cell): +def _clip_grad(clip_type, clip_value, grad): """ Clip gradients. Inputs: - grads (tuple[Tensor]): Gradients. clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'. clip_value (float): Specifies how much to clip. + grad (tuple[Tensor]): Gradients. Outputs: tuple[Tensor], clipped gradients. """ - def __init__(self): - super(ClipGradients, self).__init__() - self.clip_by_norm = nn.ClipByNorm() - self.cast = P.Cast() - self.dtype = P.DType() - - def construct(self, - grads, - clip_type, - clip_value): - if clip_type != 0 and clip_type != 1: - return grads - - new_grads = () - for grad in grads: - dt = self.dtype(grad) - if clip_type == 0: - t = C.clip_by_value(grad, self.cast(F.tuple_to_array((-clip_value,)), dt), - self.cast(F.tuple_to_array((clip_value,)), dt)) - else: - t = self.clip_by_norm(grad, self.cast(F.tuple_to_array((clip_value,)), dt)) - new_grads = new_grads + (t,) - - return new_grads - + if clip_type != 0 and clip_type != 1: + return grad + dt = F.dtype(grad) + if clip_type == 0: + new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt), + F.cast(F.tuple_to_array((clip_value,)), dt)) + else: + new_grad = _nn_clip_by_norm(grad, F.cast(F.tuple_to_array((clip_value,)), dt)) + return new_grad class GetMaskedLMOutput(nn.Cell): """ @@ -294,8 +281,8 @@ class BertTrainOneStepCell(nn.Cell): degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) - self.clip_gradients = ClipGradients() self.cast = P.Cast() + self.hyper_map = C.HyperMap() def set_sens(self, value): self.sens = value @@ -327,7 +314,7 @@ class BertTrainOneStepCell(nn.Cell): masked_lm_weights, self.cast(F.tuple_to_array((self.sens,)), mstype.float32)) - grads = self.clip_gradients(grads, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE) + grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) if self.reducer_flag: # apply grad reducer on grads grads = self.grad_reducer(grads) @@ -376,7 +363,6 @@ class BertTrainOneStepWithLossScaleCell(nn.Cell): degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) - self.clip_gradients = ClipGradients() self.cast = P.Cast() self.alloc_status = P.NPUAllocFloatStatus() self.get_status = P.NPUGetFloatStatus() @@ -427,7 +413,7 @@ class BertTrainOneStepWithLossScaleCell(nn.Cell): self.cast(scaling_sens, mstype.float32)) grads = self.hyper_map(F.partial(grad_scale, scaling_sens), grads) - grads = self.clip_gradients(grads, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE) + grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads) # apply grad reducer on grads grads = self.grad_reducer(grads) self.get_status(init) diff --git a/tests/ut/python/model/test_bert_cell.py b/tests/ut/python/model/test_bert_cell.py index 3de6073787dfc13e3c84ec1061933edb4bea3e00..817a75b6b49e40b52174814f45592ea6e5b3f288 100644 --- a/tests/ut/python/model/test_bert_cell.py +++ b/tests/ut/python/model/test_bert_cell.py @@ -19,11 +19,12 @@ import numpy as np import mindspore.common.dtype as mstype import mindspore.nn as nn import mindspore.ops.composite as C +from mindspore.ops import functional as F from mindspore.common.initializer import TruncatedNormal from mindspore.common.parameter import ParameterTuple from mindspore.common.tensor import Tensor from mindspore.model_zoo.Bert_NEZHA import BertPretrainingLoss, GetNextSentenceOutput -from mindspore.model_zoo.Bert_NEZHA.bert_for_pre_training import ClipGradients +from mindspore.model_zoo.Bert_NEZHA.bert_for_pre_training import clip_grad from mindspore.model_zoo.Bert_NEZHA.bert_model import BertConfig, \ EmbeddingLookup, EmbeddingPostprocessor, BertOutput, RelaPosMatrixGenerator, \ RelaPosEmbeddingsGenerator, SaturateCast, BertAttention, BertSelfAttention, \ @@ -80,12 +81,12 @@ class TrainStepWrapForAdam(nn.Cell): self.network = network self.weights = ParameterTuple(network.get_parameters()) self.optimizer = AdamWeightDecay(self.weights) - self.clip_gradients = ClipGradients() + self.hyper_map = C.HyperMap() def construct(self, x, sens): weights = self.weights grads = C.grad_by_list_with_sens(self.network, weights)(x, sens) - grads = self.clip_gradients(grads, 1, 1.0) + grads = self.hyper_map(F.partial(clip_grad, 1, 1.0), grads) return self.optimizer(grads) @@ -111,9 +112,10 @@ class TempC2Wrap(nn.Cell): self.op = op self.c1 = c1 self.c2 = c2 + self.hyper_map = C.HyperMap() def construct(self, x1): - x = self.op(x1, self.c1, self.c2) + x = self.hyper_map(F.partial(self.op, self.c1, self.c2), x1) return x @@ -405,7 +407,7 @@ test_case_cell_ops = [ 'desc_inputs': [[1, 64]], 'skip': ['backward']}), ('ClipGradients', { - 'block': TempC2Wrap(ClipGradients(), 1, 1.0), + 'block': TempC2Wrap(clip_grad, 1, 1.0), 'desc_inputs': [tuple(convert(shp) for shp in [[1], [1], [1]])], 'skip': ['backward', 'exec']}), ]