提交 b6acedde 编写于 作者: C chenhaozhe

update bert scripts according to rules of modelzoo

上级 02f33a17
...@@ -308,7 +308,7 @@ def get_bprop_softmax(self): ...@@ -308,7 +308,7 @@ def get_bprop_softmax(self):
axis = self.axis axis = self.axis
def bprop(x, out, dout): def bprop(x, out, dout):
dx = mul(sub(dout, sum_func(mul(dout, out), axis)), out) dx = mul(out, sub(dout, sum_func(mul(out, dout), axis)))
return (dx,) return (dx,)
return bprop return bprop
......
...@@ -16,12 +16,12 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base]( ...@@ -16,12 +16,12 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base](
- Run `run_standalone_pretrain.sh` for non-distributed pre-training of BERT-base and BERT-NEZHA model. - Run `run_standalone_pretrain.sh` for non-distributed pre-training of BERT-base and BERT-NEZHA model.
``` bash ``` bash
sh run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR sh scripts/run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR
``` ```
- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model. - Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model.
``` bash ``` bash
sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH
``` ```
### Fine-Tuning ### Fine-Tuning
......
...@@ -19,8 +19,6 @@ Bert evaluation script. ...@@ -19,8 +19,6 @@ Bert evaluation script.
import os import os
import numpy as np import numpy as np
from evaluation_config import cfg, bert_net_cfg
from utils import BertNER, BertCLS
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
from mindspore import context from mindspore import context
from mindspore.common.tensor import Tensor from mindspore.common.tensor import Tensor
...@@ -28,9 +26,11 @@ import mindspore.dataset as de ...@@ -28,9 +26,11 @@ import mindspore.dataset as de
import mindspore.dataset.transforms.c_transforms as C import mindspore.dataset.transforms.c_transforms as C
from mindspore.train.model import Model from mindspore.train.model import Model
from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.train.serialization import load_checkpoint, load_param_into_net
from CRF import postprocess from src.evaluation_config import cfg, bert_net_cfg
from cluener_evaluation import submit from src.utils import BertNER, BertCLS
from finetune_config import tag_to_index from src.CRF import postprocess
from src.cluener_evaluation import submit
from src.finetune_config import tag_to_index
class Accuracy(): class Accuracy():
''' '''
......
...@@ -18,8 +18,8 @@ Bert finetune script. ...@@ -18,8 +18,8 @@ Bert finetune script.
''' '''
import os import os
from utils import BertFinetuneCell, BertCLS, BertNER from src.utils import BertFinetuneCell, BertCLS, BertNER
from finetune_config import cfg, bert_net_cfg, tag_to_index from src.finetune_config import cfg, bert_net_cfg, tag_to_index
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
import mindspore.communication.management as D import mindspore.communication.management as D
from mindspore import context from mindspore import context
......
...@@ -26,10 +26,10 @@ from mindspore.train.parallel_utils import ParallelMode ...@@ -26,10 +26,10 @@ from mindspore.train.parallel_utils import ParallelMode
from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell from mindspore.nn.wrap.loss_scale import DynamicLossScaleUpdateCell
from mindspore.train.callback import Callback, ModelCheckpoint, CheckpointConfig, TimeMonitor from mindspore.train.callback import Callback, ModelCheckpoint, CheckpointConfig, TimeMonitor
from mindspore.train.serialization import load_checkpoint, load_param_into_net from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.model_zoo.Bert_NEZHA import BertNetworkWithLoss, BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell
from mindspore.nn.optim import Lamb, Momentum, AdamWeightDecayDynamicLR from mindspore.nn.optim import Lamb, Momentum, AdamWeightDecayDynamicLR
from dataset import create_bert_dataset from src import BertNetworkWithLoss, BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell
from config import cfg, bert_net_cfg from src.dataset import create_bert_dataset
from src.config import cfg, bert_net_cfg
_current_dir = os.path.dirname(os.path.realpath(__file__)) _current_dir = os.path.dirname(os.path.realpath(__file__))
class LossCallBack(Callback): class LossCallBack(Callback):
...@@ -48,10 +48,8 @@ class LossCallBack(Callback): ...@@ -48,10 +48,8 @@ class LossCallBack(Callback):
self._per_print_times = per_print_times self._per_print_times = per_print_times
def step_end(self, run_context): def step_end(self, run_context):
cb_params = run_context.original_args() cb_params = run_context.original_args()
with open("./loss.log", "a+") as f: print("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num,
f.write("epoch: {}, step: {}, outputs are {}".format(cb_params.cur_epoch_num, cb_params.cur_step_num,
str(cb_params.net_outputs))) str(cb_params.net_outputs)))
f.write('\n')
def run_pretrain(): def run_pretrain():
"""pre-train bert_clue""" """pre-train bert_clue"""
...@@ -81,6 +79,11 @@ def run_pretrain(): ...@@ -81,6 +79,11 @@ def run_pretrain():
context.reset_auto_parallel_context() context.reset_auto_parallel_context()
context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True,
device_num=device_num) device_num=device_num)
from mindspore.parallel._auto_parallel_context import auto_parallel_context
if bert_net_cfg.num_hidden_layers == 12:
auto_parallel_context().set_all_reduce_fusion_split_indices([28, 55, 82, 109, 136, 163, 190, 205])
elif bert_net_cfg.num_hidden_layers == 24:
auto_parallel_context().set_all_reduce_fusion_split_indices([38, 93, 148, 203, 258, 313, 368, 397])
D.init() D.init()
rank = args_opt.device_id % device_num rank = args_opt.device_id % device_num
else: else:
......
...@@ -16,8 +16,8 @@ ...@@ -16,8 +16,8 @@
echo "==============================================================================================================" echo "=============================================================================================================="
echo "Please run the scipt as: " echo "Please run the scipt as: "
echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH" echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH"
echo "for example: sh run_distribute_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json /path/hccl.json" echo "for example: bash run_distribute_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json /path/hccl.json"
echo "It is better to use absolute path." echo "It is better to use absolute path."
echo "==============================================================================================================" echo "=============================================================================================================="
...@@ -49,6 +49,10 @@ do ...@@ -49,6 +49,10 @@ do
cp *.py ./LOG$i cp *.py ./LOG$i
cd ./LOG$i || exit cd ./LOG$i || exit
echo "start training for rank $i, device $DEVICE_ID" echo "start training for rank $i, device $DEVICE_ID"
mkdir -p ms_log
CUR_DIR=`pwd`
export GLOG_log_dir=${CUR_DIR}/ms_log
export GLOG_logtostderr=0
env > env.log env > env.log
taskset -c $cmdopt python ../run_pretrain.py \ taskset -c $cmdopt python ../run_pretrain.py \
--distribute="true" \ --distribute="true" \
...@@ -59,7 +63,7 @@ do ...@@ -59,7 +63,7 @@ do
--enable_lossscale="true" \ --enable_lossscale="true" \
--do_shuffle="true" \ --do_shuffle="true" \
--enable_data_sink="true" \ --enable_data_sink="true" \
--data_sink_steps=1 \ --data_sink_steps=100 \
--checkpoint_path="" \ --checkpoint_path="" \
--save_checkpoint_steps=10000 \ --save_checkpoint_steps=10000 \
--save_checkpoint_num=1 \ --save_checkpoint_num=1 \
......
...@@ -16,8 +16,8 @@ ...@@ -16,8 +16,8 @@
echo "==============================================================================================================" echo "=============================================================================================================="
echo "Please run the scipt as: " echo "Please run the scipt as: "
echo "sh run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR" echo "bash run_standalone_pretrain.sh DEVICE_ID EPOCH_SIZE DATA_DIR SCHEMA_DIR"
echo "for example: sh run_standalone_pretrain.sh 0 40 /path/zh-wiki/ /path/Schema.json" echo "for example: bash run_standalone_pretrain.sh 0 40 /path/zh-wiki/ /path/Schema.json"
echo "==============================================================================================================" echo "=============================================================================================================="
DEVICE_ID=$1 DEVICE_ID=$1
...@@ -25,6 +25,10 @@ EPOCH_SIZE=$2 ...@@ -25,6 +25,10 @@ EPOCH_SIZE=$2
DATA_DIR=$3 DATA_DIR=$3
SCHEMA_DIR=$4 SCHEMA_DIR=$4
mkdir -p ms_log
CUR_DIR=`pwd`
export GLOG_log_dir=${CUR_DIR}/ms_log
export GLOG_logtostderr=0
python run_pretrain.py \ python run_pretrain.py \
--distribute="false" \ --distribute="false" \
--epoch_size=$EPOCH_SIZE \ --epoch_size=$EPOCH_SIZE \
...@@ -33,7 +37,7 @@ python run_pretrain.py \ ...@@ -33,7 +37,7 @@ python run_pretrain.py \
--enable_lossscale="true" \ --enable_lossscale="true" \
--do_shuffle="true" \ --do_shuffle="true" \
--enable_data_sink="true" \ --enable_data_sink="true" \
--data_sink_steps=1 \ --data_sink_steps=100 \
--checkpoint_path="" \ --checkpoint_path="" \
--save_checkpoint_steps=10000 \ --save_checkpoint_steps=10000 \
--save_checkpoint_num=1 \ --save_checkpoint_num=1 \
......
...@@ -357,10 +357,10 @@ class BertTrainOneStepWithLossScaleCell(nn.Cell): ...@@ -357,10 +357,10 @@ class BertTrainOneStepWithLossScaleCell(nn.Cell):
if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]: if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
self.reducer_flag = True self.reducer_flag = True
self.grad_reducer = F.identity self.grad_reducer = F.identity
self.degree = 1
if self.reducer_flag: if self.reducer_flag:
mean = context.get_auto_parallel_context("mirror_mean") self.degree = get_group_size()
degree = get_group_size() self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE) self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
self.cast = P.Cast() self.cast = P.Cast()
self.alloc_status = P.NPUAllocFloatStatus() self.alloc_status = P.NPUAllocFloatStatus()
...@@ -411,10 +411,10 @@ class BertTrainOneStepWithLossScaleCell(nn.Cell): ...@@ -411,10 +411,10 @@ class BertTrainOneStepWithLossScaleCell(nn.Cell):
masked_lm_weights, masked_lm_weights,
self.cast(scaling_sens, self.cast(scaling_sens,
mstype.float32)) mstype.float32))
grads = self.hyper_map(F.partial(grad_scale, scaling_sens), grads)
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
# apply grad reducer on grads # apply grad reducer on grads
grads = self.grad_reducer(grads) grads = self.grad_reducer(grads)
grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads)
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
self.get_status(init) self.get_status(init)
flag_sum = self.reduce_sum(init, (0,)) flag_sum = self.reduce_sum(init, (0,))
if self.is_distributed: if self.is_distributed:
......
...@@ -25,6 +25,7 @@ from mindspore.ops import operations as P ...@@ -25,6 +25,7 @@ from mindspore.ops import operations as P
from mindspore.ops import composite as C from mindspore.ops import composite as C
from mindspore.common.tensor import Tensor from mindspore.common.tensor import Tensor
from mindspore.common.parameter import Parameter from mindspore.common.parameter import Parameter
from .fused_layer_norm import FusedLayerNorm
class BertConfig: class BertConfig:
...@@ -77,7 +78,8 @@ class BertConfig: ...@@ -77,7 +78,8 @@ class BertConfig:
input_mask_from_dataset=True, input_mask_from_dataset=True,
token_type_ids_from_dataset=True, token_type_ids_from_dataset=True,
dtype=mstype.float32, dtype=mstype.float32,
compute_type=mstype.float32): compute_type=mstype.float32,
enable_fused_layernorm=False):
self.batch_size = batch_size self.batch_size = batch_size
self.seq_length = seq_length self.seq_length = seq_length
self.vocab_size = vocab_size self.vocab_size = vocab_size
...@@ -96,6 +98,7 @@ class BertConfig: ...@@ -96,6 +98,7 @@ class BertConfig:
self.use_relative_positions = use_relative_positions self.use_relative_positions = use_relative_positions
self.dtype = dtype self.dtype = dtype
self.compute_type = compute_type self.compute_type = compute_type
self.enable_fused_layernorm = enable_fused_layernorm
class EmbeddingLookup(nn.Cell): class EmbeddingLookup(nn.Cell):
...@@ -240,12 +243,18 @@ class BertOutput(nn.Cell): ...@@ -240,12 +243,18 @@ class BertOutput(nn.Cell):
out_channels, out_channels,
initializer_range=0.02, initializer_range=0.02,
dropout_prob=0.1, dropout_prob=0.1,
compute_type=mstype.float32): compute_type=mstype.float32,
enable_fused_layernorm=False):
super(BertOutput, self).__init__() super(BertOutput, self).__init__()
self.dense = nn.Dense(in_channels, out_channels, self.dense = nn.Dense(in_channels, out_channels,
weight_init=TruncatedNormal(initializer_range)).to_float(compute_type) weight_init=TruncatedNormal(initializer_range)).to_float(compute_type)
self.dropout = nn.Dropout(1 - dropout_prob) self.dropout = nn.Dropout(1 - dropout_prob)
self.dropout_prob = dropout_prob
self.add = P.TensorAdd() self.add = P.TensorAdd()
if compute_type == mstype.float16:
self.layernorm = FusedLayerNorm((out_channels,),
use_batch_norm=enable_fused_layernorm).to_float(compute_type)
else:
self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type) self.layernorm = nn.LayerNorm((out_channels,)).to_float(compute_type)
self.cast = P.Cast() self.cast = P.Cast()
...@@ -481,6 +490,7 @@ class BertAttention(nn.Cell): ...@@ -481,6 +490,7 @@ class BertAttention(nn.Cell):
self.shape_return = (batch_size, from_seq_length, num_attention_heads * size_per_head) self.shape_return = (batch_size, from_seq_length, num_attention_heads * size_per_head)
self.cast_compute_type = SaturateCast(dst_type=compute_type) self.cast_compute_type = SaturateCast(dst_type=compute_type)
if self.use_relative_positions:
self._generate_relative_positions_embeddings = \ self._generate_relative_positions_embeddings = \
RelaPosEmbeddingsGenerator(length=to_seq_length, RelaPosEmbeddingsGenerator(length=to_seq_length,
depth=size_per_head, depth=size_per_head,
...@@ -529,7 +539,7 @@ class BertAttention(nn.Cell): ...@@ -529,7 +539,7 @@ class BertAttention(nn.Cell):
self.trans_shape_position) self.trans_shape_position)
attention_scores = attention_scores + key_position_scores_r_t attention_scores = attention_scores + key_position_scores_r_t
attention_scores = self.multiply(attention_scores, self.scores_mul) attention_scores = self.multiply(self.scores_mul, attention_scores)
if self.has_attention_mask: if self.has_attention_mask:
attention_mask = self.expand_dims(attention_mask, 1) attention_mask = self.expand_dims(attention_mask, 1)
...@@ -606,7 +616,8 @@ class BertSelfAttention(nn.Cell): ...@@ -606,7 +616,8 @@ class BertSelfAttention(nn.Cell):
initializer_range=0.02, initializer_range=0.02,
hidden_dropout_prob=0.1, hidden_dropout_prob=0.1,
use_relative_positions=False, use_relative_positions=False,
compute_type=mstype.float32): compute_type=mstype.float32,
enable_fused_layernorm=False):
super(BertSelfAttention, self).__init__() super(BertSelfAttention, self).__init__()
if hidden_size % num_attention_heads != 0: if hidden_size % num_attention_heads != 0:
raise ValueError("The hidden size (%d) is not a multiple of the number " raise ValueError("The hidden size (%d) is not a multiple of the number "
...@@ -634,7 +645,8 @@ class BertSelfAttention(nn.Cell): ...@@ -634,7 +645,8 @@ class BertSelfAttention(nn.Cell):
out_channels=hidden_size, out_channels=hidden_size,
initializer_range=initializer_range, initializer_range=initializer_range,
dropout_prob=hidden_dropout_prob, dropout_prob=hidden_dropout_prob,
compute_type=compute_type) compute_type=compute_type,
enable_fused_layernorm=enable_fused_layernorm)
self.reshape = P.Reshape() self.reshape = P.Reshape()
self.shape = (-1, hidden_size) self.shape = (-1, hidden_size)
...@@ -676,7 +688,8 @@ class BertEncoderCell(nn.Cell): ...@@ -676,7 +688,8 @@ class BertEncoderCell(nn.Cell):
hidden_dropout_prob=0.1, hidden_dropout_prob=0.1,
use_relative_positions=False, use_relative_positions=False,
hidden_act="gelu", hidden_act="gelu",
compute_type=mstype.float32): compute_type=mstype.float32,
enable_fused_layernorm=False):
super(BertEncoderCell, self).__init__() super(BertEncoderCell, self).__init__()
self.attention = BertSelfAttention( self.attention = BertSelfAttention(
batch_size=batch_size, batch_size=batch_size,
...@@ -688,7 +701,8 @@ class BertEncoderCell(nn.Cell): ...@@ -688,7 +701,8 @@ class BertEncoderCell(nn.Cell):
initializer_range=initializer_range, initializer_range=initializer_range,
hidden_dropout_prob=hidden_dropout_prob, hidden_dropout_prob=hidden_dropout_prob,
use_relative_positions=use_relative_positions, use_relative_positions=use_relative_positions,
compute_type=compute_type) compute_type=compute_type,
enable_fused_layernorm=enable_fused_layernorm)
self.intermediate = nn.Dense(in_channels=hidden_size, self.intermediate = nn.Dense(in_channels=hidden_size,
out_channels=intermediate_size, out_channels=intermediate_size,
activation=hidden_act, activation=hidden_act,
...@@ -697,7 +711,8 @@ class BertEncoderCell(nn.Cell): ...@@ -697,7 +711,8 @@ class BertEncoderCell(nn.Cell):
out_channels=hidden_size, out_channels=hidden_size,
initializer_range=initializer_range, initializer_range=initializer_range,
dropout_prob=hidden_dropout_prob, dropout_prob=hidden_dropout_prob,
compute_type=compute_type) compute_type=compute_type,
enable_fused_layernorm=enable_fused_layernorm)
def construct(self, hidden_states, attention_mask): def construct(self, hidden_states, attention_mask):
# self-attention # self-attention
...@@ -744,7 +759,8 @@ class BertTransformer(nn.Cell): ...@@ -744,7 +759,8 @@ class BertTransformer(nn.Cell):
use_relative_positions=False, use_relative_positions=False,
hidden_act="gelu", hidden_act="gelu",
compute_type=mstype.float32, compute_type=mstype.float32,
return_all_encoders=False): return_all_encoders=False,
enable_fused_layernorm=False):
super(BertTransformer, self).__init__() super(BertTransformer, self).__init__()
self.return_all_encoders = return_all_encoders self.return_all_encoders = return_all_encoders
...@@ -761,7 +777,8 @@ class BertTransformer(nn.Cell): ...@@ -761,7 +777,8 @@ class BertTransformer(nn.Cell):
hidden_dropout_prob=hidden_dropout_prob, hidden_dropout_prob=hidden_dropout_prob,
use_relative_positions=use_relative_positions, use_relative_positions=use_relative_positions,
hidden_act=hidden_act, hidden_act=hidden_act,
compute_type=compute_type) compute_type=compute_type,
enable_fused_layernorm=enable_fused_layernorm)
layers.append(layer) layers.append(layer)
self.layers = nn.CellList(layers) self.layers = nn.CellList(layers)
...@@ -888,7 +905,8 @@ class BertModel(nn.Cell): ...@@ -888,7 +905,8 @@ class BertModel(nn.Cell):
use_relative_positions=config.use_relative_positions, use_relative_positions=config.use_relative_positions,
hidden_act=config.hidden_act, hidden_act=config.hidden_act,
compute_type=config.compute_type, compute_type=config.compute_type,
return_all_encoders=True) return_all_encoders=True,
enable_fused_layernorm=config.enable_fused_layernorm)
self.cast = P.Cast() self.cast = P.Cast()
self.dtype = config.dtype self.dtype = config.dtype
......
...@@ -17,12 +17,12 @@ ...@@ -17,12 +17,12 @@
import json import json
import numpy as np import numpy as np
from evaluation_config import cfg
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
from mindspore.common.tensor import Tensor from mindspore.common.tensor import Tensor
from CRF import postprocess
import tokenization import tokenization
from sample_process import label_generation, process_one_example_p from sample_process import label_generation, process_one_example_p
from .evaluation_config import cfg
from .CRF import postprocess
vocab_file = "./vocab.txt" vocab_file = "./vocab.txt"
tokenizer_ = tokenization.FullTokenizer(vocab_file=vocab_file) tokenizer_ = tokenization.FullTokenizer(vocab_file=vocab_file)
......
...@@ -17,16 +17,16 @@ network config setting, will be used in dataset.py, run_pretrain.py ...@@ -17,16 +17,16 @@ network config setting, will be used in dataset.py, run_pretrain.py
""" """
from easydict import EasyDict as edict from easydict import EasyDict as edict
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
from mindspore.model_zoo.Bert_NEZHA import BertConfig from .bert_model import BertConfig
cfg = edict({ cfg = edict({
'bert_network': 'base', 'bert_network': 'base',
'loss_scale_value': 2**32, 'loss_scale_value': 65536,
'scale_factor': 2, 'scale_factor': 2,
'scale_window': 1000, 'scale_window': 1000,
'optimizer': 'Lamb', 'optimizer': 'Lamb',
'AdamWeightDecayDynamicLR': edict({ 'AdamWeightDecayDynamicLR': edict({
'learning_rate': 3e-5, 'learning_rate': 3e-5,
'end_learning_rate': 1e-7, 'end_learning_rate': 1e-10,
'power': 5.0, 'power': 5.0,
'weight_decay': 1e-5, 'weight_decay': 1e-5,
'eps': 1e-6, 'eps': 1e-6,
...@@ -34,7 +34,7 @@ cfg = edict({ ...@@ -34,7 +34,7 @@ cfg = edict({
}), }),
'Lamb': edict({ 'Lamb': edict({
'start_learning_rate': 3e-5, 'start_learning_rate': 3e-5,
'end_learning_rate': 1e-7, 'end_learning_rate': 1e-10,
'power': 10.0, 'power': 10.0,
'warmup_steps': 10000, 'warmup_steps': 10000,
'weight_decay': 0.01, 'weight_decay': 0.01,
...@@ -56,7 +56,7 @@ if cfg.bert_network == 'base': ...@@ -56,7 +56,7 @@ if cfg.bert_network == 'base':
bert_net_cfg = BertConfig( bert_net_cfg = BertConfig(
batch_size=32, batch_size=32,
seq_length=128, seq_length=128,
vocab_size=21128, vocab_size=21136,
hidden_size=768, hidden_size=768,
num_hidden_layers=12, num_hidden_layers=12,
num_attention_heads=12, num_attention_heads=12,
...@@ -71,13 +71,13 @@ if cfg.bert_network == 'base': ...@@ -71,13 +71,13 @@ if cfg.bert_network == 'base':
input_mask_from_dataset=True, input_mask_from_dataset=True,
token_type_ids_from_dataset=True, token_type_ids_from_dataset=True,
dtype=mstype.float32, dtype=mstype.float32,
compute_type=mstype.float16, compute_type=mstype.float16
) )
if cfg.bert_network == 'nezha': if cfg.bert_network == 'nezha':
bert_net_cfg = BertConfig( bert_net_cfg = BertConfig(
batch_size=32, batch_size=32,
seq_length=128, seq_length=128,
vocab_size=21128, vocab_size=21136,
hidden_size=1024, hidden_size=1024,
num_hidden_layers=24, num_hidden_layers=24,
num_attention_heads=16, num_attention_heads=16,
...@@ -92,5 +92,27 @@ if cfg.bert_network == 'nezha': ...@@ -92,5 +92,27 @@ if cfg.bert_network == 'nezha':
input_mask_from_dataset=True, input_mask_from_dataset=True,
token_type_ids_from_dataset=True, token_type_ids_from_dataset=True,
dtype=mstype.float32, dtype=mstype.float32,
compute_type=mstype.float16
)
if cfg.bert_network == 'large':
bert_net_cfg = BertConfig(
batch_size=16,
seq_length=512,
vocab_size=30528,
hidden_size=1024,
num_hidden_layers=24,
num_attention_heads=16,
intermediate_size=4096,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
use_relative_positions=False,
input_mask_from_dataset=True,
token_type_ids_from_dataset=True,
dtype=mstype.float32,
compute_type=mstype.float16, compute_type=mstype.float16,
enable_fused_layernorm=True
) )
...@@ -20,7 +20,7 @@ import mindspore.common.dtype as mstype ...@@ -20,7 +20,7 @@ import mindspore.common.dtype as mstype
import mindspore.dataset.engine.datasets as de import mindspore.dataset.engine.datasets as de
import mindspore.dataset.transforms.c_transforms as C import mindspore.dataset.transforms.c_transforms as C
from mindspore import log as logger from mindspore import log as logger
from config import bert_net_cfg from .config import bert_net_cfg
def create_bert_dataset(epoch_size=1, device_num=1, rank=0, do_shuffle="true", enable_data_sink="true", def create_bert_dataset(epoch_size=1, device_num=1, rank=0, do_shuffle="true", enable_data_sink="true",
...@@ -31,8 +31,9 @@ def create_bert_dataset(epoch_size=1, device_num=1, rank=0, do_shuffle="true", e ...@@ -31,8 +31,9 @@ def create_bert_dataset(epoch_size=1, device_num=1, rank=0, do_shuffle="true", e
files = os.listdir(data_dir) files = os.listdir(data_dir)
data_files = [] data_files = []
for file_name in files: for file_name in files:
if "tfrecord" in file_name:
data_files.append(os.path.join(data_dir, file_name)) data_files.append(os.path.join(data_dir, file_name))
ds = de.TFRecordDataset(data_files, schema_dir, ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels", columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"], "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank, shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank,
......
...@@ -19,7 +19,7 @@ config settings, will be used in finetune.py ...@@ -19,7 +19,7 @@ config settings, will be used in finetune.py
from easydict import EasyDict as edict from easydict import EasyDict as edict
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
from mindspore.model_zoo.Bert_NEZHA import BertConfig from .bert_model import BertConfig
cfg = edict({ cfg = edict({
'task': 'NER', 'task': 'NER',
......
...@@ -19,7 +19,7 @@ config settings, will be used in finetune.py ...@@ -19,7 +19,7 @@ config settings, will be used in finetune.py
from easydict import EasyDict as edict from easydict import EasyDict as edict
import mindspore.common.dtype as mstype import mindspore.common.dtype as mstype
from mindspore.model_zoo.Bert_NEZHA import BertConfig from .bert_model import BertConfig
cfg = edict({ cfg = edict({
'task': 'NER', 'task': 'NER',
......
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""fused layernorm"""
from mindspore.ops import operations as P
from mindspore.ops import functional as F
from mindspore.common.parameter import Parameter
from mindspore.common.initializer import initializer
from mindspore.ops.primitive import constexpr
import mindspore.common.dtype as mstype
from mindspore.nn.cell import Cell
import numpy as np
__all__ = ['FusedLayerNorm']
@constexpr
def get_shape_for_norm(x_shape, begin_norm_axis):
print("input_shape: ", x_shape)
norm_shape = x_shape[begin_norm_axis:]
output_shape = (1, -1, 1, int(np.prod(norm_shape)))
print("output_shape: ", output_shape)
return output_shape
class FusedLayerNorm(Cell):
r"""
Applies Layer Normalization over a mini-batch of inputs.
Layer normalization is widely used in recurrent neural networks. It applies
normalization over a mini-batch of inputs for each single training case as described
in the paper `Layer Normalization <https://arxiv.org/pdf/1607.06450.pdf>`_. Unlike batch
normalization, layer normalization performs exactly the same computation at training and
testing times. It can be described using the following formula. It is applied across all channels
and pixel but only one batch size.
.. math::
y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
Args:
normalized_shape (Union(tuple[int], list[int]): The normalization is performed over axis
`begin_norm_axis ... R - 1`.
begin_norm_axis (int): It first normalization dimension: normalization will be performed along dimensions
`begin_norm_axis: rank(inputs)`, the value should be in [-1, rank(input)). Default: -1.
begin_params_axis (int): The first parameter(beta, gamma)dimension: scale and centering parameters
will have dimensions `begin_params_axis: rank(inputs)` and will be broadcast with
the normalized inputs accordingly, the value should be in [-1, rank(input)). Default: -1.
gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
'he_uniform', etc. Default: 'ones'.
beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
'he_uniform', etc. Default: 'zeros'.
use_batch_nrom (bool): Whether use batchnorm to preocess.
Inputs:
- **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`,
and `input_shape[begin_norm_axis:]` is equal to `normalized_shape`.
Outputs:
Tensor, the normalized and scaled offset tensor, has the same shape and data type as the `input_x`.
Examples:
>>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32)
>>> shape1 = x.shape()[1:]
>>> m = nn.LayerNorm(shape1, begin_norm_axis=1, begin_params_axis=1)
>>> m(x)
"""
def __init__(self,
normalized_shape,
begin_norm_axis=-1,
begin_params_axis=-1,
gamma_init='ones',
beta_init='zeros',
use_batch_norm=False):
super(FusedLayerNorm, self).__init__()
if not isinstance(normalized_shape, (tuple, list)):
raise TypeError("The type of 'normalized_shape' should be tuple[int] or list[int], but '{}' type is {}."
.format(normalized_shape, type(normalized_shape)))
self.normalized_shape = normalized_shape
self.begin_norm_axis = begin_norm_axis
self.begin_params_axis = begin_params_axis
self.gamma = Parameter(initializer(
gamma_init, normalized_shape), name="gamma")
self.beta = Parameter(initializer(
beta_init, normalized_shape), name="beta")
self.layer_norm = P.LayerNorm(begin_norm_axis=self.begin_norm_axis, begin_params_axis=self.begin_params_axis)
self.batch_norm = P.BatchNorm(is_training=True, epsilon=1e-5)
self.use_batch_norm = use_batch_norm
def construct(self, input_x):
if self.use_batch_norm and self.training:
ones = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 1.0)
zeros = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 0.0)
shape_x = F.shape(input_x)
norm_shape = get_shape_for_norm(shape_x, self.begin_norm_axis)
input_x = F.reshape(input_x, norm_shape)
output, _, _, _, _, _ = self.batch_norm(input_x, ones, zeros, None, None)
output = F.reshape(output, shape_x)
y = output * self.gamma + self.beta
else:
y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)
return y
def extend_repr(self):
"""Display instance object as string."""
s = 'normalized_shape={}, begin_norm_axis={}, begin_params_axis={}, gamma{}, beta={}'.format(
self.normalized_shape, self.begin_norm_axis, self.begin_params_axis, self.gamma, self.beta)
return s
...@@ -30,8 +30,8 @@ from mindspore.train.parallel_utils import ParallelMode ...@@ -30,8 +30,8 @@ from mindspore.train.parallel_utils import ParallelMode
from mindspore.communication.management import get_group_size from mindspore.communication.management import get_group_size
from mindspore import context from mindspore import context
from mindspore.model_zoo.Bert_NEZHA.bert_model import BertModel from mindspore.model_zoo.Bert_NEZHA.bert_model import BertModel
from mindspore.model_zoo.Bert_NEZHA.bert_for_pre_training import clip_grad from .bert_for_pre_training import clip_grad
from CRF import CRF from .CRF import CRF
GRADIENT_CLIP_TYPE = 1 GRADIENT_CLIP_TYPE = 1
GRADIENT_CLIP_VALUE = 1.0 GRADIENT_CLIP_VALUE = 1.0
......
...@@ -25,7 +25,8 @@ import mindspore.dataset.transforms.c_transforms as C ...@@ -25,7 +25,8 @@ import mindspore.dataset.transforms.c_transforms as C
from mindspore import context from mindspore import context
from mindspore import log as logger from mindspore import log as logger
from mindspore.common.tensor import Tensor from mindspore.common.tensor import Tensor
from mindspore.model_zoo.Bert_NEZHA import BertConfig, BertNetworkWithLoss, BertTrainOneStepWithLossScaleCell from src.bert_model import BertConfig
from src.bert_for_pre_training import BertNetworkWithLoss, BertTrainOneStepWithLossScaleCell
from mindspore.nn.optim import Lamb from mindspore.nn.optim import Lamb
from mindspore.train.callback import Callback from mindspore.train.callback import Callback
from mindspore.train.loss_scale_manager import DynamicLossScaleManager from mindspore.train.loss_scale_manager import DynamicLossScaleManager
...@@ -77,7 +78,8 @@ def get_config(version='base', batch_size=1): ...@@ -77,7 +78,8 @@ def get_config(version='base', batch_size=1):
input_mask_from_dataset=True, input_mask_from_dataset=True,
token_type_ids_from_dataset=True, token_type_ids_from_dataset=True,
dtype=mstype.float32, dtype=mstype.float32,
compute_type=mstype.float16) compute_type=mstype.float16,
enable_fused_layernorm=False)
else: else:
bert_config = BertConfig(batch_size=batch_size) bert_config = BertConfig(batch_size=batch_size)
return bert_config return bert_config
......
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
'''
CRF script.
'''
import numpy as np
import mindspore.nn as nn
from mindspore.ops import operations as P
from mindspore.common.tensor import Tensor
from mindspore.common.parameter import Parameter
import mindspore.common.dtype as mstype
class CRF(nn.Cell):
'''
Conditional Random Field
Args:
tag_to_index: The dict for tag to index mapping with extra "<START>" and "<STOP>"sign.
batch_size: Batch size, i.e., the length of the first dimension.
seq_length: Sequence length, i.e., the length of the second dimention.
is_training: Specifies whether to use training mode.
Returns:
Training mode: Tensor, total loss.
Evaluation mode: Tuple, the index for each step with the highest score; Tuple, the index for the last
step with the highest score.
'''
def __init__(self, tag_to_index, batch_size=1, seq_length=128, is_training=True):
super(CRF, self).__init__()
self.target_size = len(tag_to_index)
self.is_training = is_training
self.tag_to_index = tag_to_index
self.batch_size = batch_size
self.seq_length = seq_length
self.START_TAG = "<START>"
self.STOP_TAG = "<STOP>"
self.START_VALUE = Tensor(self.target_size-2, dtype=mstype.int32)
self.STOP_VALUE = Tensor(self.target_size-1, dtype=mstype.int32)
transitions = np.random.normal(size=(self.target_size, self.target_size)).astype(np.float32)
transitions[tag_to_index[self.START_TAG], :] = -10000
transitions[:, tag_to_index[self.STOP_TAG]] = -10000
self.transitions = Parameter(Tensor(transitions), name="transition_matrix")
self.cat = P.Concat(axis=-1)
self.argmax = P.ArgMaxWithValue(axis=-1)
self.log = P.Log()
self.exp = P.Exp()
self.sum = P.ReduceSum()
self.tile = P.Tile()
self.reduce_sum = P.ReduceSum(keep_dims=True)
self.reshape = P.Reshape()
self.expand = P.ExpandDims()
self.mean = P.ReduceMean()
init_alphas = np.ones(shape=(self.batch_size, self.target_size)) * -10000.0
init_alphas[:, self.tag_to_index[self.START_TAG]] = 0.
self.init_alphas = Tensor(init_alphas, dtype=mstype.float32)
self.cast = P.Cast()
self.reduce_max = P.ReduceMax(keep_dims=True)
self.on_value = Tensor(1.0, dtype=mstype.float32)
self.off_value = Tensor(0.0, dtype=mstype.float32)
self.onehot = P.OneHot()
def log_sum_exp(self, logits):
'''
Compute the log_sum_exp score for normalization factor.
'''
max_score = self.reduce_max(logits, -1) #16 5 5
score = self.log(self.reduce_sum(self.exp(logits - max_score), -1))
score = max_score + score
return score
def _realpath_score(self, features, label):
'''
Compute the emission and transition score for the real path.
'''
label = label * 1
concat_A = self.tile(self.reshape(self.START_VALUE, (1,)), (self.batch_size,))
concat_A = self.reshape(concat_A, (self.batch_size, 1))
labels = self.cat((concat_A, label))
onehot_label = self.onehot(label, self.target_size, self.on_value, self.off_value)
emits = features * onehot_label
labels = self.onehot(labels, self.target_size, self.on_value, self.off_value)
label1 = labels[:, 1:, :]
label2 = labels[:, :self.seq_length, :]
label1 = self.expand(label1, 3)
label2 = self.expand(label2, 2)
label_trans = label1 * label2
transitions = self.expand(self.expand(self.transitions, 0), 0)
trans = transitions * label_trans
score = self.sum(emits, (1, 2)) + self.sum(trans, (1, 2, 3))
stop_value_index = labels[:, (self.seq_length-1):self.seq_length, :]
stop_value = self.transitions[(self.target_size-1):self.target_size, :]
stop_score = stop_value * self.reshape(stop_value_index, (self.batch_size, self.target_size))
score = score + self.sum(stop_score, 1)
score = self.reshape(score, (self.batch_size, -1))
return score
def _normalization_factor(self, features):
'''
Compute the total score for all the paths.
'''
forward_var = self.init_alphas
forward_var = self.expand(forward_var, 1)
for idx in range(self.seq_length):
feat = features[:, idx:(idx+1), :]
emit_score = self.reshape(feat, (self.batch_size, self.target_size, 1))
next_tag_var = emit_score + self.transitions + forward_var
forward_var = self.log_sum_exp(next_tag_var)
forward_var = self.reshape(forward_var, (self.batch_size, 1, self.target_size))
terminal_var = forward_var + self.reshape(self.transitions[(self.target_size-1):self.target_size, :], (1, -1))
alpha = self.log_sum_exp(terminal_var)
alpha = self.reshape(alpha, (self.batch_size, -1))
return alpha
def _decoder(self, features):
'''
Viterbi decode for evaluation.
'''
backpointers = ()
forward_var = self.init_alphas
for idx in range(self.seq_length):
feat = features[:, idx:(idx+1), :]
feat = self.reshape(feat, (self.batch_size, self.target_size))
bptrs_t = ()
next_tag_var = self.expand(forward_var, 1) + self.transitions
best_tag_id, best_tag_value = self.argmax(next_tag_var)
bptrs_t += (best_tag_id,)
forward_var = best_tag_value + feat
backpointers += (bptrs_t,)
terminal_var = forward_var + self.reshape(self.transitions[(self.target_size-1):self.target_size, :], (1, -1))
best_tag_id, _ = self.argmax(terminal_var)
return backpointers, best_tag_id
def construct(self, features, label):
if self.is_training:
forward_score = self._normalization_factor(features)
gold_score = self._realpath_score(features, label)
return_value = self.mean(forward_score - gold_score)
else:
path_list, tag = self._decoder(features)
return_value = path_list, tag
return return_value
def postprocess(backpointers, best_tag_id):
'''
Do postprocess
'''
best_tag_id = best_tag_id.asnumpy()
batch_size = len(best_tag_id)
best_path = []
for i in range(batch_size):
best_path.append([])
best_local_id = best_tag_id[i]
best_path[-1].append(best_local_id)
for bptrs_t in reversed(backpointers):
bptrs_t = bptrs_t[0].asnumpy()
local_idx = bptrs_t[i]
best_local_id = local_idx[best_local_id]
best_path[-1].append(best_local_id)
# Pop off the start tag (we dont want to return that to the caller)
best_path[-1].pop()
best_path[-1].reverse()
return best_path
...@@ -12,41 +12,20 @@ ...@@ -12,41 +12,20 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# ============================================================================ # ============================================================================
""" test bert cell """ """Bert Init."""
import numpy as np from .bert_for_pre_training import BertNetworkWithLoss, BertPreTraining, \
import pytest BertPretrainingLoss, GetMaskedLMOutput, GetNextSentenceOutput, \
BertTrainOneStepCell, BertTrainOneStepWithLossScaleCell
from mindspore.model_zoo.Bert_NEZHA import BertConfig, BertModel from .bert_model import BertAttention, BertConfig, BertEncoderCell, BertModel, \
from ....dataset_mock import MindData BertOutput, BertSelfAttention, BertTransformer, EmbeddingLookup, \
EmbeddingPostprocessor, RelaPosEmbeddingsGenerator, RelaPosMatrixGenerator, \
SaturateCast, CreateAttentionMaskFromInputMask
def map_bert(record):
target_data = {'input_ids': None, 'input_mask': None, __all__ = [
'segment_ids': None, 'next_sentence_labels': None, "BertNetworkWithLoss", "BertPreTraining", "BertPretrainingLoss",
'masked_lm_positions': None, 'masked_lm_ids': None, "GetMaskedLMOutput", "GetNextSentenceOutput", "BertTrainOneStepCell", "BertTrainOneStepWithLossScaleCell",
'masked_lm_weights': None} "BertAttention", "BertConfig", "BertEncoderCell", "BertModel", "BertOutput",
"BertSelfAttention", "BertTransformer", "EmbeddingLookup",
sample = dt.parse_single_example(record, target_data) "EmbeddingPostprocessor", "RelaPosEmbeddingsGenerator",
"RelaPosMatrixGenerator", "SaturateCast", "CreateAttentionMaskFromInputMask"
return sample['input_ids'], sample['input_mask'], sample['segment_ids'], \ ]
sample['next_sentence_labels'], sample['masked_lm_positions'], \
sample['masked_lm_ids'], sample['masked_lm_weights']
def test_bert_model():
# test for config.hidden_size % config.num_attention_heads != 0
config_error = BertConfig(32, hidden_size=512, num_attention_heads=10)
with pytest.raises(ValueError):
BertModel(config_error, True)
def get_dataset(batch_size=1):
dataset_types = (np.int32, np.int32, np.int32, np.int32, np.int32, np.int32, np.int32)
dataset_shapes = ((batch_size, 128), (batch_size, 128), (batch_size, 128), (batch_size, 1),
(batch_size, 20), (batch_size, 20), (batch_size, 20))
dataset = MindData(size=2, batch_size=batch_size,
np_types=dataset_types,
output_shapes=dataset_shapes,
input_indexs=(0, 1))
return dataset
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Bert for pretraining."""
import numpy as np
import mindspore.nn as nn
from mindspore.common.initializer import initializer, TruncatedNormal
from mindspore.ops import operations as P
from mindspore.ops import functional as F
from mindspore.ops import composite as C
from mindspore.common.tensor import Tensor
from mindspore.common.parameter import Parameter, ParameterTuple
from mindspore.common import dtype as mstype
from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
from mindspore.train.parallel_utils import ParallelMode
from mindspore.communication.management import get_group_size
from mindspore import context
from .bert_model import BertModel
GRADIENT_CLIP_TYPE = 1
GRADIENT_CLIP_VALUE = 1.0
_nn_clip_by_norm = nn.ClipByNorm()
clip_grad = C.MultitypeFuncGraph("clip_grad")
@clip_grad.register("Number", "Number", "Tensor")
def _clip_grad(clip_type, clip_value, grad):
"""
Clip gradients.
Inputs:
clip_type (int): The way to clip, 0 for 'value', 1 for 'norm'.
clip_value (float): Specifies how much to clip.
grad (tuple[Tensor]): Gradients.
Outputs:
tuple[Tensor], clipped gradients.
"""
if clip_type != 0 and clip_type != 1:
return grad
dt = F.dtype(grad)
if clip_type == 0:
new_grad = C.clip_by_value(grad, F.cast(F.tuple_to_array((-clip_value,)), dt),
F.cast(F.tuple_to_array((clip_value,)), dt))
else:
new_grad = _nn_clip_by_norm(grad, F.cast(F.tuple_to_array((clip_value,)), dt))
return new_grad
class GetMaskedLMOutput(nn.Cell):
"""
Get masked lm output.
Args:
config (BertConfig): The config of BertModel.
Returns:
Tensor, masked lm output.
"""
def __init__(self, config):
super(GetMaskedLMOutput, self).__init__()
self.width = config.hidden_size
self.reshape = P.Reshape()
self.gather = P.GatherV2()
weight_init = TruncatedNormal(config.initializer_range)
self.dense = nn.Dense(self.width,
config.hidden_size,
weight_init=weight_init,
activation=config.hidden_act).to_float(config.compute_type)
self.layernorm = nn.LayerNorm((config.hidden_size,)).to_float(config.compute_type)
self.output_bias = Parameter(
initializer(
'zero',
config.vocab_size),
name='output_bias')
self.matmul = P.MatMul(transpose_b=True)
self.log_softmax = nn.LogSoftmax(axis=-1)
self.shape_flat_offsets = (-1, 1)
self.rng = Tensor(np.array(range(0, config.batch_size)).astype(np.int32))
self.last_idx = (-1,)
self.shape_flat_sequence_tensor = (config.batch_size * config.seq_length, self.width)
self.seq_length_tensor = Tensor(np.array((config.seq_length,)).astype(np.int32))
self.cast = P.Cast()
self.compute_type = config.compute_type
self.dtype = config.dtype
def construct(self,
input_tensor,
output_weights,
positions):
flat_offsets = self.reshape(
self.rng * self.seq_length_tensor, self.shape_flat_offsets)
flat_position = self.reshape(positions + flat_offsets, self.last_idx)
flat_sequence_tensor = self.reshape(input_tensor, self.shape_flat_sequence_tensor)
input_tensor = self.gather(flat_sequence_tensor, flat_position, 0)
input_tensor = self.cast(input_tensor, self.compute_type)
output_weights = self.cast(output_weights, self.compute_type)
input_tensor = self.dense(input_tensor)
input_tensor = self.layernorm(input_tensor)
logits = self.matmul(input_tensor, output_weights)
logits = self.cast(logits, self.dtype)
logits = logits + self.output_bias
log_probs = self.log_softmax(logits)
return log_probs
class GetNextSentenceOutput(nn.Cell):
"""
Get next sentence output.
Args:
config (BertConfig): The config of Bert.
Returns:
Tensor, next sentence output.
"""
def __init__(self, config):
super(GetNextSentenceOutput, self).__init__()
self.log_softmax = P.LogSoftmax()
self.weight_init = TruncatedNormal(config.initializer_range)
self.dense = nn.Dense(config.hidden_size, 2,
weight_init=self.weight_init, has_bias=True).to_float(config.compute_type)
self.dtype = config.dtype
self.cast = P.Cast()
def construct(self, input_tensor):
logits = self.dense(input_tensor)
logits = self.cast(logits, self.dtype)
log_prob = self.log_softmax(logits)
return log_prob
class BertPreTraining(nn.Cell):
"""
Bert pretraining network.
Args:
config (BertConfig): The config of BertModel.
is_training (bool): Specifies whether to use the training mode.
use_one_hot_embeddings (bool): Specifies whether to use one-hot for embeddings.
Returns:
Tensor, prediction_scores, seq_relationship_score.
"""
def __init__(self, config, is_training, use_one_hot_embeddings):
super(BertPreTraining, self).__init__()
self.bert = BertModel(config, is_training, use_one_hot_embeddings)
self.cls1 = GetMaskedLMOutput(config)
self.cls2 = GetNextSentenceOutput(config)
def construct(self, input_ids, input_mask, token_type_id,
masked_lm_positions):
sequence_output, pooled_output, embedding_table = \
self.bert(input_ids, token_type_id, input_mask)
prediction_scores = self.cls1(sequence_output,
embedding_table,
masked_lm_positions)
seq_relationship_score = self.cls2(pooled_output)
return prediction_scores, seq_relationship_score
class BertPretrainingLoss(nn.Cell):
"""
Provide bert pre-training loss.
Args:
config (BertConfig): The config of BertModel.
Returns:
Tensor, total loss.
"""
def __init__(self, config):
super(BertPretrainingLoss, self).__init__()
self.vocab_size = config.vocab_size
self.onehot = P.OneHot()
self.on_value = Tensor(1.0, mstype.float32)
self.off_value = Tensor(0.0, mstype.float32)
self.reduce_sum = P.ReduceSum()
self.reduce_mean = P.ReduceMean()
self.reshape = P.Reshape()
self.last_idx = (-1,)
self.neg = P.Neg()
self.cast = P.Cast()
def construct(self, prediction_scores, seq_relationship_score, masked_lm_ids,
masked_lm_weights, next_sentence_labels):
"""Defines the computation performed."""
label_ids = self.reshape(masked_lm_ids, self.last_idx)
label_weights = self.cast(self.reshape(masked_lm_weights, self.last_idx), mstype.float32)
one_hot_labels = self.onehot(label_ids, self.vocab_size, self.on_value, self.off_value)
per_example_loss = self.neg(self.reduce_sum(prediction_scores * one_hot_labels, self.last_idx))
numerator = self.reduce_sum(label_weights * per_example_loss, ())
denominator = self.reduce_sum(label_weights, ()) + self.cast(F.tuple_to_array((1e-5,)), mstype.float32)
masked_lm_loss = numerator / denominator
# next_sentence_loss
labels = self.reshape(next_sentence_labels, self.last_idx)
one_hot_labels = self.onehot(labels, 2, self.on_value, self.off_value)
per_example_loss = self.neg(self.reduce_sum(
one_hot_labels * seq_relationship_score, self.last_idx))
next_sentence_loss = self.reduce_mean(per_example_loss, self.last_idx)
# total_loss
total_loss = masked_lm_loss + next_sentence_loss
return total_loss
class BertNetworkWithLoss(nn.Cell):
"""
Provide bert pre-training loss through network.
Args:
config (BertConfig): The config of BertModel.
is_training (bool): Specifies whether to use the training mode.
use_one_hot_embeddings (bool): Specifies whether to use one-hot for embeddings. Default: False.
Returns:
Tensor, the loss of the network.
"""
def __init__(self, config, is_training, use_one_hot_embeddings=False):
super(BertNetworkWithLoss, self).__init__()
self.bert = BertPreTraining(config, is_training, use_one_hot_embeddings)
self.loss = BertPretrainingLoss(config)
self.cast = P.Cast()
def construct(self,
input_ids,
input_mask,
token_type_id,
next_sentence_labels,
masked_lm_positions,
masked_lm_ids,
masked_lm_weights):
prediction_scores, seq_relationship_score = \
self.bert(input_ids, input_mask, token_type_id, masked_lm_positions)
total_loss = self.loss(prediction_scores, seq_relationship_score,
masked_lm_ids, masked_lm_weights, next_sentence_labels)
return self.cast(total_loss, mstype.float32)
class BertTrainOneStepCell(nn.Cell):
"""
Encapsulation class of bert network training.
Append an optimizer to the training network after that the construct
function can be called to create the backward graph.
Args:
network (Cell): The training network. Note that loss function should have been added.
optimizer (Optimizer): Optimizer for updating the weights.
sens (Number): The adjust parameter. Default: 1.0.
"""
def __init__(self, network, optimizer, sens=1.0):
super(BertTrainOneStepCell, self).__init__(auto_prefix=False)
self.network = network
self.weights = ParameterTuple(network.trainable_params())
self.optimizer = optimizer
self.grad = C.GradOperation('grad', get_by_list=True, sens_param=True)
self.sens = sens
self.reducer_flag = False
self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
self.reducer_flag = True
self.grad_reducer = None
if self.reducer_flag:
mean = context.get_auto_parallel_context("mirror_mean")
degree = get_group_size()
self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
self.cast = P.Cast()
self.hyper_map = C.HyperMap()
def set_sens(self, value):
self.sens = value
def construct(self,
input_ids,
input_mask,
token_type_id,
next_sentence_labels,
masked_lm_positions,
masked_lm_ids,
masked_lm_weights):
"""Defines the computation performed."""
weights = self.weights
loss = self.network(input_ids,
input_mask,
token_type_id,
next_sentence_labels,
masked_lm_positions,
masked_lm_ids,
masked_lm_weights)
grads = self.grad(self.network, weights)(input_ids,
input_mask,
token_type_id,
next_sentence_labels,
masked_lm_positions,
masked_lm_ids,
masked_lm_weights,
self.cast(F.tuple_to_array((self.sens,)),
mstype.float32))
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
if self.reducer_flag:
# apply grad reducer on grads
grads = self.grad_reducer(grads)
succ = self.optimizer(grads)
return F.depend(loss, succ)
grad_scale = C.MultitypeFuncGraph("grad_scale")
reciprocal = P.Reciprocal()
@grad_scale.register("Tensor", "Tensor")
def tensor_grad_scale(scale, grad):
return grad * reciprocal(scale)
class BertTrainOneStepWithLossScaleCell(nn.Cell):
"""
Encapsulation class of bert network training.
Append an optimizer to the training network after that the construct
function can be called to create the backward graph.
Args:
network (Cell): The training network. Note that loss function should have been added.
optimizer (Optimizer): Optimizer for updating the weights.
scale_update_cell (Cell): Cell to do the loss scale. Default: None.
"""
def __init__(self, network, optimizer, scale_update_cell=None):
super(BertTrainOneStepWithLossScaleCell, self).__init__(auto_prefix=False)
self.network = network
self.weights = ParameterTuple(network.trainable_params())
self.optimizer = optimizer
self.grad = C.GradOperation('grad',
get_by_list=True,
sens_param=True)
self.reducer_flag = False
self.allreduce = P.AllReduce()
self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
self.reducer_flag = True
self.grad_reducer = F.identity
self.degree = 1
if self.reducer_flag:
self.degree = get_group_size()
self.grad_reducer = DistributedGradReducer(optimizer.parameters, False, self.degree)
self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
self.cast = P.Cast()
self.alloc_status = P.NPUAllocFloatStatus()
self.get_status = P.NPUGetFloatStatus()
self.clear_before_grad = P.NPUClearFloatStatus()
self.reduce_sum = P.ReduceSum(keep_dims=False)
self.depend_parameter_use = P.ControlDepend(depend_mode=1)
self.base = Tensor(1, mstype.float32)
self.less_equal = P.LessEqual()
self.hyper_map = C.HyperMap()
self.loss_scale = None
self.loss_scaling_manager = scale_update_cell
if scale_update_cell:
self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32),
name="loss_scale")
self.add_flags(has_effect=True)
def construct(self,
input_ids,
input_mask,
token_type_id,
next_sentence_labels,
masked_lm_positions,
masked_lm_ids,
masked_lm_weights,
sens=None):
"""Defines the computation performed."""
weights = self.weights
loss = self.network(input_ids,
input_mask,
token_type_id,
next_sentence_labels,
masked_lm_positions,
masked_lm_ids,
masked_lm_weights)
if sens is None:
scaling_sens = self.loss_scale
else:
scaling_sens = sens
# alloc status and clear should be right before gradoperation
init = self.alloc_status()
self.clear_before_grad(init)
grads = self.grad(self.network, weights)(input_ids,
input_mask,
token_type_id,
next_sentence_labels,
masked_lm_positions,
masked_lm_ids,
masked_lm_weights,
self.cast(scaling_sens,
mstype.float32))
# apply grad reducer on grads
grads = self.grad_reducer(grads)
grads = self.hyper_map(F.partial(grad_scale, scaling_sens * self.degree), grads)
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
self.get_status(init)
flag_sum = self.reduce_sum(init, (0,))
if self.is_distributed:
# sum overflow flag over devices
flag_reduce = self.allreduce(flag_sum)
cond = self.less_equal(self.base, flag_reduce)
else:
cond = self.less_equal(self.base, flag_sum)
overflow = cond
if sens is None:
overflow = self.loss_scaling_manager(self.loss_scale, cond)
if overflow:
succ = False
else:
succ = self.optimizer(grads)
ret = (loss, cond, scaling_sens)
return F.depend(ret, succ)
此差异已折叠。
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
'''bert clue evaluation'''
import json
import numpy as np
import mindspore.common.dtype as mstype
from mindspore.common.tensor import Tensor
import tokenization
from sample_process import label_generation, process_one_example_p
from .evaluation_config import cfg
from .CRF import postprocess
vocab_file = "./vocab.txt"
tokenizer_ = tokenization.FullTokenizer(vocab_file=vocab_file)
def process(model, text, sequence_length):
"""
process text.
"""
data = [text]
features = []
res = []
ids = []
for i in data:
feature = process_one_example_p(tokenizer_, i, max_seq_len=sequence_length)
features.append(feature)
input_ids, input_mask, token_type_id = feature
input_ids = Tensor(np.array(input_ids), mstype.int32)
input_mask = Tensor(np.array(input_mask), mstype.int32)
token_type_id = Tensor(np.array(token_type_id), mstype.int32)
if cfg.use_crf:
backpointers, best_tag_id = model.predict(input_ids, input_mask, token_type_id, Tensor(1))
best_path = postprocess(backpointers, best_tag_id)
logits = []
for ele in best_path:
logits.extend(ele)
ids = logits
else:
logits = model.predict(input_ids, input_mask, token_type_id, Tensor(1))
ids = logits.asnumpy()
ids = np.argmax(ids, axis=-1)
ids = list(ids)
res = label_generation(text, ids)
return res
def submit(model, path, sequence_length):
"""
submit task
"""
data = []
for line in open(path):
if not line.strip():
continue
oneline = json.loads(line.strip())
res = process(model, oneline["text"], sequence_length)
print("text", oneline["text"])
print("res:", res)
data.append(json.dumps({"label": res}, ensure_ascii=False))
open("ner_predict.json", "w").write("\n".join(data))
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
network config setting, will be used in dataset.py, run_pretrain.py
"""
from easydict import EasyDict as edict
import mindspore.common.dtype as mstype
from .bert_model import BertConfig
cfg = edict({
'bert_network': 'base',
'loss_scale_value': 65536,
'scale_factor': 2,
'scale_window': 1000,
'optimizer': 'Lamb',
'AdamWeightDecayDynamicLR': edict({
'learning_rate': 3e-5,
'end_learning_rate': 1e-10,
'power': 5.0,
'weight_decay': 1e-5,
'eps': 1e-6,
'warmup_steps': 10000,
}),
'Lamb': edict({
'start_learning_rate': 3e-5,
'end_learning_rate': 1e-10,
'power': 10.0,
'warmup_steps': 10000,
'weight_decay': 0.01,
'eps': 1e-6,
}),
'Momentum': edict({
'learning_rate': 2e-5,
'momentum': 0.9,
}),
})
'''
Including two kinds of network: \
base: Goole BERT-base(the base version of BERT model).
large: BERT-NEZHA(a Chinese pretrained language model developed by Huawei, which introduced a improvement of \
Functional Relative Posetional Encoding as an effective positional encoding scheme).
'''
if cfg.bert_network == 'base':
bert_net_cfg = BertConfig(
batch_size=32,
seq_length=128,
vocab_size=21136,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
use_relative_positions=False,
input_mask_from_dataset=True,
token_type_ids_from_dataset=True,
dtype=mstype.float32,
compute_type=mstype.float16
)
if cfg.bert_network == 'nezha':
bert_net_cfg = BertConfig(
batch_size=32,
seq_length=128,
vocab_size=21136,
hidden_size=1024,
num_hidden_layers=24,
num_attention_heads=16,
intermediate_size=4096,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
use_relative_positions=True,
input_mask_from_dataset=True,
token_type_ids_from_dataset=True,
dtype=mstype.float32,
compute_type=mstype.float16
)
if cfg.bert_network == 'large':
bert_net_cfg = BertConfig(
batch_size=16,
seq_length=512,
vocab_size=30528,
hidden_size=1024,
num_hidden_layers=24,
num_attention_heads=16,
intermediate_size=4096,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
use_relative_positions=False,
input_mask_from_dataset=True,
token_type_ids_from_dataset=True,
dtype=mstype.float32,
compute_type=mstype.float16,
enable_fused_layernorm=True
)
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
Data operations, will be used in run_pretrain.py
"""
import os
import mindspore.common.dtype as mstype
import mindspore.dataset.engine.datasets as de
import mindspore.dataset.transforms.c_transforms as C
from mindspore import log as logger
from .config import bert_net_cfg
def create_bert_dataset(epoch_size=1, device_num=1, rank=0, do_shuffle="true", enable_data_sink="true",
data_sink_steps=1, data_dir=None, schema_dir=None):
"""create train dataset"""
# apply repeat operations
repeat_count = epoch_size
files = os.listdir(data_dir)
data_files = []
for file_name in files:
if "tfrecord" in file_name:
data_files.append(os.path.join(data_dir, file_name))
ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
"masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
shuffle=(do_shuffle == "true"), num_shards=device_num, shard_id=rank,
shard_equal_rows=True)
ori_dataset_size = ds.get_dataset_size()
new_size = ori_dataset_size
if enable_data_sink == "true":
new_size = data_sink_steps * bert_net_cfg.batch_size
ds.set_dataset_size(new_size)
new_repeat_count = int(repeat_count * ori_dataset_size // ds.get_dataset_size())
type_cast_op = C.TypeCast(mstype.int32)
ds = ds.map(input_columns="masked_lm_ids", operations=type_cast_op)
ds = ds.map(input_columns="masked_lm_positions", operations=type_cast_op)
ds = ds.map(input_columns="next_sentence_labels", operations=type_cast_op)
ds = ds.map(input_columns="segment_ids", operations=type_cast_op)
ds = ds.map(input_columns="input_mask", operations=type_cast_op)
ds = ds.map(input_columns="input_ids", operations=type_cast_op)
# apply batch operations
ds = ds.batch(bert_net_cfg.batch_size, drop_remainder=True)
ds = ds.repeat(new_repeat_count)
logger.info("data size: {}".format(ds.get_dataset_size()))
logger.info("repeatcount: {}".format(ds.get_repeat_count()))
return ds, new_repeat_count
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
config settings, will be used in finetune.py
"""
from easydict import EasyDict as edict
import mindspore.common.dtype as mstype
from .bert_model import BertConfig
cfg = edict({
'task': 'NER',
'num_labels': 41,
'data_file': '/your/path/evaluation.tfrecord',
'schema_file': '/your/path/schema.json',
'finetune_ckpt': '/your/path/your.ckpt',
'use_crf': False,
'clue_benchmark': False,
})
bert_net_cfg = BertConfig(
batch_size=16 if not cfg.clue_benchmark else 1,
seq_length=128,
vocab_size=21128,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.0,
attention_probs_dropout_prob=0.0,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
use_relative_positions=False,
input_mask_from_dataset=True,
token_type_ids_from_dataset=True,
dtype=mstype.float32,
compute_type=mstype.float16,
)
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
config settings, will be used in finetune.py
"""
from easydict import EasyDict as edict
import mindspore.common.dtype as mstype
from .bert_model import BertConfig
cfg = edict({
'task': 'NER',
'num_labels': 41,
'data_file': '/your/path/train.tfrecord',
'schema_file': '/your/path/schema.json',
'epoch_num': 5,
'ckpt_prefix': 'bert',
'ckpt_dir': None,
'pre_training_ckpt': '/your/path/pre_training.ckpt',
'use_crf': False,
'optimizer': 'Lamb',
'AdamWeightDecayDynamicLR': edict({
'learning_rate': 2e-5,
'end_learning_rate': 1e-7,
'power': 1.0,
'weight_decay': 1e-5,
'eps': 1e-6,
}),
'Lamb': edict({
'start_learning_rate': 2e-5,
'end_learning_rate': 1e-7,
'power': 1.0,
'decay_filter': lambda x: False,
}),
'Momentum': edict({
'learning_rate': 2e-5,
'momentum': 0.9,
}),
})
bert_net_cfg = BertConfig(
batch_size=16,
seq_length=128,
vocab_size=21128,
hidden_size=768,
num_hidden_layers=12,
num_attention_heads=12,
intermediate_size=3072,
hidden_act="gelu",
hidden_dropout_prob=0.1,
attention_probs_dropout_prob=0.1,
max_position_embeddings=512,
type_vocab_size=2,
initializer_range=0.02,
use_relative_positions=False,
input_mask_from_dataset=True,
token_type_ids_from_dataset=True,
dtype=mstype.float32,
compute_type=mstype.float16,
)
tag_to_index = {
"O": 0,
"S_address": 1,
"B_address": 2,
"M_address": 3,
"E_address": 4,
"S_book": 5,
"B_book": 6,
"M_book": 7,
"E_book": 8,
"S_company": 9,
"B_company": 10,
"M_company": 11,
"E_company": 12,
"S_game": 13,
"B_game": 14,
"M_game": 15,
"E_game": 16,
"S_government": 17,
"B_government": 18,
"M_government": 19,
"E_government": 20,
"S_movie": 21,
"B_movie": 22,
"M_movie": 23,
"E_movie": 24,
"S_name": 25,
"B_name": 26,
"M_name": 27,
"E_name": 28,
"S_organization": 29,
"B_organization": 30,
"M_organization": 31,
"E_organization": 32,
"S_position": 33,
"B_position": 34,
"M_position": 35,
"E_position": 36,
"S_scene": 37,
"B_scene": 38,
"M_scene": 39,
"E_scene": 40,
"<START>": 41,
"<STOP>": 42
}
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""fused layernorm"""
from mindspore.ops import operations as P
from mindspore.ops import functional as F
from mindspore.common.parameter import Parameter
from mindspore.common.initializer import initializer
from mindspore.ops.primitive import constexpr
import mindspore.common.dtype as mstype
from mindspore.nn.cell import Cell
import numpy as np
__all__ = ['FusedLayerNorm']
@constexpr
def get_shape_for_norm(x_shape, begin_norm_axis):
print("input_shape: ", x_shape)
norm_shape = x_shape[begin_norm_axis:]
output_shape = (1, -1, 1, int(np.prod(norm_shape)))
print("output_shape: ", output_shape)
return output_shape
class FusedLayerNorm(Cell):
r"""
Applies Layer Normalization over a mini-batch of inputs.
Layer normalization is widely used in recurrent neural networks. It applies
normalization over a mini-batch of inputs for each single training case as described
in the paper `Layer Normalization <https://arxiv.org/pdf/1607.06450.pdf>`_. Unlike batch
normalization, layer normalization performs exactly the same computation at training and
testing times. It can be described using the following formula. It is applied across all channels
and pixel but only one batch size.
.. math::
y = \frac{x - \mathrm{E}[x]}{\sqrt{\mathrm{Var}[x] + \epsilon}} * \gamma + \beta
Args:
normalized_shape (Union(tuple[int], list[int]): The normalization is performed over axis
`begin_norm_axis ... R - 1`.
begin_norm_axis (int): It first normalization dimension: normalization will be performed along dimensions
`begin_norm_axis: rank(inputs)`, the value should be in [-1, rank(input)). Default: -1.
begin_params_axis (int): The first parameter(beta, gamma)dimension: scale and centering parameters
will have dimensions `begin_params_axis: rank(inputs)` and will be broadcast with
the normalized inputs accordingly, the value should be in [-1, rank(input)). Default: -1.
gamma_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the gamma weight.
The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
'he_uniform', etc. Default: 'ones'.
beta_init (Union[Tensor, str, Initializer, numbers.Number]): Initializer for the beta weight.
The values of str refer to the function `initializer` including 'zeros', 'ones', 'xavier_uniform',
'he_uniform', etc. Default: 'zeros'.
use_batch_nrom (bool): Whether use batchnorm to preocess.
Inputs:
- **input_x** (Tensor) - The shape of 'input_x' is :math:`(x_1, x_2, ..., x_R)`,
and `input_shape[begin_norm_axis:]` is equal to `normalized_shape`.
Outputs:
Tensor, the normalized and scaled offset tensor, has the same shape and data type as the `input_x`.
Examples:
>>> x = Tensor(np.ones([20, 5, 10, 10]), mindspore.float32)
>>> shape1 = x.shape()[1:]
>>> m = nn.LayerNorm(shape1, begin_norm_axis=1, begin_params_axis=1)
>>> m(x)
"""
def __init__(self,
normalized_shape,
begin_norm_axis=-1,
begin_params_axis=-1,
gamma_init='ones',
beta_init='zeros',
use_batch_norm=False):
super(FusedLayerNorm, self).__init__()
if not isinstance(normalized_shape, (tuple, list)):
raise TypeError("The type of 'normalized_shape' should be tuple[int] or list[int], but '{}' type is {}."
.format(normalized_shape, type(normalized_shape)))
self.normalized_shape = normalized_shape
self.begin_norm_axis = begin_norm_axis
self.begin_params_axis = begin_params_axis
self.gamma = Parameter(initializer(
gamma_init, normalized_shape), name="gamma")
self.beta = Parameter(initializer(
beta_init, normalized_shape), name="beta")
self.layer_norm = P.LayerNorm(begin_norm_axis=self.begin_norm_axis, begin_params_axis=self.begin_params_axis)
self.batch_norm = P.BatchNorm(is_training=True, epsilon=1e-5)
self.use_batch_norm = use_batch_norm
def construct(self, input_x):
if self.use_batch_norm and self.training:
ones = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 1.0)
zeros = P.Fill()(mstype.float32, F.shape(input_x)[:self.begin_norm_axis], 0.0)
shape_x = F.shape(input_x)
norm_shape = get_shape_for_norm(shape_x, self.begin_norm_axis)
input_x = F.reshape(input_x, norm_shape)
output, _, _, _, _, _ = self.batch_norm(input_x, ones, zeros, None, None)
output = F.reshape(output, shape_x)
y = output * self.gamma + self.beta
else:
y, _, _ = self.layer_norm(input_x, self.gamma, self.beta)
return y
def extend_repr(self):
"""Display instance object as string."""
s = 'normalized_shape={}, begin_norm_axis={}, begin_params_axis={}, gamma{}, beta={}'.format(
self.normalized_shape, self.begin_norm_axis, self.begin_params_axis, self.gamma, self.beta)
return s
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""process txt"""
import re
import json
def process_one_example_p(tokenizer, text, max_seq_len=128):
"""process one testline"""
textlist = list(text)
tokens = []
for _, word in enumerate(textlist):
token = tokenizer.tokenize(word)
tokens.extend(token)
if len(tokens) >= max_seq_len - 1:
tokens = tokens[0:(max_seq_len - 2)]
ntokens = []
segment_ids = []
label_ids = []
ntokens.append("[CLS]")
segment_ids.append(0)
for _, token in enumerate(tokens):
ntokens.append(token)
segment_ids.append(0)
ntokens.append("[SEP]")
segment_ids.append(0)
input_ids = tokenizer.convert_tokens_to_ids(ntokens)
input_mask = [1] * len(input_ids)
while len(input_ids) < max_seq_len:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
label_ids.append(0)
ntokens.append("**NULL**")
assert len(input_ids) == max_seq_len
assert len(input_mask) == max_seq_len
assert len(segment_ids) == max_seq_len
feature = (input_ids, input_mask, segment_ids)
return feature
def label_generation(text, probs):
"""generate label"""
data = [text]
probs = [probs]
result = []
label2id = json.loads(open("./label2id.json").read())
id2label = [k for k, v in label2id.items()]
for index, prob in enumerate(probs):
for v in prob[1:len(data[index]) + 1]:
result.append(id2label[int(v)])
labels = {}
start = None
index = 0
for _, t in zip("".join(data), result):
if re.search("^[BS]", t):
if start is not None:
label = result[index - 1][2:]
if labels.get(label):
te_ = text[start:index]
labels[label][te_] = [[start, index - 1]]
else:
te_ = text[start:index]
labels[label] = {te_: [[start, index - 1]]}
start = index
if re.search("^O", t):
if start is not None:
label = result[index - 1][2:]
if labels.get(label):
te_ = text[start:index]
labels[label][te_] = [[start, index - 1]]
else:
te_ = text[start:index]
labels[label] = {te_: [[start, index - 1]]}
start = None
index += 1
if start is not None:
label = result[start][2:]
if labels.get(label):
te_ = text[start:index]
labels[label][te_] = [[start, index - 1]]
else:
te_ = text[start:index]
labels[label] = {te_: [[start, index - 1]]}
return labels
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
'''
Functional Cells used in Bert finetune and evaluation.
'''
import mindspore.nn as nn
from mindspore.common.initializer import TruncatedNormal
from mindspore.ops import operations as P
from mindspore.ops import functional as F
from mindspore.ops import composite as C
from mindspore.common.tensor import Tensor
from mindspore.common.parameter import Parameter, ParameterTuple
from mindspore.common import dtype as mstype
from mindspore.nn.wrap.grad_reducer import DistributedGradReducer
from mindspore.train.parallel_utils import ParallelMode
from mindspore.communication.management import get_group_size
from mindspore import context
from mindspore.model_zoo.Bert_NEZHA.bert_model import BertModel
from .bert_for_pre_training import clip_grad
from .CRF import CRF
GRADIENT_CLIP_TYPE = 1
GRADIENT_CLIP_VALUE = 1.0
grad_scale = C.MultitypeFuncGraph("grad_scale")
reciprocal = P.Reciprocal()
@grad_scale.register("Tensor", "Tensor")
def tensor_grad_scale(scale, grad):
return grad * reciprocal(scale)
class BertFinetuneCell(nn.Cell):
"""
Especifically defined for finetuning where only four inputs tensor are needed.
"""
def __init__(self, network, optimizer, scale_update_cell=None):
super(BertFinetuneCell, self).__init__(auto_prefix=False)
self.network = network
self.weights = ParameterTuple(network.trainable_params())
self.optimizer = optimizer
self.grad = C.GradOperation('grad',
get_by_list=True,
sens_param=True)
self.reducer_flag = False
self.allreduce = P.AllReduce()
self.parallel_mode = context.get_auto_parallel_context("parallel_mode")
if self.parallel_mode in [ParallelMode.DATA_PARALLEL, ParallelMode.HYBRID_PARALLEL]:
self.reducer_flag = True
self.grad_reducer = None
if self.reducer_flag:
mean = context.get_auto_parallel_context("mirror_mean")
degree = get_group_size()
self.grad_reducer = DistributedGradReducer(optimizer.parameters, mean, degree)
self.is_distributed = (self.parallel_mode != ParallelMode.STAND_ALONE)
self.cast = P.Cast()
self.alloc_status = P.NPUAllocFloatStatus()
self.get_status = P.NPUGetFloatStatus()
self.clear_before_grad = P.NPUClearFloatStatus()
self.reduce_sum = P.ReduceSum(keep_dims=False)
self.depend_parameter_use = P.ControlDepend(depend_mode=1)
self.base = Tensor(1, mstype.float32)
self.less_equal = P.LessEqual()
self.hyper_map = C.HyperMap()
self.loss_scale = None
self.loss_scaling_manager = scale_update_cell
if scale_update_cell:
self.loss_scale = Parameter(Tensor(scale_update_cell.get_loss_scale(), dtype=mstype.float32),
name="loss_scale")
def construct(self,
input_ids,
input_mask,
token_type_id,
label_ids,
sens=None):
weights = self.weights
init = self.alloc_status()
loss = self.network(input_ids,
input_mask,
token_type_id,
label_ids)
if sens is None:
scaling_sens = self.loss_scale
else:
scaling_sens = sens
grads = self.grad(self.network, weights)(input_ids,
input_mask,
token_type_id,
label_ids,
self.cast(scaling_sens,
mstype.float32))
clear_before_grad = self.clear_before_grad(init)
F.control_depend(loss, init)
self.depend_parameter_use(clear_before_grad, scaling_sens)
grads = self.hyper_map(F.partial(grad_scale, scaling_sens), grads)
grads = self.hyper_map(F.partial(clip_grad, GRADIENT_CLIP_TYPE, GRADIENT_CLIP_VALUE), grads)
if self.reducer_flag:
grads = self.grad_reducer(grads)
flag = self.get_status(init)
flag_sum = self.reduce_sum(init, (0,))
if self.is_distributed:
flag_reduce = self.allreduce(flag_sum)
cond = self.less_equal(self.base, flag_reduce)
else:
cond = self.less_equal(self.base, flag_sum)
F.control_depend(grads, flag)
F.control_depend(flag, flag_sum)
overflow = cond
if sens is None:
overflow = self.loss_scaling_manager(self.loss_scale, cond)
if overflow:
succ = False
else:
succ = self.optimizer(grads)
ret = (loss, cond)
return F.depend(ret, succ)
class BertCLSModel(nn.Cell):
"""
This class is responsible for classification task evaluation, i.e. XNLI(num_labels=3),
LCQMC(num_labels=2), Chnsenti(num_labels=2). The returned output represents the final
logits as the results of log_softmax is propotional to that of softmax.
"""
def __init__(self, config, is_training, num_labels=2, dropout_prob=0.0, use_one_hot_embeddings=False):
super(BertCLSModel, self).__init__()
self.bert = BertModel(config, is_training, use_one_hot_embeddings)
self.cast = P.Cast()
self.weight_init = TruncatedNormal(config.initializer_range)
self.log_softmax = P.LogSoftmax(axis=-1)
self.dtype = config.dtype
self.num_labels = num_labels
self.dense_1 = nn.Dense(config.hidden_size, self.num_labels, weight_init=self.weight_init,
has_bias=True).to_float(config.compute_type)
self.dropout = nn.Dropout(1 - dropout_prob)
def construct(self, input_ids, input_mask, token_type_id):
_, pooled_output, _ = \
self.bert(input_ids, token_type_id, input_mask)
cls = self.cast(pooled_output, self.dtype)
cls = self.dropout(cls)
logits = self.dense_1(cls)
logits = self.cast(logits, self.dtype)
log_probs = self.log_softmax(logits)
return log_probs
class BertNERModel(nn.Cell):
"""
This class is responsible for sequence labeling task evaluation, i.e. NER(num_labels=11).
The returned output represents the final logits as the results of log_softmax is propotional to that of softmax.
"""
def __init__(self, config, is_training, num_labels=11, use_crf=False, dropout_prob=0.0,
use_one_hot_embeddings=False):
super(BertNERModel, self).__init__()
self.bert = BertModel(config, is_training, use_one_hot_embeddings)
self.cast = P.Cast()
self.weight_init = TruncatedNormal(config.initializer_range)
self.log_softmax = P.LogSoftmax(axis=-1)
self.dtype = config.dtype
self.num_labels = num_labels
self.dense_1 = nn.Dense(config.hidden_size, self.num_labels, weight_init=self.weight_init,
has_bias=True).to_float(config.compute_type)
self.dropout = nn.Dropout(1 - dropout_prob)
self.reshape = P.Reshape()
self.shape = (-1, config.hidden_size)
self.use_crf = use_crf
self.origin_shape = (config.batch_size, config.seq_length, self.num_labels)
def construct(self, input_ids, input_mask, token_type_id):
sequence_output, _, _ = \
self.bert(input_ids, token_type_id, input_mask)
seq = self.dropout(sequence_output)
seq = self.reshape(seq, self.shape)
logits = self.dense_1(seq)
logits = self.cast(logits, self.dtype)
if self.use_crf:
return_value = self.reshape(logits, self.origin_shape)
else:
return_value = self.log_softmax(logits)
return return_value
class CrossEntropyCalculation(nn.Cell):
"""
Cross Entropy loss
"""
def __init__(self, is_training=True):
super(CrossEntropyCalculation, self).__init__()
self.onehot = P.OneHot()
self.on_value = Tensor(1.0, mstype.float32)
self.off_value = Tensor(0.0, mstype.float32)
self.reduce_sum = P.ReduceSum()
self.reduce_mean = P.ReduceMean()
self.reshape = P.Reshape()
self.last_idx = (-1,)
self.neg = P.Neg()
self.cast = P.Cast()
self.is_training = is_training
def construct(self, logits, label_ids, num_labels):
if self.is_training:
label_ids = self.reshape(label_ids, self.last_idx)
one_hot_labels = self.onehot(label_ids, num_labels, self.on_value, self.off_value)
per_example_loss = self.neg(self.reduce_sum(one_hot_labels * logits, self.last_idx))
loss = self.reduce_mean(per_example_loss, self.last_idx)
return_value = self.cast(loss, mstype.float32)
else:
return_value = logits * 1.0
return return_value
class BertCLS(nn.Cell):
"""
Train interface for classification finetuning task.
"""
def __init__(self, config, is_training, num_labels=2, dropout_prob=0.0, use_one_hot_embeddings=False):
super(BertCLS, self).__init__()
self.bert = BertCLSModel(config, is_training, num_labels, dropout_prob, use_one_hot_embeddings)
self.loss = CrossEntropyCalculation(is_training)
self.num_labels = num_labels
def construct(self, input_ids, input_mask, token_type_id, label_ids):
log_probs = self.bert(input_ids, input_mask, token_type_id)
loss = self.loss(log_probs, label_ids, self.num_labels)
return loss
class BertNER(nn.Cell):
"""
Train interface for sequence labeling finetuning task.
"""
def __init__(self, config, is_training, num_labels=11, use_crf=False, tag_to_index=None, dropout_prob=0.0,
use_one_hot_embeddings=False):
super(BertNER, self).__init__()
self.bert = BertNERModel(config, is_training, num_labels, use_crf, dropout_prob, use_one_hot_embeddings)
if use_crf:
if not tag_to_index:
raise Exception("The dict for tag-index mapping should be provided for CRF.")
self.loss = CRF(tag_to_index, config.batch_size, config.seq_length, is_training)
else:
self.loss = CrossEntropyCalculation(is_training)
self.num_labels = num_labels
self.use_crf = use_crf
def construct(self, input_ids, input_mask, token_type_id, label_ids):
logits = self.bert(input_ids, input_mask, token_type_id)
if self.use_crf:
loss = self.loss(logits, label_ids)
else:
loss = self.loss(logits, label_ids, self.num_labels)
return loss
此差异已折叠。
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
""" test_embedding """
import numpy as np
from mindspore import Tensor
from mindspore import dtype as mstype
from mindspore.model_zoo.Bert_NEZHA import EmbeddingLookup, EmbeddingPostprocessor
from ..ut_filter import non_graph_engine
@non_graph_engine
def test_check_embedding_lookup_1():
m = EmbeddingLookup(vocab_size=32000,
embedding_size=768,
embedding_shape=[1, 128, 768],
use_one_hot_embeddings=False)
m(Tensor(np.ones([128]), mstype.int32))
@non_graph_engine
def test_check_embedding_lookup_2():
m = EmbeddingLookup(vocab_size=32000,
embedding_size=768,
embedding_shape=[1, 128, 768],
use_one_hot_embeddings=True)
m(Tensor(np.ones([128]), mstype.int32))
@non_graph_engine
def test_check_embedding_lookup_3():
m = EmbeddingLookup(vocab_size=32000,
embedding_size=768,
embedding_shape=[1, 128, 768],
use_one_hot_embeddings=True,
initializer_range=0.01)
m(Tensor(np.ones([128]), mstype.int32))
@non_graph_engine
def test_embedding_post_1():
m = EmbeddingPostprocessor(embedding_size=768,
embedding_shape=[1, 128, 768],
use_token_type=True)
m(Tensor(np.ones([128]), mstype.int32), Tensor(np.ones([1, 128, 768]), mstype.float32))
@non_graph_engine
def test_embedding_post_2():
m = EmbeddingPostprocessor(embedding_size=768,
embedding_shape=[1, 128, 768],
use_token_type=True,
initializer_range=0.3)
m(Tensor(np.ones([128]), mstype.int32), Tensor(np.ones([1, 128, 768]), mstype.float32))
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册