diff --git a/README.en.md b/README.en.md index 9dd3c6e91d9423cacf9c247ca7053ff1c9bf2e68..e89704d73c1592145be5955d21e779940693bfa0 100644 --- a/README.en.md +++ b/README.en.md @@ -54,6 +54,7 @@ from ernie.modeling_ernie import ErnieModel D.guard().__enter__() # activate paddle `dygrpah` mode model = ErnieModel.from_pretrained('ernie-1.0') # Try to get pretrained model from server, make sure you have network connection +model.eval() tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') ids, _ = tokenizer.encode('hello world') diff --git a/README.zh.md b/README.zh.md index 96c58cfc721964f0be48f26a2fec9b927376e0f3..25f29307fede3f0cde22c1be439422d883e35a54 100644 --- a/README.zh.md +++ b/README.zh.md @@ -51,6 +51,7 @@ from ernie.modeling_ernie import ErnieModel D.guard().__enter__() # activate paddle `dygrpah` mode model = ErnieModel.from_pretrained('ernie-1.0') # Try to get pretrained model from server, make sure you have network connection +model.eval() tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') ids, _ = tokenizer.encode('hello world') @@ -177,7 +178,7 @@ python3 -m paddle.distributed.launch \ 1. [情感分析](./demo/finetune_sentiment_analysis_dygraph.py) 1. [语义匹配](./demo/finetune_classifier_dygraph.py) 1. [命名实体识别(NER)](./demo/finetune_ner_dygraph.py) -1. [机器阅读理解](./demo/finetune_mrc_dygraph.py) +1. [机器阅读理解](./demo/finetune_mrc_dygraph.py) (需要多卡环境运行;参见上面"分布式 finetune"一节) 1. [文本摘要生成](./experimental/seq2seq/README.md) @@ -186,8 +187,7 @@ python3 -m paddle.distributed.launch \ |任务|batch size|learning rate| |--|--|--| | CoLA | 32 / 64 (base) | 3e-5 | -| SST-2 - | 64 / 256 (base) | 2e-5 | +| SST-2 | 64 / 256 (base) | 2e-5 | | STS-B | 128 | 5e-5 | | QQP | 256 | 3e-5(base)/5e-5(large) | | MNLI | 256 / 512 (base)| 3e-5 | diff --git a/demo/finetune_ner_dygraph.py b/demo/finetune_ner_dygraph.py index 4395d6ac046973d2dadca2d1aa56e9bd4dddd8ca..ad9376a7f1ea7140726d57defcca949da57012ba 100644 --- a/demo/finetune_ner_dygraph.py +++ b/demo/finetune_ner_dygraph.py @@ -50,11 +50,15 @@ if __name__ == '__main__': parser.add_argument('--max_seqlen', type=int, default=256) parser.add_argument('--bsz', type=int, default=32) parser.add_argument('--data_dir', type=str, required=True) - parser.add_argument('--epoch', type=int, default=3) - parser.add_argument('--warmup_steps', type=int, default=1000) - parser.add_argument('--max_steps', type=int, default=30000) + parser.add_argument('--epoch', type=int, default=6) + parser.add_argument('--warmup_proportion', type=float, default=0.1, help='if use_lr_decay is set, ' + 'learning rate will raise to `lr` at `warmup_proportion` * `max_steps` and decay to 0. at `max_steps`') + parser.add_argument('--max_steps', type=int, required=True, + help='max_train_steps, set this to EPOCH * NUM_SAMPLES / BATCH_SIZE, used in learning rate scheduler') parser.add_argument('--from_pretrained', type=str, required=True) - parser.add_argument('--lr', type=float, default=5e-5) + parser.add_argument('--lr', type=float, default=5e-5, help='learning rate') + parser.add_argument('--save_dir', type=str, default=None, help='model output directory') + parser.add_argument('--wd', type=float, default=0.01, help='weight decay, aka L2 regularizer') args = parser.parse_args() tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained) @@ -73,18 +77,22 @@ if __name__ == '__main__': def tokenizer_func_for_label(inputs): return inputs.split(b'\2') - - feature_column = propeller.data.FeatureColumns([ - propeller.data.TextColumn('text_a', unk_id=tokenizer.unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer_func), - propeller.data.TextColumn('label', unk_id=6, vocab_dict={ + + feature_map = { b"B-PER": 0, b"I-PER": 1, b"B-ORG": 2, b"I-ORG": 3, b"B-LOC": 4, b"I-LOC": 5, - }, - tokenizer=tokenizer_func_for_label,) + b"O": 6, + } + other_tag_id = feature_map[b'O'] + + feature_column = propeller.data.FeatureColumns([ + propeller.data.TextColumn('text_a', unk_id=tokenizer.unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer_func), + propeller.data.TextColumn('label', unk_id=other_tag_id, vocab_dict=feature_map, + tokenizer=tokenizer_func_for_label,) ]) def before(seg, label): @@ -95,23 +103,23 @@ if __name__ == '__main__': orig_pos, _ = tokenizer.truncate(orig_pos, [], args.max_seqlen) sentence, segments = tokenizer.build_for_ernie(seg) #utils.data.build_1_pair(seg, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id) - aligned_label = np.concatenate([[-100], aligned_label, [-100]], 0) - orig_pos = np.concatenate([[-100], orig_pos, [-100]]) + aligned_label = np.concatenate([[0], aligned_label, [0]], 0) + orig_pos = np.concatenate([[0], orig_pos, [0]]) assert len(aligned_label) == len(sentence) == len(orig_pos), (len(aligned_label), len(sentence), len(orig_pos)) # alinged return sentence, segments, aligned_label, label, orig_pos train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=False, use_gz=False) \ .map(before) \ - .padded_batch(args.bsz, (0, 0, -100, -100, -100)) \ + .padded_batch(args.bsz, (0,0,0, other_tag_id + 1, 0)) \ dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ .map(before) \ - .padded_batch(args.bsz, (0, 0, -100, -100, -100)) \ + .padded_batch(args.bsz, (0,0,0, other_tag_id + 1,0)) \ test_ds = feature_column.build_dataset('test', data_dir=os.path.join(args.data_dir, 'test'), shuffle=False, repeat=False, use_gz=False) \ .map(before) \ - .padded_batch(args.bsz, (0, 0, -100, -100, -100)) \ + .padded_batch(args.bsz, (0,0,0, other_tag_id + 1,0)) \ @@ -126,45 +134,59 @@ if __name__ == '__main__': test_ds.data_types = types place = F.CUDAPlace(0) + + @FD.no_grad + def evaluate(model, dataset): + model.eval() + chunkf1 = propeller.metrics.ChunkF1(None, None, None, len(feature_map)) + for step, (ids, sids, aligned_label, label, orig_pos) in enumerate(tqdm(dataset.start(place))): + loss, logits = model(ids, sids) + #print('\n'.join(map(str, logits.numpy().tolist()))) + + assert orig_pos.shape[0] == logits.shape[0] == ids.shape[0] == label.shape[0] + for pos, lo, la, id in zip(orig_pos.numpy(), logits.numpy(), label.numpy(), ids.numpy()): + _dic = OrderedDict() + assert len(pos) ==len(lo) == len(id) + for _pos, _lo, _id in zip(pos, lo, id): + if _id > tokenizer.mask_id: # [MASK] is the largest special token + _dic.setdefault(_pos, []).append(_lo) + merged_lo = np.array([np.array(l).mean(0) for _, l in six.iteritems(_dic)]) + merged_preds = np.argmax(merged_lo, -1) + la = la[np.where(la != (other_tag_id + 1))] #remove pad + if len(la) > len(merged_preds): + log.warn('accuracy loss due to truncation: label len:%d, truncate to %d' % (len(la), len(merged_preds))) + merged_preds = np.pad(merged_preds, [0, len(la) - len(merged_preds)], mode='constant', constant_values=7) + else: + assert len(la) == len(merged_preds), 'expect label == prediction, got %d vs %d' % (la.shape, merged_preds.shape) + chunkf1.update((merged_preds, la, np.array(len(la)))) + #f1 = f1_score(np.concatenate(all_label), np.concatenate(all_pred), average='macro') + f1 = chunkf1.eval() + model.train() + return f1 with FD.guard(place): - model = ErnieModelForTokenClassification.from_pretrained(args.from_pretrained, num_labels=7, name='') + model = ErnieModelForTokenClassification.from_pretrained(args.from_pretrained, num_labels=len(feature_map), name='', has_pooler=False) - opt = AdamW(learning_rate=LinearDecay(args.lr, args.warmup_steps, args.max_steps), parameter_list=model.parameters(), weight_decay=0.01) + g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental + opt = AdamW( + learning_rate=LinearDecay(args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps), + parameter_list=model.parameters(), + weight_decay=args.wd, grad_clip=g_clip) #opt = F.optimizer.AdamOptimizer(learning_rate=LinearDecay(args.lr, args.warmup_steps, args.max_steps), parameter_list=model.parameters()) for epoch in range(args.epoch): for step, (ids, sids, aligned_label, label, orig_pos) in enumerate(tqdm(train_ds.start(place))): - loss, _ = model(ids, sids, labels=aligned_label) + loss, logits = model(ids, sids, labels=aligned_label, loss_weights=L.cast(ids > tokenizer.mask_id, 'float32')) # [MASK] is the largest special token loss.backward() if step % 10 == 0 : - log.debug('train loss %.5f' % loss.numpy()) + log.debug('train loss %.5f, lr %.3e' % (loss.numpy(), opt.current_step_lr())) opt.minimize(loss) model.clear_gradients() if step % 100 == 0 : - all_pred, all_label = [], [] - with FD.base._switch_tracer_mode_guard_(is_train=False): - model.eval() - for step, (ids, sids, aligned_label, label, orig_pos) in enumerate(tqdm(dev_ds.start(place))): - loss, logits = model(ids, sids, labels=aligned_label) - #print('\n'.join(map(str, logits.numpy().tolist()))) - - for pos, lo, la in zip(orig_pos.numpy(), logits.numpy(), label.numpy()): - _dic = OrderedDict() - for p, l in zip(pos, lo): - _dic.setdefault(p, []).append(l) - del _dic[-100] # delete cls/sep/pad position - merged_lo = np.array([np.array(l).mean(0) for _, l in six.iteritems(_dic)]) - merged_preds = np.argmax(merged_lo, -1) - la = la[np.where(la!=-100)] #remove pad - if len(la) > len(merged_preds): - log.warn('accuracy loss due to truncation: label len:%d, truncate to %d' % (len(la), len(merged_preds))) - merged_preds = np.pad(merged_preds, [0, len(la) - len(merged_preds)], mode='constant', constant_values=-100) - all_label.append(la) - all_pred.append(merged_preds) - model.train() - - f1 = f1_score(np.concatenate(all_label), np.concatenate(all_pred), average='macro') + f1 = evaluate(model, dev_ds) log.debug('eval f1: %.5f' % f1) - F.save_dygraph(model.state_dict(), './saved') + f1 = evaluate(model, dev_ds) + log.debug('final eval f1: %.5f' % f1) + if args.save_dir is not None: + F.save_dygraph(model.state_dict(), args.save_dir) diff --git a/demo/finetune_sentiment_analysis_dygraph.py b/demo/finetune_sentiment_analysis_dygraph.py index 7a01899b1679d59e11eae40b5d67f80e0d7a51e8..894a55a16408a54962fb8e78244650fe708da305 100644 --- a/demo/finetune_sentiment_analysis_dygraph.py +++ b/demo/finetune_sentiment_analysis_dygraph.py @@ -116,7 +116,7 @@ if __name__ == '__main__': acc = [] with FD.base._switch_tracer_mode_guard_(is_train=False): model.eval() - for step, d in enumerate(tqdm(dev_ds.start(), desc='evaluating %d' % epoch)): + for step, d in enumerate(tqdm(dev_ds.start(place), desc='evaluating %d' % epoch)): ids, sids, label = d loss, logits = model(ids, sids, labels=label) #print('\n'.join(map(str, logits.numpy().tolist()))) diff --git a/ernie/modeling_ernie.py b/ernie/modeling_ernie.py index 25fe2fe5ef15d02e6122207f58c4beacb040d889..c7655c280184536e4b637a5f437da9fdc4c1e44b 100644 --- a/ernie/modeling_ernie.py +++ b/ernie/modeling_ernie.py @@ -407,7 +407,7 @@ class ErnieModelForTokenClassification(ErnieModel): self.dropout = lambda i: L.dropout(i, dropout_prob=prob, dropout_implementation="upscale_in_train",) if self.training else i @add_docstring(ErnieModel.forward.__doc__) - def forward(self, *args, **kwargs): + def forward(self, *args, ignore_index=-100, labels=None, loss_weights=None, **kwargs, ): """ Args: labels (optional, `Variable` of shape [batch_size, seq_len]): @@ -420,7 +420,6 @@ class ErnieModelForTokenClassification(ErnieModel): output logits of classifier """ - labels = kwargs.pop('labels', None) pooled, encoded = super(ErnieModelForTokenClassification, self).forward(*args, **kwargs) hidden = self.dropout(encoded) # maybe not? logits = self.classifier(hidden) @@ -428,7 +427,9 @@ class ErnieModelForTokenClassification(ErnieModel): if labels is not None: if len(labels.shape) == 2: labels = L.unsqueeze(labels, axes=[-1]) - loss = L.softmax_with_cross_entropy(logits, labels) + loss = L.softmax_with_cross_entropy(logits, labels, ignore_index=ignore_index) + if loss_weights is not None: + loss = L.squeeze(loss, [-1]) * loss_weights loss = L.reduce_mean(loss) else: loss = None