未验证 提交 899dbfc2 编写于 作者: M Meiyim 提交者: GitHub

Fix 466 (#478)

* fix #466

* NER demo: avoid memory error (issue #466)
very ugly; need to improve later

* chunkf1

* fix #476

* update readme

* fix readme
上级 0bb72513
...@@ -54,6 +54,7 @@ from ernie.modeling_ernie import ErnieModel ...@@ -54,6 +54,7 @@ from ernie.modeling_ernie import ErnieModel
D.guard().__enter__() # activate paddle `dygrpah` mode D.guard().__enter__() # activate paddle `dygrpah` mode
model = ErnieModel.from_pretrained('ernie-1.0') # Try to get pretrained model from server, make sure you have network connection model = ErnieModel.from_pretrained('ernie-1.0') # Try to get pretrained model from server, make sure you have network connection
model.eval()
tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
ids, _ = tokenizer.encode('hello world') ids, _ = tokenizer.encode('hello world')
......
...@@ -51,6 +51,7 @@ from ernie.modeling_ernie import ErnieModel ...@@ -51,6 +51,7 @@ from ernie.modeling_ernie import ErnieModel
D.guard().__enter__() # activate paddle `dygrpah` mode D.guard().__enter__() # activate paddle `dygrpah` mode
model = ErnieModel.from_pretrained('ernie-1.0') # Try to get pretrained model from server, make sure you have network connection model = ErnieModel.from_pretrained('ernie-1.0') # Try to get pretrained model from server, make sure you have network connection
model.eval()
tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0') tokenizer = ErnieTokenizer.from_pretrained('ernie-1.0')
ids, _ = tokenizer.encode('hello world') ids, _ = tokenizer.encode('hello world')
...@@ -177,7 +178,7 @@ python3 -m paddle.distributed.launch \ ...@@ -177,7 +178,7 @@ python3 -m paddle.distributed.launch \
1. [情感分析](./demo/finetune_sentiment_analysis_dygraph.py) 1. [情感分析](./demo/finetune_sentiment_analysis_dygraph.py)
1. [语义匹配](./demo/finetune_classifier_dygraph.py) 1. [语义匹配](./demo/finetune_classifier_dygraph.py)
1. [命名实体识别(NER)](./demo/finetune_ner_dygraph.py) 1. [命名实体识别(NER)](./demo/finetune_ner_dygraph.py)
1. [机器阅读理解](./demo/finetune_mrc_dygraph.py) 1. [机器阅读理解](./demo/finetune_mrc_dygraph.py) (需要多卡环境运行;参见上面"分布式 finetune"一节)
1. [文本摘要生成](./experimental/seq2seq/README.md) 1. [文本摘要生成](./experimental/seq2seq/README.md)
...@@ -186,8 +187,7 @@ python3 -m paddle.distributed.launch \ ...@@ -186,8 +187,7 @@ python3 -m paddle.distributed.launch \
|任务|batch size|learning rate| |任务|batch size|learning rate|
|--|--|--| |--|--|--|
| CoLA | 32 / 64 (base) | 3e-5 | | CoLA | 32 / 64 (base) | 3e-5 |
| SST-2 | SST-2 | 64 / 256 (base) | 2e-5 |
| 64 / 256 (base) | 2e-5 |
| STS-B | 128 | 5e-5 | | STS-B | 128 | 5e-5 |
| QQP | 256 | 3e-5(base)/5e-5(large) | | QQP | 256 | 3e-5(base)/5e-5(large) |
| MNLI | 256 / 512 (base)| 3e-5 | | MNLI | 256 / 512 (base)| 3e-5 |
......
...@@ -50,11 +50,15 @@ if __name__ == '__main__': ...@@ -50,11 +50,15 @@ if __name__ == '__main__':
parser.add_argument('--max_seqlen', type=int, default=256) parser.add_argument('--max_seqlen', type=int, default=256)
parser.add_argument('--bsz', type=int, default=32) parser.add_argument('--bsz', type=int, default=32)
parser.add_argument('--data_dir', type=str, required=True) parser.add_argument('--data_dir', type=str, required=True)
parser.add_argument('--epoch', type=int, default=3) parser.add_argument('--epoch', type=int, default=6)
parser.add_argument('--warmup_steps', type=int, default=1000) parser.add_argument('--warmup_proportion', type=float, default=0.1, help='if use_lr_decay is set, '
parser.add_argument('--max_steps', type=int, default=30000) 'learning rate will raise to `lr` at `warmup_proportion` * `max_steps` and decay to 0. at `max_steps`')
parser.add_argument('--max_steps', type=int, required=True,
help='max_train_steps, set this to EPOCH * NUM_SAMPLES / BATCH_SIZE, used in learning rate scheduler')
parser.add_argument('--from_pretrained', type=str, required=True) parser.add_argument('--from_pretrained', type=str, required=True)
parser.add_argument('--lr', type=float, default=5e-5) parser.add_argument('--lr', type=float, default=5e-5, help='learning rate')
parser.add_argument('--save_dir', type=str, default=None, help='model output directory')
parser.add_argument('--wd', type=float, default=0.01, help='weight decay, aka L2 regularizer')
args = parser.parse_args() args = parser.parse_args()
tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained) tokenizer = ErnieTokenizer.from_pretrained(args.from_pretrained)
...@@ -74,16 +78,20 @@ if __name__ == '__main__': ...@@ -74,16 +78,20 @@ if __name__ == '__main__':
def tokenizer_func_for_label(inputs): def tokenizer_func_for_label(inputs):
return inputs.split(b'\2') return inputs.split(b'\2')
feature_column = propeller.data.FeatureColumns([ feature_map = {
propeller.data.TextColumn('text_a', unk_id=tokenizer.unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer_func),
propeller.data.TextColumn('label', unk_id=6, vocab_dict={
b"B-PER": 0, b"B-PER": 0,
b"I-PER": 1, b"I-PER": 1,
b"B-ORG": 2, b"B-ORG": 2,
b"I-ORG": 3, b"I-ORG": 3,
b"B-LOC": 4, b"B-LOC": 4,
b"I-LOC": 5, b"I-LOC": 5,
}, b"O": 6,
}
other_tag_id = feature_map[b'O']
feature_column = propeller.data.FeatureColumns([
propeller.data.TextColumn('text_a', unk_id=tokenizer.unk_id, vocab_dict=tokenizer.vocab, tokenizer=tokenizer_func),
propeller.data.TextColumn('label', unk_id=other_tag_id, vocab_dict=feature_map,
tokenizer=tokenizer_func_for_label,) tokenizer=tokenizer_func_for_label,)
]) ])
...@@ -95,23 +103,23 @@ if __name__ == '__main__': ...@@ -95,23 +103,23 @@ if __name__ == '__main__':
orig_pos, _ = tokenizer.truncate(orig_pos, [], args.max_seqlen) orig_pos, _ = tokenizer.truncate(orig_pos, [], args.max_seqlen)
sentence, segments = tokenizer.build_for_ernie(seg) #utils.data.build_1_pair(seg, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id) sentence, segments = tokenizer.build_for_ernie(seg) #utils.data.build_1_pair(seg, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id)
aligned_label = np.concatenate([[-100], aligned_label, [-100]], 0) aligned_label = np.concatenate([[0], aligned_label, [0]], 0)
orig_pos = np.concatenate([[-100], orig_pos, [-100]]) orig_pos = np.concatenate([[0], orig_pos, [0]])
assert len(aligned_label) == len(sentence) == len(orig_pos), (len(aligned_label), len(sentence), len(orig_pos)) # alinged assert len(aligned_label) == len(sentence) == len(orig_pos), (len(aligned_label), len(sentence), len(orig_pos)) # alinged
return sentence, segments, aligned_label, label, orig_pos return sentence, segments, aligned_label, label, orig_pos
train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=False, use_gz=False) \ train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=False, use_gz=False) \
.map(before) \ .map(before) \
.padded_batch(args.bsz, (0, 0, -100, -100, -100)) \ .padded_batch(args.bsz, (0,0,0, other_tag_id + 1, 0)) \
dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \
.map(before) \ .map(before) \
.padded_batch(args.bsz, (0, 0, -100, -100, -100)) \ .padded_batch(args.bsz, (0,0,0, other_tag_id + 1,0)) \
test_ds = feature_column.build_dataset('test', data_dir=os.path.join(args.data_dir, 'test'), shuffle=False, repeat=False, use_gz=False) \ test_ds = feature_column.build_dataset('test', data_dir=os.path.join(args.data_dir, 'test'), shuffle=False, repeat=False, use_gz=False) \
.map(before) \ .map(before) \
.padded_batch(args.bsz, (0, 0, -100, -100, -100)) \ .padded_batch(args.bsz, (0,0,0, other_tag_id + 1,0)) \
...@@ -126,45 +134,59 @@ if __name__ == '__main__': ...@@ -126,45 +134,59 @@ if __name__ == '__main__':
test_ds.data_types = types test_ds.data_types = types
place = F.CUDAPlace(0) place = F.CUDAPlace(0)
with FD.guard(place):
model = ErnieModelForTokenClassification.from_pretrained(args.from_pretrained, num_labels=7, name='')
opt = AdamW(learning_rate=LinearDecay(args.lr, args.warmup_steps, args.max_steps), parameter_list=model.parameters(), weight_decay=0.01) @FD.no_grad
#opt = F.optimizer.AdamOptimizer(learning_rate=LinearDecay(args.lr, args.warmup_steps, args.max_steps), parameter_list=model.parameters()) def evaluate(model, dataset):
for epoch in range(args.epoch):
for step, (ids, sids, aligned_label, label, orig_pos) in enumerate(tqdm(train_ds.start(place))):
loss, _ = model(ids, sids, labels=aligned_label)
loss.backward()
if step % 10 == 0 :
log.debug('train loss %.5f' % loss.numpy())
opt.minimize(loss)
model.clear_gradients()
if step % 100 == 0 :
all_pred, all_label = [], []
with FD.base._switch_tracer_mode_guard_(is_train=False):
model.eval() model.eval()
for step, (ids, sids, aligned_label, label, orig_pos) in enumerate(tqdm(dev_ds.start(place))): chunkf1 = propeller.metrics.ChunkF1(None, None, None, len(feature_map))
loss, logits = model(ids, sids, labels=aligned_label) for step, (ids, sids, aligned_label, label, orig_pos) in enumerate(tqdm(dataset.start(place))):
loss, logits = model(ids, sids)
#print('\n'.join(map(str, logits.numpy().tolist()))) #print('\n'.join(map(str, logits.numpy().tolist())))
for pos, lo, la in zip(orig_pos.numpy(), logits.numpy(), label.numpy()): assert orig_pos.shape[0] == logits.shape[0] == ids.shape[0] == label.shape[0]
for pos, lo, la, id in zip(orig_pos.numpy(), logits.numpy(), label.numpy(), ids.numpy()):
_dic = OrderedDict() _dic = OrderedDict()
for p, l in zip(pos, lo): assert len(pos) ==len(lo) == len(id)
_dic.setdefault(p, []).append(l) for _pos, _lo, _id in zip(pos, lo, id):
del _dic[-100] # delete cls/sep/pad position if _id > tokenizer.mask_id: # [MASK] is the largest special token
_dic.setdefault(_pos, []).append(_lo)
merged_lo = np.array([np.array(l).mean(0) for _, l in six.iteritems(_dic)]) merged_lo = np.array([np.array(l).mean(0) for _, l in six.iteritems(_dic)])
merged_preds = np.argmax(merged_lo, -1) merged_preds = np.argmax(merged_lo, -1)
la = la[np.where(la!=-100)] #remove pad la = la[np.where(la != (other_tag_id + 1))] #remove pad
if len(la) > len(merged_preds): if len(la) > len(merged_preds):
log.warn('accuracy loss due to truncation: label len:%d, truncate to %d' % (len(la), len(merged_preds))) log.warn('accuracy loss due to truncation: label len:%d, truncate to %d' % (len(la), len(merged_preds)))
merged_preds = np.pad(merged_preds, [0, len(la) - len(merged_preds)], mode='constant', constant_values=-100) merged_preds = np.pad(merged_preds, [0, len(la) - len(merged_preds)], mode='constant', constant_values=7)
all_label.append(la) else:
all_pred.append(merged_preds) assert len(la) == len(merged_preds), 'expect label == prediction, got %d vs %d' % (la.shape, merged_preds.shape)
chunkf1.update((merged_preds, la, np.array(len(la))))
#f1 = f1_score(np.concatenate(all_label), np.concatenate(all_pred), average='macro')
f1 = chunkf1.eval()
model.train() model.train()
return f1
with FD.guard(place):
model = ErnieModelForTokenClassification.from_pretrained(args.from_pretrained, num_labels=len(feature_map), name='', has_pooler=False)
f1 = f1_score(np.concatenate(all_label), np.concatenate(all_pred), average='macro') g_clip = F.clip.GradientClipByGlobalNorm(1.0) #experimental
opt = AdamW(
learning_rate=LinearDecay(args.lr, int(args.warmup_proportion * args.max_steps), args.max_steps),
parameter_list=model.parameters(),
weight_decay=args.wd, grad_clip=g_clip)
#opt = F.optimizer.AdamOptimizer(learning_rate=LinearDecay(args.lr, args.warmup_steps, args.max_steps), parameter_list=model.parameters())
for epoch in range(args.epoch):
for step, (ids, sids, aligned_label, label, orig_pos) in enumerate(tqdm(train_ds.start(place))):
loss, logits = model(ids, sids, labels=aligned_label, loss_weights=L.cast(ids > tokenizer.mask_id, 'float32')) # [MASK] is the largest special token
loss.backward()
if step % 10 == 0 :
log.debug('train loss %.5f, lr %.3e' % (loss.numpy(), opt.current_step_lr()))
opt.minimize(loss)
model.clear_gradients()
if step % 100 == 0 :
f1 = evaluate(model, dev_ds)
log.debug('eval f1: %.5f' % f1) log.debug('eval f1: %.5f' % f1)
F.save_dygraph(model.state_dict(), './saved')
f1 = evaluate(model, dev_ds)
log.debug('final eval f1: %.5f' % f1)
if args.save_dir is not None:
F.save_dygraph(model.state_dict(), args.save_dir)
...@@ -116,7 +116,7 @@ if __name__ == '__main__': ...@@ -116,7 +116,7 @@ if __name__ == '__main__':
acc = [] acc = []
with FD.base._switch_tracer_mode_guard_(is_train=False): with FD.base._switch_tracer_mode_guard_(is_train=False):
model.eval() model.eval()
for step, d in enumerate(tqdm(dev_ds.start(), desc='evaluating %d' % epoch)): for step, d in enumerate(tqdm(dev_ds.start(place), desc='evaluating %d' % epoch)):
ids, sids, label = d ids, sids, label = d
loss, logits = model(ids, sids, labels=label) loss, logits = model(ids, sids, labels=label)
#print('\n'.join(map(str, logits.numpy().tolist()))) #print('\n'.join(map(str, logits.numpy().tolist())))
......
...@@ -407,7 +407,7 @@ class ErnieModelForTokenClassification(ErnieModel): ...@@ -407,7 +407,7 @@ class ErnieModelForTokenClassification(ErnieModel):
self.dropout = lambda i: L.dropout(i, dropout_prob=prob, dropout_implementation="upscale_in_train",) if self.training else i self.dropout = lambda i: L.dropout(i, dropout_prob=prob, dropout_implementation="upscale_in_train",) if self.training else i
@add_docstring(ErnieModel.forward.__doc__) @add_docstring(ErnieModel.forward.__doc__)
def forward(self, *args, **kwargs): def forward(self, *args, ignore_index=-100, labels=None, loss_weights=None, **kwargs, ):
""" """
Args: Args:
labels (optional, `Variable` of shape [batch_size, seq_len]): labels (optional, `Variable` of shape [batch_size, seq_len]):
...@@ -420,7 +420,6 @@ class ErnieModelForTokenClassification(ErnieModel): ...@@ -420,7 +420,6 @@ class ErnieModelForTokenClassification(ErnieModel):
output logits of classifier output logits of classifier
""" """
labels = kwargs.pop('labels', None)
pooled, encoded = super(ErnieModelForTokenClassification, self).forward(*args, **kwargs) pooled, encoded = super(ErnieModelForTokenClassification, self).forward(*args, **kwargs)
hidden = self.dropout(encoded) # maybe not? hidden = self.dropout(encoded) # maybe not?
logits = self.classifier(hidden) logits = self.classifier(hidden)
...@@ -428,7 +427,9 @@ class ErnieModelForTokenClassification(ErnieModel): ...@@ -428,7 +427,9 @@ class ErnieModelForTokenClassification(ErnieModel):
if labels is not None: if labels is not None:
if len(labels.shape) == 2: if len(labels.shape) == 2:
labels = L.unsqueeze(labels, axes=[-1]) labels = L.unsqueeze(labels, axes=[-1])
loss = L.softmax_with_cross_entropy(logits, labels) loss = L.softmax_with_cross_entropy(logits, labels, ignore_index=ignore_index)
if loss_weights is not None:
loss = L.squeeze(loss, [-1]) * loss_weights
loss = L.reduce_mean(loss) loss = L.reduce_mean(loss)
else: else:
loss = None loss = None
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册