From efdd16d44c1c34b44fad9efbbb3853db8f080b10 Mon Sep 17 00:00:00 2001 From: chenxuyi Date: Mon, 28 Oct 2019 14:42:47 +0800 Subject: [PATCH] + more propeller examples --- example/finetune_classifier.py | 222 ++++++++++++ example/finetune_ner.py | 396 ++++++++++++++++++++++ example/finetune_ranker.py | 265 +++++++++++++++ example/propeller_xnli_demo.ipynb | 541 ++++++++++++++++++++++++++++++ 4 files changed, 1424 insertions(+) create mode 100644 example/finetune_classifier.py create mode 100644 example/finetune_ner.py create mode 100644 example/finetune_ranker.py create mode 100644 example/propeller_xnli_demo.ipynb diff --git a/example/finetune_classifier.py b/example/finetune_classifier.py new file mode 100644 index 0000000..77a68ad --- /dev/null +++ b/example/finetune_classifier.py @@ -0,0 +1,222 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import time +import logging +from random import random +from functools import reduce, partial + +import numpy as np +import multiprocessing + +import paddle +import paddle.fluid as F +import paddle.fluid.layers as L + +from model.ernie import ErnieModel +from optimization import optimization +import utils.data + +from propeller import log +import propeller.paddle as propeller +log.setLevel(logging.DEBUG) + +class ClassificationErnieModel(propeller.train.Model): + """propeller Model wraper for paddle-ERNIE """ + def __init__(self, hparam, mode, run_config): + self.hparam = hparam + self.mode = mode + self.run_config = run_config + + def forward(self, features): + src_ids, sent_ids = features + zero = L.fill_constant([1], dtype='int64', value=0) + input_mask = L.cast(L.logical_not(L.equal(src_ids, zero)), 'float32') # assume pad id == 0 + #input_mask = L.unsqueeze(input_mask, axes=[2]) + d_shape = L.shape(src_ids) + seqlen = d_shape[1] + batch_size = d_shape[0] + pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0]) + pos_ids = L.expand(pos_ids, [batch_size, 1]) + pos_ids = L.unsqueeze(pos_ids, axes=[2]) + pos_ids = L.cast(pos_ids, 'int64') + pos_ids.stop_gradient = True + input_mask.stop_gradient = True + task_ids = L.zeros_like(src_ids) + self.hparam.task_id #this shit wont use at the moment + task_ids.stop_gradient = True + + ernie = ErnieModel( + src_ids=src_ids, + position_ids=pos_ids, + sentence_ids=sent_ids, + task_ids=task_ids, + input_mask=input_mask, + config=self.hparam, + use_fp16=self.hparam['use_fp16'] + ) + + cls_feats = ernie.get_pooled_output() + + cls_feats = L.dropout( + x=cls_feats, + dropout_prob=0.1, + dropout_implementation="upscale_in_train" + ) + + logits = L.fc( + input=cls_feats, + size=self.hparam['num_label'], + param_attr=F.ParamAttr( + name="cls_out_w", + initializer=F.initializer.TruncatedNormal(scale=0.02)), + bias_attr=F.ParamAttr( + name="cls_out_b", initializer=F.initializer.Constant(0.)) + ) + + propeller.summary.histogram('pred', logits) + + if self.mode is propeller.RunMode.PREDICT: + probs = L.softmax(logits) + return probs + else: + return logits + + def loss(self, predictions, labels): + ce_loss, probs = L.softmax_with_cross_entropy( + logits=predictions, label=labels, return_softmax=True) + #L.Print(ce_loss, message='per_example_loss') + loss = L.mean(x=ce_loss) + return loss + + def backward(self, loss): + scheduled_lr, _ = optimization( + loss=loss, + warmup_steps=int(self.run_config.max_steps * self.hparam['warmup_proportion']), + num_train_steps=self.run_config.max_steps, + learning_rate=self.hparam['learning_rate'], + train_program=F.default_main_program(), + startup_prog=F.default_startup_program(), + weight_decay=self.hparam['weight_decay'], + scheduler="linear_warmup_decay",) + propeller.summary.scalar('lr', scheduled_lr) + + def metrics(self, predictions, label): + predictions = L.argmax(predictions, axis=1) + predictions = L.unsqueeze(predictions, axes=[1]) + acc = propeller.metrics.Acc(label, predictions) + #auc = propeller.metrics.Auc(label, predictions) + return {'acc': acc} + + +if __name__ == '__main__': + parser = propeller.ArgumentParser('classify model with ERNIE') + parser.add_argument('--max_seqlen', type=int, default=128) + parser.add_argument('--data_dir', type=str, required=True) + parser.add_argument('--vocab_file', type=str, required=True) + parser.add_argument('--do_predict', action='store_true') + parser.add_argument('--warm_start_from', type=str) + args = parser.parse_args() + run_config = propeller.parse_runconfig(args) + hparams = propeller.parse_hparam(args) + + + vocab = {j.strip().split(b'\t')[0].decode('utf8'): i for i, j in enumerate(open(args.vocab_file, 'rb'))} + sep_id = vocab['[SEP]'] + cls_id = vocab['[CLS]'] + unk_id = vocab['[UNK]'] + + tokenizer = utils.data.CharTokenizer(vocab.keys()) + + def tokenizer_func(inputs): + '''avoid pickle error''' + ret = tokenizer(inputs) + return ret + + if not args.do_predict: + feature_column = propeller.data.FeatureColumns([ + propeller.data.TextColumn('title',unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func), + propeller.data.LabelColumn('label'), + ]) + + def before(seg_a, label): + sentence, segments = utils.data.build_1_pair(seg_a, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id) + return sentence, segments, label + + def after(sentence, segments, label): + sentence, segments, label = utils.data.expand_dims(sentence, segments, label) + return sentence, segments, label + + log.debug(os.path.join(args.data_dir, 'train')) + train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=True, use_gz=False) \ + .map(before) \ + .padded_batch(hparams.batch_size, (0, 0, 0)) \ + .map(after) + + dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ + .map(before) \ + .padded_batch(hparams.batch_size, (0, 0, 0)) \ + .map(after) + + + shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1], [-1, 1]) + types = ('int64', 'int64', 'int64') + + train_ds.data_shapes = shapes + train_ds.data_types = types + dev_ds.data_shapes = shapes + dev_ds.data_types = types + + varname_to_warmstart = re.compile('encoder.*|pooled.*|.*embedding|pre_encoder_.*') + warm_start_dir = args.warm_start_from + ws = propeller.WarmStartSetting( + predicate_fn=lambda v: varname_to_warmstart.match(v.name) and os.path.exists(os.path.join(warm_start_dir, v.name)), + from_dir=warm_start_dir + ) + + best_exporter = propeller.train.exporter.BestInferenceModelExporter(os.path.join(run_config.model_dir, 'best'), cmp_fn=lambda old, new: new['eval']['acc'] > old['eval']['acc']) + propeller.train.train_and_eval( + model_class_or_model_fn=ClassificationErnieModel, + params=hparams, + run_config=run_config, + train_dataset=train_ds, + eval_dataset=dev_ds, + warm_start_setting=ws, + exporters=[best_exporter]) + print('dev_acc\t%.5f' % (best_exporter._best['eval']['acc'])) + else: + feature_column = propeller.data.FeatureColumns([ + propeller.data.TextColumn('title',unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func), + propeller.data.LabelColumn('label'), + ]) + def before(seg_a): + sentence, segments = utils.data.build_1_pair(seg_a, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id) + return sentence, segments + def after(sentence, segments): + sentence, segments = utils.data.expand_dims(sentence, segments) + return sentence, segments + predict_ds = feature_column.build_dataset_from_stdin('predict') \ + .map(before) \ + .padded_batch(hparams.batch_size, (0, 0)) \ + .map(after) + shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1]) + types = ('int64', 'int64') + + predict_ds.data_shapes = shapes + predict_ds.data_types = types + finetuned_model = propeller.Learner(ClassificationErnieModel, run_config, hparams) + for logits, in finetuned_model.predict(predict_ds, ckpt=-1): # ckpt=-1 means last step + print(np.argmax(logits)) + diff --git a/example/finetune_ner.py b/example/finetune_ner.py new file mode 100644 index 0000000..89a9e22 --- /dev/null +++ b/example/finetune_ner.py @@ -0,0 +1,396 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import sys +import os +import re +import time +from random import random +from functools import reduce, partial + +import numpy as np +import multiprocessing +import logging +import six +import re + +import paddle +import paddle.fluid as F +import paddle.fluid.layers as L + + +from model.ernie import ErnieModel +from optimization import optimization +import tokenization +import utils.data + +from propeller import log +log.setLevel(logging.DEBUG) +import propeller.paddle as propeller + +class SequenceLabelErnieModel(propeller.train.Model): + """propeller Model wraper for paddle-ERNIE """ + def __init__(self, hparam, mode, run_config): + self.hparam = hparam + self.mode = mode + self.run_config = run_config + self.num_label = len(hparam['label_list']) + + def forward(self, features): + src_ids, sent_ids, input_seqlen = features + zero = L.fill_constant([1], dtype='int64', value=0) + input_mask = L.cast(L.equal(src_ids, zero), 'float32') # assume pad id == 0 + #input_mask = L.unsqueeze(input_mask, axes=[2]) + d_shape = L.shape(src_ids) + seqlen = d_shape[1] + batch_size = d_shape[0] + pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0]) + pos_ids = L.expand(pos_ids, [batch_size, 1]) + pos_ids = L.unsqueeze(pos_ids, axes=[2]) + pos_ids = L.cast(pos_ids, 'int64') + pos_ids.stop_gradient = True + input_mask.stop_gradient = True + task_ids = L.zeros_like(src_ids) + self.hparam.task_id #this shit wont use at the moment + task_ids.stop_gradient = True + + model = ErnieModel( + src_ids=src_ids, + position_ids=pos_ids, + sentence_ids=sent_ids, + task_ids=task_ids, + input_mask=input_mask, + config=self.hparam, + use_fp16=self.hparam['use_fp16'] + ) + + enc_out = model.get_sequence_output() + logits = L.fc( + input=enc_out, + size=self.num_label, + num_flatten_dims=2, + param_attr= F.ParamAttr( + name="cls_seq_label_out_w", + initializer= F.initializer.TruncatedNormal(scale=0.02)), + bias_attr=F.ParamAttr( + name="cls_seq_label_out_b", + initializer=F.initializer.Constant(0.))) + + propeller.summary.histogram('pred', logits) + + return logits, input_seqlen + + def loss(self, predictions, labels): + logits, input_seqlen = predictions + logits = L.flatten(logits, axis=2) + labels = L.flatten(labels, axis=2) + ce_loss, probs = L.softmax_with_cross_entropy( + logits=logits, label=labels, return_softmax=True) + loss = L.mean(x=ce_loss) + return loss + + def backward(self, loss): + scheduled_lr, _ = optimization( + loss=loss, + warmup_steps=int(self.run_config.max_steps * self.hparam['warmup_proportion']), + num_train_steps=self.run_config.max_steps, + learning_rate=self.hparam['learning_rate'], + train_program=F.default_main_program(), + startup_prog=F.default_startup_program(), + weight_decay=self.hparam['weight_decay'], + scheduler="linear_warmup_decay",) + propeller.summary.scalar('lr', scheduled_lr) + + def metrics(self, predictions, label): + pred, seqlen = predictions + pred = L.argmax(pred, axis=-1) + pred = L.unsqueeze(pred, axes=[-1]) + f1 = propeller.metrics.ChunkF1(label, pred, seqlen, self.num_label) + return {'f1': f1} + +def make_sequence_label_dataset(name, input_files, label_list, tokenizer, batch_size, max_seqlen, is_train): + label_map = {v: i for i, v in enumerate(label_list)} + no_entity_id = label_map['O'] + delimiter = '' + + def read_bio_data(filename): + ds = propeller.data.Dataset.from_file(filename) + iterable = iter(ds) + def gen(): + buf, size = [], 0 + iterator = iter(ds) + while 1: + line = next(iterator) + cols = line.rstrip(b'\n').split(b'\t') + if len(cols) != 2: + continue + tokens = tokenization.convert_to_unicode(cols[0]).split(delimiter) + labels = tokenization.convert_to_unicode(cols[1]).split(delimiter) + if len(tokens) != len(labels) or len(tokens) == 0: + continue + yield [tokens, labels] + + return propeller.data.Dataset.from_generator_func(gen) + + def reseg_token_label(dataset): + def gen(): + iterator = iter(dataset) + while True: + tokens, labels = next(iterator) + assert len(tokens) == len(labels) + ret_tokens = [] + ret_labels = [] + for token, label in zip(tokens, labels): + sub_token = tokenizer.tokenize(token) + if len(sub_token) == 0: + continue + ret_tokens.extend(sub_token) + ret_labels.append(label) + if len(sub_token) < 2: + continue + sub_label = label + if label.startswith("B-"): + sub_label = "I-" + label[2:] + ret_labels.extend([sub_label] * (len(sub_token) - 1)) + + assert len(ret_tokens) == len(ret_labels) + yield ret_tokens, ret_labels + + ds = propeller.data.Dataset.from_generator_func(gen) + return ds + + def convert_to_ids(dataset): + def gen(): + iterator = iter(dataset) + while True: + tokens, labels = next(iterator) + if len(tokens) > max_seqlen - 2: + tokens = tokens[: max_seqlen - 2] + labels = labels[: max_seqlen - 2] + + tokens = ['[CLS]'] + tokens + ['[SEP]'] + token_ids = tokenizer.convert_tokens_to_ids(tokens) + label_ids = [no_entity_id] + [label_map[x] for x in labels] + [no_entity_id] + token_type_ids = [0] * len(token_ids) + input_seqlen = len(token_ids) + + token_ids = np.array(token_ids, dtype=np.int64) + label_ids = np.array(label_ids, dtype=np.int64) + token_type_ids = np.array(token_type_ids, dtype=np.int64) + input_seqlen = np.array(input_seqlen, dtype=np.int64) + + yield token_ids, token_type_ids, input_seqlen, label_ids + + ds = propeller.data.Dataset.from_generator_func(gen) + return ds + + def after(*features): + return utils.data.expand_dims(*features) + + dataset = propeller.data.Dataset.from_list(input_files) + if is_train: + dataset = dataset.repeat().shuffle(buffer_size=len(input_files)) + dataset = dataset.interleave(map_fn=read_bio_data, cycle_length=len(input_files), block_length=1) + if is_train: + dataset = dataset.shuffle(buffer_size=100) + dataset = reseg_token_label(dataset) + dataset = convert_to_ids(dataset) + dataset = dataset.padded_batch(batch_size).map(after) + dataset.name = name + return dataset + + +def make_sequence_label_dataset_from_stdin(name, tokenizer, batch_size, max_seqlen): + delimiter = '' + + def stdin_gen(): + if six.PY3: + source = sys.stdin.buffer + else: + source = sys.stdin + while True: + line = source.readline() + if len(line) == 0: + break + yield line, + + def read_bio_data(ds): + iterable = iter(ds) + def gen(): + buf, size = [], 0 + iterator = iter(ds) + while 1: + line, = next(iterator) + cols = line.rstrip(b'\n').split(b'\t') + if len(cols) != 1: + continue + tokens = tokenization.convert_to_unicode(cols[0]).split(delimiter) + if len(tokens) == 0: + continue + yield tokens, + return propeller.data.Dataset.from_generator_func(gen) + + def reseg_token_label(dataset): + def gen(): + iterator = iter(dataset) + while True: + tokens, = next(iterator) + ret_tokens = [] + for token in tokens: + sub_token = tokenizer.tokenize(token) + if len(sub_token) == 0: + continue + ret_tokens.extend(sub_token) + if len(sub_token) < 2: + continue + yield ret_tokens, + ds = propeller.data.Dataset.from_generator_func(gen) + return ds + + def convert_to_ids(dataset): + def gen(): + iterator = iter(dataset) + while True: + tokens, = next(iterator) + if len(tokens) > max_seqlen - 2: + tokens = tokens[: max_seqlen - 2] + + tokens = ['[CLS]'] + tokens + ['[SEP]'] + token_ids = tokenizer.convert_tokens_to_ids(tokens) + token_type_ids = [0] * len(token_ids) + input_seqlen = len(token_ids) + + token_ids = np.array(token_ids, dtype=np.int64) + token_type_ids = np.array(token_type_ids, dtype=np.int64) + input_seqlen = np.array(input_seqlen, dtype=np.int64) + yield token_ids, token_type_ids, input_seqlen + + ds = propeller.data.Dataset.from_generator_func(gen) + return ds + + def after(*features): + return utils.data.expand_dims(*features) + + dataset = propeller.data.Dataset.from_generator_func(stdin_gen) + dataset = read_bio_data(dataset) + dataset = reseg_token_label(dataset) + dataset = convert_to_ids(dataset) + dataset = dataset.padded_batch(batch_size).map(after) + dataset.name = name + return dataset + + +if __name__ == '__main__': + parser = propeller.ArgumentParser('NER model with ERNIE') + parser.add_argument('--max_seqlen', type=int, default=128) + parser.add_argument('--data_dir', type=str, required=True) + parser.add_argument('--vocab_file', type=str, required=True) + parser.add_argument('--do_predict', action='store_true') + parser.add_argument('--warm_start_from', type=str) + args = parser.parse_args() + run_config = propeller.parse_runconfig(args) + hparams = propeller.parse_hparam(args) + + tokenizer = tokenization.FullTokenizer(args.vocab_file) + vocab = tokenizer.vocab + sep_id = vocab['[SEP]'] + cls_id = vocab['[CLS]'] + unk_id = vocab['[UNK]'] + pad_id = vocab['[PAD]'] + + label_list = ['B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'O'] + hparams['label_list'] = label_list + + if not args.do_predict: + train_data_dir = os.path.join(args.data_dir, 'train') + train_input_files = [os.path.join(train_data_dir, filename) for filename in os.listdir(train_data_dir)] + dev_data_dir = os.path.join(args.data_dir, 'dev') + dev_input_files = [os.path.join(dev_data_dir, filename) for filename in os.listdir(dev_data_dir)] + test_data_dir = os.path.join(args.data_dir, 'test') + test_input_files = [os.path.join(test_data_dir, filename) for filename in os.listdir(test_data_dir)] + + train_ds = make_sequence_label_dataset(name='train', + input_files=train_input_files, + label_list=label_list, + tokenizer=tokenizer, + batch_size=hparams.batch_size, + max_seqlen=args.max_seqlen, + is_train=True) + dev_ds = make_sequence_label_dataset(name='dev', + input_files=dev_input_files, + label_list=label_list, + tokenizer=tokenizer, + batch_size=hparams.batch_size, + max_seqlen=args.max_seqlen, + is_train=False) + test_ds = make_sequence_label_dataset(name='test', + input_files=test_input_files, + label_list=label_list, + tokenizer=tokenizer, + batch_size=hparams.batch_size, + max_seqlen=args.max_seqlen, + is_train=False) + + shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1], [-1, 1], [-1, args.max_seqlen, 1]) + types = ('int64', 'int64', 'int64', 'int64') + + train_ds.data_shapes = shapes + train_ds.data_types = types + dev_ds.data_shapes = shapes + dev_ds.data_types = types + test_ds.data_shapes = shapes + test_ds.data_types = types + + varname_to_warmstart = re.compile(r'^encoder.*[wb]_0$|^.*embedding$|^.*bias$|^.*scale$|^pooled_fc.[wb]_0$') + warm_start_dir = args.warm_start_from + ws = propeller.WarmStartSetting( + predicate_fn=lambda v: varname_to_warmstart.match(v.name) and os.path.exists(os.path.join(warm_start_dir, v.name)), + from_dir=warm_start_dir + ) + + best_exporter = propeller.train.exporter.BestExporter(os.path.join(run_config.model_dir, 'best'), cmp_fn=lambda old, new: new['dev']['f1'] > old['dev']['f1']) + propeller.train.train_and_eval( + model_class_or_model_fn=SequenceLabelErnieModel, + params=hparams, + run_config=run_config, + train_dataset=train_ds, + eval_dataset={'dev': dev_ds, 'test': test_ds}, + warm_start_setting=ws, + exporters=[best_exporter]) + + for k in best_exporter._best['dev'].keys(): + if 'loss' in k: + continue + dev_v = best_exporter._best['dev'][k] + test_v = best_exporter._best['test'][k] + print('dev_%s\t%.5f\ntest_%s\t%.5f' % (k, dev_v, k, test_v)) + else: + predict_ds = make_sequence_label_dataset_from_stdin(name='pred', + tokenizer=tokenizer, + batch_size=hparams.batch_size, + max_seqlen=args.max_seqlen) + + shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1], [-1, 1]) + types = ('int64', 'int64', 'int64') + + predict_ds.data_shapes = shapes + predict_ds.data_types = types + + rev_label_map = {i: v for i, v in enumerate(label_list)} + best_exporter = propeller.train.exporter.BestExporter(os.path.join(run_config.model_dir, 'best'), cmp_fn=lambda old, new: new['dev']['f1'] > old['dev']['f1']) + learner = propeller.Learner(SequenceLabelErnieModel, run_config, hparams) + for pred, _ in learner.predict(predict_ds, ckpt=-1): + pred_str = ' '.join([rev_label_map[idx] for idx in np.argmax(pred, 1).tolist()]) + print(pred_str) + + diff --git a/example/finetune_ranker.py b/example/finetune_ranker.py new file mode 100644 index 0000000..db40b26 --- /dev/null +++ b/example/finetune_ranker.py @@ -0,0 +1,265 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import re +import time +import logging +import six +import sys +import io + +from random import random +from functools import reduce, partial, wraps + +import numpy as np +import multiprocessing +import re + +import paddle +import paddle.fluid as F +import paddle.fluid.layers as L + + +from model.ernie import ErnieModel +from optimization import optimization +import utils.data + +from propeller import log +import propeller.paddle as propeller +log.setLevel(logging.DEBUG) + +class RankingErnieModel(propeller.train.Model): + """propeller Model wraper for paddle-ERNIE """ + def __init__(self, hparam, mode, run_config): + self.hparam = hparam + self.mode = mode + self.run_config = run_config + + def forward(self, features): + src_ids, sent_ids, qid = features + + zero = L.fill_constant([1], dtype='int64', value=0) + input_mask = L.cast(L.logical_not(L.equal(src_ids, zero)), 'float32') # assume pad id == 0 + #input_mask = L.unsqueeze(input_mask, axes=[2]) + d_shape = L.shape(src_ids) + seqlen = d_shape[1] + batch_size = d_shape[0] + pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0]) + pos_ids = L.expand(pos_ids, [batch_size, 1]) + pos_ids = L.unsqueeze(pos_ids, axes=[2]) + pos_ids = L.cast(pos_ids, 'int64') + pos_ids.stop_gradient = True + input_mask.stop_gradient = True + task_ids = L.zeros_like(src_ids) + self.hparam.task_id #this shit wont use at the moment + task_ids.stop_gradient = True + + ernie = ErnieModel( + src_ids=src_ids, + position_ids=pos_ids, + sentence_ids=sent_ids, + task_ids=task_ids, + input_mask=input_mask, + config=self.hparam, + use_fp16=self.hparam['use_fp16'] + ) + + cls_feats = ernie.get_pooled_output() + + cls_feats = L.dropout( + x=cls_feats, + dropout_prob=0.1, + dropout_implementation="upscale_in_train" + ) + + logits = L.fc( + input=cls_feats, + size=self.hparam['num_label'], + param_attr=F.ParamAttr( + name="cls_out_w", + initializer=F.initializer.TruncatedNormal(scale=0.02)), + bias_attr=F.ParamAttr( + name="cls_out_b", initializer=F.initializer.Constant(0.)) + ) + + propeller.summary.histogram('pred', logits) + + if self.mode is propeller.RunMode.PREDICT: + probs = L.softmax(logits) + return qid, probs + else: + return qid, logits + + def loss(self, predictions, labels): + qid, predictions = predictions + ce_loss, probs = L.softmax_with_cross_entropy( + logits=predictions, label=labels, return_softmax=True) + #L.Print(ce_loss, message='per_example_loss') + loss = L.mean(x=ce_loss) + return loss + + def metrics(self, predictions, label): + qid, logits = predictions + + positive_class_logits = L.slice(logits, axes=[1], starts=[1], ends=[2]) + mrr = propeller.metrics.Mrr(qid, label, positive_class_logits) + + predictions = L.argmax(logits, axis=1) + predictions = L.unsqueeze(predictions, axes=[1]) + f1 = propeller.metrics.F1(label, predictions) + acc = propeller.metrics.Acc(label, predictions) + #auc = propeller.metrics.Auc(label, predictions) + + return {'acc': acc, 'f1': f1, 'mrr': mrr} + + def backward(self, loss): + scheduled_lr, _ = optimization( + loss=loss, + warmup_steps=int(self.run_config.max_steps * self.hparam['warmup_proportion']), + num_train_steps=self.run_config.max_steps, + learning_rate=self.hparam['learning_rate'], + train_program=F.default_main_program(), + startup_prog=F.default_startup_program(), + weight_decay=self.hparam['weight_decay'], + scheduler="linear_warmup_decay",) + propeller.summary.scalar('lr', scheduled_lr) + + + +if __name__ == '__main__': + parser = propeller.ArgumentParser('ranker model with ERNIE') + parser.add_argument('--do_predict', action='store_true') + parser.add_argument('--predict_model', type=str, default=None) + parser.add_argument('--max_seqlen', type=int, default=128) + parser.add_argument('--vocab_file', type=str, required=True) + parser.add_argument('--data_dir', type=str, required=True) + parser.add_argument('--warm_start_from', type=str) + parser.add_argument('--sentence_piece_model', type=str, default=None) + args = parser.parse_args() + run_config = propeller.parse_runconfig(args) + hparams = propeller.parse_hparam(args) + + + vocab = {j.strip().split(b'\t')[0].decode('utf8') : i for i, j in enumerate(open(args.vocab_file, 'rb'))} + sep_id = vocab['[SEP]'] + cls_id = vocab['[CLS]'] + unk_id = vocab['[UNK]'] + + if args.sentence_piece_model is not None: + tokenizer = utils.data.JBSPTokenizer(args.sentence_piece_model, jb=True, lower=True) + else: + tokenizer = utils.data.CharTokenizer(vocab.keys()) + + def tokenizer_func(inputs): + '''avoid pickle error''' + ret = tokenizer(inputs) + return ret + + + shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1], [-1, 1], [-1, 1]) + types = ('int64', 'int64', 'int64', 'int64') + + + if not args.do_predict: + feature_column = propeller.data.FeatureColumns([ + propeller.data.LabelColumn('qid'), + propeller.data.TextColumn('title', vocab_dict=vocab, tokenizer=tokenizer_func, unk_id=unk_id), + propeller.data.TextColumn('comment', vocab_dict=vocab, tokenizer=tokenizer_func, unk_id=unk_id), + propeller.data.LabelColumn('label'), + ]) + + def before(qid, seg_a, seg_b, label): + sentence, segments = utils.data.build_2_pair(seg_a, seg_b, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id) + return sentence, segments, qid, label + + def after(sentence, segments, qid, label): + sentence, segments, qid, label = utils.data.expand_dims(sentence, segments, qid, label) + return sentence, segments, qid, label + + + + train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=True, use_gz=False) \ + .map(before) \ + .padded_batch(hparams.batch_size, (0, 0, 0, 0)) \ + .map(after) + + dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \ + .map(before) \ + .padded_batch(hparams.batch_size, (0, 0, 0, 0)) \ + .map(after) + + test_ds = feature_column.build_dataset('test', data_dir=os.path.join(args.data_dir, 'test'), shuffle=False, repeat=False, use_gz=False) \ + .map(before) \ + .padded_batch(hparams.batch_size, (0, 0, 0, 0)) \ + .map(after) + + train_ds.data_shapes = shapes + train_ds.data_types = types + dev_ds.data_shapes = shapes + dev_ds.data_types = types + test_ds.data_shapes = shapes + test_ds.data_types = types + + varname_to_warmstart = re.compile(r'^encoder.*[wb]_0$|^.*embedding$|^.*bias$|^.*scale$|^pooled_fc.[wb]_0$') + warm_start_dir = args.warm_start_from + ws = propeller.WarmStartSetting( + predicate_fn=lambda v: varname_to_warmstart.match(v.name) and os.path.exists(os.path.join(warm_start_dir, v.name)), + from_dir=warm_start_dir + ) + + best_exporter = propeller.train.exporter.BestExporter(os.path.join(run_config.model_dir, 'best'), cmp_fn=lambda old, new: new['dev']['f1'] > old['dev']['f1']) + propeller.train_and_eval( + model_class_or_model_fn=RankingErnieModel, + params=hparams, + run_config=run_config, + train_dataset=train_ds, + eval_dataset={'dev': dev_ds, 'test': test_ds}, + warm_start_setting=ws, + exporters=[best_exporter]) + + print('dev_mrr\t%.5f\ntest_mrr\t%.5f\ndev_f1\t%.5f\ntest_f1\t%.5f' % ( + best_exporter._best['dev']['mrr'], best_exporter._best['test']['mrr'], + best_exporter._best['dev']['f1'], best_exporter._best['test']['f1'], + )) + else: + feature_column = propeller.data.FeatureColumns([ + propeller.data.LabelColumn('qid'), + propeller.data.TextColumn('title', unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func), + propeller.data.TextColumn('comment', unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func), + ]) + + def before(qid, seg_a, seg_b): + sentence, segments = utils.data.build_2_pair(seg_a, seg_b, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id) + return sentence, segments, qid + + def after(sentence, segments, qid): + sentence, segments, qid = utils.data.expand_dims(sentence, segments, qid) + return sentence, segments, qid + + predict_ds = feature_column.build_dataset_from_stdin('predict') \ + .map(before) \ + .padded_batch(hparams.batch_size, (0, 0, 0)) \ + .map(after) + + predict_ds.data_shapes = shapes[: -1] + predict_ds.data_types = types[: -1] + + est = propeller.Learner(RankingErnieModel, run_config, hparams) + for qid, res in est.predict(predict_ds, ckpt=-1): + print('%d\t%d\t%.5f\t%.5f' % (qid[0], np.argmax(res), res[0], res[1])) + #for i in predict_ds: + # sen = i[0] + # for ss in np.squeeze(sen): + # print(' '.join(map(str, ss))) + diff --git a/example/propeller_xnli_demo.ipynb b/example/propeller_xnli_demo.ipynb new file mode 100644 index 0000000..c4f6150 --- /dev/null +++ b/example/propeller_xnli_demo.ipynb @@ -0,0 +1,541 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "import numpy as np\n", + "import re\n", + "import logging\n", + "import json" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sys.path.append('../ernie')\n", + "sys.path.append('../')\n", + "%env CUDA_VICIBLE_DEVICES=7\n", + "# if CUDA_VICIBLE_DEVICES is changed, relaunch jupyter kernel to inform paddle" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import propeller.paddle as propeller\n", + "import paddle\n", + "import paddle.fluid as F\n", + "import paddle.fluid.layers as L\n", + "#import model defenition from original ERNIE\n", + "from model.ernie import ErnieModel\n", + "from tokenization import FullTokenizer\n", + "from optimization import optimization\n", + "from propeller import log\n", + "log.setLevel(logging.DEBUG)\n", + "\n", + "if paddle.__version__ not in ['1.5.1', '1.5.2']:\n", + " raise RuntimeError('propeller works in paddle1.5.1')\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%%bash\n", + "# download pretrained model&config(ernie1.0) and xnli data\n", + "mkdir ernie1.0_pretrained\n", + "if [ ! -f ernie1.0_pretrained/ERNIE_stable-1.0.1.tar.gz ]\n", + "then\n", + " echo \"download model\"\n", + " wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz -P ernie1.0_pretrained\n", + "fi\n", + "\n", + "if [ ! -f task_data_zh.tgz ]\n", + "then\n", + " echo \"download data\"\n", + " wget --no-check-certificate https://ernie.bj.bcebos.com/task_data_zh.tgz\n", + "fi\n", + "\n", + "tar xzf ernie1.0_pretrained/ERNIE_stable-1.0.1.tar.gz -C ernie1.0_pretrained\n", + "tar xzf task_data_zh.tgz" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#define basic training settings\n", + "EPOCH=3\n", + "BATCH=16\n", + "LR=5e-3\n", + "MAX_SEQLEN=128\n", + "TASK_DATA='./task_data/'\n", + "MODEL='./ernie1.0_pretrained/'\n", + "OUTPUT_DIR='./output'" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!rm -rf {OUTPUT_DIR}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#skip header, and reorganize train data into ./xnli_data \n", + "!mkdir xnli_data\n", + "!mkdir xnli_data/train\n", + "!mkdir xnli_data/test\n", + "!mkdir xnli_data/dev\n", + "\n", + "def remove_header_and_save(fname_in, fname_out):\n", + " with open(fname_out, 'w') as fout:\n", + " buf = open(fname_in).readlines()[1:]\n", + " for i in buf:\n", + " fout.write(i)\n", + " return len(buf)\n", + "train_data_size = remove_header_and_save(TASK_DATA + '/xnli/train.tsv', './xnli_data/train/part.0') \n", + "dev_data_size = remove_header_and_save(TASK_DATA + '/xnli/dev.tsv', './xnli_data/dev/part.0') \n", + "test_data_size = remove_header_and_save(TASK_DATA + '/xnli/test.tsv', './xnli_data/test/part.0') \n", + "print(train_data_size)\n", + "print(dev_data_size)\n", + "print(test_data_size)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "tokenizer = FullTokenizer(MODEL + 'vocab.txt')\n", + "vocab = {j.strip().split('\\t')[0]: i for i, j in enumerate(open(MODEL + 'vocab.txt', encoding='utf8'))}\n", + "\n", + "print(tokenizer.tokenize('今天很热'))\n", + "print(tokenizer.tokenize('coding in paddle is cool'))\n", + "print(tokenizer.tokenize('[CLS]i have an pen')) # note: special token like [CLS], will be segmented, so please add these id after tokenization.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`propeller.data.FeatureColumns` defines the data schema in every data file.\n", + "\n", + "our data consist of 3 columns: seg_a, seg_b, label. with \"\\t\" as delemeter.\n", + "\n", + "`TextColumn` will do 3 things for you: \n", + "\n", + "1. tokenize input sentence with user-defined `tokenizer_func`\n", + "2. vocab lookup\n", + "3. serialize to protobuf bin file (optional)\n", + "\n", + "data file is organized into following patten:\n", + "\n", + "```script\n", + "./xnli_data\n", + "|-- dev\n", + "| `-- part.0\n", + "|-- test\n", + "| `-- part.0\n", + "|-- train\n", + " `-- part.0\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "sep_id = vocab['[SEP]']\n", + "cls_id = vocab['[CLS]']\n", + "unk_id = vocab['[UNK]']\n", + "\n", + "label_map = {\n", + " b\"contradictory\": 0,\n", + " b\"contradiction\": 0,\n", + " b\"entailment\": 1,\n", + " b\"neutral\": 2,\n", + "}\n", + "def tokenizer_func(inputs):\n", + " ret = tokenizer.tokenize(inputs) #`tokenize` will conver bytes to str, so we use a str vocab\n", + " return ret\n", + "\n", + "feature_column = propeller.data.FeatureColumns([\n", + " propeller.data.TextColumn('title', unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func),\n", + " propeller.data.TextColumn('comment', unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func),\n", + " propeller.data.LabelColumn('label', vocab_dict=label_map), #be careful, Columns deal with python3 bytes directly.\n", + "])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## trian model in propeller can be defined in 2 ways:\n", + "1. subclass of `propeller.train.Model` which implements:\n", + " 1. `__init__` (hyper_param, mode, run_config)\n", + " 2. `forward` (features) => (prediction)\n", + " 3. `backword` (loss) => None\n", + " 4. `loss` (predictoin) => (loss)\n", + " 5. `metrics` (optional) (prediction) => (dict of propeller.Metrics)\n", + " \n", + "2. a callable takes following args:\n", + " 1. features\n", + " 2. param\n", + " 3. mode\n", + " 4. run_config(optional)\n", + " \n", + " and returns a propeller.ModelSpec\n", + " \n", + "we use the subclasss approch here" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "class ClassificationErnieModel(propeller.train.Model):\n", + " def __init__(self, hparam, mode, run_config):\n", + " self.hparam = hparam\n", + " self.mode = mode\n", + " self.run_config = run_config\n", + "\n", + " def forward(self, features):\n", + " src_ids, sent_ids = features\n", + " dtype = 'float16' if self.hparam['use_fp16'] else 'float32'\n", + " zero = L.fill_constant([1], dtype='int64', value=0)\n", + " input_mask = L.cast(L.equal(src_ids, zero), dtype) # assume pad id == 0\n", + " #input_mask = L.unsqueeze(input_mask, axes=[2])\n", + " d_shape = L.shape(src_ids)\n", + " seqlen = d_shape[1]\n", + " batch_size = d_shape[0]\n", + " pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0])\n", + " pos_ids = L.expand(pos_ids, [batch_size, 1])\n", + " pos_ids = L.unsqueeze(pos_ids, axes=[2])\n", + " pos_ids = L.cast(pos_ids, 'int64')\n", + " pos_ids.stop_gradient = True\n", + " input_mask.stop_gradient = True\n", + " task_ids = L.zeros_like(src_ids) + self.hparam.task_id #this shit wont use at the moment\n", + " task_ids.stop_gradient = True\n", + "\n", + " ernie = ErnieModel(\n", + " src_ids=src_ids,\n", + " position_ids=pos_ids,\n", + " sentence_ids=sent_ids,\n", + " task_ids=task_ids,\n", + " input_mask=input_mask,\n", + " config=self.hparam,\n", + " use_fp16=self.hparam['use_fp16']\n", + " )\n", + "\n", + " cls_feats = ernie.get_pooled_output()\n", + "\n", + " cls_feats = L.dropout(\n", + " x=cls_feats,\n", + " dropout_prob=0.1,\n", + " dropout_implementation=\"upscale_in_train\"\n", + " )\n", + "\n", + " logits = L.fc(\n", + " input=cls_feats,\n", + " size=self.hparam['num_label'],\n", + " param_attr=F.ParamAttr(\n", + " name=\"cls_out_w\",\n", + " initializer=F.initializer.TruncatedNormal(scale=0.02)),\n", + " bias_attr=F.ParamAttr(\n", + " name=\"cls_out_b\", initializer=F.initializer.Constant(0.))\n", + " )\n", + "\n", + " propeller.summary.histogram('pred', logits)\n", + "\n", + " if self.mode is propeller.RunMode.PREDICT:\n", + " probs = L.softmax(logits)\n", + " return probs\n", + " else:\n", + " return logits\n", + "\n", + " def loss(self, predictions, labels):\n", + " ce_loss, probs = L.softmax_with_cross_entropy(\n", + " logits=predictions, label=labels, return_softmax=True)\n", + " #L.Print(ce_loss, message='per_example_loss')\n", + " loss = L.mean(x=ce_loss)\n", + " return loss\n", + "\n", + " def backward(self, loss):\n", + " scheduled_lr, loss_scale = optimization(\n", + " loss=loss,\n", + " warmup_steps=int(self.run_config.max_steps * self.hparam['warmup_proportion']),\n", + " num_train_steps=self.run_config.max_steps,\n", + " learning_rate=self.hparam['learning_rate'],\n", + " train_program=F.default_main_program(),\n", + " startup_prog=F.default_startup_program(),\n", + " weight_decay=self.hparam['weight_decay'],\n", + " scheduler=\"linear_warmup_decay\",)\n", + " propeller.summary.scalar('lr', scheduled_lr)\n", + "\n", + " def metrics(self, predictions, label):\n", + " predictions = L.argmax(predictions, axis=1)\n", + " predictions = L.unsqueeze(predictions, axes=[1])\n", + " acc = propeller.metrics.Acc(label, predictions)\n", + " #auc = propeller.metrics.Auc(label, predictions)\n", + " return {'acc': acc}\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# define some utility function.\n", + "\n", + "def build_2_pair(seg_a, seg_b):\n", + " token_type_a = np.ones_like(seg_a, dtype=np.int64) * 0\n", + " token_type_b = np.ones_like(seg_b, dtype=np.int64) * 1\n", + " sen_emb = np.concatenate([[cls_id], seg_a, [sep_id], seg_b, [sep_id]], 0)\n", + " token_type_emb = np.concatenate([[0], token_type_a, [0], token_type_b, [1]], 0)\n", + " #seqlen = sen_emb.shape[0]\n", + " #deteministic truncate\n", + " sen_emb = sen_emb[0: MAX_SEQLEN]\n", + " token_type_emb = token_type_emb[0: MAX_SEQLEN]\n", + " return sen_emb, token_type_emb\n", + "\n", + "def expand_dims(*args):\n", + " func = lambda i: np.expand_dims(i, -1)\n", + " ret = [func(i) for i in args]\n", + " return ret\n", + "\n", + "def before_pad(seg_a, seg_b, label):\n", + " sentence, segments = build_2_pair(seg_a, seg_b)\n", + " return sentence, segments, label\n", + "\n", + "def after_pad(sentence, segments, label):\n", + " sentence, segments, label = expand_dims(sentence, segments, label)\n", + " return sentence, segments, label" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# a `propeller.paddle.data.Dataset` is built from FeatureColumns\n", + "\n", + "train_ds = feature_column.build_dataset('train', use_gz=False, data_dir='./xnli_data/train', shuffle=True, repeat=True) \\\n", + " .map(before_pad) \\\n", + " .padded_batch(BATCH, (0, 0, 0)) \\\n", + " .map(after_pad)\n", + "\n", + "dev_ds = feature_column.build_dataset('dev', use_gz=False, data_dir='./xnli_data/dev', shuffle=False, repeat=False) \\\n", + " .map(before_pad) \\\n", + " .padded_batch(BATCH, (0, 0, 0)) \\\n", + " .map(after_pad)\n", + "\n", + "shapes = ([-1, MAX_SEQLEN, 1], [-1, MAX_SEQLEN, 1], [-1, 1])\n", + "types = ('int64', 'int64', 'int64')\n", + "train_ds.data_shapes = shapes\n", + "train_ds.data_types = types\n", + "dev_ds.data_shapes = shapes\n", + "dev_ds.data_types = types\n", + "\n", + "warm_start_dir = MODEL + '/params'\n", + "# only the encoder and embedding is loaded from pretrained model\n", + "varname_to_warmstart = re.compile('^encoder.*w_0$|^encoder.*b_0$|^.*embedding$|^.*bias$|^.*scale$')\n", + "ws = propeller.WarmStartSetting(\n", + " predicate_fn=lambda v: varname_to_warmstart.match(v.name) and os.path.exists(os.path.join(warm_start_dir, v.name)),\n", + " from_dir=warm_start_dir\n", + " )\n", + "\n", + "# propeller will export model of highest performance, the criteria is up to you. \n", + "# here we pick the model with maximum evaluatoin accuracy.\n", + "#`BestInferenceModelExporter` is used to export serveable models\n", + "best_inference_exporter = propeller.train.exporter.BestInferenceModelExporter(\n", + " os.path.join(OUTPUT_DIR, 'best'), \n", + " cmp_fn=lambda old, new: new['eval']['acc'] > old['eval']['acc'])\n", + "#`BestExporter` is used to export restartable checkpoint, so that we can restore from it and check test-set accuracy.\n", + "best_exporter = propeller.train.exporter.BestExporter(\n", + " os.path.join(OUTPUT_DIR, 'best_model'), \n", + " cmp_fn=lambda old, new: new['eval']['acc'] > old['eval']['acc'])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "#ERNIE1.0 config \n", + "ernie_config = propeller.HParams(**json.loads(open(MODEL + '/ernie_config.json').read()))\n", + "\n", + "# default term in official config\n", + "ernie_v2_config = propeller.HParams(**{\n", + " \"sent_type_vocab_size\": None, \n", + " \"use_task_id\": False,\n", + " \"task_id\": 0,\n", + "})\n", + "\n", + "# train schema\n", + "train_config = propeller.HParams(**{ \n", + " \"warmup_proportion\": 0.1,\n", + " \"weight_decay\": 0.01,\n", + " \"use_fp16\": 0,\n", + " \"learning_rate\": 0.00005,\n", + " \"num_label\": 3,\n", + " \"batch_size\": 32\n", + "})\n", + "\n", + "config = ernie_config.join(ernie_v2_config).join(train_config)\n", + "\n", + "run_config = propeller.RunConfig(\n", + " model_dir=OUTPUT_DIR,\n", + " max_steps=EPOCH * train_data_size / BATCH,\n", + " skip_steps=10,\n", + " eval_steps=1000,\n", + " save_steps=1000,\n", + " log_steps=10,\n", + " max_ckpt=3\n", + ")\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Finetune and Eval" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# `train_and_eval` takes key-word args only\n", + "# we are now ready to train\n", + "hooks = [propeller.train.TqdmNotebookProgressBarHook(run_config.max_steps)] # to show the progress bar, you need to `pip install tqdm ipywidgets`\n", + "propeller.train_and_eval(\n", + " model_class_or_model_fn=ClassificationErnieModel, #**careful**, you should pass a Class to `train_and_eval`, propeller will try to instantiate it.\n", + " params=config, \n", + " run_config=run_config, \n", + " train_dataset=train_ds, \n", + " eval_dataset=dev_ds, \n", + " warm_start_setting=ws, \n", + " exporters=[best_exporter, best_inference_exporter],\n", + " train_hooks=hooks,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Predict" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# after training you might want to check your model performance on test-set\n", + "# let's do this via `propeller.predict`\n", + "# keep in mind that model of best performace has been exported during thet `train_and_eval` phrase\n", + "\n", + "best_filename = [file for file in os.listdir(os.path.join(OUTPUT_DIR, 'best_model')) if 'model' in file][0]\n", + "best_model_path = os.path.join(os.path.join(OUTPUT_DIR, 'best_model'), best_filename)\n", + "true_label = [label_map[(line.strip().split(b'\\t')[-1])]for line in open('./xnli_data/test/part.0', 'rb')]\n", + "\n", + "def drop_label(sentence, segments, label): #we drop the label column here\n", + " return sentence, segments\n", + "\n", + "test_ds = feature_column.build_dataset('test', use_gz=False, data_dir='./xnli_data/test', shuffle=False, repeat=False) \\\n", + " .map(before_pad) \\\n", + " .padded_batch(BATCH, (0, 0, 0)) \\\n", + " .map(after_pad) \\\n", + " .map(drop_label)\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result = []\n", + "learner = propeller.Learner(ClassificationErnieModel, run_config, params=config, )\n", + "for pred in learner.predict(test_ds, ckpt=-1):\n", + " result.append(np.argmax(pred))\n", + " \n", + "result, true_label = np.array(result), np.array(true_label)\n", + "\n", + "test_acc = (result == true_label).sum() / len(true_label)\n", + "print('test accuracy:%.5f' % test_acc)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Serving\n", + "your model is now ready to serve! \n", + "you can open up a server by propeller with \n", + "```script\n", + "python -m propeller.tools.start_server -m /path/to/saved/model -p 8888\n", + "```\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.5" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} -- GitLab