提交 efdd16d4 编写于 作者: C chenxuyi

+ more propeller examples

上级 dd26cf3a
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re
import time
import logging
from random import random
from functools import reduce, partial
import numpy as np
import multiprocessing
import paddle
import paddle.fluid as F
import paddle.fluid.layers as L
from model.ernie import ErnieModel
from optimization import optimization
import utils.data
from propeller import log
import propeller.paddle as propeller
log.setLevel(logging.DEBUG)
class ClassificationErnieModel(propeller.train.Model):
"""propeller Model wraper for paddle-ERNIE """
def __init__(self, hparam, mode, run_config):
self.hparam = hparam
self.mode = mode
self.run_config = run_config
def forward(self, features):
src_ids, sent_ids = features
zero = L.fill_constant([1], dtype='int64', value=0)
input_mask = L.cast(L.logical_not(L.equal(src_ids, zero)), 'float32') # assume pad id == 0
#input_mask = L.unsqueeze(input_mask, axes=[2])
d_shape = L.shape(src_ids)
seqlen = d_shape[1]
batch_size = d_shape[0]
pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0])
pos_ids = L.expand(pos_ids, [batch_size, 1])
pos_ids = L.unsqueeze(pos_ids, axes=[2])
pos_ids = L.cast(pos_ids, 'int64')
pos_ids.stop_gradient = True
input_mask.stop_gradient = True
task_ids = L.zeros_like(src_ids) + self.hparam.task_id #this shit wont use at the moment
task_ids.stop_gradient = True
ernie = ErnieModel(
src_ids=src_ids,
position_ids=pos_ids,
sentence_ids=sent_ids,
task_ids=task_ids,
input_mask=input_mask,
config=self.hparam,
use_fp16=self.hparam['use_fp16']
)
cls_feats = ernie.get_pooled_output()
cls_feats = L.dropout(
x=cls_feats,
dropout_prob=0.1,
dropout_implementation="upscale_in_train"
)
logits = L.fc(
input=cls_feats,
size=self.hparam['num_label'],
param_attr=F.ParamAttr(
name="cls_out_w",
initializer=F.initializer.TruncatedNormal(scale=0.02)),
bias_attr=F.ParamAttr(
name="cls_out_b", initializer=F.initializer.Constant(0.))
)
propeller.summary.histogram('pred', logits)
if self.mode is propeller.RunMode.PREDICT:
probs = L.softmax(logits)
return probs
else:
return logits
def loss(self, predictions, labels):
ce_loss, probs = L.softmax_with_cross_entropy(
logits=predictions, label=labels, return_softmax=True)
#L.Print(ce_loss, message='per_example_loss')
loss = L.mean(x=ce_loss)
return loss
def backward(self, loss):
scheduled_lr, _ = optimization(
loss=loss,
warmup_steps=int(self.run_config.max_steps * self.hparam['warmup_proportion']),
num_train_steps=self.run_config.max_steps,
learning_rate=self.hparam['learning_rate'],
train_program=F.default_main_program(),
startup_prog=F.default_startup_program(),
weight_decay=self.hparam['weight_decay'],
scheduler="linear_warmup_decay",)
propeller.summary.scalar('lr', scheduled_lr)
def metrics(self, predictions, label):
predictions = L.argmax(predictions, axis=1)
predictions = L.unsqueeze(predictions, axes=[1])
acc = propeller.metrics.Acc(label, predictions)
#auc = propeller.metrics.Auc(label, predictions)
return {'acc': acc}
if __name__ == '__main__':
parser = propeller.ArgumentParser('classify model with ERNIE')
parser.add_argument('--max_seqlen', type=int, default=128)
parser.add_argument('--data_dir', type=str, required=True)
parser.add_argument('--vocab_file', type=str, required=True)
parser.add_argument('--do_predict', action='store_true')
parser.add_argument('--warm_start_from', type=str)
args = parser.parse_args()
run_config = propeller.parse_runconfig(args)
hparams = propeller.parse_hparam(args)
vocab = {j.strip().split(b'\t')[0].decode('utf8'): i for i, j in enumerate(open(args.vocab_file, 'rb'))}
sep_id = vocab['[SEP]']
cls_id = vocab['[CLS]']
unk_id = vocab['[UNK]']
tokenizer = utils.data.CharTokenizer(vocab.keys())
def tokenizer_func(inputs):
'''avoid pickle error'''
ret = tokenizer(inputs)
return ret
if not args.do_predict:
feature_column = propeller.data.FeatureColumns([
propeller.data.TextColumn('title',unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func),
propeller.data.LabelColumn('label'),
])
def before(seg_a, label):
sentence, segments = utils.data.build_1_pair(seg_a, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id)
return sentence, segments, label
def after(sentence, segments, label):
sentence, segments, label = utils.data.expand_dims(sentence, segments, label)
return sentence, segments, label
log.debug(os.path.join(args.data_dir, 'train'))
train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=True, use_gz=False) \
.map(before) \
.padded_batch(hparams.batch_size, (0, 0, 0)) \
.map(after)
dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \
.map(before) \
.padded_batch(hparams.batch_size, (0, 0, 0)) \
.map(after)
shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1], [-1, 1])
types = ('int64', 'int64', 'int64')
train_ds.data_shapes = shapes
train_ds.data_types = types
dev_ds.data_shapes = shapes
dev_ds.data_types = types
varname_to_warmstart = re.compile('encoder.*|pooled.*|.*embedding|pre_encoder_.*')
warm_start_dir = args.warm_start_from
ws = propeller.WarmStartSetting(
predicate_fn=lambda v: varname_to_warmstart.match(v.name) and os.path.exists(os.path.join(warm_start_dir, v.name)),
from_dir=warm_start_dir
)
best_exporter = propeller.train.exporter.BestInferenceModelExporter(os.path.join(run_config.model_dir, 'best'), cmp_fn=lambda old, new: new['eval']['acc'] > old['eval']['acc'])
propeller.train.train_and_eval(
model_class_or_model_fn=ClassificationErnieModel,
params=hparams,
run_config=run_config,
train_dataset=train_ds,
eval_dataset=dev_ds,
warm_start_setting=ws,
exporters=[best_exporter])
print('dev_acc\t%.5f' % (best_exporter._best['eval']['acc']))
else:
feature_column = propeller.data.FeatureColumns([
propeller.data.TextColumn('title',unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func),
propeller.data.LabelColumn('label'),
])
def before(seg_a):
sentence, segments = utils.data.build_1_pair(seg_a, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id)
return sentence, segments
def after(sentence, segments):
sentence, segments = utils.data.expand_dims(sentence, segments)
return sentence, segments
predict_ds = feature_column.build_dataset_from_stdin('predict') \
.map(before) \
.padded_batch(hparams.batch_size, (0, 0)) \
.map(after)
shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1])
types = ('int64', 'int64')
predict_ds.data_shapes = shapes
predict_ds.data_types = types
finetuned_model = propeller.Learner(ClassificationErnieModel, run_config, hparams)
for logits, in finetuned_model.predict(predict_ds, ckpt=-1): # ckpt=-1 means last step
print(np.argmax(logits))
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import os
import re
import time
from random import random
from functools import reduce, partial
import numpy as np
import multiprocessing
import logging
import six
import re
import paddle
import paddle.fluid as F
import paddle.fluid.layers as L
from model.ernie import ErnieModel
from optimization import optimization
import tokenization
import utils.data
from propeller import log
log.setLevel(logging.DEBUG)
import propeller.paddle as propeller
class SequenceLabelErnieModel(propeller.train.Model):
"""propeller Model wraper for paddle-ERNIE """
def __init__(self, hparam, mode, run_config):
self.hparam = hparam
self.mode = mode
self.run_config = run_config
self.num_label = len(hparam['label_list'])
def forward(self, features):
src_ids, sent_ids, input_seqlen = features
zero = L.fill_constant([1], dtype='int64', value=0)
input_mask = L.cast(L.equal(src_ids, zero), 'float32') # assume pad id == 0
#input_mask = L.unsqueeze(input_mask, axes=[2])
d_shape = L.shape(src_ids)
seqlen = d_shape[1]
batch_size = d_shape[0]
pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0])
pos_ids = L.expand(pos_ids, [batch_size, 1])
pos_ids = L.unsqueeze(pos_ids, axes=[2])
pos_ids = L.cast(pos_ids, 'int64')
pos_ids.stop_gradient = True
input_mask.stop_gradient = True
task_ids = L.zeros_like(src_ids) + self.hparam.task_id #this shit wont use at the moment
task_ids.stop_gradient = True
model = ErnieModel(
src_ids=src_ids,
position_ids=pos_ids,
sentence_ids=sent_ids,
task_ids=task_ids,
input_mask=input_mask,
config=self.hparam,
use_fp16=self.hparam['use_fp16']
)
enc_out = model.get_sequence_output()
logits = L.fc(
input=enc_out,
size=self.num_label,
num_flatten_dims=2,
param_attr= F.ParamAttr(
name="cls_seq_label_out_w",
initializer= F.initializer.TruncatedNormal(scale=0.02)),
bias_attr=F.ParamAttr(
name="cls_seq_label_out_b",
initializer=F.initializer.Constant(0.)))
propeller.summary.histogram('pred', logits)
return logits, input_seqlen
def loss(self, predictions, labels):
logits, input_seqlen = predictions
logits = L.flatten(logits, axis=2)
labels = L.flatten(labels, axis=2)
ce_loss, probs = L.softmax_with_cross_entropy(
logits=logits, label=labels, return_softmax=True)
loss = L.mean(x=ce_loss)
return loss
def backward(self, loss):
scheduled_lr, _ = optimization(
loss=loss,
warmup_steps=int(self.run_config.max_steps * self.hparam['warmup_proportion']),
num_train_steps=self.run_config.max_steps,
learning_rate=self.hparam['learning_rate'],
train_program=F.default_main_program(),
startup_prog=F.default_startup_program(),
weight_decay=self.hparam['weight_decay'],
scheduler="linear_warmup_decay",)
propeller.summary.scalar('lr', scheduled_lr)
def metrics(self, predictions, label):
pred, seqlen = predictions
pred = L.argmax(pred, axis=-1)
pred = L.unsqueeze(pred, axes=[-1])
f1 = propeller.metrics.ChunkF1(label, pred, seqlen, self.num_label)
return {'f1': f1}
def make_sequence_label_dataset(name, input_files, label_list, tokenizer, batch_size, max_seqlen, is_train):
label_map = {v: i for i, v in enumerate(label_list)}
no_entity_id = label_map['O']
delimiter = ''
def read_bio_data(filename):
ds = propeller.data.Dataset.from_file(filename)
iterable = iter(ds)
def gen():
buf, size = [], 0
iterator = iter(ds)
while 1:
line = next(iterator)
cols = line.rstrip(b'\n').split(b'\t')
if len(cols) != 2:
continue
tokens = tokenization.convert_to_unicode(cols[0]).split(delimiter)
labels = tokenization.convert_to_unicode(cols[1]).split(delimiter)
if len(tokens) != len(labels) or len(tokens) == 0:
continue
yield [tokens, labels]
return propeller.data.Dataset.from_generator_func(gen)
def reseg_token_label(dataset):
def gen():
iterator = iter(dataset)
while True:
tokens, labels = next(iterator)
assert len(tokens) == len(labels)
ret_tokens = []
ret_labels = []
for token, label in zip(tokens, labels):
sub_token = tokenizer.tokenize(token)
if len(sub_token) == 0:
continue
ret_tokens.extend(sub_token)
ret_labels.append(label)
if len(sub_token) < 2:
continue
sub_label = label
if label.startswith("B-"):
sub_label = "I-" + label[2:]
ret_labels.extend([sub_label] * (len(sub_token) - 1))
assert len(ret_tokens) == len(ret_labels)
yield ret_tokens, ret_labels
ds = propeller.data.Dataset.from_generator_func(gen)
return ds
def convert_to_ids(dataset):
def gen():
iterator = iter(dataset)
while True:
tokens, labels = next(iterator)
if len(tokens) > max_seqlen - 2:
tokens = tokens[: max_seqlen - 2]
labels = labels[: max_seqlen - 2]
tokens = ['[CLS]'] + tokens + ['[SEP]']
token_ids = tokenizer.convert_tokens_to_ids(tokens)
label_ids = [no_entity_id] + [label_map[x] for x in labels] + [no_entity_id]
token_type_ids = [0] * len(token_ids)
input_seqlen = len(token_ids)
token_ids = np.array(token_ids, dtype=np.int64)
label_ids = np.array(label_ids, dtype=np.int64)
token_type_ids = np.array(token_type_ids, dtype=np.int64)
input_seqlen = np.array(input_seqlen, dtype=np.int64)
yield token_ids, token_type_ids, input_seqlen, label_ids
ds = propeller.data.Dataset.from_generator_func(gen)
return ds
def after(*features):
return utils.data.expand_dims(*features)
dataset = propeller.data.Dataset.from_list(input_files)
if is_train:
dataset = dataset.repeat().shuffle(buffer_size=len(input_files))
dataset = dataset.interleave(map_fn=read_bio_data, cycle_length=len(input_files), block_length=1)
if is_train:
dataset = dataset.shuffle(buffer_size=100)
dataset = reseg_token_label(dataset)
dataset = convert_to_ids(dataset)
dataset = dataset.padded_batch(batch_size).map(after)
dataset.name = name
return dataset
def make_sequence_label_dataset_from_stdin(name, tokenizer, batch_size, max_seqlen):
delimiter = ''
def stdin_gen():
if six.PY3:
source = sys.stdin.buffer
else:
source = sys.stdin
while True:
line = source.readline()
if len(line) == 0:
break
yield line,
def read_bio_data(ds):
iterable = iter(ds)
def gen():
buf, size = [], 0
iterator = iter(ds)
while 1:
line, = next(iterator)
cols = line.rstrip(b'\n').split(b'\t')
if len(cols) != 1:
continue
tokens = tokenization.convert_to_unicode(cols[0]).split(delimiter)
if len(tokens) == 0:
continue
yield tokens,
return propeller.data.Dataset.from_generator_func(gen)
def reseg_token_label(dataset):
def gen():
iterator = iter(dataset)
while True:
tokens, = next(iterator)
ret_tokens = []
for token in tokens:
sub_token = tokenizer.tokenize(token)
if len(sub_token) == 0:
continue
ret_tokens.extend(sub_token)
if len(sub_token) < 2:
continue
yield ret_tokens,
ds = propeller.data.Dataset.from_generator_func(gen)
return ds
def convert_to_ids(dataset):
def gen():
iterator = iter(dataset)
while True:
tokens, = next(iterator)
if len(tokens) > max_seqlen - 2:
tokens = tokens[: max_seqlen - 2]
tokens = ['[CLS]'] + tokens + ['[SEP]']
token_ids = tokenizer.convert_tokens_to_ids(tokens)
token_type_ids = [0] * len(token_ids)
input_seqlen = len(token_ids)
token_ids = np.array(token_ids, dtype=np.int64)
token_type_ids = np.array(token_type_ids, dtype=np.int64)
input_seqlen = np.array(input_seqlen, dtype=np.int64)
yield token_ids, token_type_ids, input_seqlen
ds = propeller.data.Dataset.from_generator_func(gen)
return ds
def after(*features):
return utils.data.expand_dims(*features)
dataset = propeller.data.Dataset.from_generator_func(stdin_gen)
dataset = read_bio_data(dataset)
dataset = reseg_token_label(dataset)
dataset = convert_to_ids(dataset)
dataset = dataset.padded_batch(batch_size).map(after)
dataset.name = name
return dataset
if __name__ == '__main__':
parser = propeller.ArgumentParser('NER model with ERNIE')
parser.add_argument('--max_seqlen', type=int, default=128)
parser.add_argument('--data_dir', type=str, required=True)
parser.add_argument('--vocab_file', type=str, required=True)
parser.add_argument('--do_predict', action='store_true')
parser.add_argument('--warm_start_from', type=str)
args = parser.parse_args()
run_config = propeller.parse_runconfig(args)
hparams = propeller.parse_hparam(args)
tokenizer = tokenization.FullTokenizer(args.vocab_file)
vocab = tokenizer.vocab
sep_id = vocab['[SEP]']
cls_id = vocab['[CLS]']
unk_id = vocab['[UNK]']
pad_id = vocab['[PAD]']
label_list = ['B-PER', 'I-PER', 'B-ORG', 'I-ORG', 'B-LOC', 'I-LOC', 'O']
hparams['label_list'] = label_list
if not args.do_predict:
train_data_dir = os.path.join(args.data_dir, 'train')
train_input_files = [os.path.join(train_data_dir, filename) for filename in os.listdir(train_data_dir)]
dev_data_dir = os.path.join(args.data_dir, 'dev')
dev_input_files = [os.path.join(dev_data_dir, filename) for filename in os.listdir(dev_data_dir)]
test_data_dir = os.path.join(args.data_dir, 'test')
test_input_files = [os.path.join(test_data_dir, filename) for filename in os.listdir(test_data_dir)]
train_ds = make_sequence_label_dataset(name='train',
input_files=train_input_files,
label_list=label_list,
tokenizer=tokenizer,
batch_size=hparams.batch_size,
max_seqlen=args.max_seqlen,
is_train=True)
dev_ds = make_sequence_label_dataset(name='dev',
input_files=dev_input_files,
label_list=label_list,
tokenizer=tokenizer,
batch_size=hparams.batch_size,
max_seqlen=args.max_seqlen,
is_train=False)
test_ds = make_sequence_label_dataset(name='test',
input_files=test_input_files,
label_list=label_list,
tokenizer=tokenizer,
batch_size=hparams.batch_size,
max_seqlen=args.max_seqlen,
is_train=False)
shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1], [-1, 1], [-1, args.max_seqlen, 1])
types = ('int64', 'int64', 'int64', 'int64')
train_ds.data_shapes = shapes
train_ds.data_types = types
dev_ds.data_shapes = shapes
dev_ds.data_types = types
test_ds.data_shapes = shapes
test_ds.data_types = types
varname_to_warmstart = re.compile(r'^encoder.*[wb]_0$|^.*embedding$|^.*bias$|^.*scale$|^pooled_fc.[wb]_0$')
warm_start_dir = args.warm_start_from
ws = propeller.WarmStartSetting(
predicate_fn=lambda v: varname_to_warmstart.match(v.name) and os.path.exists(os.path.join(warm_start_dir, v.name)),
from_dir=warm_start_dir
)
best_exporter = propeller.train.exporter.BestExporter(os.path.join(run_config.model_dir, 'best'), cmp_fn=lambda old, new: new['dev']['f1'] > old['dev']['f1'])
propeller.train.train_and_eval(
model_class_or_model_fn=SequenceLabelErnieModel,
params=hparams,
run_config=run_config,
train_dataset=train_ds,
eval_dataset={'dev': dev_ds, 'test': test_ds},
warm_start_setting=ws,
exporters=[best_exporter])
for k in best_exporter._best['dev'].keys():
if 'loss' in k:
continue
dev_v = best_exporter._best['dev'][k]
test_v = best_exporter._best['test'][k]
print('dev_%s\t%.5f\ntest_%s\t%.5f' % (k, dev_v, k, test_v))
else:
predict_ds = make_sequence_label_dataset_from_stdin(name='pred',
tokenizer=tokenizer,
batch_size=hparams.batch_size,
max_seqlen=args.max_seqlen)
shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1], [-1, 1])
types = ('int64', 'int64', 'int64')
predict_ds.data_shapes = shapes
predict_ds.data_types = types
rev_label_map = {i: v for i, v in enumerate(label_list)}
best_exporter = propeller.train.exporter.BestExporter(os.path.join(run_config.model_dir, 'best'), cmp_fn=lambda old, new: new['dev']['f1'] > old['dev']['f1'])
learner = propeller.Learner(SequenceLabelErnieModel, run_config, hparams)
for pred, _ in learner.predict(predict_ds, ckpt=-1):
pred_str = ' '.join([rev_label_map[idx] for idx in np.argmax(pred, 1).tolist()])
print(pred_str)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import re
import time
import logging
import six
import sys
import io
from random import random
from functools import reduce, partial, wraps
import numpy as np
import multiprocessing
import re
import paddle
import paddle.fluid as F
import paddle.fluid.layers as L
from model.ernie import ErnieModel
from optimization import optimization
import utils.data
from propeller import log
import propeller.paddle as propeller
log.setLevel(logging.DEBUG)
class RankingErnieModel(propeller.train.Model):
"""propeller Model wraper for paddle-ERNIE """
def __init__(self, hparam, mode, run_config):
self.hparam = hparam
self.mode = mode
self.run_config = run_config
def forward(self, features):
src_ids, sent_ids, qid = features
zero = L.fill_constant([1], dtype='int64', value=0)
input_mask = L.cast(L.logical_not(L.equal(src_ids, zero)), 'float32') # assume pad id == 0
#input_mask = L.unsqueeze(input_mask, axes=[2])
d_shape = L.shape(src_ids)
seqlen = d_shape[1]
batch_size = d_shape[0]
pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0])
pos_ids = L.expand(pos_ids, [batch_size, 1])
pos_ids = L.unsqueeze(pos_ids, axes=[2])
pos_ids = L.cast(pos_ids, 'int64')
pos_ids.stop_gradient = True
input_mask.stop_gradient = True
task_ids = L.zeros_like(src_ids) + self.hparam.task_id #this shit wont use at the moment
task_ids.stop_gradient = True
ernie = ErnieModel(
src_ids=src_ids,
position_ids=pos_ids,
sentence_ids=sent_ids,
task_ids=task_ids,
input_mask=input_mask,
config=self.hparam,
use_fp16=self.hparam['use_fp16']
)
cls_feats = ernie.get_pooled_output()
cls_feats = L.dropout(
x=cls_feats,
dropout_prob=0.1,
dropout_implementation="upscale_in_train"
)
logits = L.fc(
input=cls_feats,
size=self.hparam['num_label'],
param_attr=F.ParamAttr(
name="cls_out_w",
initializer=F.initializer.TruncatedNormal(scale=0.02)),
bias_attr=F.ParamAttr(
name="cls_out_b", initializer=F.initializer.Constant(0.))
)
propeller.summary.histogram('pred', logits)
if self.mode is propeller.RunMode.PREDICT:
probs = L.softmax(logits)
return qid, probs
else:
return qid, logits
def loss(self, predictions, labels):
qid, predictions = predictions
ce_loss, probs = L.softmax_with_cross_entropy(
logits=predictions, label=labels, return_softmax=True)
#L.Print(ce_loss, message='per_example_loss')
loss = L.mean(x=ce_loss)
return loss
def metrics(self, predictions, label):
qid, logits = predictions
positive_class_logits = L.slice(logits, axes=[1], starts=[1], ends=[2])
mrr = propeller.metrics.Mrr(qid, label, positive_class_logits)
predictions = L.argmax(logits, axis=1)
predictions = L.unsqueeze(predictions, axes=[1])
f1 = propeller.metrics.F1(label, predictions)
acc = propeller.metrics.Acc(label, predictions)
#auc = propeller.metrics.Auc(label, predictions)
return {'acc': acc, 'f1': f1, 'mrr': mrr}
def backward(self, loss):
scheduled_lr, _ = optimization(
loss=loss,
warmup_steps=int(self.run_config.max_steps * self.hparam['warmup_proportion']),
num_train_steps=self.run_config.max_steps,
learning_rate=self.hparam['learning_rate'],
train_program=F.default_main_program(),
startup_prog=F.default_startup_program(),
weight_decay=self.hparam['weight_decay'],
scheduler="linear_warmup_decay",)
propeller.summary.scalar('lr', scheduled_lr)
if __name__ == '__main__':
parser = propeller.ArgumentParser('ranker model with ERNIE')
parser.add_argument('--do_predict', action='store_true')
parser.add_argument('--predict_model', type=str, default=None)
parser.add_argument('--max_seqlen', type=int, default=128)
parser.add_argument('--vocab_file', type=str, required=True)
parser.add_argument('--data_dir', type=str, required=True)
parser.add_argument('--warm_start_from', type=str)
parser.add_argument('--sentence_piece_model', type=str, default=None)
args = parser.parse_args()
run_config = propeller.parse_runconfig(args)
hparams = propeller.parse_hparam(args)
vocab = {j.strip().split(b'\t')[0].decode('utf8') : i for i, j in enumerate(open(args.vocab_file, 'rb'))}
sep_id = vocab['[SEP]']
cls_id = vocab['[CLS]']
unk_id = vocab['[UNK]']
if args.sentence_piece_model is not None:
tokenizer = utils.data.JBSPTokenizer(args.sentence_piece_model, jb=True, lower=True)
else:
tokenizer = utils.data.CharTokenizer(vocab.keys())
def tokenizer_func(inputs):
'''avoid pickle error'''
ret = tokenizer(inputs)
return ret
shapes = ([-1, args.max_seqlen, 1], [-1, args.max_seqlen, 1], [-1, 1], [-1, 1])
types = ('int64', 'int64', 'int64', 'int64')
if not args.do_predict:
feature_column = propeller.data.FeatureColumns([
propeller.data.LabelColumn('qid'),
propeller.data.TextColumn('title', vocab_dict=vocab, tokenizer=tokenizer_func, unk_id=unk_id),
propeller.data.TextColumn('comment', vocab_dict=vocab, tokenizer=tokenizer_func, unk_id=unk_id),
propeller.data.LabelColumn('label'),
])
def before(qid, seg_a, seg_b, label):
sentence, segments = utils.data.build_2_pair(seg_a, seg_b, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id)
return sentence, segments, qid, label
def after(sentence, segments, qid, label):
sentence, segments, qid, label = utils.data.expand_dims(sentence, segments, qid, label)
return sentence, segments, qid, label
train_ds = feature_column.build_dataset('train', data_dir=os.path.join(args.data_dir, 'train'), shuffle=True, repeat=True, use_gz=False) \
.map(before) \
.padded_batch(hparams.batch_size, (0, 0, 0, 0)) \
.map(after)
dev_ds = feature_column.build_dataset('dev', data_dir=os.path.join(args.data_dir, 'dev'), shuffle=False, repeat=False, use_gz=False) \
.map(before) \
.padded_batch(hparams.batch_size, (0, 0, 0, 0)) \
.map(after)
test_ds = feature_column.build_dataset('test', data_dir=os.path.join(args.data_dir, 'test'), shuffle=False, repeat=False, use_gz=False) \
.map(before) \
.padded_batch(hparams.batch_size, (0, 0, 0, 0)) \
.map(after)
train_ds.data_shapes = shapes
train_ds.data_types = types
dev_ds.data_shapes = shapes
dev_ds.data_types = types
test_ds.data_shapes = shapes
test_ds.data_types = types
varname_to_warmstart = re.compile(r'^encoder.*[wb]_0$|^.*embedding$|^.*bias$|^.*scale$|^pooled_fc.[wb]_0$')
warm_start_dir = args.warm_start_from
ws = propeller.WarmStartSetting(
predicate_fn=lambda v: varname_to_warmstart.match(v.name) and os.path.exists(os.path.join(warm_start_dir, v.name)),
from_dir=warm_start_dir
)
best_exporter = propeller.train.exporter.BestExporter(os.path.join(run_config.model_dir, 'best'), cmp_fn=lambda old, new: new['dev']['f1'] > old['dev']['f1'])
propeller.train_and_eval(
model_class_or_model_fn=RankingErnieModel,
params=hparams,
run_config=run_config,
train_dataset=train_ds,
eval_dataset={'dev': dev_ds, 'test': test_ds},
warm_start_setting=ws,
exporters=[best_exporter])
print('dev_mrr\t%.5f\ntest_mrr\t%.5f\ndev_f1\t%.5f\ntest_f1\t%.5f' % (
best_exporter._best['dev']['mrr'], best_exporter._best['test']['mrr'],
best_exporter._best['dev']['f1'], best_exporter._best['test']['f1'],
))
else:
feature_column = propeller.data.FeatureColumns([
propeller.data.LabelColumn('qid'),
propeller.data.TextColumn('title', unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func),
propeller.data.TextColumn('comment', unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func),
])
def before(qid, seg_a, seg_b):
sentence, segments = utils.data.build_2_pair(seg_a, seg_b, max_seqlen=args.max_seqlen, cls_id=cls_id, sep_id=sep_id)
return sentence, segments, qid
def after(sentence, segments, qid):
sentence, segments, qid = utils.data.expand_dims(sentence, segments, qid)
return sentence, segments, qid
predict_ds = feature_column.build_dataset_from_stdin('predict') \
.map(before) \
.padded_batch(hparams.batch_size, (0, 0, 0)) \
.map(after)
predict_ds.data_shapes = shapes[: -1]
predict_ds.data_types = types[: -1]
est = propeller.Learner(RankingErnieModel, run_config, hparams)
for qid, res in est.predict(predict_ds, ckpt=-1):
print('%d\t%d\t%.5f\t%.5f' % (qid[0], np.argmax(res), res[0], res[1]))
#for i in predict_ds:
# sen = i[0]
# for ss in np.squeeze(sen):
# print(' '.join(map(str, ss)))
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import os\n",
"import numpy as np\n",
"import re\n",
"import logging\n",
"import json"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sys.path.append('../ernie')\n",
"sys.path.append('../')\n",
"%env CUDA_VICIBLE_DEVICES=7\n",
"# if CUDA_VICIBLE_DEVICES is changed, relaunch jupyter kernel to inform paddle"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import propeller.paddle as propeller\n",
"import paddle\n",
"import paddle.fluid as F\n",
"import paddle.fluid.layers as L\n",
"#import model defenition from original ERNIE\n",
"from model.ernie import ErnieModel\n",
"from tokenization import FullTokenizer\n",
"from optimization import optimization\n",
"from propeller import log\n",
"log.setLevel(logging.DEBUG)\n",
"\n",
"if paddle.__version__ not in ['1.5.1', '1.5.2']:\n",
" raise RuntimeError('propeller works in paddle1.5.1')\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"%%bash\n",
"# download pretrained model&config(ernie1.0) and xnli data\n",
"mkdir ernie1.0_pretrained\n",
"if [ ! -f ernie1.0_pretrained/ERNIE_stable-1.0.1.tar.gz ]\n",
"then\n",
" echo \"download model\"\n",
" wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz -P ernie1.0_pretrained\n",
"fi\n",
"\n",
"if [ ! -f task_data_zh.tgz ]\n",
"then\n",
" echo \"download data\"\n",
" wget --no-check-certificate https://ernie.bj.bcebos.com/task_data_zh.tgz\n",
"fi\n",
"\n",
"tar xzf ernie1.0_pretrained/ERNIE_stable-1.0.1.tar.gz -C ernie1.0_pretrained\n",
"tar xzf task_data_zh.tgz"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#define basic training settings\n",
"EPOCH=3\n",
"BATCH=16\n",
"LR=5e-3\n",
"MAX_SEQLEN=128\n",
"TASK_DATA='./task_data/'\n",
"MODEL='./ernie1.0_pretrained/'\n",
"OUTPUT_DIR='./output'"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"!rm -rf {OUTPUT_DIR}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#skip header, and reorganize train data into ./xnli_data \n",
"!mkdir xnli_data\n",
"!mkdir xnli_data/train\n",
"!mkdir xnli_data/test\n",
"!mkdir xnli_data/dev\n",
"\n",
"def remove_header_and_save(fname_in, fname_out):\n",
" with open(fname_out, 'w') as fout:\n",
" buf = open(fname_in).readlines()[1:]\n",
" for i in buf:\n",
" fout.write(i)\n",
" return len(buf)\n",
"train_data_size = remove_header_and_save(TASK_DATA + '/xnli/train.tsv', './xnli_data/train/part.0') \n",
"dev_data_size = remove_header_and_save(TASK_DATA + '/xnli/dev.tsv', './xnli_data/dev/part.0') \n",
"test_data_size = remove_header_and_save(TASK_DATA + '/xnli/test.tsv', './xnli_data/test/part.0') \n",
"print(train_data_size)\n",
"print(dev_data_size)\n",
"print(test_data_size)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"tokenizer = FullTokenizer(MODEL + 'vocab.txt')\n",
"vocab = {j.strip().split('\\t')[0]: i for i, j in enumerate(open(MODEL + 'vocab.txt', encoding='utf8'))}\n",
"\n",
"print(tokenizer.tokenize('今天很热'))\n",
"print(tokenizer.tokenize('coding in paddle is cool'))\n",
"print(tokenizer.tokenize('[CLS]i have an pen')) # note: special token like [CLS], will be segmented, so please add these id after tokenization.\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"`propeller.data.FeatureColumns` defines the data schema in every data file.\n",
"\n",
"our data consist of 3 columns: seg_a, seg_b, label. with \"\\t\" as delemeter.\n",
"\n",
"`TextColumn` will do 3 things for you: \n",
"\n",
"1. tokenize input sentence with user-defined `tokenizer_func`\n",
"2. vocab lookup\n",
"3. serialize to protobuf bin file (optional)\n",
"\n",
"data file is organized into following patten:\n",
"\n",
"```script\n",
"./xnli_data\n",
"|-- dev\n",
"| `-- part.0\n",
"|-- test\n",
"| `-- part.0\n",
"|-- train\n",
" `-- part.0\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"sep_id = vocab['[SEP]']\n",
"cls_id = vocab['[CLS]']\n",
"unk_id = vocab['[UNK]']\n",
"\n",
"label_map = {\n",
" b\"contradictory\": 0,\n",
" b\"contradiction\": 0,\n",
" b\"entailment\": 1,\n",
" b\"neutral\": 2,\n",
"}\n",
"def tokenizer_func(inputs):\n",
" ret = tokenizer.tokenize(inputs) #`tokenize` will conver bytes to str, so we use a str vocab\n",
" return ret\n",
"\n",
"feature_column = propeller.data.FeatureColumns([\n",
" propeller.data.TextColumn('title', unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func),\n",
" propeller.data.TextColumn('comment', unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func),\n",
" propeller.data.LabelColumn('label', vocab_dict=label_map), #be careful, Columns deal with python3 bytes directly.\n",
"])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## trian model in propeller can be defined in 2 ways:\n",
"1. subclass of `propeller.train.Model` which implements:\n",
" 1. `__init__` (hyper_param, mode, run_config)\n",
" 2. `forward` (features) => (prediction)\n",
" 3. `backword` (loss) => None\n",
" 4. `loss` (predictoin) => (loss)\n",
" 5. `metrics` (optional) (prediction) => (dict of propeller.Metrics)\n",
" \n",
"2. a callable takes following args:\n",
" 1. features\n",
" 2. param\n",
" 3. mode\n",
" 4. run_config(optional)\n",
" \n",
" and returns a propeller.ModelSpec\n",
" \n",
"we use the subclasss approch here"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"class ClassificationErnieModel(propeller.train.Model):\n",
" def __init__(self, hparam, mode, run_config):\n",
" self.hparam = hparam\n",
" self.mode = mode\n",
" self.run_config = run_config\n",
"\n",
" def forward(self, features):\n",
" src_ids, sent_ids = features\n",
" dtype = 'float16' if self.hparam['use_fp16'] else 'float32'\n",
" zero = L.fill_constant([1], dtype='int64', value=0)\n",
" input_mask = L.cast(L.equal(src_ids, zero), dtype) # assume pad id == 0\n",
" #input_mask = L.unsqueeze(input_mask, axes=[2])\n",
" d_shape = L.shape(src_ids)\n",
" seqlen = d_shape[1]\n",
" batch_size = d_shape[0]\n",
" pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0])\n",
" pos_ids = L.expand(pos_ids, [batch_size, 1])\n",
" pos_ids = L.unsqueeze(pos_ids, axes=[2])\n",
" pos_ids = L.cast(pos_ids, 'int64')\n",
" pos_ids.stop_gradient = True\n",
" input_mask.stop_gradient = True\n",
" task_ids = L.zeros_like(src_ids) + self.hparam.task_id #this shit wont use at the moment\n",
" task_ids.stop_gradient = True\n",
"\n",
" ernie = ErnieModel(\n",
" src_ids=src_ids,\n",
" position_ids=pos_ids,\n",
" sentence_ids=sent_ids,\n",
" task_ids=task_ids,\n",
" input_mask=input_mask,\n",
" config=self.hparam,\n",
" use_fp16=self.hparam['use_fp16']\n",
" )\n",
"\n",
" cls_feats = ernie.get_pooled_output()\n",
"\n",
" cls_feats = L.dropout(\n",
" x=cls_feats,\n",
" dropout_prob=0.1,\n",
" dropout_implementation=\"upscale_in_train\"\n",
" )\n",
"\n",
" logits = L.fc(\n",
" input=cls_feats,\n",
" size=self.hparam['num_label'],\n",
" param_attr=F.ParamAttr(\n",
" name=\"cls_out_w\",\n",
" initializer=F.initializer.TruncatedNormal(scale=0.02)),\n",
" bias_attr=F.ParamAttr(\n",
" name=\"cls_out_b\", initializer=F.initializer.Constant(0.))\n",
" )\n",
"\n",
" propeller.summary.histogram('pred', logits)\n",
"\n",
" if self.mode is propeller.RunMode.PREDICT:\n",
" probs = L.softmax(logits)\n",
" return probs\n",
" else:\n",
" return logits\n",
"\n",
" def loss(self, predictions, labels):\n",
" ce_loss, probs = L.softmax_with_cross_entropy(\n",
" logits=predictions, label=labels, return_softmax=True)\n",
" #L.Print(ce_loss, message='per_example_loss')\n",
" loss = L.mean(x=ce_loss)\n",
" return loss\n",
"\n",
" def backward(self, loss):\n",
" scheduled_lr, loss_scale = optimization(\n",
" loss=loss,\n",
" warmup_steps=int(self.run_config.max_steps * self.hparam['warmup_proportion']),\n",
" num_train_steps=self.run_config.max_steps,\n",
" learning_rate=self.hparam['learning_rate'],\n",
" train_program=F.default_main_program(),\n",
" startup_prog=F.default_startup_program(),\n",
" weight_decay=self.hparam['weight_decay'],\n",
" scheduler=\"linear_warmup_decay\",)\n",
" propeller.summary.scalar('lr', scheduled_lr)\n",
"\n",
" def metrics(self, predictions, label):\n",
" predictions = L.argmax(predictions, axis=1)\n",
" predictions = L.unsqueeze(predictions, axes=[1])\n",
" acc = propeller.metrics.Acc(label, predictions)\n",
" #auc = propeller.metrics.Auc(label, predictions)\n",
" return {'acc': acc}\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# define some utility function.\n",
"\n",
"def build_2_pair(seg_a, seg_b):\n",
" token_type_a = np.ones_like(seg_a, dtype=np.int64) * 0\n",
" token_type_b = np.ones_like(seg_b, dtype=np.int64) * 1\n",
" sen_emb = np.concatenate([[cls_id], seg_a, [sep_id], seg_b, [sep_id]], 0)\n",
" token_type_emb = np.concatenate([[0], token_type_a, [0], token_type_b, [1]], 0)\n",
" #seqlen = sen_emb.shape[0]\n",
" #deteministic truncate\n",
" sen_emb = sen_emb[0: MAX_SEQLEN]\n",
" token_type_emb = token_type_emb[0: MAX_SEQLEN]\n",
" return sen_emb, token_type_emb\n",
"\n",
"def expand_dims(*args):\n",
" func = lambda i: np.expand_dims(i, -1)\n",
" ret = [func(i) for i in args]\n",
" return ret\n",
"\n",
"def before_pad(seg_a, seg_b, label):\n",
" sentence, segments = build_2_pair(seg_a, seg_b)\n",
" return sentence, segments, label\n",
"\n",
"def after_pad(sentence, segments, label):\n",
" sentence, segments, label = expand_dims(sentence, segments, label)\n",
" return sentence, segments, label"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# a `propeller.paddle.data.Dataset` is built from FeatureColumns\n",
"\n",
"train_ds = feature_column.build_dataset('train', use_gz=False, data_dir='./xnli_data/train', shuffle=True, repeat=True) \\\n",
" .map(before_pad) \\\n",
" .padded_batch(BATCH, (0, 0, 0)) \\\n",
" .map(after_pad)\n",
"\n",
"dev_ds = feature_column.build_dataset('dev', use_gz=False, data_dir='./xnli_data/dev', shuffle=False, repeat=False) \\\n",
" .map(before_pad) \\\n",
" .padded_batch(BATCH, (0, 0, 0)) \\\n",
" .map(after_pad)\n",
"\n",
"shapes = ([-1, MAX_SEQLEN, 1], [-1, MAX_SEQLEN, 1], [-1, 1])\n",
"types = ('int64', 'int64', 'int64')\n",
"train_ds.data_shapes = shapes\n",
"train_ds.data_types = types\n",
"dev_ds.data_shapes = shapes\n",
"dev_ds.data_types = types\n",
"\n",
"warm_start_dir = MODEL + '/params'\n",
"# only the encoder and embedding is loaded from pretrained model\n",
"varname_to_warmstart = re.compile('^encoder.*w_0$|^encoder.*b_0$|^.*embedding$|^.*bias$|^.*scale$')\n",
"ws = propeller.WarmStartSetting(\n",
" predicate_fn=lambda v: varname_to_warmstart.match(v.name) and os.path.exists(os.path.join(warm_start_dir, v.name)),\n",
" from_dir=warm_start_dir\n",
" )\n",
"\n",
"# propeller will export model of highest performance, the criteria is up to you. \n",
"# here we pick the model with maximum evaluatoin accuracy.\n",
"#`BestInferenceModelExporter` is used to export serveable models\n",
"best_inference_exporter = propeller.train.exporter.BestInferenceModelExporter(\n",
" os.path.join(OUTPUT_DIR, 'best'), \n",
" cmp_fn=lambda old, new: new['eval']['acc'] > old['eval']['acc'])\n",
"#`BestExporter` is used to export restartable checkpoint, so that we can restore from it and check test-set accuracy.\n",
"best_exporter = propeller.train.exporter.BestExporter(\n",
" os.path.join(OUTPUT_DIR, 'best_model'), \n",
" cmp_fn=lambda old, new: new['eval']['acc'] > old['eval']['acc'])\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"#ERNIE1.0 config \n",
"ernie_config = propeller.HParams(**json.loads(open(MODEL + '/ernie_config.json').read()))\n",
"\n",
"# default term in official config\n",
"ernie_v2_config = propeller.HParams(**{\n",
" \"sent_type_vocab_size\": None, \n",
" \"use_task_id\": False,\n",
" \"task_id\": 0,\n",
"})\n",
"\n",
"# train schema\n",
"train_config = propeller.HParams(**{ \n",
" \"warmup_proportion\": 0.1,\n",
" \"weight_decay\": 0.01,\n",
" \"use_fp16\": 0,\n",
" \"learning_rate\": 0.00005,\n",
" \"num_label\": 3,\n",
" \"batch_size\": 32\n",
"})\n",
"\n",
"config = ernie_config.join(ernie_v2_config).join(train_config)\n",
"\n",
"run_config = propeller.RunConfig(\n",
" model_dir=OUTPUT_DIR,\n",
" max_steps=EPOCH * train_data_size / BATCH,\n",
" skip_steps=10,\n",
" eval_steps=1000,\n",
" save_steps=1000,\n",
" log_steps=10,\n",
" max_ckpt=3\n",
")\n",
" "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Finetune and Eval"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# `train_and_eval` takes key-word args only\n",
"# we are now ready to train\n",
"hooks = [propeller.train.TqdmNotebookProgressBarHook(run_config.max_steps)] # to show the progress bar, you need to `pip install tqdm ipywidgets`\n",
"propeller.train_and_eval(\n",
" model_class_or_model_fn=ClassificationErnieModel, #**careful**, you should pass a Class to `train_and_eval`, propeller will try to instantiate it.\n",
" params=config, \n",
" run_config=run_config, \n",
" train_dataset=train_ds, \n",
" eval_dataset=dev_ds, \n",
" warm_start_setting=ws, \n",
" exporters=[best_exporter, best_inference_exporter],\n",
" train_hooks=hooks,\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Predict"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# after training you might want to check your model performance on test-set\n",
"# let's do this via `propeller.predict`\n",
"# keep in mind that model of best performace has been exported during thet `train_and_eval` phrase\n",
"\n",
"best_filename = [file for file in os.listdir(os.path.join(OUTPUT_DIR, 'best_model')) if 'model' in file][0]\n",
"best_model_path = os.path.join(os.path.join(OUTPUT_DIR, 'best_model'), best_filename)\n",
"true_label = [label_map[(line.strip().split(b'\\t')[-1])]for line in open('./xnli_data/test/part.0', 'rb')]\n",
"\n",
"def drop_label(sentence, segments, label): #we drop the label column here\n",
" return sentence, segments\n",
"\n",
"test_ds = feature_column.build_dataset('test', use_gz=False, data_dir='./xnli_data/test', shuffle=False, repeat=False) \\\n",
" .map(before_pad) \\\n",
" .padded_batch(BATCH, (0, 0, 0)) \\\n",
" .map(after_pad) \\\n",
" .map(drop_label)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"result = []\n",
"learner = propeller.Learner(ClassificationErnieModel, run_config, params=config, )\n",
"for pred in learner.predict(test_ds, ckpt=-1):\n",
" result.append(np.argmax(pred))\n",
" \n",
"result, true_label = np.array(result), np.array(true_label)\n",
"\n",
"test_acc = (result == true_label).sum() / len(true_label)\n",
"print('test accuracy:%.5f' % test_acc)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Serving\n",
"your model is now ready to serve! \n",
"you can open up a server by propeller with \n",
"```script\n",
"python -m propeller.tools.start_server -m /path/to/saved/model -p 8888\n",
"```\n"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册