In [None]:
import sys
import os
import numpy as np
import re
import logging
import json

In [None]:
sys.path.append('../ernie')
sys.path.append('../')
%env CUDA_VICIBLE_DEVICES=7
# if CUDA_VICIBLE_DEVICES is changed, relaunch jupyter kernel to inform paddle

In [None]:
import propeller.paddle as propeller
import paddle
import paddle.fluid as F
import paddle.fluid.layers as L
#import model defenition from original ERNIE
from model.ernie import ErnieModel
from tokenization import FullTokenizer
from optimization import optimization
from propeller import log
log.setLevel(logging.DEBUG)

if paddle.__version__ not in ['1.5.1', '1.5.2']:
    raise RuntimeError('propeller works in paddle1.5.1')


In [None]:
%%bash
# download pretrained model&config(ernie1.0) and xnli data
mkdir ernie1.0_pretrained
if [ ! -f ernie1.0_pretrained/ERNIE_stable-1.0.1.tar.gz ]
then
    echo "download model"
    wget --no-check-certificate https://baidu-nlp.bj.bcebos.com/ERNIE_stable-1.0.1.tar.gz -P ernie1.0_pretrained
fi

if [ ! -f task_data_zh.tgz ]
then
    echo "download data"
    wget --no-check-certificate https://ernie.bj.bcebos.com/task_data_zh.tgz
fi

tar xzf ernie1.0_pretrained/ERNIE_stable-1.0.1.tar.gz -C ernie1.0_pretrained
tar xzf task_data_zh.tgz

In [None]:
#define basic training settings
EPOCH=3
BATCH=16
LR=5e-3
MAX_SEQLEN=128
TASK_DATA='./task_data/'
MODEL='./ernie1.0_pretrained/'
OUTPUT_DIR='./output'

In [None]:
!rm -rf {OUTPUT_DIR}

In [None]:
#skip header, and reorganize train data into ./xnli_data 
!mkdir xnli_data
!mkdir xnli_data/train
!mkdir xnli_data/test
!mkdir xnli_data/dev

def remove_header_and_save(fname_in, fname_out):
    with open(fname_out, 'w') as fout:
        buf = open(fname_in).readlines()[1:]
        for i in buf:
            fout.write(i)
        return len(buf)
train_data_size = remove_header_and_save(TASK_DATA + '/xnli/train.tsv', './xnli_data/train/part.0') 
dev_data_size = remove_header_and_save(TASK_DATA + '/xnli/dev.tsv', './xnli_data/dev/part.0') 
test_data_size = remove_header_and_save(TASK_DATA + '/xnli/test.tsv', './xnli_data/test/part.0') 
print(train_data_size)
print(dev_data_size)
print(test_data_size)


In [None]:
tokenizer = FullTokenizer(MODEL + 'vocab.txt')
vocab = {j.strip().split('\t')[0]: i for i, j in enumerate(open(MODEL + 'vocab.txt', encoding='utf8'))}

print(tokenizer.tokenize('今天很热'))
print(tokenizer.tokenize('coding in paddle is cool'))
print(tokenizer.tokenize('[CLS]i have an pen')) # note: special token like [CLS], will be segmented, so please add these id after tokenization.


`propeller.data.FeatureColumns` defines the data schema in every data file.

our data consist of 3 columns: seg_a, seg_b, label. with "\t" as delemeter.

`TextColumn` will do 3 things for you: 

1. tokenize input sentence with user-defined `tokenizer_func`
2. vocab lookup
3. serialize to protobuf bin file (optional)

data file is organized into following patten:

```script
./xnli_data
|-- dev
|   `-- part.0
|-- test
|   `-- part.0
|-- train
   `-- part.0
```

In [None]:

sep_id = vocab['[SEP]']
cls_id = vocab['[CLS]']
unk_id = vocab['[UNK]']

label_map = {
    b"contradictory": 0,
    b"contradiction": 0,
    b"entailment": 1,
    b"neutral": 2,
}
def tokenizer_func(inputs):
    ret = tokenizer.tokenize(inputs) #`tokenize` will conver bytes to str, so we use a str vocab
    return ret

feature_column = propeller.data.FeatureColumns([
    propeller.data.TextColumn('title', unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func),
    propeller.data.TextColumn('comment', unk_id=unk_id, vocab_dict=vocab, tokenizer=tokenizer_func),
    propeller.data.LabelColumn('label', vocab_dict=label_map), #be careful, Columns deal with python3 bytes directly.
])

## trian model in propeller can be defined in 2 ways:
1. subclass of `propeller.train.Model` which implements:
    1. `__init__`           (hyper_param, mode, run_config)
    2. `forward`            (features) => (prediction)
    3. `backword`           (loss) => None
    4. `loss`               (predictoin) => (loss)
    5. `metrics` (optional) (prediction) => (dict of propeller.Metrics)
    
2. a callable takes following args:
    1. features
    2. param
    3. mode
    4. run_config(optional)
    
   and returns a propeller.ModelSpec
   
we use the subclasss approch here

In [None]:
class ClassificationErnieModel(propeller.train.Model):
    def __init__(self, hparam, mode, run_config):
        self.hparam = hparam
        self.mode = mode
        self.run_config = run_config

    def forward(self, features):
        src_ids, sent_ids = features
        dtype = 'float16' if self.hparam['use_fp16'] else 'float32'
        zero = L.fill_constant([1], dtype='int64', value=0)
        input_mask = L.cast(L.equal(src_ids, zero), dtype) # assume pad id == 0
        #input_mask = L.unsqueeze(input_mask, axes=[2])
        d_shape = L.shape(src_ids)
        seqlen = d_shape[1]
        batch_size = d_shape[0]
        pos_ids = L.unsqueeze(L.range(0, seqlen, 1, dtype='int32'), axes=[0])
        pos_ids = L.expand(pos_ids, [batch_size, 1])
        pos_ids = L.unsqueeze(pos_ids, axes=[2])
        pos_ids = L.cast(pos_ids, 'int64')
        pos_ids.stop_gradient = True
        input_mask.stop_gradient = True
        task_ids = L.zeros_like(src_ids) + self.hparam.task_id #this shit wont use at the moment
        task_ids.stop_gradient = True

        ernie = ErnieModel(
            src_ids=src_ids,
            position_ids=pos_ids,
            sentence_ids=sent_ids,
            task_ids=task_ids,
            input_mask=input_mask,
            config=self.hparam,
            use_fp16=self.hparam['use_fp16']
        )

        cls_feats = ernie.get_pooled_output()

        cls_feats = L.dropout(
            x=cls_feats,
            dropout_prob=0.1,
            dropout_implementation="upscale_in_train"
        )

        logits = L.fc(
            input=cls_feats,
            size=self.hparam['num_label'],
            param_attr=F.ParamAttr(
                name="cls_out_w",
                initializer=F.initializer.TruncatedNormal(scale=0.02)),
            bias_attr=F.ParamAttr(
                name="cls_out_b", initializer=F.initializer.Constant(0.))
        )

        propeller.summary.histogram('pred', logits)

        if self.mode is propeller.RunMode.PREDICT:
            probs = L.softmax(logits)
            return probs
        else:
            return logits

    def loss(self, predictions, labels):
        ce_loss, probs = L.softmax_with_cross_entropy(
            logits=predictions, label=labels, return_softmax=True)
        #L.Print(ce_loss, message='per_example_loss')
        loss = L.mean(x=ce_loss)
        return loss

    def backward(self, loss):
        scheduled_lr, loss_scale = optimization(
            loss=loss,
            warmup_steps=int(self.run_config.max_steps * self.hparam['warmup_proportion']),
            num_train_steps=self.run_config.max_steps,
            learning_rate=self.hparam['learning_rate'],
            train_program=F.default_main_program(),
            startup_prog=F.default_startup_program(),
            weight_decay=self.hparam['weight_decay'],
            scheduler="linear_warmup_decay",)
        propeller.summary.scalar('lr', scheduled_lr)

    def metrics(self, predictions, label):
        predictions = L.argmax(predictions, axis=1)
        predictions = L.unsqueeze(predictions, axes=[1])
        acc = propeller.metrics.Acc(label, predictions)
        #auc = propeller.metrics.Auc(label, predictions)
        return {'acc': acc}
    

In [None]:
# define some utility function.

def build_2_pair(seg_a, seg_b):
    token_type_a = np.ones_like(seg_a, dtype=np.int64) * 0
    token_type_b = np.ones_like(seg_b, dtype=np.int64) * 1
    sen_emb = np.concatenate([[cls_id], seg_a, [sep_id], seg_b, [sep_id]], 0)
    token_type_emb = np.concatenate([[0], token_type_a, [0], token_type_b, [1]], 0)
    #seqlen = sen_emb.shape[0]
    #deteministic truncate
    sen_emb = sen_emb[0: MAX_SEQLEN]
    token_type_emb = token_type_emb[0: MAX_SEQLEN]
    return sen_emb, token_type_emb

def expand_dims(*args):
    func = lambda i: np.expand_dims(i, -1)
    ret = [func(i) for i in args]
    return ret

def before_pad(seg_a, seg_b, label):
    sentence, segments = build_2_pair(seg_a, seg_b)
    return sentence, segments, label

def after_pad(sentence, segments, label):
    sentence, segments, label = expand_dims(sentence, segments, label)
    return sentence, segments, label

In [None]:
# a `propeller.paddle.data.Dataset` is built from FeatureColumns

train_ds = feature_column.build_dataset('train', use_gz=False, data_dir='./xnli_data/train', shuffle=True, repeat=True) \
                               .map(before_pad) \
                               .padded_batch(BATCH, (0, 0, 0)) \
                               .map(after_pad)

dev_ds = feature_column.build_dataset('dev', use_gz=False, data_dir='./xnli_data/dev', shuffle=False, repeat=False) \
                               .map(before_pad) \
                               .padded_batch(BATCH, (0, 0, 0)) \
                               .map(after_pad)

shapes = ([-1, MAX_SEQLEN, 1], [-1, MAX_SEQLEN, 1], [-1, 1])
types = ('int64', 'int64', 'int64')
train_ds.data_shapes = shapes
train_ds.data_types = types
dev_ds.data_shapes = shapes
dev_ds.data_types = types

warm_start_dir = MODEL + '/params'
# only the encoder and embedding is loaded from pretrained model
varname_to_warmstart = re.compile('^encoder.*w_0$|^encoder.*b_0$|^.*embedding$|^.*bias$|^.*scale$')
ws = propeller.WarmStartSetting(
        predicate_fn=lambda v: varname_to_warmstart.match(v.name) and os.path.exists(os.path.join(warm_start_dir, v.name)),
        from_dir=warm_start_dir
    )

# propeller will export model of highest performance, the criteria is up to you. 
# here we pick the model with maximum evaluatoin accuracy.
#`BestInferenceModelExporter` is used to export serveable models
best_inference_exporter = propeller.train.exporter.BestInferenceModelExporter(
    os.path.join(OUTPUT_DIR, 'best'), 
    cmp_fn=lambda old, new: new['eval']['acc'] > old['eval']['acc'])
#`BestExporter` is used to export restartable checkpoint, so that we can restore from it and check test-set accuracy.
best_exporter = propeller.train.exporter.BestExporter(
    os.path.join(OUTPUT_DIR, 'best_model'), 
    cmp_fn=lambda old, new: new['eval']['acc'] > old['eval']['acc'])


In [None]:
#ERNIE1.0 config 
ernie_config = propeller.HParams(**json.loads(open(MODEL + '/ernie_config.json').read()))

# default term in official config
ernie_v2_config = propeller.HParams(**{
    "sent_type_vocab_size": None,  
    "use_task_id": False,
    "task_id": 0,
})

# train schema
train_config = propeller.HParams(**{  
      "warmup_proportion":  0.1,
      "weight_decay": 0.01,
      "use_fp16": 0,
      "learning_rate": 0.00005,
      "num_label": 3,
      "batch_size": 32
})

config = ernie_config.join(ernie_v2_config).join(train_config)

run_config = propeller.RunConfig(
    model_dir=OUTPUT_DIR,
    max_steps=EPOCH * train_data_size / BATCH,
    skip_steps=10,
    eval_steps=1000,
    save_steps=1000,
    log_steps=10,
    max_ckpt=3
)
            

## Finetune and Eval

In [None]:
# `train_and_eval` takes key-word args only
# we are now ready to train
hooks = [propeller.train.TqdmNotebookProgressBarHook(run_config.max_steps)] # to show the progress bar, you need to `pip install tqdm ipywidgets`
propeller.train_and_eval(
    model_class_or_model_fn=ClassificationErnieModel, #**careful**, you should pass a Class to `train_and_eval`, propeller will try to instantiate it.
    params=config, 
    run_config=run_config, 
    train_dataset=train_ds, 
    eval_dataset=dev_ds, 
    warm_start_setting=ws, 
    exporters=[best_exporter, best_inference_exporter],
    train_hooks=hooks,
)

## Predict

In [None]:
# after training you might want to check your model performance on test-set
# let's do this via `propeller.predict`
# keep in mind that model of best performace has been exported during thet `train_and_eval` phrase

best_filename = [file for file in os.listdir(os.path.join(OUTPUT_DIR, 'best_model')) if 'model' in file][0]
best_model_path = os.path.join(os.path.join(OUTPUT_DIR, 'best_model'), best_filename)
true_label = [label_map[(line.strip().split(b'\t')[-1])]for line in open('./xnli_data/test/part.0', 'rb')]

def drop_label(sentence, segments, label): #we drop the label column here
    return sentence, segments

test_ds = feature_column.build_dataset('test', use_gz=False, data_dir='./xnli_data/test', shuffle=False, repeat=False) \
                               .map(before_pad) \
                               .padded_batch(BATCH, (0, 0, 0)) \
                               .map(after_pad) \
                               .map(drop_label)



In [None]:
result = []
learner = propeller.Learner(ClassificationErnieModel, run_config, params=config, )
for pred in learner.predict(test_ds, ckpt=-1):
    result.append(np.argmax(pred))
    
result, true_label = np.array(result), np.array(true_label)

test_acc = (result == true_label).sum() / len(true_label)
print('test accuracy:%.5f' % test_acc)

## Serving
your model is now ready to serve! 
you can open up a server by propeller with 
```script
python -m propeller.tools.start_server -m /path/to/saved/model -p 8888
```
