classifier.py

#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Model for classifier."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import time
import numpy as np
import collections
from collections import namedtuple

import paddle.fluid as fluid

from model.static.ernie import ErnieDocModel
from utils.multi_process_eval import MultiProcessEvalForErnieDoc
from utils.metrics import Acc

def create_model(args, ernie_config, mem_len=128, is_infer=False):
    """create model for classifier"""
    shapes = [[-1, args.max_seq_len, 1], [-1, 2 * args.max_seq_len + mem_len, 1], [-1, args.max_seq_len, 1],
            [-1, args.max_seq_len, 1], [-1, 1], [-1, 1], [-1, 1], []]
    dtypes = ['int64', 'int64', 'int64', 'float32', 'int64', 'int64', 'int64', 'int64']
    names = ["src_ids", "pos_ids", "task_ids", "input_mask", "labels", "qids", "gather_idx", "need_cal_loss"]

    inputs = []
    for shape, dtype, name in zip(shapes, dtypes, names):
        inputs.append(fluid.layers.data(name=name, shape=shape, dtype=dtype, append_batch_size=False))

    src_ids, pos_ids, task_ids, input_mask, labels, qids, \
            gather_idx, need_cal_loss = inputs
    pyreader = fluid.io.DataLoader.from_generator(
            feed_list=inputs,
            capacity=70, iterable=False)

    ernie_doc = ErnieDocModel(
        src_ids=src_ids,
        position_ids=pos_ids,
        task_ids=task_ids,
        input_mask=input_mask,
        config=ernie_config,
        number_instance=args.batch_size,
        rel_pos_params_sharing=args.rel_pos_params_sharing,
        use_vars=args.use_vars)
    
    mems, new_mems = ernie_doc.get_mem_output()

    cls_feats = ernie_doc.get_pooled_output()
    checkpoints = ernie_doc.get_checkpoints()
    cls_feats = fluid.layers.dropout(
        x=cls_feats,
        dropout_prob=0.1,
        dropout_implementation="upscale_in_train")
    logits = fluid.layers.fc(
        input=cls_feats,
        size=args.num_labels,
        param_attr=fluid.ParamAttr(
            name="cls_out_w",
            initializer=fluid.initializer.TruncatedNormal(scale=0.02)),
        bias_attr=fluid.ParamAttr(
            name="cls_out_b", initializer=fluid.initializer.Constant(0.)))

    if is_infer:
        probs = fluid.layers.softmax(logits)
        feed_targets_name = [
            src_ids.name, pos_ids.name, task_ids.name, input_mask.name
        ]
        return pyreader, probs, feed_targets_name
    
    # filter
    qids, logits, labels = list(map(lambda x: fluid.layers.gather(x, gather_idx), [qids, logits, labels]))
    ce_loss, probs = fluid.layers.softmax_with_cross_entropy(
        logits=logits, label=labels, return_softmax=True)
    loss = fluid.layers.mean(x=ce_loss)

    num_seqs = fluid.layers.create_tensor(dtype='int32')
    accuracy = fluid.layers.accuracy(input=probs, label=labels, total=num_seqs)
    
    loss, num_seqs, accuracy = list(map(lambda x: x * need_cal_loss, [loss, num_seqs, accuracy]))
    graph_vars = collections.OrderedDict()

    fetch_names = ['loss', 'accuracy', 'probs', 'labels', 'num_seqs', 'qids', 'need_cal_loss']
    fetch_vars = [loss, accuracy, probs, labels, num_seqs, qids, need_cal_loss]
    for name, var in zip(fetch_names, fetch_vars):
        graph_vars[name] = var

    for k, v in graph_vars.items():
        v.persistable = True
    mems_vars = {'mems': mems, 'new_mems': new_mems}
    return pyreader, graph_vars, checkpoints, mems_vars


def evaluate(exe,  
             program, 
             pyreader, 
             graph_vars,
             mems_vars,
             tower_mems_np, 
             phase, 
             steps=None,
             trainers_id=None, 
             trainers_num=None,
             scheduled_lr=None,
             use_vars=False):
    """evaluate interface"""
    fetch_names = [k for k, v in graph_vars.items()]
    fetch_list = [v for k, v in graph_vars.items()]

    if phase == "train":
        fetch_names += ['scheduled_lr']
        fetch_list += [scheduled_lr]

    if not use_vars:
        feed_dict = {}
        for m, m_np in zip(mems_vars['mems'], tower_mems_np):
            feed_dict[m.name] = m_np
        
        fetch_list += mems_vars['new_mems']
        fetch_names += [m.name for m in mems_vars['new_mems']]
        

    if phase == "train":
        if use_vars:
            outputs = exe.run(fetch_list=fetch_list, program=program, use_program_cache=True)
        else:
            outputs = exe.run(feed=feed_dict, fetch_list=fetch_list, program=program, use_program_cache=True)
            tower_mems_np = outputs[-len(mems_vars['new_mems']):]

        outputs_dict = {}
        for var_name, output_var in zip(fetch_names, outputs):
            outputs_dict[var_name] = output_var

        ret = {"loss": np.mean(outputs_dict['loss']), 
               "accuracy": np.mean(outputs_dict['accuracy']), 
               "learning_rate": np.mean(outputs_dict['scheduled_lr']),
               "tower_mems_np": tower_mems_np}
        return ret

    if phase == "eval" or phase == "test":
        pyreader.start()
        qids, labels, scores = [], [], []
        time_begin = time.time()
        
        all_results = []
        total_cost, total_num_seqs= 0.0, 0.0
        RawResult = namedtuple("RawResult", ["unique_id", "prob", "label"])
        while True:
            try:
                if use_vars:
                    outputs = exe.run(
                        program=program, fetch_list=fetch_list, use_program_cache=True)
                else:
                    feed_dict = {}
                    for m, m_np in zip(mems_vars['mems'], tower_mems_np):
                        feed_dict[m.name] = m_np
                    outputs = exe.run(feed=feed_dict, fetch_list=fetch_list, program=program, use_program_cache=True)
                    tower_mems_np = outputs[-len(mems_vars['new_mems']):]
                    outputs = outputs[:-len(mems_vars['new_mems'])]

                np_loss, np_acc, np_probs, np_labels, np_num_seqs, np_qids, np_need_cal_loss = outputs

                if int(np_need_cal_loss) == 1:
                    total_cost += np.sum(np_loss * np_num_seqs)
                    total_num_seqs += np.sum(np_num_seqs)
                    for idx in range(np_qids.shape[0]):
                        if len(all_results) % 1000 == 0 and len(all_results):
                            print("processining example: %d" % len(all_results))
                        qid_each = int(np_qids[idx])
                        probs_each = [float(x) for x in np_probs[idx].flat]
                        label_each = int(np_labels[idx])
                        all_results.append(
                        RawResult(
                            unique_id=qid_each,
                            prob=probs_each,
                            label=label_each))

            except fluid.core.EOFException:
                pyreader.reset()
                break
        time_end = time.time()
        
        output_path = "./tmpout"
        mul_pro_test = MultiProcessEvalForErnieDoc(output_path, phase, trainers_num, trainers_id)
        is_print = True
        if mul_pro_test.dev_count > 1:
            is_print = False
            mul_pro_test.write_result(all_results)
            if trainers_id == 0:
                is_print = True
                all_results = mul_pro_test.concat_result(RawResult)

        if is_print:
            num_seqs, all_labels, all_probs = mul_pro_test.write_predictions(all_results)
            acc_func = Acc()
            accuracy = acc_func.eval([all_probs, all_labels])
            time_cost = time_end - time_begin
            print("[%d_%s evaluation] ave loss: %f, ave acc: %f, data_num: %d, elapsed time: %f s"
                % (steps, phase, total_cost / total_num_seqs, accuracy, num_seqs, time_cost))