retrieval.py

#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Model for retrieval."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os
import glob
import time
import codecs
import numpy as np

import paddle.fluid as fluid
from eval import img_eval
from collections import OrderedDict
from utils.utils import print_eval_log
from model.unimo_finetune import UNIMOModel


def circle_loss(sp, sn, m, scale):
    """
    sp: score list of positive samples, shape [B * L]
    sn: score list of negative samples, shape [B * K]
    m: relaxation factor in circle loss function
    scale:  scale factor in circle loss function

    return: circle loss value, shape [1]
    """
    op = 1. + m
    on = 0. - m

    delta_p = 1 - m
    delta_n = m

    ap = fluid.layers.relu(op - sp)
    ap.stop_gradient = True
    an = fluid.layers.relu(sn - on)
    an.stop_gradient = True

    logit_p = ap * (sp - delta_p) 
    logit_p = -1. * scale * logit_p
    logit_p = fluid.layers.cast(x=logit_p, dtype=np.float64)
    loss_p = fluid.layers.reduce_sum(fluid.layers.exp(logit_p), dim=1, keep_dim=False)

    logit_n = an * (sn - delta_n)
    logit_n = scale * logit_n
    logit_n = fluid.layers.cast(x=logit_n, dtype=np.float64)
    loss_n = fluid.layers.reduce_sum(fluid.layers.exp(logit_n), dim=1, keep_dim=False)

    circle_loss = fluid.layers.log(1 + loss_n * loss_p)
    circle_loss = fluid.layers.cast(x=circle_loss, dtype=np.float32)
    return fluid.layers.mean(circle_loss)


def create_model(args, phase, config, samples_num):
    """"create_model"""
    input_mask_shape = [-1, args.max_img_len + args.max_seq_len, args.max_img_len + args.max_seq_len]

    src_ids = fluid.layers.data(name='src_ids', shape=[-1, args.max_seq_len, 1], dtype='int64')
    pos_ids = fluid.layers.data(name='pos_ids', shape=[-1, args.max_seq_len, 1], dtype='int64')
    sent_ids = fluid.layers.data(name='sent_ids', shape=[-1, args.max_seq_len, 1], dtype='int64')
    input_mask = fluid.layers.data(name='input_mask', shape=input_mask_shape, dtype='float32')
    image_embedding = fluid.layers.data(
            name='image_embedding', 
            shape=[-1, args.max_img_len, config["image_embedding_size"]], 
            dtype='float32')
    image_loc = fluid.layers.data(name='image_loc', shape=[-1, args.max_img_len, 5], dtype='float32')
    labels = fluid.layers.data(name='labels', shape=[-1, 1], dtype='int64')
    ids = fluid.layers.data(name='ids', shape=[-1, 2], dtype='int64')

    drop_last = True if phase == 'train' else False
    feed_list = [src_ids, pos_ids, sent_ids, input_mask, image_embedding, image_loc, labels, ids]
    pyreader = fluid.io.DataLoader.from_generator(
            feed_list=feed_list, 
            capacity=70, 
            use_double_buffer=True, 
            iterable=False, 
            drop_last=drop_last)

    emb_ids = {"word_embedding": src_ids, "sent_embedding": sent_ids, "pos_embedding": pos_ids}
    image_input = {"image_embedding": image_embedding, "loc_embedding": image_loc}

    model = UNIMOModel(
        emb_ids=emb_ids,
        input_mask=input_mask,
        config=config,
        image_input=image_input,
        weight_sharing=args.weight_sharing
    )
    text, image = model.get_pooled_output()
    score = model.get_match_output(text, image, mode="mul")
    score = fluid.layers.fc(
        input=score,
        size=1,
        act=None,
        param_attr=fluid.ParamAttr(
            name='match_fc.w_0',
            initializer=fluid.initializer.Xavier()),
        bias_attr=fluid.ParamAttr(name='match_fc.b_0',
            initializer=fluid.initializer.UniformInitializer()))

    score = fluid.layers.reshape(score, [-1, samples_num])
    if phase == 'train':
        if args.use_sigmoid:
            score = fluid.layers.sigmoid(score)
        positive_score = score[:, 0]
        image_neg_score = score[:, 1:int((samples_num + 1) / 2)]
        caption_neg_score = score[:, int((samples_num + 1) / 2):]
        acc = fluid.layers.accuracy(score, labels, k=1)

        positive_score = fluid.layers.reshape(x=positive_score, shape=[-1, 1])
        loss_c = circle_loss(positive_score, caption_neg_score, args.margin, args.scale_circle)
        loss_i = circle_loss(positive_score, image_neg_score, args.margin, args.scale_circle)
        total_loss = (loss_c + loss_i) / 2
    else:
        assert samples_num == 1
        total_loss = fluid.layers.cross_entropy(input=score, label=labels)
        total_loss = fluid.layers.mean(x=total_loss)
        acc = fluid.layers.zeros_like(total_loss)
    graph_vars = {"loss": total_loss, "acc": acc, "score": score, "label": labels, "ids": ids}
    return pyreader, graph_vars


def evaluate(args, exe, test_pyreader, graph_vars, eval_phase, dev_count=1, gpu_id=0, data_reader=None):
    """evaluate"""
    test_pyreader.start()
    time_begin = time.time()
    all_mat = None
    fetch_list = [graph_vars["score"].name, graph_vars["ids"].name]
    while True:
        try:
            score, ids = exe.run(fetch_list=fetch_list)
            mat = np.concatenate([score, ids], axis=1)
            if all_mat is None:
                all_mat = mat
            else:
                all_mat = np.concatenate([all_mat, mat], axis=0)
        except fluid.core.EOFException:
            test_pyreader.reset()
            break
    time_end = time.time()

    save_file = "%s/%s.trainers_%d.part_%d.npy" % (args.eval_dir, eval_phase, dev_count, gpu_id)
    np.save(save_file, all_mat)
    tmp_file = "%s/%s.trainers_%d.part_%d.finish" % (args.eval_dir, eval_phase, dev_count, gpu_id)
    tmp_writer = codecs.open(tmp_file, "w", 'utf-8')
    tmp_writer.close()

    if gpu_id == 0:
        while True:
            ret = os.popen('find %s -maxdepth 1 -name "%s.trainers_%d.part_*.finish"' %
                           (args.eval_dir, eval_phase, dev_count)).readlines()
            if len(ret) != dev_count:
                time.sleep(1)
                continue
            else:
                break

        all_mat = None
        save_files = glob.glob("%s/%s.trainers_%d.part_*.npy" % (args.eval_dir, eval_phase, dev_count))
        for cur_save_file in save_files:
            mat = np.load(cur_save_file)
            if all_mat is None:
                all_mat = mat
            else:
                all_mat = np.concatenate([all_mat, mat], axis=0)

        cur_time = str(int(time.time()))
        os.system("mkdir %s/%s" % (args.eval_dir, cur_time))
        os.system("mv %s/%s.trainers_%d.* %s/%s" % (args.eval_dir, eval_phase, dev_count, args.eval_dir, cur_time))

        assert data_reader is not None
        text2img = {text_id: item[-1] for text_id, item in data_reader._caption_ids_dict.items()}
        img2texts = data_reader._image_sent_map

        ret = OrderedDict()
        ret['phase'] = eval_phase
        ret['loss'] = -1
        ret['data_num'] = all_mat.shape[0]
        ret['used_time'] = round(time_end - time_begin, 4)
        metrics = OrderedDict()
        metrics["recall@k"] = img_eval.recall_at_k
        if args.eval_mertrics in metrics:
            ret_metric = metrics[args.eval_mertrics](all_mat, text2img, img2texts)
            ret.update(ret_metric)
            print_eval_log(ret)
        else:
            raise ValueError('unsupported metric {}'.format(args.eval_mertrics))
        return ret
    else:
        return None