# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # http://www.apache.org/licenses/LICENSE-2.0 # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ prepare data format for finetuning tasks """ from __future__ import absolute_import from __future__ import division from __future__ import print_function import numpy as np from six.moves import xrange def prepare_batch_data(batch_records, num_choice, pad_id, task_index, task_num): """ prepare batch data for finetuning tasks """ batch_input_ids = [] batch_input_pos = [] batch_seg_ids = [] batch_input_masks = [] num_sample = len(batch_records) batch_lens = [record["input_lens"] for record in batch_records] batch_labels = [record["target"] for record in batch_records] binary_labels = np.zeros([num_choice * num_sample, 1], dtype='float32') for i, l in enumerate(batch_labels): binary_labels[i * num_choice + l] = 1.0 labels = np.array(batch_labels).astype("int64").reshape([-1, 1]) image_features = [record["features"] for record in batch_records] image_boxes = [record["boxes"] for record in batch_records] batch_anno_ids = np.array([record["anno_id"] for record in batch_records]).astype("int64").reshape([-1, 1]) max_len = max([max(lens) for lens in batch_lens]) for i in range(len(batch_records)): batch_input_ids.append([inst + list([pad_id] * (max_len - len(inst))) \ for inst in batch_records[i]["input_ids"]]) batch_input_pos.append([inst + list([pad_id] * (max_len - len(inst))) \ for inst in batch_records[i]["input_pos"]]) batch_seg_ids.append([inst + list([pad_id] * (max_len - len(inst))) \ for inst in batch_records[i]["segment_ids"]]) batch_input_masks.append([[1] * len(inst) + [0] * (max_len - len(inst)) \ for inst in batch_records[i]["input_ids"]]) image_embedding, image_mask = pad_feature_data(image_features, return_mask=True) image_loc = pad_feature_data(image_boxes) src_ids = np.array(batch_input_ids).astype("int64").reshape([num_choice * num_sample, max_len, 1]) src_pos = np.array(batch_input_pos).astype("int64").reshape([num_choice * num_sample, max_len, 1]) src_seg = np.array(batch_seg_ids).astype("int64").reshape([num_choice * num_sample, max_len, 1]) src_masks = np.array(batch_input_masks).astype("float32").reshape([num_choice * num_sample, max_len, 1]) batch, seq_len, fea_len = image_embedding.shape image_embedding = np.tile(np.expand_dims(image_embedding, axis=1), \ (1, num_choice, 1, 1)).reshape([num_choice * batch, seq_len, fea_len]) image_mask = np.tile(np.expand_dims(image_mask, axis=1), \ (1, num_choice, 1, 1)).reshape([num_choice * batch, seq_len, 1]) image_loc = np.tile(np.expand_dims(image_loc, axis=1), \ (1, num_choice, 1, 1)).reshape([num_choice * batch, seq_len, 5]) return_list = [src_ids, src_pos, src_seg, src_masks, \ image_embedding, image_loc, image_mask, labels, batch_anno_ids] return_list.append(np.array([task_index]).astype('int64')) return_list.append(binary_labels) for i in xrange(task_num): if i == task_index: return_list.append(np.array([1.0]).astype("float32")) else: return_list.append(np.array([0.0]).astype("float32")) return return_list def prepare_vqa_batch_data(insts, total_token_num, task_index, task_num, voc_size=0, pad_id=None, cls_id=None, sep_id=None, mask_id=None, return_input_mask=True, return_max_len=True, return_num_token=False): """ prepare batch data for vqa tasks """ batch_src_ids = [inst["token_ids"] for inst in insts] batch_sent_ids = [inst["sent_ids"] for inst in insts] batch_pos_ids = [inst["pos_ids"] for inst in insts] batch_image_embedding = [inst["image_embeddings"] for inst in insts] batch_image_loc = [inst["image_loc"] for inst in insts] batch_weight_label = [inst["weight_labels"] for inst in insts] q_ids = np.array([inst["question_id"] for inst in insts]) #pad and trans to numpy array src_id, self_input_mask, seq_lens = pad_batch_data( batch_src_ids, pad_idx=pad_id, return_input_mask=True, return_seq_lens = True) pos_id = pad_batch_data(batch_pos_ids, pad_idx=pad_id) sent_id = pad_batch_data(batch_sent_ids, pad_idx=pad_id) weight_labels = np.array(batch_weight_label).astype("float32") #image_embedding_ori = copy.deepcopy(batch_image_embedding) image_embedding, image_mask = pad_feature_data(batch_image_embedding, return_mask = True) #image_embedding_ori = pad_feature_data(image_embedding_ori) image_loc = pad_feature_data(batch_image_loc) return_list = [ src_id, pos_id, sent_id, self_input_mask, \ image_embedding, image_loc, image_mask, weight_labels, q_ids ] return return_list def prepare_flickr_data(insts, total_token_num, task_index, task_num, voc_size=0, pad_id=None, cls_id=None, sep_id=None, mask_id=None, outs=4, return_input_mask=True, return_max_len=True, return_num_token=False): """ prepare flickr data for finetuning tasks """ if outs > 1: batch_src_ids = [inst["token_ids"][out] for inst in insts for out in range(outs)] batch_sent_ids = [inst["sent_ids"][out] for inst in insts for out in range(outs)] batch_pos_ids = [inst["pos_ids"][out] for inst in insts for out in range(outs)] batch_image_embedding = [inst["image_embeddings"][out] for inst in insts for out in range(outs)] batch_image_loc = [inst["image_loc"][out] for inst in insts for out in range(outs)] else: batch_src_ids = [inst["token_ids"] for inst in insts] batch_sent_ids = [inst["sent_ids"] for inst in insts] batch_pos_ids = [inst["pos_ids"] for inst in insts] batch_image_embedding = [inst["image_embeddings"] for inst in insts ] batch_image_loc = [inst["image_loc"] for inst in insts ] batch_ids = [inst["ids"] for inst in insts for out in range(outs)] batch_size = int(len(batch_src_ids) / outs) label = np.array([[0] for i in range(batch_size)], dtype = "int64") src_id, self_input_mask, seq_lens = pad_batch_data( batch_src_ids, pad_idx=pad_id, return_input_mask=True, return_seq_lens = True) pos_id = pad_batch_data(batch_pos_ids, pad_idx=pad_id) sent_id = pad_batch_data(batch_sent_ids, pad_idx=pad_id) image_embeddings, image_mask = pad_feature_data(batch_image_embedding, return_mask = True) image_loc = pad_feature_data(batch_image_loc) ids = np.array(batch_ids, dtype = "int64") return_list = [ src_id, pos_id, sent_id, self_input_mask, image_embeddings, image_loc, image_mask, label, ids] return return_list def prepare_refcoco_plus_batch_data(insts, total_token_num, task_index, task_num, voc_size=0, pad_id=None, return_input_mask=True, return_max_len=True, return_num_token=False): """ prepare batch data for refcoco_plus tasks """ batch_src_ids = [inst["token_ids"] for inst in insts] batch_sent_ids = [inst["sent_ids"] for inst in insts] batch_pos_ids = [inst["pos_ids"] for inst in insts] batch_image_embedding = [inst["image_embeddings"] for inst in insts] batch_image_loc = [inst["image_loc"] for inst in insts] batch_image_label = [inst["label"] for inst in insts] add_items = np.array([inst["add_item"] for inst in insts], dtype="float32") src_id, self_input_mask, seq_lens = pad_batch_data( batch_src_ids, pad_idx=pad_id, return_input_mask=True, return_seq_lens = True) pos_id = pad_batch_data(batch_pos_ids, pad_idx=pad_id) sent_id = pad_batch_data(batch_sent_ids, pad_idx=pad_id) image_embedding, image_mask = pad_feature_data(batch_image_embedding, return_mask = True) image_loc = pad_feature_data(batch_image_loc) image_label = pad_feature_data(batch_image_label) return_list = [ src_id, pos_id, sent_id, self_input_mask, seq_lens, \ image_embedding, image_loc, image_mask, image_label, add_items ] return return_list def pad_batch_data(insts, pad_idx=0, return_pos=False, return_input_mask=False, return_max_len=False, return_num_token=False, return_seq_lens=False): """ Pad the instances to the max sequence length in batch, and generate the corresponding position data and attention bias. """ return_list = [] max_len = max(len(inst) for inst in insts) # Any token included in dict can be used to pad, since the paddings' loss # will be masked out by weights and make no effect on parameter gradients. inst_data = np.array( [inst + list([pad_idx] * (max_len - len(inst))) for inst in insts]) return_list += [inst_data.astype("int64").reshape([-1, max_len, 1])] # position data if return_pos: inst_pos = np.array([ list(range(0, len(inst))) + [pad_idx] * (max_len - len(inst)) for inst in insts ]) return_list += [inst_pos.astype("int64").reshape([-1, max_len, 1])] if return_input_mask: # This is used to avoid attention on paddings. input_mask_data = np.array([[1] * len(inst) + [0] * (max_len - len(inst)) for inst in insts]) input_mask_data = np.expand_dims(input_mask_data, axis=-1) return_list += [input_mask_data.astype("float32")] if return_max_len: return_list += [max_len] if return_num_token: num_token = 0 for inst in insts: num_token += len(inst) return_list += [num_token] if return_seq_lens: seq_lens = np.array([len(inst) for inst in insts]) return_list += [seq_lens.astype("int64").reshape([-1, 1])] return return_list if len(return_list) > 1 else return_list[0] def pad_feature_data(data, pad_value=0.0, dtype="float32", return_mask=False): """ pad visual features with given pad value """ max_length=max([len(item) for item in data]) data_width = len(data[0][0]) out_data = np.ones((len(data), max_length, data_width), dtype=dtype) * pad_value out_mask = np.zeros((len(data), max_length, 1), dtype=dtype) for i in range(len(data)): out_data[i, 0: len(data[i]), :] = data[i] if return_mask: out_mask[i, 0:len(data[i]):] = 1.0 if return_mask: return out_data, out_mask else: return out_data if __name__ == "__main__": pass