# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import paddle import numpy as np class VQATokenPad(object): def __init__(self, max_seq_len=512, pad_to_max_seq_len=True, return_attention_mask=True, return_token_type_ids=True, truncation_strategy="longest_first", return_overflowing_tokens=False, return_special_tokens_mask=False, infer_mode=False, **kwargs): self.max_seq_len = max_seq_len self.pad_to_max_seq_len = max_seq_len self.return_attention_mask = return_attention_mask self.return_token_type_ids = return_token_type_ids self.truncation_strategy = truncation_strategy self.return_overflowing_tokens = return_overflowing_tokens self.return_special_tokens_mask = return_special_tokens_mask self.pad_token_label_id = paddle.nn.CrossEntropyLoss().ignore_index self.infer_mode = infer_mode def __call__(self, data): needs_to_be_padded = self.pad_to_max_seq_len and len(data[ "input_ids"]) < self.max_seq_len if needs_to_be_padded: if 'tokenizer_params' in data: tokenizer_params = data.pop('tokenizer_params') else: tokenizer_params = dict( padding_side='right', pad_token_type_id=0, pad_token_id=1) difference = self.max_seq_len - len(data["input_ids"]) if tokenizer_params['padding_side'] == 'right': if self.return_attention_mask: data["attention_mask"] = [1] * len(data[ "input_ids"]) + [0] * difference if self.return_token_type_ids: data["token_type_ids"] = ( data["token_type_ids"] + [tokenizer_params['pad_token_type_id']] * difference) if self.return_special_tokens_mask: data["special_tokens_mask"] = data[ "special_tokens_mask"] + [1] * difference data["input_ids"] = data["input_ids"] + [ tokenizer_params['pad_token_id'] ] * difference if not self.infer_mode: data["labels"] = data[ "labels"] + [self.pad_token_label_id] * difference data["bbox"] = data["bbox"] + [[0, 0, 0, 0]] * difference elif tokenizer_params['padding_side'] == 'left': if self.return_attention_mask: data["attention_mask"] = [0] * difference + [ 1 ] * len(data["input_ids"]) if self.return_token_type_ids: data["token_type_ids"] = ( [tokenizer_params['pad_token_type_id']] * difference + data["token_type_ids"]) if self.return_special_tokens_mask: data["special_tokens_mask"] = [ 1 ] * difference + data["special_tokens_mask"] data["input_ids"] = [tokenizer_params['pad_token_id'] ] * difference + data["input_ids"] if not self.infer_mode: data["labels"] = [self.pad_token_label_id ] * difference + data["labels"] data["bbox"] = [[0, 0, 0, 0]] * difference + data["bbox"] else: if self.return_attention_mask: data["attention_mask"] = [1] * len(data["input_ids"]) for key in data: if key in [ 'input_ids', 'labels', 'token_type_ids', 'bbox', 'attention_mask' ]: if self.infer_mode: if key != 'labels': length = min(len(data[key]), self.max_seq_len) data[key] = data[key][:length] else: continue data[key] = np.array(data[key], dtype='int64') return data