#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve. # #Licensed under the Apache License, Version 2.0 (the "License"); #you may not use this file except in compliance with the License. #You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # #Unless required by applicable law or agreed to in writing, software #distributed under the License is distributed on an "AS IS" BASIS, #WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. #See the License for the specific language governing permissions and #limitations under the License. from __future__ import absolute_import from __future__ import division from __future__ import print_function import math import paddle import paddle.fluid as fluid from paddle.fluid.param_attr import ParamAttr import numpy as np from .self_attention.model import wrap_encoder from .self_attention.model import wrap_encoder_forFeature gradient_clip = 10 class SRNPredict(object): """ SRN: see arxiv: https://arxiv.org/abs/2003.12294 args: params(dict): the super parameters for network build """ def __init__(self, params): super(SRNPredict, self).__init__() self.char_num = params['char_num'] self.max_length = params['max_text_length'] self.num_heads = params['num_heads'] self.num_encoder_TUs = params['num_encoder_TUs'] self.num_decoder_TUs = params['num_decoder_TUs'] self.hidden_dims = params['hidden_dims'] def pvam(self, inputs, others): """ Parallel visual attention module model args: inputs(variable): Feature map extracted from backbone network others(list): Other location information variables return: pvam_features """ b, c, h, w = inputs.shape conv_features = fluid.layers.reshape(x=inputs, shape=[-1, c, h * w]) conv_features = fluid.layers.transpose(x=conv_features, perm=[0, 2, 1]) #===== Transformer encoder ===== b, t, c = conv_features.shape encoder_word_pos = others["encoder_word_pos"] gsrm_word_pos = others["gsrm_word_pos"] enc_inputs = [conv_features, encoder_word_pos, None] word_features = wrap_encoder_forFeature( src_vocab_size=-1, max_length=t, n_layer=self.num_encoder_TUs, n_head=self.num_heads, d_key=int(self.hidden_dims / self.num_heads), d_value=int(self.hidden_dims / self.num_heads), d_model=self.hidden_dims, d_inner_hid=self.hidden_dims, prepostprocess_dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, preprocess_cmd="n", postprocess_cmd="da", weight_sharing=True, enc_inputs=enc_inputs, ) fluid.clip.set_gradient_clip( fluid.clip.GradientClipByValue(gradient_clip)) #===== Parallel Visual Attention Module ===== b, t, c = word_features.shape word_features = fluid.layers.fc(word_features, c, num_flatten_dims=2) word_features_ = fluid.layers.reshape(word_features, [-1, 1, t, c]) word_features_ = fluid.layers.expand(word_features_, [1, self.max_length, 1, 1]) word_pos_feature = fluid.layers.embedding(gsrm_word_pos, [self.max_length, c]) word_pos_ = fluid.layers.reshape(word_pos_feature, [-1, self.max_length, 1, c]) word_pos_ = fluid.layers.expand(word_pos_, [1, 1, t, 1]) temp = fluid.layers.elementwise_add( word_features_, word_pos_, act='tanh') attention_weight = fluid.layers.fc(input=temp, size=1, num_flatten_dims=3, bias_attr=False) attention_weight = fluid.layers.reshape( x=attention_weight, shape=[-1, self.max_length, t]) attention_weight = fluid.layers.softmax(input=attention_weight, axis=-1) pvam_features = fluid.layers.matmul(attention_weight, word_features) #[b, max_length, c] return pvam_features def gsrm(self, pvam_features, others): """ Global Semantic Reasonging Module args: pvam_features(variable): Feature map extracted from pvam others(list): Other location information variables return: gsrm_features, word_out, gsrm_out """ #===== GSRM Visual-to-semantic embedding block ===== b, t, c = pvam_features.shape word_out = fluid.layers.fc( input=fluid.layers.reshape(pvam_features, [-1, c]), size=self.char_num, act="softmax") #word_out.stop_gradient = True word_ids = fluid.layers.argmax(word_out, axis=1) word_ids.stop_gradient = True word_ids = fluid.layers.reshape(x=word_ids, shape=[-1, t, 1]) #===== GSRM Semantic reasoning block ===== """ This module is achieved through bi-transformers, ngram_feature1 is the froward one, ngram_fetaure2 is the backward one """ pad_idx = self.char_num gsrm_word_pos = others["gsrm_word_pos"] gsrm_slf_attn_bias1 = others["gsrm_slf_attn_bias1"] gsrm_slf_attn_bias2 = others["gsrm_slf_attn_bias2"] def prepare_bi(word_ids): """ prepare bi for gsrm word1 for forward; word2 for backward """ word1 = fluid.layers.cast(word_ids, "float32") word1 = fluid.layers.pad(word1, [0, 0, 1, 0, 0, 0], pad_value=1.0 * pad_idx) word1 = fluid.layers.cast(word1, "int64") word1 = word1[:, :-1, :] word2 = word_ids return word1, word2 word1, word2 = prepare_bi(word_ids) word1.stop_gradient = True word2.stop_gradient = True enc_inputs_1 = [word1, gsrm_word_pos, gsrm_slf_attn_bias1] enc_inputs_2 = [word2, gsrm_word_pos, gsrm_slf_attn_bias2] gsrm_feature1 = wrap_encoder( src_vocab_size=self.char_num + 1, max_length=self.max_length, n_layer=self.num_decoder_TUs, n_head=self.num_heads, d_key=int(self.hidden_dims / self.num_heads), d_value=int(self.hidden_dims / self.num_heads), d_model=self.hidden_dims, d_inner_hid=self.hidden_dims, prepostprocess_dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, preprocess_cmd="n", postprocess_cmd="da", weight_sharing=True, enc_inputs=enc_inputs_1, ) gsrm_feature2 = wrap_encoder( src_vocab_size=self.char_num + 1, max_length=self.max_length, n_layer=self.num_decoder_TUs, n_head=self.num_heads, d_key=int(self.hidden_dims / self.num_heads), d_value=int(self.hidden_dims / self.num_heads), d_model=self.hidden_dims, d_inner_hid=self.hidden_dims, prepostprocess_dropout=0.1, attention_dropout=0.1, relu_dropout=0.1, preprocess_cmd="n", postprocess_cmd="da", weight_sharing=True, enc_inputs=enc_inputs_2, ) gsrm_feature2 = fluid.layers.pad(gsrm_feature2, [0, 0, 0, 1, 0, 0], pad_value=0.) gsrm_feature2 = gsrm_feature2[:, 1:, ] gsrm_features = gsrm_feature1 + gsrm_feature2 b, t, c = gsrm_features.shape gsrm_out = fluid.layers.matmul( x=gsrm_features, y=fluid.default_main_program().global_block().var( "src_word_emb_table"), transpose_y=True) b, t, c = gsrm_out.shape gsrm_out = fluid.layers.softmax(input=fluid.layers.reshape(gsrm_out, [-1, c])) return gsrm_features, word_out, gsrm_out def vsfd(self, pvam_features, gsrm_features): """ Visual-Semantic Fusion Decoder Module args: pvam_features(variable): Feature map extracted from pvam gsrm_features(list): Feature map extracted from gsrm return: fc_out """ #===== Visual-Semantic Fusion Decoder Module ===== b, t, c1 = pvam_features.shape b, t, c2 = gsrm_features.shape combine_features_ = fluid.layers.concat( [pvam_features, gsrm_features], axis=2) img_comb_features_ = fluid.layers.reshape( x=combine_features_, shape=[-1, c1 + c2]) img_comb_features_map = fluid.layers.fc(input=img_comb_features_, size=c1, act="sigmoid") img_comb_features_map = fluid.layers.reshape( x=img_comb_features_map, shape=[-1, t, c1]) combine_features = img_comb_features_map * pvam_features + ( 1.0 - img_comb_features_map) * gsrm_features img_comb_features = fluid.layers.reshape( x=combine_features, shape=[-1, c1]) fc_out = fluid.layers.fc(input=img_comb_features, size=self.char_num, act="softmax") return fc_out def __call__(self, inputs, others, mode=None): pvam_features = self.pvam(inputs, others) gsrm_features, word_out, gsrm_out = self.gsrm(pvam_features, others) final_out = self.vsfd(pvam_features, gsrm_features) _, decoded_out = fluid.layers.topk(input=final_out, k=1) predicts = { 'predict': final_out, 'decoded_out': decoded_out, 'word_out': word_out, 'gsrm_out': gsrm_out } return predicts