diff --git a/.gitignore b/.gitignore index 34f0e0cc9c8394d298f0bf730c5f3ac24470412d..3300be325f1f6c8b2b58301fc87a4f9d241afb84 100644 --- a/.gitignore +++ b/.gitignore @@ -11,6 +11,7 @@ inference/ inference_results/ output/ train_data/ +log/ *.DS_Store *.vs *.user diff --git a/configs/rec/rec_r32_gaspin_bilstm_att.yml b/configs/rec/rec_r32_gaspin_bilstm_att.yml index 236a17c43d34cf873b5ef6ffc12da6cf09194f03..e8235415c7e75addfc5599e181844a227d1e4eff 100644 --- a/configs/rec/rec_r32_gaspin_bilstm_att.yml +++ b/configs/rec/rec_r32_gaspin_bilstm_att.yml @@ -61,7 +61,6 @@ Loss: PostProcess: name: SPINAttnLabelDecode - character_dict_path: ./ppocr/utils/dict/spin_dict.txt use_space_char: False diff --git a/ppocr/losses/rec_spin_att_loss.py b/ppocr/losses/rec_spin_att_loss.py index 37fd93da5750a353e30c4b9f0574e0e634b50d79..195780c7bfaf4aae5dd23bd72ace268bed9c1d4f 100644 --- a/ppocr/losses/rec_spin_att_loss.py +++ b/ppocr/losses/rec_spin_att_loss.py @@ -1,4 +1,4 @@ -# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -12,6 +12,7 @@ # See the License for the specific language governing permissions and # limitations under the License. + from __future__ import absolute_import from __future__ import division from __future__ import print_function @@ -19,6 +20,9 @@ from __future__ import print_function import paddle from paddle import nn +'''This code is refer from: +https://github.com/hikopensource/DAVAR-Lab-OCR +''' class SPINAttentionLoss(nn.Layer): def __init__(self, reduction='mean', ignore_index=-100, **kwargs): diff --git a/ppocr/modeling/heads/rec_spin_att_head.py b/ppocr/modeling/heads/rec_spin_att_head.py index 94e69a7ede926e76fb4df4c0cb5077644cd9737f..07a58b08327a602e9402cc122a6aedfe610fefd5 100644 --- a/ppocr/modeling/heads/rec_spin_att_head.py +++ b/ppocr/modeling/heads/rec_spin_att_head.py @@ -1,4 +1,4 @@ -# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -80,98 +80,6 @@ class SPINAttentionHead(nn.Layer): return probs -class AttentionGRUCell(nn.Layer): - def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False): - super(AttentionGRUCell, self).__init__() - self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False) - self.h2h = nn.Linear(hidden_size, hidden_size) - self.score = nn.Linear(hidden_size, 1, bias_attr=False) - - self.rnn = nn.GRUCell( - input_size=input_size + num_embeddings, hidden_size=hidden_size) - - self.hidden_size = hidden_size - - def forward(self, prev_hidden, batch_H, char_onehots): - - batch_H_proj = self.i2h(batch_H) - prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden), axis=1) - - res = paddle.add(batch_H_proj, prev_hidden_proj) - res = paddle.tanh(res) - e = self.score(res) - - alpha = F.softmax(e, axis=1) - alpha = paddle.transpose(alpha, [0, 2, 1]) - context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1) - concat_context = paddle.concat([context, char_onehots], 1) - - cur_hidden = self.rnn(concat_context, prev_hidden) - - return cur_hidden, alpha - - -class AttentionLSTM(nn.Layer): - def __init__(self, in_channels, out_channels, hidden_size, **kwargs): - super(AttentionLSTM, self).__init__() - self.input_size = in_channels - self.hidden_size = hidden_size - self.num_classes = out_channels - - self.attention_cell = AttentionLSTMCell( - in_channels, hidden_size, out_channels, use_gru=False) - self.generator = nn.Linear(hidden_size, out_channels) - - def _char_to_onehot(self, input_char, onehot_dim): - input_ont_hot = F.one_hot(input_char, onehot_dim) - return input_ont_hot - - def forward(self, inputs, targets=None, batch_max_length=25): - batch_size = inputs.shape[0] - num_steps = batch_max_length - - hidden = (paddle.zeros((batch_size, self.hidden_size)), paddle.zeros( - (batch_size, self.hidden_size))) - output_hiddens = [] - - if targets is not None: - for i in range(num_steps): - # one-hot vectors for a i-th char - char_onehots = self._char_to_onehot( - targets[:, i], onehot_dim=self.num_classes) - hidden, alpha = self.attention_cell(hidden, inputs, - char_onehots) - - hidden = (hidden[1][0], hidden[1][1]) - output_hiddens.append(paddle.unsqueeze(hidden[0], axis=1)) - output = paddle.concat(output_hiddens, axis=1) - probs = self.generator(output) - - else: - targets = paddle.zeros(shape=[batch_size], dtype="int32") - probs = None - - for i in range(num_steps): - char_onehots = self._char_to_onehot( - targets, onehot_dim=self.num_classes) - hidden, alpha = self.attention_cell(hidden, inputs, - char_onehots) - probs_step = self.generator(hidden[0]) - hidden = (hidden[1][0], hidden[1][1]) - if probs is None: - probs = paddle.unsqueeze(probs_step, axis=1) - else: - probs = paddle.concat( - [probs, paddle.unsqueeze( - probs_step, axis=1)], axis=1) - - next_input = probs_step.argmax(axis=1) - - targets = next_input - - return probs - - class AttentionLSTMCell(nn.Layer): def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False): super(AttentionLSTMCell, self).__init__() diff --git a/ppocr/modeling/necks/rnn.py b/ppocr/modeling/necks/rnn.py index 32e626c3fcfcf9c578772e99941f791083aabf31..33be9400b34cb535d260881748e179c3df106caa 100644 --- a/ppocr/modeling/necks/rnn.py +++ b/ppocr/modeling/necks/rnn.py @@ -70,17 +70,6 @@ class BidirectionalLSTM(nn.Layer): self.linear = nn.Linear(hidden_size * 2, output_size) def forward(self, input_feature): - """ - - Args: - input_feature (Torch.Tensor): visual feature [batch_size x T x input_size] - - Returns: - Torch.Tensor: LSTM output contextual feature [batch_size x T x output_size] - - """ - - # self.rnn.flatten_parameters() # error in export_model recurrent, _ = self.rnn(input_feature) # batch_size x T x input_size -> batch_size x T x (2*hidden_size) if self.with_linear: output = self.linear(recurrent) # batch_size x T x output_size diff --git a/ppocr/modeling/transforms/gaspin_transformer.py b/ppocr/modeling/transforms/gaspin_transformer.py index 331c82aaeb0b176950621faf5e095cfc806a6257..9440e360d151e4dfd0715a8ecf26bf47c55e238a 100644 --- a/ppocr/modeling/transforms/gaspin_transformer.py +++ b/ppocr/modeling/transforms/gaspin_transformer.py @@ -1,4 +1,4 @@ -# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve. +# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -71,14 +71,14 @@ class SP_TransformerNetwork(nn.Layer): """ Args: - batch_I (torch.Tensor): batch of input images [batch_size x nc x I_height x I_width] + batch_I (Tensor): batch of input images [batch_size x nc x I_height x I_width] weights: offsets: the predicted offset by AIN, a scalar lambda_color: the learnable update gate \alpha in Equa. (5) as g(x) = (1 - \alpha) \odot x + \alpha \odot x_{offsets} Returns: - torch.Tensor: transformed images by SPN as Equa. (4) in Ref. [1] + Tensor: transformed images by SPN as Equa. (4) in Ref. [1] [batch_size x I_channel_num x I_r_height x I_r_width] """ @@ -114,8 +114,6 @@ class GA_SPIN_Transformer(nn.Layer): in_channels (int): channel of input features, set it to 1 if the grayscale images and 3 if RGB input I_r_size (tuple): size of rectified images (used in STN transformations) - inputDataType (str): the type of input data, - only support 'torch.cuda.FloatTensor' this version offsets (bool): set it to False if use SPN w.o. AIN, and set it to True if use SPIN (both with SPN and AIN) norm_type (str): the normalization type of the module, @@ -123,6 +121,7 @@ class GA_SPIN_Transformer(nn.Layer): default_type (int): the K chromatic space, set it to 3/5/6 depend on the complexity of transformation intensities loc_lr (float): learning rate of location network + stn (bool): whther to use stn. """ super(GA_SPIN_Transformer, self).__init__() @@ -233,12 +232,12 @@ class GA_SPIN_Transformer(nn.Layer): def forward(self, x, return_weight=False): """ Args: - x (torch.cuda.FloatTensor): input image batch + x (Tensor): input image batch return_weight (bool): set to False by default, if set to True return the predicted offsets of AIN, denoted as x_{offsets} Returns: - torch.Tensor: rectified image [batch_size x I_channel_num x I_height x I_width], the same as the input size + Tensor: rectified image [batch_size x I_channel_num x I_height x I_width], the same as the input size """ if self.spt: diff --git a/tools/export_model.py b/tools/export_model.py index b8bc5e1ed835dbb8cd581c7d1034abd5b71edb19..3ea0228f857a2fadb36678ecd3b91bc865e56e46 100755 --- a/tools/export_model.py +++ b/tools/export_model.py @@ -73,12 +73,6 @@ def export_single_model(model, arch_config, save_path, logger, quanter=None): shape=[None, 3, 64, 512], dtype="float32"), ] model = to_static(model, input_spec=other_shape) - elif arch_config["algorithm"] == "SPIN": - other_shape = [ - paddle.static.InputSpec( - shape=[None, 1, 32, 100], dtype="float32"), - ] - model = to_static(model, input_spec=other_shape) else: infer_shape = [3, -1, -1] if arch_config["model_type"] == "rec":