modified pr

1cda437c · xuyang2233 · bbca1e0d · 1cda437c · 1cda437c · 1cda437c
7 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -11,6 +11,7 @@ inference/
 inference_results/
 output/
 train_data/
+log/
 *.DS_Store
 *.vs
 *.user

--- a/configs/rec/rec_r32_gaspin_bilstm_att.yml
+++ b/configs/rec/rec_r32_gaspin_bilstm_att.yml
@@ -61,7 +61,6 @@ Loss:
 PostProcess:
  name: SPINAttnLabelDecode
-  character_dict_path: ./ppocr/utils/dict/spin_dict.txt
  use_space_char: False

--- a/ppocr/losses/rec_spin_att_loss.py
+++ b/ppocr/losses/rec_spin_att_loss.py
-# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import absolute_import
 from __future__ import division
 from __future__ import print_function
@@ -19,6 +20,9 @@ from __future__ import print_function
 import paddle
 from paddle import nn
+'''This code is refer from:
+https://github.com/hikopensource/DAVAR-Lab-OCR
+'''
 class SPINAttentionLoss(nn.Layer):
    def __init__(self, reduction='mean', ignore_index=-100, **kwargs):

--- a/ppocr/modeling/heads/rec_spin_att_head.py
+++ b/ppocr/modeling/heads/rec_spin_att_head.py
-# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -80,98 +80,6 @@ class SPINAttentionHead(nn.Layer):
        return probs
-class AttentionGRUCell(nn.Layer):
-    def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False):
-        super(AttentionGRUCell, self).__init__()
-        self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False)
-        self.h2h = nn.Linear(hidden_size, hidden_size)
-        self.score = nn.Linear(hidden_size, 1, bias_attr=False)
-        self.rnn = nn.GRUCell(
-            input_size=input_size + num_embeddings, hidden_size=hidden_size)
-        self.hidden_size = hidden_size
-    def forward(self, prev_hidden, batch_H, char_onehots):
-        batch_H_proj = self.i2h(batch_H)
-        prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden), axis=1)
-        res = paddle.add(batch_H_proj, prev_hidden_proj)
-        res = paddle.tanh(res)
-        e = self.score(res)
-        alpha = F.softmax(e, axis=1)
-        alpha = paddle.transpose(alpha, [0, 2, 1])
-        context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1)
-        concat_context = paddle.concat([context, char_onehots], 1)
-        cur_hidden = self.rnn(concat_context, prev_hidden)
-        return cur_hidden, alpha
-class AttentionLSTM(nn.Layer):
-    def __init__(self, in_channels, out_channels, hidden_size, **kwargs):
-        super(AttentionLSTM, self).__init__()
-        self.input_size = in_channels
-        self.hidden_size = hidden_size
-        self.num_classes = out_channels
-        self.attention_cell = AttentionLSTMCell(
-            in_channels, hidden_size, out_channels, use_gru=False)
-        self.generator = nn.Linear(hidden_size, out_channels)
-    def _char_to_onehot(self, input_char, onehot_dim):
-        input_ont_hot = F.one_hot(input_char, onehot_dim)
-        return input_ont_hot
-    def forward(self, inputs, targets=None, batch_max_length=25):
-        batch_size = inputs.shape[0]
-        num_steps = batch_max_length
-        hidden = (paddle.zeros((batch_size, self.hidden_size)), paddle.zeros(
-            (batch_size, self.hidden_size)))
-        output_hiddens = []
-        if targets is not None:
-            for i in range(num_steps):
-                # one-hot vectors for a i-th char
-                char_onehots = self._char_to_onehot(
-                    targets[:, i], onehot_dim=self.num_classes)
-                hidden, alpha = self.attention_cell(hidden, inputs,
-                                                    char_onehots)
-                hidden = (hidden[1][0], hidden[1][1])
-                output_hiddens.append(paddle.unsqueeze(hidden[0], axis=1))
-            output = paddle.concat(output_hiddens, axis=1)
-            probs = self.generator(output)
-        else:
-            targets = paddle.zeros(shape=[batch_size], dtype="int32")
-            probs = None
-            for i in range(num_steps):
-                char_onehots = self._char_to_onehot(
-                    targets, onehot_dim=self.num_classes)
-                hidden, alpha = self.attention_cell(hidden, inputs,
-                                                    char_onehots)
-                probs_step = self.generator(hidden[0])
-                hidden = (hidden[1][0], hidden[1][1])
-                if probs is None:
-                    probs = paddle.unsqueeze(probs_step, axis=1)
-                else:
-                    probs = paddle.concat(
-                        [probs, paddle.unsqueeze(
-                            probs_step, axis=1)], axis=1)
-                next_input = probs_step.argmax(axis=1)
-                targets = next_input
-        return probs
 class AttentionLSTMCell(nn.Layer):
    def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False):
        super(AttentionLSTMCell, self).__init__()

--- a/ppocr/modeling/necks/rnn.py
+++ b/ppocr/modeling/necks/rnn.py
@@ -70,17 +70,6 @@ class BidirectionalLSTM(nn.Layer):
            self.linear = nn.Linear(hidden_size * 2, output_size)
    def forward(self, input_feature):
-        """
-        Args:
-            input_feature (Torch.Tensor): visual feature [batch_size x T x input_size]
-        Returns:
-            Torch.Tensor: LSTM output contextual feature [batch_size x T x output_size]
-        """
-        # self.rnn.flatten_parameters() # error in export_model
        recurrent, _ = self.rnn(input_feature)  # batch_size x T x input_size -> batch_size x T x (2*hidden_size)
        if self.with_linear:
            output = self.linear(recurrent)     # batch_size x T x output_size

--- a/ppocr/modeling/transforms/gaspin_transformer.py
+++ b/ppocr/modeling/transforms/gaspin_transformer.py
-# copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
+# copyright (c) 2022 PaddlePaddle Authors. All Rights Reserve.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -71,14 +71,14 @@ class SP_TransformerNetwork(nn.Layer):
        """
        Args:
-            batch_I (torch.Tensor): batch of input images [batch_size x nc x I_height x I_width]
+            batch_I (Tensor): batch of input images [batch_size x nc x I_height x I_width]
            weights:
            offsets: the predicted offset by AIN, a scalar
            lambda_color: the learnable update gate \alpha in Equa. (5) as
                          g(x) = (1 - \alpha) \odot x + \alpha \odot x_{offsets}
        Returns:
-            torch.Tensor: transformed images by SPN as Equa. (4) in Ref. [1]
+            Tensor: transformed images by SPN as Equa. (4) in Ref. [1]
                        [batch_size x I_channel_num x I_r_height x I_r_width]
        """
@@ -114,8 +114,6 @@ class GA_SPIN_Transformer(nn.Layer):
            in_channels (int): channel of input features,
                                set it to 1 if the grayscale images and 3 if RGB input
            I_r_size (tuple): size of rectified images (used in STN transformations)
-            inputDataType (str): the type of input data,
-                                only support 'torch.cuda.FloatTensor' this version
            offsets (bool): set it to False if use SPN w.o. AIN,
                            and set it to True if use SPIN (both with SPN and AIN)
            norm_type (str): the normalization type of the module,
@@ -123,6 +121,7 @@ class GA_SPIN_Transformer(nn.Layer):
            default_type (int): the K chromatic space,
                                set it to 3/5/6 depend on the complexity of transformation intensities
            loc_lr (float): learning rate of location network
+            stn (bool): whther to use stn.
        """
        super(GA_SPIN_Transformer, self).__init__()
@@ -233,12 +232,12 @@ class GA_SPIN_Transformer(nn.Layer):
    def forward(self, x, return_weight=False):
        """
        Args:
-            x (torch.cuda.FloatTensor): input image batch
+            x (Tensor): input image batch
            return_weight (bool): set to False by default,
                                  if set to True return the predicted offsets of AIN, denoted as x_{offsets}
        Returns:
-            torch.Tensor: rectified image [batch_size x I_channel_num x I_height x I_width], the same as the input size
+            Tensor: rectified image [batch_size x I_channel_num x I_height x I_width], the same as the input size
        """
        if self.spt:

--- a/tools/export_model.py
+++ b/tools/export_model.py
@@ -73,12 +73,6 @@ def export_single_model(model, arch_config, save_path, logger, quanter=None):
                shape=[None, 3, 64, 512], dtype="float32"),
        ]
        model = to_static(model, input_spec=other_shape)
-    elif arch_config["algorithm"] == "SPIN":
-        other_shape = [
-            paddle.static.InputSpec(
-                shape=[None, 1, 32, 100], dtype="float32"),
-        ]
-        model = to_static(model, input_spec=other_shape)
    else:
        infer_shape = [3, -1, -1]
        if arch_config["model_type"] == "rec":