提交 b6f0a903 编写于 作者: T Topdu

add rec_nrtr

上级 6127aad9
use_gpu: True
epoch_num: 21
log_smooth_window: 20
print_batch_step: 10
save_model_dir: ./output/rec/nrtr_final/
save_epoch_step: 1
# evaluation is run every 2000 iterations
eval_batch_step: [0, 2000]
cal_metric_during_train: True
use_visualdl: False
infer_img: doc/imgs_words_en/word_10.png
# for data or label process
character_type: EN_symbol
max_text_length: 25
infer_mode: False
use_space_char: True
save_res_path: ./output/rec/predicts_nrtr.txt
name: Adam
beta1: 0.9
beta2: 0.99
clip_norm: 5.0
name: Cosine
learning_rate: 0.0005
warmup_epoch: 2
name: 'L2'
factor: 0.
model_type: rec
algorithm: NRTR
in_channels: 1
name: MTB
cnn_num: 2
name: TransformerOptim
d_model: 512
num_encoder_layers: 6
beam_size: -1 # When Beam size is greater than 0, it means to use beam search when evaluation.
name: NRTRLoss
smoothing: True
name: NRTRLabelDecode
name: RecMetric
main_indicator: acc
name: LMDBDataSet
data_dir: /paddle/data/ocr_data/training/
- NRTRDecodeImage: # load image
img_mode: BGR
channel_first: False
- NRTRLabelEncode: # Class handling label
- PILResize:
image_shape: [100, 32]
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
shuffle: True
batch_size_per_card: 512
drop_last: True
num_workers: 8
name: LMDBDataSet
data_dir: /paddle/data/ocr_data/evaluation/
- NRTRDecodeImage: # load image
img_mode: BGR
channel_first: False
- NRTRLabelEncode: # Class handling label
- PILResize:
image_shape: [100, 32]
- KeepKeys:
keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
shuffle: False
drop_last: False
batch_size_per_card: 256
num_workers: 1
use_shared_memory: False
...@@ -159,6 +159,34 @@ class BaseRecLabelEncode(object): ...@@ -159,6 +159,34 @@ class BaseRecLabelEncode(object):
return text_list return text_list
class NRTRLabelEncode(BaseRecLabelEncode):
""" Convert between text-label and text-index """
def __init__(self,
self).__init__(max_text_length, character_dict_path,
character_type, use_space_char)
def __call__(self, data):
text = data['label']
text = self.encode(text)
if text is None:
return None
data['length'] = np.array(len(text))
text.insert(0, 2)
text = text + [0] * (self.max_text_len - len(text))
data['label'] = np.array(text)
return data
def add_special_char(self, dict_character):
dict_character = ['blank','<unk>','<s>','</s>'] + dict_character
return dict_character
class CTCLabelEncode(BaseRecLabelEncode): class CTCLabelEncode(BaseRecLabelEncode):
""" Convert between text-label and text-index """ """ Convert between text-label and text-index """
...@@ -26,12 +26,13 @@ def build_head(config): ...@@ -26,12 +26,13 @@ def build_head(config):
from .rec_ctc_head import CTCHead from .rec_ctc_head import CTCHead
from .rec_att_head import AttentionHead from .rec_att_head import AttentionHead
from .rec_srn_head import SRNHead from .rec_srn_head import SRNHead
from .rec_nrtr_optim_head import TransformerOptim
# cls head # cls head
from .cls_head import ClsHead from .cls_head import ClsHead
support_dict = [ support_dict = [
'DBHead', 'EASTHead', 'SASTHead', 'CTCHead', 'ClsHead', 'AttentionHead', 'DBHead', 'EASTHead', 'SASTHead', 'CTCHead', 'ClsHead', 'AttentionHead',
'SRNHead', 'PGHead'] 'SRNHead', 'PGHead', 'TransformerOptim']
module_name = config.pop('name') module_name = config.pop('name')
import paddle
from paddle import nn
import paddle.nn.functional as F
from paddle.nn import Linear
from paddle.nn.initializer import XavierUniform as xavier_uniform_
from paddle.nn.initializer import Constant as constant_
from paddle.nn.initializer import XavierNormal as xavier_normal_
zeros_ = constant_(value=0.)
ones_ = constant_(value=1.)
class MultiheadAttention(nn.Layer):
r"""Allows the model to jointly attend to information
from different representation subspaces.
See reference: Attention Is All You Need
.. math::
\text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
\text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
embed_dim: total dimension of the model
num_heads: parallel attention layers, or heads
>>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
>>> attn_output, attn_output_weights = multihead_attn(query, key, value)
def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False):
super(MultiheadAttention, self).__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
self.scaling = self.head_dim ** -0.5
self.out_proj = Linear(embed_dim, embed_dim, bias_attr=bias)
if add_bias_kv:
self.bias_k = self.create_parameter(
shape=(1, 1, embed_dim), default_initializer=zeros_)
self.add_parameter("bias_k", self.bias_k)
self.bias_v = self.create_parameter(
shape=(1, 1, embed_dim), default_initializer=zeros_)
self.add_parameter("bias_v", self.bias_v)
self.bias_k = self.bias_v = None
self.add_zero_attn = add_zero_attn
self.conv1 = paddle.nn.Conv2D(in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
self.conv2 = paddle.nn.Conv2D(in_channels=embed_dim, out_channels=embed_dim * 2, kernel_size=(1, 1))
self.conv3 = paddle.nn.Conv2D(in_channels=embed_dim, out_channels=embed_dim * 3, kernel_size=(1, 1))
def _reset_parameters(self):
if self.bias_k is not None:
if self.bias_v is not None:
def forward(self, query, key, value, key_padding_mask=None, incremental_state=None,
need_weights=True, static_kv=False, attn_mask=None, qkv_ = [False,False,False]):
Inputs of forward function
query: [target length, batch size, embed dim]
key: [sequence length, batch size, embed dim]
value: [sequence length, batch size, embed dim]
key_padding_mask: if True, mask padding based on batch size
incremental_state: if provided, previous time steps are cashed
need_weights: output attn_output_weights
static_kv: key and value are static
Outputs of forward function
attn_output: [target length, batch size, embed dim]
attn_output_weights: [batch size, target length, sequence length]
qkv_same = qkv_[0]
kv_same = qkv_[1]
tgt_len, bsz, embed_dim = query.shape
assert embed_dim == self.embed_dim
assert list(query.shape) == [tgt_len, bsz, embed_dim]
assert key.shape == value.shape
if qkv_same:
# self-attention
q, k, v = self._in_proj_qkv(query)
elif kv_same:
# encoder-decoder attention
q = self._in_proj_q(query)
if key is None:
assert value is None
k = v = None
k, v = self._in_proj_kv(key)
q = self._in_proj_q(query)
k = self._in_proj_k(key)
v = self._in_proj_v(value)
q *= self.scaling
if self.bias_k is not None:
assert self.bias_v is not None
self.bias_k = paddle.concat([self.bias_k for i in range(bsz)],axis=1)
self.bias_v = paddle.concat([self.bias_v for i in range(bsz)],axis=1)
k = paddle.concat([k, self.bias_k])
v = paddle.concat([v, self.bias_v])
if attn_mask is not None:
attn_mask = paddle.concat([attn_mask, paddle.zeros([attn_mask.shape[0], 1],dtype=attn_mask.dtype)], axis=1)
if key_padding_mask is not None:
key_padding_mask = paddle.concat(
[key_padding_mask,paddle.zeros([key_padding_mask.shape[0], 1],dtype=key_padding_mask.dtype)], axis=1)
q = q.reshape([tgt_len, bsz * self.num_heads, self.head_dim]).transpose([1, 0, 2])
if k is not None:
k = k.reshape([-1, bsz * self.num_heads, self.head_dim]).transpose([1, 0, 2])
if v is not None:
v = v.reshape([-1, bsz * self.num_heads, self.head_dim]).transpose([1, 0, 2])
src_len = k.shape[1]
if key_padding_mask is not None:
assert key_padding_mask.shape[0] == bsz
assert key_padding_mask.shape[1] == src_len
if self.add_zero_attn:
src_len += 1
k = paddle.concat([k, paddle.zeros((k.shape[0], 1) + k.shape[2:],dtype=k.dtype)], axis=1)
v = paddle.concat([v, paddle.zeros((v.shape[0], 1) + v.shape[2:],dtype=v.dtype)], axis=1)
if attn_mask is not None:
attn_mask = paddle.concat([attn_mask, paddle.zeros([attn_mask.shape[0], 1],dtype=attn_mask.dtype)], axis=1)
if key_padding_mask is not None:
key_padding_mask = paddle.concat(
[key_padding_mask, paddle.zeros([key_padding_mask.shape[0], 1],dtype=key_padding_mask.dtype)], axis=1)
attn_output_weights = paddle.bmm(q, k.transpose([0,2,1]))
assert list(attn_output_weights.shape) == [bsz * self.num_heads, tgt_len, src_len]
if attn_mask is not None:
attn_mask = attn_mask.unsqueeze(0)
attn_output_weights += attn_mask
if key_padding_mask is not None:
attn_output_weights = attn_output_weights.reshape([bsz, self.num_heads, tgt_len, src_len])
key = key_padding_mask.unsqueeze(1).unsqueeze(2).astype('float32')
y = paddle.full(shape=key.shape, dtype='float32', fill_value='-inf')
y = paddle.where(key==0.,key, y)
attn_output_weights += y
attn_output_weights = attn_output_weights.reshape([bsz*self.num_heads, tgt_len, src_len])
attn_output_weights = F.softmax(
attn_output_weights.astype('float32'), axis=-1,
dtype=paddle.float32 if attn_output_weights.dtype == paddle.float16 else attn_output_weights.dtype)
attn_output_weights = F.dropout(attn_output_weights, p=self.dropout, training=self.training)
attn_output = paddle.bmm(attn_output_weights, v)
assert list(attn_output.shape) == [bsz * self.num_heads, tgt_len, self.head_dim]
attn_output = attn_output.transpose([1, 0,2]).reshape([tgt_len, bsz, embed_dim])
attn_output = self.out_proj(attn_output)
if need_weights:
# average attention weights over heads
attn_output_weights = attn_output_weights.reshape([bsz, self.num_heads, tgt_len, src_len])
attn_output_weights = attn_output_weights.sum(axis=1) / self.num_heads
attn_output_weights = None
return attn_output, attn_output_weights
def _in_proj_qkv(self, query):
query = query.transpose([1, 2, 0])
query = paddle.unsqueeze(query, axis=2)
res = self.conv3(query)
res = paddle.squeeze(res, axis=2)
res = res.transpose([2, 0, 1])
return res.chunk(3, axis=-1)
def _in_proj_kv(self, key):
key = key.transpose([1, 2, 0])
key = paddle.unsqueeze(key, axis=2)
res = self.conv2(key)
res = paddle.squeeze(res, axis=2)
res = res.transpose([2, 0, 1])
return res.chunk(2, axis=-1)
def _in_proj_q(self, query):
query = query.transpose([1, 2, 0])
query = paddle.unsqueeze(query, axis=2)
res = self.conv1(query)
res = paddle.squeeze(res, axis=2)
res = res.transpose([2, 0, 1])
return res
def _in_proj_k(self, key):
key = key.transpose([1, 2, 0])
key = paddle.unsqueeze(key, axis=2)
res = self.conv1(key)
res = paddle.squeeze(res, axis=2)
res = res.transpose([2, 0, 1])
return res
def _in_proj_v(self, value):
value = value.transpose([1,2,0])#(1, 2, 0)
value = paddle.unsqueeze(value, axis=2)
res = self.conv1(value)
res = paddle.squeeze(res, axis=2)
res = res.transpose([2, 0, 1])
return res
class MultiheadAttentionOptim(nn.Layer):
r"""Allows the model to jointly attend to information
from different representation subspaces.
See reference: Attention Is All You Need
.. math::
\text{MultiHead}(Q, K, V) = \text{Concat}(head_1,\dots,head_h)W^O
\text{where} head_i = \text{Attention}(QW_i^Q, KW_i^K, VW_i^V)
embed_dim: total dimension of the model
num_heads: parallel attention layers, or heads
>>> multihead_attn = nn.MultiheadAttention(embed_dim, num_heads)
>>> attn_output, attn_output_weights = multihead_attn(query, key, value)
def __init__(self, embed_dim, num_heads, dropout=0., bias=True, add_bias_kv=False, add_zero_attn=False):
super(MultiheadAttentionOptim, self).__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
self.dropout = dropout
self.head_dim = embed_dim // num_heads
assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads"
self.scaling = self.head_dim ** -0.5
self.out_proj = Linear(embed_dim, embed_dim, bias_attr=bias)
self.conv1 = paddle.nn.Conv2D(in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
self.conv2 = paddle.nn.Conv2D(in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
self.conv3 = paddle.nn.Conv2D(in_channels=embed_dim, out_channels=embed_dim, kernel_size=(1, 1))
def _reset_parameters(self):
def forward(self, query, key, value, key_padding_mask=None, incremental_state=None,
need_weights=True, static_kv=False, attn_mask=None):
Inputs of forward function
query: [target length, batch size, embed dim]
key: [sequence length, batch size, embed dim]
value: [sequence length, batch size, embed dim]
key_padding_mask: if True, mask padding based on batch size
incremental_state: if provided, previous time steps are cashed
need_weights: output attn_output_weights
static_kv: key and value are static
Outputs of forward function
attn_output: [target length, batch size, embed dim]
attn_output_weights: [batch size, target length, sequence length]
tgt_len, bsz, embed_dim = query.shape
assert embed_dim == self.embed_dim
assert list(query.shape) == [tgt_len, bsz, embed_dim]
assert key.shape == value.shape
q = self._in_proj_q(query)
k = self._in_proj_k(key)
v = self._in_proj_v(value)
q *= self.scaling
q = q.reshape([tgt_len, bsz * self.num_heads, self.head_dim]).transpose([1, 0, 2])
k = k.reshape([-1, bsz * self.num_heads, self.head_dim]).transpose([1, 0, 2])
v = v.reshape([-1, bsz * self.num_heads, self.head_dim]).transpose([1, 0, 2])
src_len = k.shape[1]
if key_padding_mask is not None:
assert key_padding_mask.shape[0] == bsz
assert key_padding_mask.shape[1] == src_len
attn_output_weights = paddle.bmm(q, k.transpose([0,2,1]))
assert list(attn_output_weights.shape) == [bsz * self.num_heads, tgt_len, src_len]
if attn_mask is not None:
attn_mask = attn_mask.unsqueeze(0)
attn_output_weights += attn_mask
if key_padding_mask is not None:
attn_output_weights = attn_output_weights.reshape([bsz, self.num_heads, tgt_len, src_len])
key = key_padding_mask.unsqueeze(1).unsqueeze(2).astype('float32')
y = paddle.full(shape=key.shape, dtype='float32', fill_value='-inf')
y = paddle.where(key==0.,key, y)
attn_output_weights += y
attn_output_weights = attn_output_weights.reshape([bsz*self.num_heads, tgt_len, src_len])
attn_output_weights = F.softmax(
attn_output_weights.astype('float32'), axis=-1,
dtype=paddle.float32 if attn_output_weights.dtype == paddle.float16 else attn_output_weights.dtype)
attn_output_weights = F.dropout(attn_output_weights, p=self.dropout, training=self.training)
attn_output = paddle.bmm(attn_output_weights, v)
assert list(attn_output.shape) == [bsz * self.num_heads, tgt_len, self.head_dim]
attn_output = attn_output.transpose([1, 0,2]).reshape([tgt_len, bsz, embed_dim])
attn_output = self.out_proj(attn_output)
if need_weights:
# average attention weights over heads
attn_output_weights = attn_output_weights.reshape([bsz, self.num_heads, tgt_len, src_len])
attn_output_weights = attn_output_weights.sum(axis=1) / self.num_heads
attn_output_weights = None
return attn_output, attn_output_weights
def _in_proj_q(self, query):
query = query.transpose([1, 2, 0])
query = paddle.unsqueeze(query, axis=2)
res = self.conv1(query)
res = paddle.squeeze(res, axis=2)
res = res.transpose([2, 0, 1])
return res
def _in_proj_k(self, key):
key = key.transpose([1, 2, 0])
key = paddle.unsqueeze(key, axis=2)
res = self.conv2(key)
res = paddle.squeeze(res, axis=2)
res = res.transpose([2, 0, 1])
return res
def _in_proj_v(self, value):
value = value.transpose([1,2,0])#(1, 2, 0)
value = paddle.unsqueeze(value, axis=2)
res = self.conv3(value)
res = paddle.squeeze(res, axis=2)
res = res.transpose([2, 0, 1])
return res
\ No newline at end of file
...@@ -156,6 +156,69 @@ class DistillationCTCLabelDecode(CTCLabelDecode): ...@@ -156,6 +156,69 @@ class DistillationCTCLabelDecode(CTCLabelDecode):
return output return output
class NRTRLabelDecode(BaseRecLabelDecode):
""" Convert between text-label and text-index """
def __init__(self,
super(NRTRLabelDecode, self).__init__(character_dict_path,
character_type, use_space_char)
def __call__(self, preds, label=None, *args, **kwargs):
if preds.dtype == paddle.int64:
if isinstance(preds, paddle.Tensor):
preds = preds.numpy()
if preds[0][0]==2:
preds_idx = preds[:,1:]
preds_idx = preds
text = self.decode(preds_idx)
if label is None:
return text
label = self.decode(label[:,1:])
if isinstance(preds, paddle.Tensor):
preds = preds.numpy()
preds_idx = preds.argmax(axis=2)
preds_prob = preds.max(axis=2)
text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False)
if label is None:
return text
label = self.decode(label[:,1:])
return text, label
def add_special_char(self, dict_character):
dict_character = ['blank','<unk>','<s>','</s>'] + dict_character
return dict_character
def decode(self, text_index, text_prob=None, is_remove_duplicate=False):
""" convert text-index into text-label. """
result_list = []
batch_size = len(text_index)
for batch_idx in range(batch_size):
char_list = []
conf_list = []
for idx in range(len(text_index[batch_idx])):
if text_index[batch_idx][idx] == 3: # end
if text_prob is not None:
text = ''.join(char_list)
result_list.append((text.lower(), np.mean(conf_list)))
return result_list
class AttnLabelDecode(BaseRecLabelDecode): class AttnLabelDecode(BaseRecLabelDecode):
""" Convert between text-label and text-index """ """ Convert between text-label and text-index """
...@@ -22,7 +22,6 @@ import sys ...@@ -22,7 +22,6 @@ import sys
__dir__ = os.path.dirname(os.path.abspath(__file__)) __dir__ = os.path.dirname(os.path.abspath(__file__))
sys.path.append(__dir__) sys.path.append(__dir__)
sys.path.append(os.path.abspath(os.path.join(__dir__, '..'))) sys.path.append(os.path.abspath(os.path.join(__dir__, '..')))
from ppocr.data import build_dataloader from ppocr.data import build_dataloader
from ppocr.modeling.architectures import build_model from ppocr.modeling.architectures import build_model
from ppocr.postprocess import build_post_process from ppocr.postprocess import build_post_process
...@@ -31,7 +30,6 @@ from ppocr.utils.save_load import init_model ...@@ -31,7 +30,6 @@ from ppocr.utils.save_load import init_model
from ppocr.utils.utility import print_dict from ppocr.utils.utility import print_dict
import tools.program as program import tools.program as program
def main(): def main():
global_config = config['Global'] global_config = config['Global']
# build dataloader # build dataloader
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
想要评论请 注册