Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleOCR
提交
f688909b
P
PaddleOCR
项目概览
PaddlePaddle
/
PaddleOCR
1 年多 前同步成功
通知
1534
Star
32963
Fork
6643
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
108
列表
看板
标记
里程碑
合并请求
7
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleOCR
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
108
Issue
108
列表
看板
标记
里程碑
合并请求
7
合并请求
7
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
f688909b
编写于
9月 13, 2021
作者:
T
Topdu
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix nrtr export inference model
上级
cc24646a
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
210 addition
and
175 deletion
+210
-175
configs/rec/rec_mtb_nrtr.yml
configs/rec/rec_mtb_nrtr.yml
+3
-3
ppocr/data/imaug/label_ops.py
ppocr/data/imaug/label_ops.py
+7
-2
ppocr/data/imaug/rec_img_aug.py
ppocr/data/imaug/rec_img_aug.py
+24
-2
ppocr/data/simple_dataset.py
ppocr/data/simple_dataset.py
+0
-1
ppocr/modeling/backbones/rec_nrtr_mtb.py
ppocr/modeling/backbones/rec_nrtr_mtb.py
+5
-3
ppocr/modeling/heads/multiheadAttention.py
ppocr/modeling/heads/multiheadAttention.py
+37
-52
ppocr/modeling/heads/rec_nrtr_head.py
ppocr/modeling/heads/rec_nrtr_head.py
+83
-101
ppocr/postprocess/rec_postprocess.py
ppocr/postprocess/rec_postprocess.py
+13
-8
tools/export_model.py
tools/export_model.py
+2
-0
tools/infer/predict_rec.py
tools/infer/predict_rec.py
+36
-3
未找到文件。
configs/rec/rec_mtb_nrtr.yml
浏览文件 @
f688909b
...
...
@@ -46,7 +46,7 @@ Architecture:
name
:
Transformer
d_model
:
512
num_encoder_layers
:
6
beam_size
:
10
# When Beam size is greater than 0, it means to use beam search when evaluation.
beam_size
:
-1
# When Beam size is greater than 0, it means to use beam search when evaluation.
Loss
:
...
...
@@ -65,7 +65,7 @@ Train:
name
:
LMDBDataSet
data_dir
:
./train_data/data_lmdb_release/training/
transforms
:
-
NRTR
DecodeImage
:
# load image
-
DecodeImage
:
# load image
img_mode
:
BGR
channel_first
:
False
-
NRTRLabelEncode
:
# Class handling label
...
...
@@ -85,7 +85,7 @@ Eval:
name
:
LMDBDataSet
data_dir
:
./train_data/data_lmdb_release/evaluation/
transforms
:
-
NRTR
DecodeImage
:
# load image
-
DecodeImage
:
# load image
img_mode
:
BGR
channel_first
:
False
-
NRTRLabelEncode
:
# Class handling label
...
...
ppocr/data/imaug/label_ops.py
浏览文件 @
f688909b
...
...
@@ -174,21 +174,26 @@ class NRTRLabelEncode(BaseRecLabelEncode):
super
(
NRTRLabelEncode
,
self
).
__init__
(
max_text_length
,
character_dict_path
,
character_type
,
use_space_char
)
def
__call__
(
self
,
data
):
text
=
data
[
'label'
]
text
=
self
.
encode
(
text
)
if
text
is
None
:
return
None
if
len
(
text
)
>=
self
.
max_text_len
-
1
:
return
None
data
[
'length'
]
=
np
.
array
(
len
(
text
))
text
.
insert
(
0
,
2
)
text
.
append
(
3
)
text
=
text
+
[
0
]
*
(
self
.
max_text_len
-
len
(
text
))
data
[
'label'
]
=
np
.
array
(
text
)
return
data
def
add_special_char
(
self
,
dict_character
):
dict_character
=
[
'blank'
,
'<unk>'
,
'<s>'
,
'</s>'
]
+
dict_character
dict_character
=
[
'blank'
,
'<unk>'
,
'<s>'
,
'</s>'
]
+
dict_character
return
dict_character
class
CTCLabelEncode
(
BaseRecLabelEncode
):
""" Convert between text-label and text-index """
...
...
@@ -588,7 +593,7 @@ class SARLabelEncode(BaseRecLabelEncode):
data
[
'length'
]
=
np
.
array
(
len
(
text
))
target
=
[
self
.
start_idx
]
+
text
+
[
self
.
end_idx
]
padded_text
=
[
self
.
padding_idx
for
_
in
range
(
self
.
max_text_len
)]
padded_text
[:
len
(
target
)]
=
target
data
[
'label'
]
=
np
.
array
(
padded_text
)
return
data
...
...
ppocr/data/imaug/rec_img_aug.py
浏览文件 @
f688909b
...
...
@@ -44,12 +44,33 @@ class ClsResizeImg(object):
class
NRTRRecResizeImg
(
object
):
def
__init__
(
self
,
image_shape
,
resize_type
,
**
kwargs
):
def
__init__
(
self
,
image_shape
,
resize_type
,
padding
=
False
,
**
kwargs
):
self
.
image_shape
=
image_shape
self
.
resize_type
=
resize_type
self
.
padding
=
padding
def
__call__
(
self
,
data
):
img
=
data
[
'image'
]
img
=
cv2
.
cvtColor
(
img
,
cv2
.
COLOR_BGR2GRAY
)
image_shape
=
self
.
image_shape
if
self
.
padding
:
imgC
,
imgH
,
imgW
=
image_shape
# todo: change to 0 and modified image shape
h
=
img
.
shape
[
0
]
w
=
img
.
shape
[
1
]
ratio
=
w
/
float
(
h
)
if
math
.
ceil
(
imgH
*
ratio
)
>
imgW
:
resized_w
=
imgW
else
:
resized_w
=
int
(
math
.
ceil
(
imgH
*
ratio
))
resized_image
=
cv2
.
resize
(
img
,
(
resized_w
,
imgH
))
norm_img
=
np
.
expand_dims
(
resized_image
,
-
1
)
norm_img
=
norm_img
.
transpose
((
2
,
0
,
1
))
resized_image
=
norm_img
.
astype
(
np
.
float32
)
/
128.
-
1.
padding_im
=
np
.
zeros
((
imgC
,
imgH
,
imgW
),
dtype
=
np
.
float32
)
padding_im
[:,
:,
0
:
resized_w
]
=
resized_image
data
[
'image'
]
=
padding_im
return
data
if
self
.
resize_type
==
'PIL'
:
image_pil
=
Image
.
fromarray
(
np
.
uint8
(
img
))
img
=
image_pil
.
resize
(
self
.
image_shape
,
Image
.
ANTIALIAS
)
...
...
@@ -109,7 +130,8 @@ class SARRecResizeImg(object):
def
__call__
(
self
,
data
):
img
=
data
[
'image'
]
norm_img
,
resize_shape
,
pad_shape
,
valid_ratio
=
resize_norm_img_sar
(
img
,
self
.
image_shape
,
self
.
width_downsample_ratio
)
norm_img
,
resize_shape
,
pad_shape
,
valid_ratio
=
resize_norm_img_sar
(
img
,
self
.
image_shape
,
self
.
width_downsample_ratio
)
data
[
'image'
]
=
norm_img
data
[
'resized_shape'
]
=
resize_shape
data
[
'pad_shape'
]
=
pad_shape
...
...
ppocr/data/simple_dataset.py
浏览文件 @
f688909b
...
...
@@ -15,7 +15,6 @@ import numpy as np
import
os
import
random
from
paddle.io
import
Dataset
from
.imaug
import
transform
,
create_operators
...
...
ppocr/modeling/backbones/rec_nrtr_mtb.py
浏览文件 @
f688909b
...
...
@@ -13,6 +13,7 @@
# limitations under the License.
from
paddle
import
nn
import
paddle
class
MTB
(
nn
.
Layer
):
...
...
@@ -40,7 +41,8 @@ class MTB(nn.Layer):
x
=
self
.
block
(
images
)
if
self
.
cnn_num
==
2
:
# (b, w, h, c)
x
=
x
.
transpose
([
0
,
3
,
2
,
1
])
x_shape
=
x
.
shape
x
=
x
.
reshape
([
x_shape
[
0
],
x_shape
[
1
],
x_shape
[
2
]
*
x_shape
[
3
]])
x
=
paddle
.
transpose
(
x
,
[
0
,
3
,
2
,
1
])
x_shape
=
paddle
.
shape
(
x
)
x
=
paddle
.
reshape
(
x
,
[
x_shape
[
0
],
x_shape
[
1
],
x_shape
[
2
]
*
x_shape
[
3
]])
return
x
ppocr/modeling/heads/multiheadAttention.py
浏览文件 @
f688909b
...
...
@@ -71,8 +71,6 @@ class MultiheadAttention(nn.Layer):
value
,
key_padding_mask
=
None
,
incremental_state
=
None
,
need_weights
=
True
,
static_kv
=
False
,
attn_mask
=
None
):
"""
Inputs of forward function
...
...
@@ -88,46 +86,42 @@ class MultiheadAttention(nn.Layer):
attn_output: [target length, batch size, embed dim]
attn_output_weights: [batch size, target length, sequence length]
"""
tgt_len
,
bsz
,
embed_dim
=
query
.
shape
assert
embed_dim
==
self
.
embed_dim
assert
list
(
query
.
shape
)
==
[
tgt_len
,
bsz
,
embed_dim
]
assert
key
.
shape
==
value
.
shape
q_shape
=
paddle
.
shape
(
query
)
src_shape
=
paddle
.
shape
(
key
)
q
=
self
.
_in_proj_q
(
query
)
k
=
self
.
_in_proj_k
(
key
)
v
=
self
.
_in_proj_v
(
value
)
q
*=
self
.
scaling
q
=
q
.
reshape
([
tgt_len
,
bsz
*
self
.
num_heads
,
self
.
head_dim
]).
transpose
(
[
1
,
0
,
2
])
k
=
k
.
reshape
([
-
1
,
bsz
*
self
.
num_heads
,
self
.
head_dim
]).
transpose
(
[
1
,
0
,
2
])
v
=
v
.
reshape
([
-
1
,
bsz
*
self
.
num_heads
,
self
.
head_dim
]).
transpose
(
[
1
,
0
,
2
])
src_len
=
k
.
shape
[
1
]
q
=
paddle
.
transpose
(
paddle
.
reshape
(
q
,
[
q_shape
[
0
],
q_shape
[
1
],
self
.
num_heads
,
self
.
head_dim
]),
[
1
,
2
,
0
,
3
])
k
=
paddle
.
transpose
(
paddle
.
reshape
(
k
,
[
src_shape
[
0
],
q_shape
[
1
],
self
.
num_heads
,
self
.
head_dim
]),
[
1
,
2
,
0
,
3
])
v
=
paddle
.
transpose
(
paddle
.
reshape
(
v
,
[
src_shape
[
0
],
q_shape
[
1
],
self
.
num_heads
,
self
.
head_dim
]),
[
1
,
2
,
0
,
3
])
if
key_padding_mask
is
not
None
:
assert
key_padding_mask
.
shape
[
0
]
==
bsz
assert
key_padding_mask
.
shape
[
1
]
==
src_len
attn_output_weights
=
paddle
.
bmm
(
q
,
k
.
transpose
([
0
,
2
,
1
]))
assert
list
(
attn_output_weights
.
shape
)
==
[
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
]
assert
key_padding_mask
.
shape
[
0
]
==
q_shape
[
1
]
assert
key_padding_mask
.
shape
[
1
]
==
src_shape
[
0
]
attn_output_weights
=
paddle
.
matmul
(
q
,
paddle
.
transpose
(
k
,
[
0
,
1
,
3
,
2
]))
if
attn_mask
is
not
None
:
attn_mask
=
attn_mask
.
unsqueeze
(
0
)
attn_mask
=
paddle
.
unsqueeze
(
paddle
.
unsqueeze
(
attn_mask
,
0
),
0
)
attn_output_weights
+=
attn_mask
if
key_padding_mask
is
not
None
:
attn_output_weights
=
attn_output_weights
.
reshape
(
[
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
])
key
=
key_padding_mask
.
unsqueeze
(
1
).
unsqueeze
(
2
).
astype
(
'float32'
)
y
=
paddle
.
full
(
shape
=
key
.
shape
,
dtype
=
'float32'
,
fill_value
=
'-inf'
)
attn_output_weights
=
paddle
.
reshape
(
attn_output_weights
,
[
q_shape
[
1
],
self
.
num_heads
,
q_shape
[
0
],
src_shape
[
0
]])
key
=
paddle
.
unsqueeze
(
paddle
.
unsqueeze
(
key_padding_mask
,
1
),
2
)
key
=
paddle
.
cast
(
key
,
'float32'
)
y
=
paddle
.
full
(
shape
=
paddle
.
shape
(
key
),
dtype
=
'float32'
,
fill_value
=
'-inf'
)
y
=
paddle
.
where
(
key
==
0.
,
key
,
y
)
attn_output_weights
+=
y
attn_output_weights
=
attn_output_weights
.
reshape
(
[
bsz
*
self
.
num_heads
,
tgt_len
,
src_len
])
attn_output_weights
=
F
.
softmax
(
attn_output_weights
.
astype
(
'float32'
),
axis
=-
1
,
...
...
@@ -136,43 +130,34 @@ class MultiheadAttention(nn.Layer):
attn_output_weights
=
F
.
dropout
(
attn_output_weights
,
p
=
self
.
dropout
,
training
=
self
.
training
)
attn_output
=
paddle
.
bmm
(
attn_output_weights
,
v
)
assert
list
(
attn_output
.
shape
)
==
[
bsz
*
self
.
num_heads
,
tgt_len
,
self
.
head_dim
]
attn_output
=
attn_output
.
transpose
([
1
,
0
,
2
]).
reshape
(
[
tgt_len
,
bsz
,
embed_dim
])
attn_output
=
paddle
.
matmul
(
attn_output_weights
,
v
)
attn_output
=
paddle
.
reshape
(
paddle
.
transpose
(
attn_output
,
[
2
,
0
,
1
,
3
]),
[
q_shape
[
0
],
q_shape
[
1
],
self
.
embed_dim
])
attn_output
=
self
.
out_proj
(
attn_output
)
if
need_weights
:
# average attention weights over heads
attn_output_weights
=
attn_output_weights
.
reshape
(
[
bsz
,
self
.
num_heads
,
tgt_len
,
src_len
])
attn_output_weights
=
attn_output_weights
.
sum
(
axis
=
1
)
/
self
.
num_heads
else
:
attn_output_weights
=
None
return
attn_output
,
attn_output_weights
return
attn_output
def
_in_proj_q
(
self
,
query
):
query
=
query
.
transpose
(
[
1
,
2
,
0
])
query
=
paddle
.
transpose
(
query
,
[
1
,
2
,
0
])
query
=
paddle
.
unsqueeze
(
query
,
axis
=
2
)
res
=
self
.
conv1
(
query
)
res
=
paddle
.
squeeze
(
res
,
axis
=
2
)
res
=
res
.
transpose
(
[
2
,
0
,
1
])
res
=
paddle
.
transpose
(
res
,
[
2
,
0
,
1
])
return
res
def
_in_proj_k
(
self
,
key
):
key
=
key
.
transpose
(
[
1
,
2
,
0
])
key
=
paddle
.
transpose
(
key
,
[
1
,
2
,
0
])
key
=
paddle
.
unsqueeze
(
key
,
axis
=
2
)
res
=
self
.
conv2
(
key
)
res
=
paddle
.
squeeze
(
res
,
axis
=
2
)
res
=
res
.
transpose
(
[
2
,
0
,
1
])
res
=
paddle
.
transpose
(
res
,
[
2
,
0
,
1
])
return
res
def
_in_proj_v
(
self
,
value
):
value
=
value
.
transpose
(
[
1
,
2
,
0
])
#(1, 2, 0)
value
=
paddle
.
transpose
(
value
,
[
1
,
2
,
0
])
#(1, 2, 0)
value
=
paddle
.
unsqueeze
(
value
,
axis
=
2
)
res
=
self
.
conv3
(
value
)
res
=
paddle
.
squeeze
(
res
,
axis
=
2
)
res
=
res
.
transpose
(
[
2
,
0
,
1
])
res
=
paddle
.
transpose
(
res
,
[
2
,
0
,
1
])
return
res
ppocr/modeling/heads/rec_nrtr_head.py
浏览文件 @
f688909b
...
...
@@ -61,12 +61,12 @@ class Transformer(nn.Layer):
custom_decoder
=
None
,
in_channels
=
0
,
out_channels
=
0
,
dst_vocab_size
=
99
,
scale_embedding
=
True
):
super
(
Transformer
,
self
).
__init__
()
self
.
out_channels
=
out_channels
+
1
self
.
embedding
=
Embeddings
(
d_model
=
d_model
,
vocab
=
dst_vocab_size
,
vocab
=
self
.
out_channels
,
padding_idx
=
0
,
scale_embedding
=
scale_embedding
)
self
.
positional_encoding
=
PositionalEncoding
(
...
...
@@ -96,9 +96,10 @@ class Transformer(nn.Layer):
self
.
beam_size
=
beam_size
self
.
d_model
=
d_model
self
.
nhead
=
nhead
self
.
tgt_word_prj
=
nn
.
Linear
(
d_model
,
dst_vocab_size
,
bias_attr
=
False
)
self
.
tgt_word_prj
=
nn
.
Linear
(
d_model
,
self
.
out_channels
,
bias_attr
=
False
)
w0
=
np
.
random
.
normal
(
0.0
,
d_model
**-
0.5
,
(
d_model
,
dst_vocab_size
)).
astype
(
np
.
float32
)
(
d_model
,
self
.
out_channels
)).
astype
(
np
.
float32
)
self
.
tgt_word_prj
.
weight
.
set_value
(
w0
)
self
.
apply
(
self
.
_init_weights
)
...
...
@@ -156,46 +157,41 @@ class Transformer(nn.Layer):
return
self
.
forward_test
(
src
)
def
forward_test
(
self
,
src
):
bs
=
src
.
shape
[
0
]
bs
=
paddle
.
shape
(
src
)
[
0
]
if
self
.
encoder
is
not
None
:
src
=
self
.
positional_encoding
(
src
.
transpose
(
[
1
,
0
,
2
]))
src
=
self
.
positional_encoding
(
paddle
.
transpose
(
src
,
[
1
,
0
,
2
]))
memory
=
self
.
encoder
(
src
)
else
:
memory
=
src
.
squeeze
(
2
).
transpose
(
[
2
,
0
,
1
])
memory
=
paddle
.
transpose
(
paddle
.
squeeze
(
src
,
2
),
[
2
,
0
,
1
])
dec_seq
=
paddle
.
full
((
bs
,
1
),
2
,
dtype
=
paddle
.
int64
)
dec_prob
=
paddle
.
full
((
bs
,
1
),
1.
,
dtype
=
paddle
.
float32
)
for
len_dec_seq
in
range
(
1
,
25
):
src_enc
=
memory
.
clone
()
tgt_key_padding_mask
=
self
.
generate_padding_mask
(
dec_seq
)
dec_seq_embed
=
self
.
embedding
(
dec_seq
).
transpose
([
1
,
0
,
2
])
dec_seq_embed
=
paddle
.
transpose
(
self
.
embedding
(
dec_seq
),
[
1
,
0
,
2
])
dec_seq_embed
=
self
.
positional_encoding
(
dec_seq_embed
)
tgt_mask
=
self
.
generate_square_subsequent_mask
(
dec_seq_embed
.
shape
[
0
])
tgt_mask
=
self
.
generate_square_subsequent_mask
(
paddle
.
shape
(
dec_seq_embed
)[
0
])
output
=
self
.
decoder
(
dec_seq_embed
,
src_enc
,
memory
,
tgt_mask
=
tgt_mask
,
memory_mask
=
None
,
tgt_key_padding_mask
=
tgt_key_padding_mask
,
tgt_key_padding_mask
=
None
,
memory_key_padding_mask
=
None
)
dec_output
=
output
.
transpose
([
1
,
0
,
2
])
dec_output
=
dec_output
[:,
-
1
,
:]
# Pick the last step: (bh * bm) * d_h
word_prob
=
F
.
log_softmax
(
self
.
tgt_word_prj
(
dec_output
),
axis
=
1
)
word_prob
=
word_prob
.
reshape
([
1
,
bs
,
-
1
])
preds_idx
=
word_prob
.
argmax
(
axis
=
2
)
dec_output
=
paddle
.
transpose
(
output
,
[
1
,
0
,
2
])
dec_output
=
dec_output
[:,
-
1
,
:]
word_prob
=
F
.
softmax
(
self
.
tgt_word_prj
(
dec_output
),
axis
=
1
)
preds_idx
=
paddle
.
argmax
(
word_prob
,
axis
=
1
)
if
paddle
.
equal_all
(
preds_idx
[
-
1
]
,
preds_idx
,
paddle
.
full
(
p
reds_idx
[
-
1
].
shape
,
3
,
dtype
=
'int64'
)):
p
addle
.
shape
(
preds_idx
)
,
3
,
dtype
=
'int64'
)):
break
preds_prob
=
word_prob
.
max
(
axis
=
2
)
preds_prob
=
paddle
.
max
(
word_prob
,
axis
=
1
)
dec_seq
=
paddle
.
concat
(
[
dec_seq
,
preds_idx
.
reshape
([
-
1
,
1
])],
axis
=
1
)
return
dec_seq
[
dec_seq
,
paddle
.
reshape
(
preds_idx
,
[
-
1
,
1
])],
axis
=
1
)
dec_prob
=
paddle
.
concat
(
[
dec_prob
,
paddle
.
reshape
(
preds_prob
,
[
-
1
,
1
])],
axis
=
1
)
return
[
dec_seq
,
dec_prob
]
def
forward_beam
(
self
,
images
):
''' Translation work in one batch '''
...
...
@@ -211,14 +207,15 @@ class Transformer(nn.Layer):
n_prev_active_inst
,
n_bm
):
''' Collect tensor parts associated to active instances. '''
_
,
*
d_hs
=
beamed_tensor
.
shape
beamed_tensor_shape
=
paddle
.
shape
(
beamed_tensor
)
n_curr_active_inst
=
len
(
curr_active_inst_idx
)
new_shape
=
(
n_curr_active_inst
*
n_bm
,
*
d_hs
)
new_shape
=
(
n_curr_active_inst
*
n_bm
,
beamed_tensor_shape
[
1
],
beamed_tensor_shape
[
2
])
beamed_tensor
=
beamed_tensor
.
reshape
([
n_prev_active_inst
,
-
1
])
beamed_tensor
=
beamed_tensor
.
index_select
(
paddle
.
to_tensor
(
curr_active_inst_idx
)
,
axis
=
0
)
beamed_tensor
=
beamed_tensor
.
reshape
(
[
*
new_shape
]
)
curr_active_inst_idx
,
axis
=
0
)
beamed_tensor
=
beamed_tensor
.
reshape
(
new_shape
)
return
beamed_tensor
...
...
@@ -249,44 +246,26 @@ class Transformer(nn.Layer):
b
.
get_current_state
()
for
b
in
inst_dec_beams
if
not
b
.
done
]
dec_partial_seq
=
paddle
.
stack
(
dec_partial_seq
)
dec_partial_seq
=
dec_partial_seq
.
reshape
([
-
1
,
len_dec_seq
])
return
dec_partial_seq
def
prepare_beam_memory_key_padding_mask
(
inst_dec_beams
,
memory_key_padding_mask
,
n_bm
):
keep
=
[]
for
idx
in
(
memory_key_padding_mask
):
if
not
inst_dec_beams
[
idx
].
done
:
keep
.
append
(
idx
)
memory_key_padding_mask
=
memory_key_padding_mask
[
paddle
.
to_tensor
(
keep
)]
len_s
=
memory_key_padding_mask
.
shape
[
-
1
]
n_inst
=
memory_key_padding_mask
.
shape
[
0
]
memory_key_padding_mask
=
paddle
.
concat
(
[
memory_key_padding_mask
for
i
in
range
(
n_bm
)],
axis
=
1
)
memory_key_padding_mask
=
memory_key_padding_mask
.
reshape
(
[
n_inst
*
n_bm
,
len_s
])
#repeat(1, n_bm)
return
memory_key_padding_mask
def
predict_word
(
dec_seq
,
enc_output
,
n_active_inst
,
n_bm
,
memory_key_padding_mask
):
tgt_key_padding_mask
=
self
.
generate_padding_mask
(
dec_seq
)
dec_seq
=
self
.
embedding
(
dec_seq
).
transpose
([
1
,
0
,
2
])
dec_seq
=
paddle
.
transpose
(
self
.
embedding
(
dec_seq
),
[
1
,
0
,
2
])
dec_seq
=
self
.
positional_encoding
(
dec_seq
)
tgt_mask
=
self
.
generate_square_subsequent_mask
(
dec_seq
.
shape
[
0
])
tgt_mask
=
self
.
generate_square_subsequent_mask
(
paddle
.
shape
(
dec_seq
)[
0
])
dec_output
=
self
.
decoder
(
dec_seq
,
enc_output
,
tgt_mask
=
tgt_mask
,
tgt_key_padding_mask
=
tgt_key_padding_mask
,
memory_key_padding_mask
=
memory_key_padding_mask
,
).
transpose
(
[
1
,
0
,
2
])
tgt_key_padding_mask
=
None
,
memory_key_padding_mask
=
memory_key_padding_mask
,
)
dec_output
=
paddle
.
transpose
(
dec_output
,
[
1
,
0
,
2
])
dec_output
=
dec_output
[:,
-
1
,
:]
# Pick the last step: (bh * bm) * d_h
word_prob
=
F
.
log_
softmax
(
self
.
tgt_word_prj
(
dec_output
),
axis
=
1
)
word_prob
=
word_prob
.
reshape
(
[
n_active_inst
,
n_bm
,
-
1
])
word_prob
=
F
.
softmax
(
self
.
tgt_word_prj
(
dec_output
),
axis
=
1
)
word_prob
=
paddle
.
reshape
(
word_prob
,
[
n_active_inst
,
n_bm
,
-
1
])
return
word_prob
def
collect_active_inst_idx_list
(
inst_beams
,
word_prob
,
...
...
@@ -302,9 +281,8 @@ class Transformer(nn.Layer):
n_active_inst
=
len
(
inst_idx_to_position_map
)
dec_seq
=
prepare_beam_dec_seq
(
inst_dec_beams
,
len_dec_seq
)
memory_key_padding_mask
=
None
word_prob
=
predict_word
(
dec_seq
,
enc_output
,
n_active_inst
,
n_bm
,
memory_key_padding_mask
)
None
)
# Update the beam with predicted word prob information and collect incomplete instances
active_inst_idx_list
=
collect_active_inst_idx_list
(
inst_dec_beams
,
word_prob
,
inst_idx_to_position_map
)
...
...
@@ -324,27 +302,21 @@ class Transformer(nn.Layer):
with
paddle
.
no_grad
():
#-- Encode
if
self
.
encoder
is
not
None
:
src
=
self
.
positional_encoding
(
images
.
transpose
([
1
,
0
,
2
]))
src_enc
=
self
.
encoder
(
src
)
.
transpose
([
1
,
0
,
2
])
src_enc
=
self
.
encoder
(
src
)
else
:
src_enc
=
images
.
squeeze
(
2
).
transpose
([
0
,
2
,
1
])
#-- Repeat data for beam search
n_bm
=
self
.
beam_size
n_inst
,
len_s
,
d_h
=
src_enc
.
shape
src_enc
=
paddle
.
concat
([
src_enc
for
i
in
range
(
n_bm
)],
axis
=
1
)
src_enc
=
src_enc
.
reshape
([
n_inst
*
n_bm
,
len_s
,
d_h
]).
transpose
(
[
1
,
0
,
2
])
#-- Prepare beams
inst_dec_beams
=
[
Beam
(
n_bm
)
for
_
in
range
(
n_inst
)]
#-- Bookkeeping for active or not
active_inst_idx_list
=
list
(
range
(
n_inst
))
src_shape
=
paddle
.
shape
(
src_enc
)
inst_dec_beams
=
[
Beam
(
n_bm
)
for
_
in
range
(
1
)]
active_inst_idx_list
=
list
(
range
(
1
))
# Repeat data for beam search
src_enc
=
paddle
.
tile
(
src_enc
,
[
1
,
n_bm
,
1
])
inst_idx_to_position_map
=
get_inst_idx_to_tensor_position_map
(
active_inst_idx_list
)
#
--
Decode
# Decode
for
len_dec_seq
in
range
(
1
,
25
):
src_enc_copy
=
src_enc
.
clone
()
active_inst_idx_list
=
beam_decode_step
(
...
...
@@ -358,10 +330,19 @@ class Transformer(nn.Layer):
batch_hyp
,
batch_scores
=
collect_hypothesis_and_scores
(
inst_dec_beams
,
1
)
result_hyp
=
[]
for
bs_hyp
in
batch_hyp
:
bs_hyp_pad
=
bs_hyp
[
0
]
+
[
3
]
*
(
25
-
len
(
bs_hyp
[
0
]))
hyp_scores
=
[]
for
bs_hyp
,
score
in
zip
(
batch_hyp
,
batch_scores
):
l
=
len
(
bs_hyp
[
0
])
bs_hyp_pad
=
bs_hyp
[
0
]
+
[
3
]
*
(
25
-
l
)
result_hyp
.
append
(
bs_hyp_pad
)
return
paddle
.
to_tensor
(
np
.
array
(
result_hyp
),
dtype
=
paddle
.
int64
)
score
=
float
(
score
)
/
l
hyp_score
=
[
score
for
_
in
range
(
25
)]
hyp_scores
.
append
(
hyp_score
)
return
[
paddle
.
to_tensor
(
np
.
array
(
result_hyp
),
dtype
=
paddle
.
int64
),
paddle
.
to_tensor
(
hyp_scores
)
]
def
generate_square_subsequent_mask
(
self
,
sz
):
"""Generate a square mask for the sequence. The masked positions are filled with float('-inf').
...
...
@@ -376,7 +357,7 @@ class Transformer(nn.Layer):
return
mask
def
generate_padding_mask
(
self
,
x
):
padding_mask
=
x
.
equal
(
paddle
.
to_tensor
(
0
,
dtype
=
x
.
dtype
))
padding_mask
=
paddle
.
equal
(
x
,
paddle
.
to_tensor
(
0
,
dtype
=
x
.
dtype
))
return
padding_mask
def
_reset_parameters
(
self
):
...
...
@@ -514,17 +495,17 @@ class TransformerEncoderLayer(nn.Layer):
src
,
src
,
attn_mask
=
src_mask
,
key_padding_mask
=
src_key_padding_mask
)
[
0
]
key_padding_mask
=
src_key_padding_mask
)
src
=
src
+
self
.
dropout1
(
src2
)
src
=
self
.
norm1
(
src
)
src
=
src
.
transpose
(
[
1
,
2
,
0
])
src
=
paddle
.
transpose
(
src
,
[
1
,
2
,
0
])
src
=
paddle
.
unsqueeze
(
src
,
2
)
src2
=
self
.
conv2
(
F
.
relu
(
self
.
conv1
(
src
)))
src2
=
paddle
.
squeeze
(
src2
,
2
)
src2
=
src2
.
transpose
(
[
2
,
0
,
1
])
src2
=
paddle
.
transpose
(
src2
,
[
2
,
0
,
1
])
src
=
paddle
.
squeeze
(
src
,
2
)
src
=
src
.
transpose
(
[
2
,
0
,
1
])
src
=
paddle
.
transpose
(
src
,
[
2
,
0
,
1
])
src
=
src
+
self
.
dropout2
(
src2
)
src
=
self
.
norm2
(
src
)
...
...
@@ -598,7 +579,7 @@ class TransformerDecoderLayer(nn.Layer):
tgt
,
tgt
,
attn_mask
=
tgt_mask
,
key_padding_mask
=
tgt_key_padding_mask
)
[
0
]
key_padding_mask
=
tgt_key_padding_mask
)
tgt
=
tgt
+
self
.
dropout1
(
tgt2
)
tgt
=
self
.
norm1
(
tgt
)
tgt2
=
self
.
multihead_attn
(
...
...
@@ -606,18 +587,18 @@ class TransformerDecoderLayer(nn.Layer):
memory
,
memory
,
attn_mask
=
memory_mask
,
key_padding_mask
=
memory_key_padding_mask
)
[
0
]
key_padding_mask
=
memory_key_padding_mask
)
tgt
=
tgt
+
self
.
dropout2
(
tgt2
)
tgt
=
self
.
norm2
(
tgt
)
# default
tgt
=
tgt
.
transpose
(
[
1
,
2
,
0
])
tgt
=
paddle
.
transpose
(
tgt
,
[
1
,
2
,
0
])
tgt
=
paddle
.
unsqueeze
(
tgt
,
2
)
tgt2
=
self
.
conv2
(
F
.
relu
(
self
.
conv1
(
tgt
)))
tgt2
=
paddle
.
squeeze
(
tgt2
,
2
)
tgt2
=
tgt2
.
transpose
(
[
2
,
0
,
1
])
tgt2
=
paddle
.
transpose
(
tgt2
,
[
2
,
0
,
1
])
tgt
=
paddle
.
squeeze
(
tgt
,
2
)
tgt
=
tgt
.
transpose
(
[
2
,
0
,
1
])
tgt
=
paddle
.
transpose
(
tgt
,
[
2
,
0
,
1
])
tgt
=
tgt
+
self
.
dropout3
(
tgt2
)
tgt
=
self
.
norm3
(
tgt
)
...
...
@@ -656,8 +637,8 @@ class PositionalEncoding(nn.Layer):
(
-
math
.
log
(
10000.0
)
/
dim
))
pe
[:,
0
::
2
]
=
paddle
.
sin
(
position
*
div_term
)
pe
[:,
1
::
2
]
=
paddle
.
cos
(
position
*
div_term
)
pe
=
p
e
.
unsqueeze
(
0
)
pe
=
p
e
.
transpose
(
[
1
,
0
,
2
])
pe
=
p
addle
.
unsqueeze
(
pe
,
0
)
pe
=
p
addle
.
transpose
(
pe
,
[
1
,
0
,
2
])
self
.
register_buffer
(
'pe'
,
pe
)
def
forward
(
self
,
x
):
...
...
@@ -670,7 +651,7 @@ class PositionalEncoding(nn.Layer):
Examples:
>>> output = pos_encoder(x)
"""
x
=
x
+
self
.
pe
[:
x
.
shape
[
0
],
:]
x
=
x
+
self
.
pe
[:
paddle
.
shape
(
x
)
[
0
],
:]
return
self
.
dropout
(
x
)
...
...
@@ -702,7 +683,7 @@ class PositionalEncoding_2d(nn.Layer):
(
-
math
.
log
(
10000.0
)
/
dim
))
pe
[:,
0
::
2
]
=
paddle
.
sin
(
position
*
div_term
)
pe
[:,
1
::
2
]
=
paddle
.
cos
(
position
*
div_term
)
pe
=
p
e
.
unsqueeze
(
0
).
transpose
(
[
1
,
0
,
2
])
pe
=
p
addle
.
transpose
(
paddle
.
unsqueeze
(
pe
,
0
),
[
1
,
0
,
2
])
self
.
register_buffer
(
'pe'
,
pe
)
self
.
avg_pool_1
=
nn
.
AdaptiveAvgPool2D
((
1
,
1
))
...
...
@@ -722,22 +703,23 @@ class PositionalEncoding_2d(nn.Layer):
Examples:
>>> output = pos_encoder(x)
"""
w_pe
=
self
.
pe
[:
x
.
shape
[
-
1
],
:]
w_pe
=
self
.
pe
[:
paddle
.
shape
(
x
)
[
-
1
],
:]
w1
=
self
.
linear1
(
self
.
avg_pool_1
(
x
).
squeeze
()).
unsqueeze
(
0
)
w_pe
=
w_pe
*
w1
w_pe
=
w_pe
.
transpose
(
[
1
,
2
,
0
])
w_pe
=
w_pe
.
unsqueeze
(
2
)
w_pe
=
paddle
.
transpose
(
w_pe
,
[
1
,
2
,
0
])
w_pe
=
paddle
.
unsqueeze
(
w_pe
,
2
)
h_pe
=
self
.
pe
[:
x
.
shape
[
-
2
],
:]
h_pe
=
self
.
pe
[:
paddle
.
shape
(
x
)
.
shape
[
-
2
],
:]
w2
=
self
.
linear2
(
self
.
avg_pool_2
(
x
).
squeeze
()).
unsqueeze
(
0
)
h_pe
=
h_pe
*
w2
h_pe
=
h_pe
.
transpose
(
[
1
,
2
,
0
])
h_pe
=
h_pe
.
unsqueeze
(
3
)
h_pe
=
paddle
.
transpose
(
h_pe
,
[
1
,
2
,
0
])
h_pe
=
paddle
.
unsqueeze
(
h_pe
,
3
)
x
=
x
+
w_pe
+
h_pe
x
=
x
.
reshape
(
[
x
.
shape
[
0
],
x
.
shape
[
1
],
x
.
shape
[
2
]
*
x
.
shape
[
3
]]).
transpose
(
[
2
,
0
,
1
])
x
=
paddle
.
transpose
(
paddle
.
reshape
(
x
,
[
x
.
shape
[
0
],
x
.
shape
[
1
],
x
.
shape
[
2
]
*
x
.
shape
[
3
]]),
[
2
,
0
,
1
])
return
self
.
dropout
(
x
)
...
...
@@ -817,7 +799,7 @@ class Beam():
def
sort_scores
(
self
):
"Sort the scores."
return
self
.
scores
,
paddle
.
to_tensor
(
[
i
for
i
in
range
(
self
.
scores
.
shape
[
0
]
)],
dtype
=
'int32'
)
[
i
for
i
in
range
(
int
(
self
.
scores
.
shape
[
0
])
)],
dtype
=
'int32'
)
def
get_the_best_score_and_idx
(
self
):
"Get the score of the best in the beam."
...
...
ppocr/postprocess/rec_postprocess.py
浏览文件 @
f688909b
...
...
@@ -169,15 +169,20 @@ class NRTRLabelDecode(BaseRecLabelDecode):
character_type
,
use_space_char
)
def
__call__
(
self
,
preds
,
label
=
None
,
*
args
,
**
kwargs
):
if
preds
.
dtype
==
paddle
.
int64
:
if
isinstance
(
preds
,
paddle
.
Tensor
):
preds
=
preds
.
numpy
()
if
preds
[
0
][
0
]
==
2
:
preds_idx
=
preds
[:,
1
:]
else
:
preds_idx
=
preds
text
=
self
.
decode
(
preds_idx
)
if
len
(
preds
)
==
2
:
preds_id
=
preds
[
0
]
preds_prob
=
preds
[
1
]
if
isinstance
(
preds_id
,
paddle
.
Tensor
):
preds_id
=
preds_id
.
numpy
()
if
isinstance
(
preds_prob
,
paddle
.
Tensor
):
preds_prob
=
preds_prob
.
numpy
()
if
preds_id
[
0
][
0
]
==
2
:
preds_idx
=
preds_id
[:,
1
:]
preds_prob
=
preds_prob
[:,
1
:]
else
:
preds_idx
=
preds_id
text
=
self
.
decode
(
preds_idx
,
preds_prob
,
is_remove_duplicate
=
False
)
if
label
is
None
:
return
text
label
=
self
.
decode
(
label
[:,
1
:])
...
...
tools/export_model.py
浏览文件 @
f688909b
...
...
@@ -60,6 +60,8 @@ def export_single_model(model, arch_config, save_path, logger):
"When there is tps in the network, variable length input is not supported, and the input size needs to be the same as during training"
)
infer_shape
[
-
1
]
=
100
if
arch_config
[
"algorithm"
]
==
"NRTR"
:
infer_shape
=
[
1
,
32
,
100
]
elif
arch_config
[
"model_type"
]
==
"table"
:
infer_shape
=
[
3
,
488
,
488
]
model
=
to_static
(
...
...
tools/infer/predict_rec.py
浏览文件 @
f688909b
...
...
@@ -13,7 +13,7 @@
# limitations under the License.
import
os
import
sys
from
PIL
import
Image
__dir__
=
os
.
path
.
dirname
(
os
.
path
.
abspath
(
__file__
))
sys
.
path
.
append
(
__dir__
)
sys
.
path
.
append
(
os
.
path
.
abspath
(
os
.
path
.
join
(
__dir__
,
'../..'
)))
...
...
@@ -61,6 +61,13 @@ class TextRecognizer(object):
"character_dict_path"
:
args
.
rec_char_dict_path
,
"use_space_char"
:
args
.
use_space_char
}
elif
self
.
rec_algorithm
==
'NRTR'
:
postprocess_params
=
{
'name'
:
'NRTRLabelDecode'
,
"character_type"
:
args
.
rec_char_type
,
"character_dict_path"
:
args
.
rec_char_dict_path
,
"use_space_char"
:
args
.
use_space_char
}
self
.
postprocess_op
=
build_post_process
(
postprocess_params
)
self
.
predictor
,
self
.
input_tensor
,
self
.
output_tensors
,
self
.
config
=
\
utility
.
create_predictor
(
args
,
'rec'
,
logger
)
...
...
@@ -87,6 +94,30 @@ class TextRecognizer(object):
def
resize_norm_img
(
self
,
img
,
max_wh_ratio
):
imgC
,
imgH
,
imgW
=
self
.
rec_image_shape
if
imgC
==
1
:
img
=
cv2
.
cvtColor
(
img
,
cv2
.
COLOR_BGR2GRAY
)
# h = img.shape[0]
# w = img.shape[1]
# ratio = w / float(h)
# if math.ceil(imgH * ratio) > imgW:
# resized_w = imgW
# else:
# resized_w = int(math.ceil(imgH * ratio))
# resized_image = cv2.resize(img, (resized_w, imgH))
# #norm_img = np.expand_dims(resized_image, -1)
# #norm_img = norm_img.transpose((2, 0, 1))
# resized_image = resized_image.astype(np.float32) / 128. - 1.
# padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
# padding_im[0, :, 0:resized_w] = resized_image
# return padding_im
image_pil
=
Image
.
fromarray
(
np
.
uint8
(
img
))
img
=
image_pil
.
resize
([
100
,
32
],
Image
.
ANTIALIAS
)
img
=
np
.
array
(
img
)
norm_img
=
np
.
expand_dims
(
img
,
-
1
)
norm_img
=
norm_img
.
transpose
((
2
,
0
,
1
))
return
norm_img
.
astype
(
np
.
float32
)
/
128.
-
1.
assert
imgC
==
img
.
shape
[
2
]
max_wh_ratio
=
max
(
max_wh_ratio
,
imgW
/
imgH
)
imgW
=
int
((
32
*
max_wh_ratio
))
...
...
@@ -252,14 +283,16 @@ class TextRecognizer(object):
else
:
self
.
input_tensor
.
copy_from_cpu
(
norm_img_batch
)
self
.
predictor
.
run
()
outputs
=
[]
for
output_tensor
in
self
.
output_tensors
:
output
=
output_tensor
.
copy_to_cpu
()
outputs
.
append
(
output
)
if
self
.
benchmark
:
self
.
autolog
.
times
.
stamp
()
preds
=
outputs
[
0
]
if
len
(
outputs
)
!=
1
:
preds
=
outputs
else
:
preds
=
outputs
[
0
]
rec_result
=
self
.
postprocess_op
(
preds
)
for
rno
in
range
(
len
(
rec_result
)):
rec_res
[
indices
[
beg_img_no
+
rno
]]
=
rec_result
[
rno
]
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录