提交 f4b62551 编写于 作者: littletomatodonkey's avatar littletomatodonkey

add support for svtr static training (#6328)

上级 1bb03b4d
......@@ -40,11 +40,29 @@ def apply_to_static(model, config, logger):
return model
assert "image_shape" in config[
"Global"], "image_shape must be assigned for static training mode..."
supported_list = ["DB"]
assert config["Architecture"][
"algorithm"] in supported_list, f"algorithms that supports static training must in in {supported_list} but got {config['Architecture']['algorithm']}"
supported_list = ["DB", "SVTR"]
if config["Architecture"]["algorithm"] in ["Distillation"]:
algo = list(config["Architecture"]["Models"].values())[0]["algorithm"]
else:
algo = config["Architecture"]["algorithm"]
assert algo in supported_list, f"algorithms that supports static training must in in {supported_list} but got {algo}"
specs = [
InputSpec(
[None] + config["Global"]["image_shape"], dtype='float32')
]
if algo == "SVTR":
specs.append([
InputSpec(
[None, config["Global"]["max_text_length"]],
dtype='int64'), InputSpec(
[None, config["Global"]["max_text_length"]], dtype='int64'),
InputSpec(
[None], dtype='int64'), InputSpec(
[None], dtype='float64')
])
specs = [InputSpec([None] + config["Global"]["image_shape"])]
model = to_static(model, input_spec=specs)
logger.info("Successfully to apply @to_static with specs: {}".format(specs))
return model
......@@ -83,7 +83,7 @@ class SAREncoder(nn.Layer):
def forward(self, feat, img_metas=None):
if img_metas is not None:
assert len(img_metas[0]) == feat.shape[0]
assert len(img_metas[0]) == paddle.shape(feat)[0]
valid_ratios = None
if img_metas is not None and self.mask:
......@@ -98,9 +98,10 @@ class SAREncoder(nn.Layer):
if valid_ratios is not None:
valid_hf = []
T = holistic_feat.shape[1]
for i in range(len(valid_ratios)):
valid_step = min(T, math.ceil(T * valid_ratios[i])) - 1
T = paddle.shape(holistic_feat)[1]
for i in range(paddle.shape(valid_ratios)[0]):
valid_step = paddle.minimum(
T, paddle.ceil(valid_ratios[i] * T).astype('int32')) - 1
valid_hf.append(holistic_feat[i, valid_step, :])
valid_hf = paddle.stack(valid_hf, axis=0)
else:
......@@ -247,13 +248,14 @@ class ParallelSARDecoder(BaseDecoder):
# bsz * (seq_len + 1) * h * w * attn_size
attn_weight = self.conv1x1_2(attn_weight)
# bsz * (seq_len + 1) * h * w * 1
bsz, T, h, w, c = attn_weight.shape
bsz, T, h, w, c = paddle.shape(attn_weight)
assert c == 1
if valid_ratios is not None:
# cal mask of attention weight
for i in range(len(valid_ratios)):
valid_width = min(w, math.ceil(w * valid_ratios[i]))
for i in range(paddle.shape(valid_ratios)[0]):
valid_width = paddle.minimum(
w, paddle.ceil(valid_ratios[i] * w).astype("int32"))
if valid_width < w:
attn_weight[i, :, :, valid_width:, :] = float('-inf')
......@@ -288,7 +290,7 @@ class ParallelSARDecoder(BaseDecoder):
img_metas: [label, valid_ratio]
'''
if img_metas is not None:
assert len(img_metas[0]) == feat.shape[0]
assert paddle.shape(img_metas[0])[0] == paddle.shape(feat)[0]
valid_ratios = None
if img_metas is not None and self.mask:
......@@ -302,7 +304,6 @@ class ParallelSARDecoder(BaseDecoder):
# bsz * (seq_len + 1) * C
out_dec = self._2d_attention(
in_dec, feat, out_enc, valid_ratios=valid_ratios)
# bsz * (seq_len + 1) * num_classes
return out_dec[:, 1:, :] # bsz * seq_len * num_classes
......@@ -395,7 +396,6 @@ class SARHead(nn.Layer):
if self.training:
label = targets[0] # label
label = paddle.to_tensor(label, dtype='int64')
final_out = self.decoder(
feat, holistic_feat, label, img_metas=targets)
else:
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册