diff --git a/configs/rec/rec_resnet_stn_bilstm_att.yml b/configs/rec/rec_resnet_stn_bilstm_att.yml index 0bb90b35264b424c58a45685f5a2a066843298a6..5a37ba7541ef38be6ccde61bf64baba9dbd7a71b 100644 --- a/configs/rec/rec_resnet_stn_bilstm_att.yml +++ b/configs/rec/rec_resnet_stn_bilstm_att.yml @@ -50,7 +50,7 @@ Architecture: name: AsterHead # AttentionHead sDim: 512 attDim: 512 - max_len_labels: 100 + max_len_labels: 20 Loss: name: AsterLoss diff --git a/doc/doc_ch/algorithm_rec_seed.md b/doc/doc_ch/algorithm_rec_seed.md index 710e92272dc3169bf373d273534441a15c6be01c..ae0c529a666ac795172e3e260d2738ea461611a9 100644 --- a/doc/doc_ch/algorithm_rec_seed.md +++ b/doc/doc_ch/algorithm_rec_seed.md @@ -78,23 +78,34 @@ python3 tools/infer_rec.py -c configs/rec/rec_resnet_stn_bilstm_att.yml -o Globa ### 4.1 Python推理 -coming soon +首先将SEED文本识别训练过程中保存的模型,转换成inference model。( [模型下载地址](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) ),可以使用如下命令进行转换: + +``` +python3 tools/export_model.py -c configs/rec/rec_resnet_stn_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.save_inference_dir=seed_infer +``` + +SEED文本识别模型推理,可以执行如下命令: + +``` +python3 tools/infer/predict_rec.py --rec_model_dir=seed_infer --image_dir=doc/imgs_words_en/word_10.png --rec_algorithm="SEED" --rec_char_dict_path=ppocr/utils/EN_symbol_dict.txt --rec_image_shape="3,64,256" --use_space_char=False +``` + ### 4.2 C++推理 -coming soon +暂不支持 ### 4.3 Serving服务化部署 -coming soon +暂不支持 ### 4.4 更多推理部署 -coming soon +暂不支持 ## 5. FAQ diff --git a/doc/doc_en/algorithm_rec_seed_en.md b/doc/doc_en/algorithm_rec_seed_en.md index f8d7ae6d3f34ab8a4f510c88002b22dbce7a10e8..a71d105e0c1f5b3cbe210d7cc78c2b07725ab978 100644 --- a/doc/doc_en/algorithm_rec_seed_en.md +++ b/doc/doc_en/algorithm_rec_seed_en.md @@ -77,7 +77,17 @@ python3 tools/infer_rec.py -c configs/rec/rec_resnet_stn_bilstm_att.yml -o Globa ### 4.1 Python Inference -Not support +First, the model saved during the SEED text recognition training process is converted into an inference model. ( [Model download link](https://paddleocr.bj.bcebos.com/dygraph_v2.1/rec/rec_resnet_stn_bilstm_att.tar) ), you can use the following command to convert: + +``` +python3 tools/export_model.py -c configs/rec/rec_resnet_stn_bilstm_att.yml -o Global.pretrained_model={path/to/weights}/best_accuracy Global.save_inference_dir=seed_infer +``` + +For SEED text recognition model inference, the following commands can be executed: + +``` +python3 tools/infer/predict_rec.py --rec_model_dir=seed_infer --image_dir=doc/imgs_words_en/word_10.png --rec_algorithm="SEED" --rec_char_dict_path=ppocr/utils/EN_symbol_dict.txt --rec_image_shape="3,64,256" --use_space_char=False +``` ### 4.2 C++ Inference diff --git a/ppocr/modeling/heads/rec_aster_head.py b/ppocr/modeling/heads/rec_aster_head.py index c95e8fd31f84c26cf58f7fbbdaab6c825b10eea8..1febc320ea835db31d11bba2fe5f0b4ef0c1f7f9 100644 --- a/ppocr/modeling/heads/rec_aster_head.py +++ b/ppocr/modeling/heads/rec_aster_head.py @@ -62,10 +62,11 @@ class AsterHead(nn.Layer): else: rec_pred, rec_pred_scores = self.decoder.beam_search( x, self.beam_width, self.eos, embedding_vectors) + rec_pred_scores.stop_gradient = True + rec_pred.stop_gradient = True return_dict['rec_pred'] = rec_pred return_dict['rec_pred_scores'] = rec_pred_scores return_dict['embedding_vectors'] = embedding_vectors - return return_dict @@ -114,37 +115,13 @@ class AttentionRecognitionHead(nn.Layer): y_prev = paddle.full( shape=[batch_size], fill_value=self.num_classes) else: + y_prev = targets[:, i - 1] output, state = self.decoder(x, state, y_prev) outputs.append(output) outputs = paddle.concat([_.unsqueeze(1) for _ in outputs], 1) return outputs - # inference stage. - def sample(self, x): - x, _, _ = x - batch_size = x.size(0) - # Decoder - state = paddle.zeros([1, batch_size, self.sDim]) - - predicted_ids, predicted_scores = [], [] - for i in range(self.max_len_labels): - if i == 0: - y_prev = paddle.full( - shape=[batch_size], fill_value=self.num_classes) - else: - y_prev = predicted - - output, state = self.decoder(x, state, y_prev) - output = F.softmax(output, axis=1) - score, predicted = output.max(1) - predicted_ids.append(predicted.unsqueeze(1)) - predicted_scores.append(score.unsqueeze(1)) - predicted_ids = paddle.concat([predicted_ids, 1]) - predicted_scores = paddle.concat([predicted_scores, 1]) - # return predicted_ids.squeeze(), predicted_scores.squeeze() - return predicted_ids, predicted_scores - def beam_search(self, x, beam_width, eos, embed): def _inflate(tensor, times, dim): repeat_dims = [1] * tensor.dim() @@ -153,7 +130,7 @@ class AttentionRecognitionHead(nn.Layer): return output # https://github.com/IBM/pytorch-seq2seq/blob/fede87655ddce6c94b38886089e05321dc9802af/seq2seq/models/TopKDecoder.py - batch_size, l, d = x.shape + batch_size, l, d = paddle.shape(x) x = paddle.tile( paddle.transpose( x.unsqueeze(1), perm=[1, 0, 2, 3]), [beam_width, 1, 1, 1]) @@ -166,21 +143,22 @@ class AttentionRecognitionHead(nn.Layer): pos_index = paddle.reshape( paddle.arange(batch_size) * beam_width, shape=[-1, 1]) - # Initialize the scores + sequence_scores = paddle.full( - shape=[batch_size * beam_width, 1], fill_value=-float('Inf')) - index = [i * beam_width for i in range(0, batch_size)] - sequence_scores[index] = 0.0 + shape=[batch_size, beam_width], fill_value=-float('Inf')) + sequence_scores[:, 0] = 0.0 + sequence_scores = paddle.reshape( + sequence_scores, shape=[batch_size * beam_width, 1]) # Initialize the input vector y_prev = paddle.full( shape=[batch_size * beam_width], fill_value=self.num_classes) # Store decisions for backtracking - stored_scores = list() - stored_predecessors = list() - stored_emitted_symbols = list() + stored_scores = [] + stored_predecessors = [] + stored_emitted_symbols = [] for i in range(self.max_len_labels): output, state = self.decoder(inflated_encoder_feats, state, y_prev) @@ -194,15 +172,16 @@ class AttentionRecognitionHead(nn.Layer): paddle.reshape(sequence_scores, [batch_size, -1]), beam_width, axis=1) - # Reshape input = (bk, 1) and sequence_scores = (bk, 1) y_prev = paddle.reshape( - candidates % self.num_classes, shape=[batch_size * beam_width]) + candidates % self.num_classes, shape=[batch_size, beam_width]) + y_prev = paddle.reshape(y_prev, shape=[batch_size * beam_width]) sequence_scores = paddle.reshape( scores, shape=[batch_size * beam_width, 1]) # Update fields for next timestep - pos_index = paddle.expand_as(pos_index, candidates) + pos_index = paddle.expand(pos_index, paddle.shape(candidates)) + predecessors = paddle.cast( candidates / self.num_classes + pos_index, dtype='int64') predecessors = paddle.reshape( @@ -213,13 +192,13 @@ class AttentionRecognitionHead(nn.Layer): # Update sequence socres and erase scores for symbol so that they aren't expanded stored_scores.append(sequence_scores.clone()) y_prev = paddle.reshape(y_prev, shape=[-1, 1]) - eos_prev = paddle.full_like(y_prev, fill_value=eos) + + eos_prev = paddle.full(paddle.shape(y_prev), fill_value=eos) mask = eos_prev == y_prev + mask = paddle.cast(mask, 'int64') mask = paddle.nonzero(mask) - if mask.dim() > 0: - sequence_scores = sequence_scores.numpy() - mask = mask.numpy() - sequence_scores[mask] = -float('inf') + if len(mask) > 0: + sequence_scores[:] = -float('inf') sequence_scores = paddle.to_tensor(sequence_scores) # Cache results for backtracking @@ -228,11 +207,12 @@ class AttentionRecognitionHead(nn.Layer): stored_emitted_symbols.append(y_prev) # Do backtracking to return the optimal values - #====== backtrak ======# + # ====== backtrak ======# # Initialize return variables given different types - p = list() - l = [[self.max_len_labels] * beam_width for _ in range(batch_size) - ] # Placeholder for lengths of top-k sequences + p = [] + + # Placeholder for lengths of top-k sequences + l = paddle.full([batch_size, beam_width], self.max_len_labels) # the last step output of the beams are not sorted # thus they are sorted here @@ -244,14 +224,18 @@ class AttentionRecognitionHead(nn.Layer): # initialize the sequence scores with the sorted last step beam scores s = sorted_score.clone() - batch_eos_found = [0] * batch_size # the number of EOS found + batch_eos_found = paddle.zeros( + [batch_size], dtype='int32') # the number of EOS found # in the backward loop below for each batch t = self.max_len_labels - 1 + # initialize the back pointer with the sorted order of the last step beams. # add pos_index for indexing variable with b*k as the first dimension. t_predecessors = paddle.reshape( - sorted_idx + pos_index.expand_as(sorted_idx), + sorted_idx + pos_index.expand(paddle.shape(sorted_idx)), shape=[batch_size * beam_width]) + + tmp_beam_width = beam_width while t >= 0: # Re-order the variables with the back pointer current_symbol = paddle.index_select( @@ -261,26 +245,32 @@ class AttentionRecognitionHead(nn.Layer): eos_indices = stored_emitted_symbols[t] == eos eos_indices = paddle.nonzero(eos_indices) + stored_predecessors_t = stored_predecessors[t] + stored_emitted_symbols_t = stored_emitted_symbols[t] + stored_scores_t = stored_scores[t] + t_plus = t + 1 + if eos_indices.dim() > 0: - for i in range(eos_indices.shape[0] - 1, -1, -1): + for j in range(eos_indices.shape[0] - 1, -1, -1): # Indices of the EOS symbol for both variables # with b*k as the first dimension, and b, k for # the first two dimensions - idx = eos_indices[i] - b_idx = int(idx[0] / beam_width) + idx = eos_indices[j] + b_idx = int(idx[0] / tmp_beam_width) # The indices of the replacing position # according to the replacement strategy noted above - res_k_idx = beam_width - (batch_eos_found[b_idx] % - beam_width) - 1 + res_k_idx = tmp_beam_width - (batch_eos_found[b_idx] % + tmp_beam_width) - 1 batch_eos_found[b_idx] += 1 - res_idx = b_idx * beam_width + res_k_idx + res_idx = b_idx * tmp_beam_width + res_k_idx # Replace the old information in return variables # with the new ended sequence information - t_predecessors[res_idx] = stored_predecessors[t][idx[0]] - current_symbol[res_idx] = stored_emitted_symbols[t][idx[0]] - s[b_idx, res_k_idx] = stored_scores[t][idx[0], 0] - l[b_idx][res_k_idx] = t + 1 + + t_predecessors[res_idx] = stored_predecessors_t[idx[0]] + current_symbol[res_idx] = stored_emitted_symbols_t[idx[0]] + s[b_idx, res_k_idx] = stored_scores_t[idx[0], 0] + l[b_idx][res_k_idx] = t_plus # record the back tracked results p.append(current_symbol) @@ -289,24 +279,30 @@ class AttentionRecognitionHead(nn.Layer): # Sort and re-order again as the added ended sequences may change # the order (very unlikely) s, re_sorted_idx = s.topk(beam_width) + for b_idx in range(batch_size): - l[b_idx] = [ - l[b_idx][k_idx.item()] for k_idx in re_sorted_idx[b_idx, :] - ] + tmp_tensor = paddle.full_like(l[b_idx], 0) + for k_idx in re_sorted_idx[b_idx]: + tmp_tensor[k_idx] = l[b_idx][k_idx] + l[b_idx] = tmp_tensor re_sorted_idx = paddle.reshape( - re_sorted_idx + pos_index.expand_as(re_sorted_idx), + re_sorted_idx + pos_index.expand(paddle.shape(re_sorted_idx)), [batch_size * beam_width]) # Reverse the sequences and re-order at the same time # It is reversed because the backtracking happens in reverse time order - p = [ - paddle.reshape( - paddle.index_select(step, re_sorted_idx, 0), - shape=[batch_size, beam_width, -1]) for step in reversed(p) - ] - p = paddle.concat(p, -1)[:, 0, :] - return p, paddle.ones_like(p) + reversed_p = p[::-1] + + q = [] + for step in reversed_p: + q.append( + paddle.reshape( + paddle.index_select(step, re_sorted_idx, 0), + shape=[batch_size, beam_width, -1])) + + q = paddle.concat(q, -1)[:, 0, :] + return q, paddle.ones_like(q) class AttentionUnit(nn.Layer): @@ -385,9 +381,9 @@ class DecoderUnit(nn.Layer): yProj = self.tgt_embedding(yPrev) concat_context = paddle.concat([yProj, context], 1) - concat_context = paddle.squeeze(concat_context, 1) sPrev = paddle.squeeze(sPrev, 0) + output, state = self.gru(concat_context, sPrev) output = paddle.squeeze(output, axis=1) output = self.fc(output) - return output, state \ No newline at end of file + return output, state diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index 749060a053f1442f4bf5df6c5f4b56205e893be8..99e7d0c90f24373c97247bcd407d30af2b48a430 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -307,6 +307,11 @@ class SEEDLabelDecode(BaseRecLabelDecode): label = self.decode(label, is_remove_duplicate=False) return text, label """ + tmp = {} + if isinstance(preds, list): + tmp["rec_pred"] = preds[1] + tmp["rec_pred_scores"] = preds[0] + preds = tmp preds_idx = preds["rec_pred"] if isinstance(preds_idx, paddle.Tensor): preds_idx = preds_idx.numpy() diff --git a/tools/export_model.py b/tools/export_model.py index 193988cc1b62a6c4536a8d2ec640e3e5fc81a79c..4b91462a3b40bb6db73d0ad91d6b3cf96673f2a6 100755 --- a/tools/export_model.py +++ b/tools/export_model.py @@ -97,7 +97,6 @@ def export_single_model(model, paddle.static.InputSpec( shape=[None, 3, 32, 128], dtype="float32"), ] - # print([None, 3, 32, 128]) model = to_static(model, input_spec=other_shape) elif arch_config["algorithm"] in ["NRTR", "SPIN"]: other_shape = [ @@ -115,16 +114,18 @@ def export_single_model(model, max_text_length = arch_config["Head"]["max_text_length"] other_shape = [ paddle.static.InputSpec( - shape=[None, 3, 48, 160], dtype="float32"), - - [ - paddle.static.InputSpec( - shape=[None, ], - dtype="float32"), + shape=[None, 3, 48, 160], dtype="float32"), [ + paddle.static.InputSpec( + shape=[None, ], dtype="float32"), + paddle.static.InputSpec( + shape=[None, max_text_length], dtype="int64") + ] + ] + model = to_static(model, input_spec=other_shape) + elif arch_config["algorithm"] == "SEED": + other_shape = [ paddle.static.InputSpec( - shape=[None, max_text_length], - dtype="int64") - ] + shape=[None, 3, 64, 256], dtype="float32") ] model = to_static(model, input_spec=other_shape) elif arch_config["algorithm"] in ["LayoutLM", "LayoutLMv2", "LayoutXLM"]: diff --git a/tools/infer/predict_rec.py b/tools/infer/predict_rec.py index 176e2c68e2c9b2e08f9b56378c45a57733faf8cd..5a7564e122c7173658c4cc2a57caef13276a304f 100755 --- a/tools/infer/predict_rec.py +++ b/tools/infer/predict_rec.py @@ -100,6 +100,12 @@ class TextRecognizer(object): "use_space_char": args.use_space_char, "rm_symbol": True } + elif self.rec_algorithm == "SEED": + postprocess_params = { + 'name': 'SEEDLabelDecode', + "character_dict_path": args.rec_char_dict_path, + "use_space_char": args.use_space_char + } self.postprocess_op = build_post_process(postprocess_params) self.predictor, self.input_tensor, self.output_tensors, self.config = \ utility.create_predictor(args, 'rec', logger) @@ -161,6 +167,7 @@ class TextRecognizer(object): if resized_w > self.rec_image_shape[2]: resized_w = self.rec_image_shape[2] imgW = self.rec_image_shape[2] + resized_image = cv2.resize(img, (resized_w, imgH)) resized_image = resized_image.astype('float32') resized_image = resized_image.transpose((2, 0, 1)) / 255 @@ -398,6 +405,11 @@ class TextRecognizer(object): img_list[indices[ino]], self.rec_image_shape) norm_img = norm_img[np.newaxis, :] norm_img_batch.append(norm_img) + elif self.rec_algorithm == "SEED": + norm_img = self.resize_norm_img_svtr(img_list[indices[ino]], + self.rec_image_shape) + norm_img = norm_img[np.newaxis, :] + norm_img_batch.append(norm_img) elif self.rec_algorithm == "RobustScanner": norm_img, _, _, valid_ratio = self.resize_norm_img_sar( img_list[indices[ino]], diff --git a/tools/infer_rec.py b/tools/infer_rec.py index 14b14544eb11e9fb0a0c2cdf92aff9d7cb4b5ba7..2d2e09a2840da1f73cac613bdd01090050729893 100755 --- a/tools/infer_rec.py +++ b/tools/infer_rec.py @@ -75,7 +75,6 @@ def main(): 'out_channels_list'] = out_channels_list else: # base rec model config['Architecture']["Head"]['out_channels'] = char_num - model = build_model(config['Architecture']) load_model(config, model) @@ -97,7 +96,8 @@ def main(): elif config['Architecture']['algorithm'] == "SAR": op[op_name]['keep_keys'] = ['image', 'valid_ratio'] elif config['Architecture']['algorithm'] == "RobustScanner": - op[op_name]['keep_keys'] = ['image', 'valid_ratio', 'word_positons'] + op[op_name][ + 'keep_keys'] = ['image', 'valid_ratio', 'word_positons'] else: op[op_name]['keep_keys'] = ['image'] transforms.append(op) @@ -136,9 +136,10 @@ def main(): if config['Architecture']['algorithm'] == "RobustScanner": valid_ratio = np.expand_dims(batch[1], axis=0) word_positons = np.expand_dims(batch[2], axis=0) - img_metas = [paddle.to_tensor(valid_ratio), - paddle.to_tensor(word_positons), - ] + img_metas = [ + paddle.to_tensor(valid_ratio), + paddle.to_tensor(word_positons), + ] images = np.expand_dims(batch[0], axis=0) images = paddle.to_tensor(images) if config['Architecture']['algorithm'] == "SRN":