rename en_sensitive EN_symbol

edeb12b1 · tink2123 · d9ae86f4 · edeb12b1 · edeb12b1 · edeb12b1
5 changed file
--- a/configs/rec/multi_language/rec_en_number_lite_train.yml
+++ b/configs/rec/multi_language/rec_en_number_lite_train.yml
 Global:
-  use_gpu: True
+  use_gpu: False
  epoch_num: 500
  log_smooth_window: 20
  print_batch_step: 10
@@ -16,7 +16,7 @@ Global:
  infer_img:
  # for data or label process
  character_dict_path: ppocr/utils/dict/en_dict.txt
-  character_type: En
+  character_type: EN
  max_text_length: 25
  infer_mode: False
  use_space_char: False
@@ -63,8 +63,8 @@ Metric:
 Train:
  dataset:
    name: SimpleDataSet
-    data_dir: ./train_data/
+    data_dir: ./train_data/ic15_data/
-    label_file_list: ["./train_data/train_list.txt"]
+    label_file_list: ["./train_data/ic15_data/rec_gt_test.txt"]
    transforms:
      - DecodeImage: # load image
          img_mode: BGR
@@ -77,15 +77,15 @@ Train:
          keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order
  loader:
    shuffle: True
-    batch_size_per_card: 256
+    batch_size_per_card: 1
    drop_last: True
-    num_workers: 8
+    num_workers: 1
 Eval:
  dataset:
    name: SimpleDataSet
-    data_dir: ./train_data/
+    data_dir: ./train_data/ic15_data/
-    label_file_list: ["./train_data/eval_list.txt"]
+    label_file_list: ["./train_data/ic15_data/rec_gt_test.txt"]
    transforms:
      - DecodeImage: # load image
          img_mode: BGR

--- a/doc/doc_ch/recognition.md
+++ b/doc/doc_ch/recognition.md
@@ -348,7 +348,7 @@ PaddleOCR目前已支持26种（除中文外）语种识别，`configs/rec/multi
 | 配置文件 |  算法名称 |   backbone |   trans   |   seq      |     pred     |  language | character_type |
 | :--------: |  :-------:   | :-------:  |   :-------:   |   :-----:   |  :-----:   | :-----:  | :-----:  |
 | rec_chinese_cht_lite_train.yml |  CRNN |   Mobilenet_v3 small 0.5 |  None   |  BiLSTM |  ctc  | 中文繁体  | chinese_cht|
-| rec_en_lite_train.yml |  CRNN |   Mobilenet_v3 small 0.5 |  None   |  BiLSTM |  ctc  | 英语   | En |
+| rec_en_lite_train.yml |  CRNN |   Mobilenet_v3 small 0.5 |  None   |  BiLSTM |  ctc  | 英语（区分大小写）   | EN |
 | rec_french_lite_train.yml |  CRNN |   Mobilenet_v3 small 0.5 |  None   |  BiLSTM |  ctc  | 法语 |  french |
 | rec_ger_lite_train.yml |  CRNN |   Mobilenet_v3 small 0.5 |  None   |  BiLSTM |  ctc  | 德语   | german |
 | rec_japan_lite_train.yml |  CRNN |   Mobilenet_v3 small 0.5 |  None   |  BiLSTM |  ctc  | 日语  | japan |

--- a/doc/doc_en/recognition_en.md
+++ b/doc/doc_en/recognition_en.md
@@ -350,7 +350,7 @@ Currently, the multi-language algorithms supported by PaddleOCR are:
 | Configuration file |  Algorithm name |   backbone |   trans   |   seq      |     pred     |  language | character_type |
 | :--------: |  :-------:   | :-------:  |   :-------:   |   :-----:   |  :-----:   | :-----:  | :-----:  |
 | rec_chinese_cht_lite_train.yml |  CRNN |   Mobilenet_v3 small 0.5 |  None   |  BiLSTM |  ctc  | chinese traditional  | chinese_cht|
-| rec_en_lite_train.yml |  CRNN |   Mobilenet_v3 small 0.5 |  None   |  BiLSTM |  ctc  | English   | En |
+| rec_en_lite_train.yml |  CRNN |   Mobilenet_v3 small 0.5 |  None   |  BiLSTM |  ctc  | English(Case sensitive)   | EN |
 | rec_french_lite_train.yml |  CRNN |   Mobilenet_v3 small 0.5 |  None   |  BiLSTM |  ctc  | French |  french |
 | rec_ger_lite_train.yml |  CRNN |   Mobilenet_v3 small 0.5 |  None   |  BiLSTM |  ctc  | German   | german |
 | rec_japan_lite_train.yml |  CRNN |   Mobilenet_v3 small 0.5 |  None   |  BiLSTM |  ctc  | Japanese | japan |

--- a/ppocr/data/imaug/label_ops.py
+++ b/ppocr/data/imaug/label_ops.py
@@ -18,6 +18,7 @@ from __future__ import print_function
 from __future__ import unicode_literals
 import numpy as np
+import string
 class ClsLabelEncode(object):
@@ -92,8 +93,8 @@ class BaseRecLabelEncode(object):
                 character_type='ch',
                 use_space_char=False):
        support_character_type = [
-            'ch', 'en', 'en_sensitive', 'french', 'german', 'japan', 'korean',
+            'ch', 'en', 'EN_symbol', 'french', 'german', 'japan', 'korean',
-            'En', 'it', 'xi', 'pu', 'ru', 'ar', 'ta', 'ug', 'fa', 'ur', 'rs',
+            'EN', 'it', 'xi', 'pu', 'ru', 'ar', 'ta', 'ug', 'fa', 'ur', 'rs',
            'oc', 'rsc', 'bg', 'uk', 'be', 'te', 'ka', 'chinese_cht', 'hi',
            'mr', 'ne'
        ]
@@ -104,9 +105,8 @@ class BaseRecLabelEncode(object):
        if character_type == "en":
            self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
            dict_character = list(self.character_str)
-        elif character_type == "en_sensitive":
+        elif character_type == "EN_symbol":
            # same with ASTER setting (use 94 char).
-            import string
            self.character_str = string.printable[:-6]
            dict_character = list(self.character_str)
        elif character_type in support_character_type:

--- a/ppocr/postprocess/rec_postprocess.py
+++ b/ppocr/postprocess/rec_postprocess.py
@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import numpy as np
+import string
 import paddle
 from paddle.nn import functional as F
@@ -24,10 +25,10 @@ class BaseRecLabelDecode(object):
                 character_type='ch',
                 use_space_char=False):
        support_character_type = [
-            'ch', 'en', 'en_sensitive', 'french', 'german', 'japan', 'korean',
+            'ch', 'en', 'EN_symbol', 'french', 'german', 'japan', 'korean',
            'it', 'xi', 'pu', 'ru', 'ar', 'ta', 'ug', 'fa', 'ur', 'rs', 'oc',
            'rsc', 'bg', 'uk', 'be', 'te', 'ka', 'chinese_cht', 'hi', 'mr',
-            'ne', 'En'
+            'ne', 'EN'
        ]
        assert character_type in support_character_type, "Only {} are supported now but get {}".format(
            support_character_type, character_type)
@@ -35,9 +36,8 @@ class BaseRecLabelDecode(object):
        if character_type == "en":
            self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
            dict_character = list(self.character_str)
-        elif character_type == "en_sensitive":
+        elif character_type == "EN_symbol":
            # same with ASTER setting (use 94 char).
-            import string
            self.character_str = string.printable[:-6]
            dict_character = list(self.character_str)
        elif character_type in support_character_type: