From 06430c9359b6ee042d2b5d022120ac04de7e3f4f Mon Sep 17 00:00:00 2001
From: WenmuZhou <zjwenmu@gmail.com>
Date: Fri, 18 Sep 2020 11:29:39 +0800
Subject: [PATCH] =?UTF-8?q?ppocr=E6=94=AF=E6=8C=81=E5=A4=9A=E8=AF=AD?=
 =?UTF-8?q?=E8=A8=80=E5=88=87=E6=8D=A2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 doc/doc_ch/whl.md    |  5 +++--
 doc/doc_en/whl_en.md | 21 +++++++++++----------
 paddleocr.py         | 40 ++++++++++++++++++++++++++++------------
 3 files changed, 42 insertions(+), 24 deletions(-)

diff --git a/doc/doc_ch/whl.md b/doc/doc_ch/whl.md
index 61ad26a9..657f9837 100644
--- a/doc/doc_ch/whl.md
+++ b/doc/doc_ch/whl.md
@@ -236,7 +236,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs_words/ch/word_1.jpg --use_angle_cls tru
 ```python
 from paddleocr import PaddleOCR, draw_ocr
 # 模型路径下必须含有model和params文件
-ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_model_dir}', cls_model_dir='{your_cls_model_dir}', use_angle_cls=True)
+ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_model_dir}', rec_char_dict_path='{your_rec_char_dict_path}', cls_model_dir='{your_cls_model_dir}', use_angle_cls=True)
 img_path = 'PaddleOCR/doc/imgs/11.jpg'
 result = ocr.ocr(img_path, cls=True)
 for line in result:
@@ -256,7 +256,7 @@ im_show.save('result.jpg')
 ### 通过命令行使用
 
 ```bash
-paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_dir} --rec_model_dir {your_rec_model_dir} --cls_model_dir {your_cls_model_dir} --use_angle_cls true --cls true
+paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_dir} --rec_model_dir {your_rec_model_dir} --rec_char_dict_path {your_rec_char_dict_path} --cls_model_dir {your_cls_model_dir} --use_angle_cls true --cls true
 ```
 
 ## 参数说明
@@ -290,6 +290,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_
 | cls_batch_num          | 进行分类时，同时前向的图片数                                                                          |30                 |
 | enable_mkldnn           | 是否启用mkldnn                                                                                                                                                                                                       | FALSE                   |
 | use_zero_copy_run           | 是否通过zero_copy_run的方式进行前向                                                                                                                                                                               | FALSE                   |
+| lang                     | 模型语言类型,目前支持 中文(ch)和英文(en)                                                                                                                                                                                                  | ch                    |
 | det                     | 前向时使用启动检测                                                                                                                                                                                                   | TRUE                    |
 | rec                     | 前向时是否启动识别                                                                                                                                                                                                   | TRUE                    |
 | cls                     | 前向时是否启动分类                                                                                                                                                                                                 | FALSE                    |
diff --git a/doc/doc_en/whl_en.md b/doc/doc_en/whl_en.md
index 49a97653..b62e5454 100644
--- a/doc/doc_en/whl_en.md
+++ b/doc/doc_en/whl_en.md
@@ -17,7 +17,7 @@ pip install dist/paddleocr-x.x.x-py3-none-any.whl # x.x.x is the version of padd
 * detection classification and recognition
 ```python
 from paddleocr import PaddleOCR,draw_ocr
-ocr = PaddleOCR(use_angle_cls=True) # need to run only once to download and load model into memory
+ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to download and load model into memory
 img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg'
 result = ocr.ocr(img_path, cls=True)
 for line in result:
@@ -51,7 +51,7 @@ Visualization of results
 * detection and recognition
 ```python
 from paddleocr import PaddleOCR,draw_ocr
-ocr = PaddleOCR() # need to run only once to download and load model into memory
+ocr = PaddleOCR(lang='en') # need to run only once to download and load model into memory
 img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg'
 result = ocr.ocr(img_path)
 for line in result:
@@ -85,7 +85,7 @@ Visualization of results
 * classification and recognition
 ```python
 from paddleocr import PaddleOCR
-ocr = PaddleOCR(use_angle_cls=True) # need to run only once to load model into memory
+ocr = PaddleOCR(use_angle_cls=True, lang='en') # need to run only once to load model into memory
 img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png'
 result = ocr.ocr(img_path, det=False, cls=True)
 for line in result:
@@ -132,7 +132,7 @@ Visualization of results
 * only recognition
 ```python
 from paddleocr import PaddleOCR
-ocr = PaddleOCR() # need to run only once to load model into memory
+ocr = PaddleOCR(lang='en') # need to run only once to load model into memory
 img_path = 'PaddleOCR/doc/imgs_words_en/word_10.png'
 result = ocr.ocr(img_path, det=False, cls=False)
 for line in result:
@@ -168,7 +168,7 @@ paddleocr -h
 
 * detection classification and recognition
 ```bash
-paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --use_angle_cls true -cls true
+paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --use_angle_cls true -cls true --lang en
 ```
 
 Output will be a list, each item contains bounding box, text and recognition confidence
@@ -181,7 +181,7 @@ Output will be a list, each item contains bounding box, text and recognition con
 
 * detection and recognition
 ```bash
-paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg
+paddleocr --image_dir PaddleOCR/doc/imgs_en/img_12.jpg --lang en
 ```
 
 Output will be a list, each item contains bounding box, text and recognition confidence
@@ -194,7 +194,7 @@ Output will be a list, each item contains bounding box, text and recognition con
 
 * classification and recognition
 ```bash
-paddleocr --image_dir PaddleOCR/doc/imgs_words_en/word_10.png --use_angle_cls true -cls true --det false
+paddleocr --image_dir PaddleOCR/doc/imgs_words_en/word_10.png --use_angle_cls true -cls true --det false --lang en
 ```
 
 Output will be a list, each item contains text and recognition confidence
@@ -217,7 +217,7 @@ Output will be a list, each item only contains bounding box
 
 * only recognition
 ```bash
-paddleocr --image_dir PaddleOCR/doc/imgs_words_en/word_10.png --det false --cls false
+paddleocr --image_dir PaddleOCR/doc/imgs_words_en/word_10.png --det false --cls false --lang en
 ```
 
 Output will be a list, each item contains text and recognition confidence
@@ -244,7 +244,7 @@ First, refer to the first section of [inference_en.md](./inference_en.md) to con
 ```python
 from paddleocr import PaddleOCR,draw_ocr
 # The path of detection and recognition model must contain model and params files
-ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_model_dir}', cls_model_dir='{your_cls_model_dir}', use_angle_cls=True)
+ocr = PaddleOCR(det_model_dir='{your_det_model_dir}', rec_model_dir='{your_rec_model_dir}', rec_char_dict_path='{your_rec_char_dict_path}', cls_model_dir='{your_cls_model_dir}', use_angle_cls=True)
 img_path = 'PaddleOCR/doc/imgs_en/img_12.jpg'
 result = ocr.ocr(img_path, cls=True)
 for line in result:
@@ -264,7 +264,7 @@ im_show.save('result.jpg')
 ### Use by command line
 
 ```bash
-paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_dir} --rec_model_dir {your_rec_model_dir} --cls_model_dir {your_cls_model_dir} --use_angle_cls true --cls true
+paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_dir} --rec_model_dir {your_rec_model_dir} --rec_char_dict_path {your_rec_char_dict_path} --cls_model_dir {your_cls_model_dir} --use_angle_cls true --cls true
 ```
 
 ## Parameter Description
@@ -298,6 +298,7 @@ paddleocr --image_dir PaddleOCR/doc/imgs/11.jpg --det_model_dir {your_det_model_
 | cls_batch_num           | When performing classification, the batchsize of forward images                                                                                                                                                                                         | 30                      |
 | enable_mkldnn           | Whether to enable mkldnn                                                                                                                                                                                                       | FALSE                   |
 | use_zero_copy_run           | Whether to forward by zero_copy_run                                                                                                                                                                               | FALSE                   |
+| lang                     | The support language, now only chinese(ch) and english(en) are supported                                                                                                                                                                                                  | ch                    |
 | det                     | Enable detction when `ppocr.ocr` func exec                                                                                                                                                                                                   | TRUE                    |
 | rec                     | Enable recognition when `ppocr.ocr` func exec                                                                                                                                                                                                   | TRUE                    |
 | cls                     | Enable classification when `ppocr.ocr` func exec                                                                                                                                                                                                   | FALSE                    |
diff --git a/paddleocr.py b/paddleocr.py
index cf497dc2..55ca87ac 100644
--- a/paddleocr.py
+++ b/paddleocr.py
@@ -33,11 +33,21 @@ from ppocr.utils.utility import check_and_read_gif, get_image_file_list
 
 __all__ = ['PaddleOCR']
 
-model_params = {
+model_urls = {
     'det':
     'https://paddleocr.bj.bcebos.com/20-09-22/mobile/det/ch_ppocr_mobile_v1.1_det_infer.tar',
-    'rec':
-    'https://paddleocr.bj.bcebos.com/20-09-22/mobile/rec/ch_ppocr_mobile_v1.1_rec_infer.tar',
+    'rec': {
+        'ch': {
+            'url':
+            'https://paddleocr.bj.bcebos.com/20-09-22/mobile/rec/ch_ppocr_mobile_v1.1_rec_infer.tar',
+            'dict_path': './ppocr/utils/ppocr_keys_v1.txt'
+        },
+        'en': {
+            'url':
+            'https://paddleocr.bj.bcebos.com/20-09-22/mobile/en/en_ppocr_mobile_v1.1_rec_infer.tar',
+            'dict_path': './ppocr/utils/ic15_dict.txt'
+        }
+    },
     'cls':
     'https://paddleocr.bj.bcebos.com/20-09-22/cls/ch_ppocr_mobile_v1.1_cls_infer.tar'
 }
@@ -123,10 +133,7 @@ def parse_args():
     parser.add_argument("--rec_char_type", type=str, default='ch')
     parser.add_argument("--rec_batch_num", type=int, default=30)
     parser.add_argument("--max_text_length", type=int, default=25)
-    parser.add_argument(
-        "--rec_char_dict_path",
-        type=str,
-        default="./ppocr/utils/ppocr_keys_v1.txt")
+    parser.add_argument("--rec_char_dict_path", type=str, default=None)
     parser.add_argument("--use_space_char", type=bool, default=True)
 
     # params for text classifier
@@ -135,10 +142,12 @@ def parse_args():
     parser.add_argument("--cls_image_shape", type=str, default="3, 48, 192")
     parser.add_argument("--label_list", type=list, default=['0', '180'])
     parser.add_argument("--cls_batch_num", type=int, default=30)
+    parser.add_argument("--cls_thresh", type=float, default=0.9)
 
     parser.add_argument("--enable_mkldnn", type=bool, default=False)
     parser.add_argument("--use_zero_copy_run", type=bool, default=False)
 
+    parser.add_argument("--lang", type=str, default='ch')
     parser.add_argument("--det", type=str2bool, default=True)
     parser.add_argument("--rec", type=str2bool, default=True)
     parser.add_argument("--cls", type=str2bool, default=False)
@@ -155,21 +164,28 @@ class PaddleOCR(predict_system.TextSystem):
         postprocess_params = parse_args()
         postprocess_params.__dict__.update(**kwargs)
         self.use_angle_cls = postprocess_params.use_angle_cls
+        lang = postprocess_params.lang
+        assert lang in model_urls['rec'], 'param lang must in {}'.format(
+            model_urls['rec'].keys())
+        if postprocess_params.rec_char_dict_path is None:
+            postprocess_params.rec_char_dict_path = model_urls['rec'][lang][
+                'dict_path']
 
         # init model dir
         if postprocess_params.det_model_dir is None:
             postprocess_params.det_model_dir = os.path.join(BASE_DIR, 'det')
         if postprocess_params.rec_model_dir is None:
-            postprocess_params.rec_model_dir = os.path.join(BASE_DIR, 'rec')
+            postprocess_params.rec_model_dir = os.path.join(
+                BASE_DIR, 'rec/{}'.format(lang))
         if postprocess_params.cls_model_dir is None:
             postprocess_params.cls_model_dir = os.path.join(BASE_DIR, 'cls')
         print(postprocess_params)
         # download model
-        maybe_download(postprocess_params.det_model_dir, model_params['det'])
-        maybe_download(postprocess_params.rec_model_dir, model_params['rec'])
+        maybe_download(postprocess_params.det_model_dir, model_urls['det'])
+        maybe_download(postprocess_params.rec_model_dir,
+                       model_urls['rec'][lang]['url'])
         if self.use_angle_cls:
-            maybe_download(postprocess_params.cls_model_dir,
-                           model_params['cls'])
+            maybe_download(postprocess_params.cls_model_dir, model_urls['cls'])
 
         if postprocess_params.det_algorithm not in SUPPORT_DET_MODEL:
             logger.error('det_algorithm must in {}'.format(SUPPORT_DET_MODEL))
-- 
GitLab