From 446243af1648092e1ed4abc21abc9659d037206a Mon Sep 17 00:00:00 2001
From: Hongyu Liu <43953930+phlrain@users.noreply.github.com>
Date: Fri, 31 May 2019 16:03:40 +0800
Subject: [PATCH] Add orc readme (#2341)

* add pbt lm; test=develop

* add dynamic ocr recognition; test=develop

* add unility.py; test=develop

* add readme; test=develop

* fix bug; test=develop

* fix bug; test=develop

* fix bug; test=develop
---
 dygraph/ocr_recognition/README.md      |  33 +++++
 dygraph/ocr_recognition/data_reader.py | 164 +++++--------------------
 dygraph/ocr_recognition/train.py       |  23 +---
 3 files changed, 66 insertions(+), 154 deletions(-)
 create mode 100644 dygraph/ocr_recognition/README.md

diff --git a/dygraph/ocr_recognition/README.md b/dygraph/ocr_recognition/README.md
new file mode 100644
index 00000000..69c89fdc
--- /dev/null
+++ b/dygraph/ocr_recognition/README.md
@@ -0,0 +1,33 @@
+DyGraph模式下ocr recognition实现
+========
+
+简介
+--------
+ocr任务是识别图片单行的字母信息，在动态图下使用了带attention的seq2seq结构，静态图实现可以参考（[ocr recognition](https://github.com/PaddlePaddle/models/tree/develop/PaddleCV/ocr_recognition)）
+运行本目录下的程序示例需要使用PaddlePaddle develop最新版本。
+
+
+## 代码结构
+```
+└── train.py     # 训练脚本。
+└── data_reader.py     # 数据读取。
+└── utility     # 基础的函数。
+```
+
+## 使用的数据
+
+教程中使用`ocr attention`数据集作为训练数据，该数据集通过`paddle.dataset`模块自动下载到本地。
+
+## 训练测试ocr recognition
+
+在GPU单卡上训练ocr recognition:
+
+```
+env CUDA_VISIBLE_DEVICES=0 python train.py
+```
+
+这里`CUDA_VISIBLE_DEVICES=0`表示是执行在0号设备卡上，请根据自身情况修改这个参数。
+
+## 效果
+
+在test测试集合上，最好的效果为82.0%
diff --git a/dygraph/ocr_recognition/data_reader.py b/dygraph/ocr_recognition/data_reader.py
index 00e98d12..41618ae1 100644
--- a/dygraph/ocr_recognition/data_reader.py
+++ b/dygraph/ocr_recognition/data_reader.py
@@ -17,7 +17,6 @@ DATA_SHAPE = [1, 48, 512]
 
 DATA_MD5 = "7256b1d5420d8c3e74815196e58cdad5"
 DATA_URL = "http://paddle-ocr-data.bj.bcebos.com/data.tar.gz"
-CACHE_DIR_NAME = "ctc_data"
 SAVED_FILE_NAME = "data.tar.gz"
 DATA_DIR_NAME = "data"
 TRAIN_DATA_DIR_NAME = "train_images"
@@ -27,15 +26,14 @@ TEST_LIST_FILE_NAME = "test.list"
 
 
 class DataGenerator(object):
-    def __init__(self, model="crnn_ctc"):
-        self.model = model
+    def __init__(self):
+        pass
 
     def train_reader(self,
                      img_root_dir,
                      img_label_list,
                      batchsize,
                      cycle,
-                     max_length,
                      shuffle=True):
         '''
         Reader interface for training.
@@ -89,11 +87,6 @@ class DataGenerator(object):
                         label = [int(c) for c in items[-1].split(',')]
                         max_len = max(max_len, len(label))
 
-                    #print( "max len", max_len, i)
-                    max_length = max_len
-
-                    #mask = np.zeros( (batchsize, max_length)).astype('float32')
-
                     for j in range(batchsize):
                         line = img_label_lines[i * batchsize + j]
                         items = line.split(' ')
@@ -102,11 +95,11 @@ class DataGenerator(object):
                         mask = np.zeros((max_len)).astype('float32')
                         mask[:len(label) + 1] = 1.0
                         #mask[ j, :len(label) + 1] = 1.0
-                        if max_length > len(label) + 1:
-                            extend_label = [EOS] * (max_length - len(label) - 1)
+                        if max_len > len(label) + 1:
+                            extend_label = [EOS] * (max_len - len(label) - 1)
                             label.extend(extend_label)
                         else:
-                            label = label[0:max_length - 1]
+                            label = label[0:max_len - 1]
                         img = Image.open(os.path.join(img_root_dir, items[
                             2])).convert('L')
                         if j == 0:
@@ -121,85 +114,6 @@ class DataGenerator(object):
 
         return reader
 
-    def test_reader(self, img_root_dir, img_label_list):
-        '''
-        Reader interface for inference.
-
-        :param img_root_dir: The root path of the images for training.
-        :type img_root_dir: str
-
-        :param img_label_list: The path of the <image_name, label> file for testing.
-        :type img_label_list: str
-        '''
-
-        def reader():
-            for line in open(img_label_list):
-                # h, w, img_name, labels
-                items = line.split(' ')
-
-                label = [int(c) for c in items[-1].split(',')]
-                img = Image.open(os.path.join(img_root_dir, items[2])).convert(
-                    'L')
-                img = np.array(img) - 127.5
-                img = img[np.newaxis, ...]
-                if self.model == "crnn_ctc":
-                    yield img, label
-                else:
-                    yield img, [SOS] + label, label + [EOS]
-
-        return reader
-
-    def infer_reader(self, img_root_dir=None, img_label_list=None, cycle=False):
-        '''A reader interface for inference.
-
-        :param img_root_dir: The root path of the images for training.
-        :type img_root_dir: str
-
-        :param img_label_list: The path of the <image_name, label> file for
-        inference. It should be the path of <image_path> file if img_root_dir
-        was None. If img_label_list was set to None, it will read image path
-        from stdin.
-        :type img_root_dir: str
-
-        :param cycle: If number of iterations is greater than dataset_size /
-        batch_size it reiterates dataset over as many times as necessary.
-        :type cycle: bool
-        '''
-
-        def reader():
-            def yield_img_and_label(lines):
-                for line in lines:
-                    if img_root_dir is not None:
-                        # h, w, img_name, labels
-                        img_name = line.split(' ')[2]
-                        img_path = os.path.join(img_root_dir, img_name)
-                    else:
-                        img_path = line.strip("\t\n\r")
-                    img = Image.open(img_path).convert('L')
-                    img = np.array(img) - 127.5
-                    img = img[np.newaxis, ...]
-                    label = [int(c) for c in line.split(' ')[3].split(',')]
-                    yield img, label
-
-            if img_label_list is not None:
-                lines = []
-                with open(img_label_list) as f:
-                    lines = f.readlines()
-                for img, label in yield_img_and_label(lines):
-                    yield img, label
-                while cycle:
-                    for img, label in yield_img_and_label(lines):
-                        yield img, label
-            else:
-                while True:
-                    img_path = input("Please input the path of image: ")
-                    img = Image.open(img_path).convert('L')
-                    img = np.array(img) - 127.5
-                    img = img[np.newaxis, ...]
-                    yield img, [[0]]
-
-        return reader
-
 
 def num_classes():
     '''Get classes number of this dataset.
@@ -213,51 +127,31 @@ def data_shape():
     return DATA_SHAPE
 
 
-def train(batch_size,
-          max_length,
-          train_images_dir=None,
-          train_list_file=None,
-          cycle=False,
-          shuffle=False,
-          model="crnn_ctc"):
-    generator = DataGenerator(model)
-    if train_images_dir is None:
-        data_dir = download_data()
-        train_images_dir = path.join(data_dir, TRAIN_DATA_DIR_NAME)
-    if train_list_file is None:
-        train_list_file = path.join(data_dir, TRAIN_LIST_FILE_NAME)
+def data_reader(batch_size,
+                images_dir=None,
+                list_file=None,
+                cycle=False,
+                shuffle=False,
+                data_type="train"):
+    generator = DataGenerator()
+
+    if data_type == "train":
+        if images_dir is None:
+            data_dir = download_data()
+            images_dir = path.join(data_dir, TRAIN_DATA_DIR_NAME)
+        if list_file is None:
+            list_file = path.join(data_dir, TRAIN_LIST_FILE_NAME)
+    elif data_type == "test":
+        if images_dir is None:
+            data_dir = download_data()
+            images_dir = path.join(data_dir, TEST_DATA_DIR_NAME)
+        if list_file is None:
+            list_file = path.join(data_dir, TEST_LIST_FILE_NAME)
+    else:
+        print("data type only support train | test")
+        raise Exception("data type only support train | test")
     return generator.train_reader(
-        train_images_dir,
-        train_list_file,
-        batch_size,
-        cycle,
-        max_length,
-        shuffle=shuffle)
-
-
-def test(batch_size=1,
-         test_images_dir=None,
-         test_list_file=None,
-         model="crnn_ctc"):
-    generator = DataGenerator(model)
-    if test_images_dir is None:
-        data_dir = download_data()
-        test_images_dir = path.join(data_dir, TEST_DATA_DIR_NAME)
-    if test_list_file is None:
-        test_list_file = path.join(data_dir, TEST_LIST_FILE_NAME)
-    return paddle.batch(
-        generator.test_reader(test_images_dir, test_list_file), batch_size)
-
-
-def inference(batch_size=1,
-              infer_images_dir=None,
-              infer_list_file=None,
-              cycle=False,
-              model="crnn_ctc"):
-    generator = DataGenerator(model)
-    return paddle.batch(
-        generator.infer_reader(infer_images_dir, infer_list_file, cycle),
-        batch_size)
+        images_dir, list_file, batch_size, cycle, shuffle=shuffle)
 
 
 def download_data():
diff --git a/dygraph/ocr_recognition/train.py b/dygraph/ocr_recognition/train.py
index 954612af..f333d3e2 100644
--- a/dygraph/ocr_recognition/train.py
+++ b/dygraph/ocr_recognition/train.py
@@ -42,7 +42,6 @@ add_arg('train_images',      str,   None,       "The directory of images to be u
 add_arg('train_list',        str,   None,       "The list file of images to be used for training.")
 add_arg('test_images',       str,   None,       "The directory of images to be used for test.")
 add_arg('test_list',         str,   None,       "The list file of images to be used for training.")
-add_arg('model',    str,   "attention",           "Which type of network to be used. 'crnn_ctc' or 'attention'")
 add_arg('init_model',        str,   None,       "The init model file of directory.")
 add_arg('use_gpu',           bool,  True,      "Whether use GPU to train.")
 add_arg('min_average_window',int,   10000,     "Min average window.")
@@ -78,10 +77,6 @@ class Config(object):
     # special label for start and end
     SOS = 0
     EOS = 1
-    # settings for ctc data, not use in unittest
-    DATA_DIR_NAME = "./dataset/ctc_data/data"
-    TRAIN_DATA_DIR_NAME = "train_images"
-    TRAIN_LIST_FILE_NAME = "train.list"
 
     # data shape for input image
     DATA_SHAPE = [1, 48, 512]
@@ -478,24 +473,18 @@ def train(args):
 
         grad_clip = fluid.dygraph_grad_clip.GradClipByGlobalNorm(5.0 )
 
-        train_reader = data_reader.train(
+        train_reader = data_reader.data_reader(
             Config.batch_size,
-            max_length=Config.max_length,
-            train_images_dir=args.train_images,
-            train_list_file=args.train_list,
             cycle=args.total_step > 0,
             shuffle=True,
-            model=args.model)
+            data_type='train')
 
         infer_image= './data/data/test_images/'
         infer_files = './data/data/test.list'
-        test_reader = data_reader.train(
+        test_reader = data_reader.data_reader(
                 Config.batch_size,
-                1000,
-                train_images_dir= infer_image,
-                train_list_file= infer_files,
                 cycle=False,
-                model=args.model)
+                data_type="test")
         def eval():
             ocr_attention.eval()
             total_loss = 0.0
@@ -578,10 +567,6 @@ def train(args):
                     total_loss = 0.0
 
                 if total_step > 0 and total_step % 2000 == 0:
-
-                    model_value = ocr_attention.state_dict()
-                    np.savez( "model/" + str(total_step), **model_value )
-
                     ocr_attention.eval()
                     eval()
                     ocr_attention.train()
-- 
GitLab