From 0fa990bbc9de9d2afa94b87cd38987389712d285 Mon Sep 17 00:00:00 2001
From: peterzhang2029 <zhangchao41@baidu.com>
Date: Wed, 25 Oct 2017 19:09:31 +0800
Subject: [PATCH] add config and refine doc

---
 scene_text_recognition/README.md        | 116 ++++++++++---------
 scene_text_recognition/config.py        |  75 ++++++++++++
 scene_text_recognition/data_provider.py | 100 ----------------
 scene_text_recognition/index.html       | 116 ++++++++++---------
 scene_text_recognition/infer.py         |  51 +++++---
 scene_text_recognition/model.py         |  74 +++++++-----
 scene_text_recognition/reader.py        |  62 ++++++++++
 scene_text_recognition/requirements.txt |   2 +
 scene_text_recognition/train.py         | 148 +++++++++++-------------
 scene_text_recognition/utils.py         |  59 ++++++++++
 10 files changed, 461 insertions(+), 342 deletions(-)
 create mode 100644 scene_text_recognition/config.py
 delete mode 100644 scene_text_recognition/data_provider.py
 create mode 100644 scene_text_recognition/reader.py
 create mode 100644 scene_text_recognition/requirements.txt
 create mode 100644 scene_text_recognition/utils.py
diff --git a/scene_text_recognition/README.md b/scene_text_recognition/README.md
index de1418dd..5e83a68e 100644
--- a/scene_text_recognition/README.md
+++ b/scene_text_recognition/README.md
@@ -4,7 +4,7 @@
 
 在现实生活中，包括路牌、菜单、大厦标语在内的很多场景均会有文字出现，这些场景的照片中的文字为图片场景的理解提供了更多信息，\[[1](#参考文献)\]使用深度学习模型自动识别路牌中的文字，帮助街景应用获取更加准确的地址信息。
 
-本例将演示如何用 PaddlePaddle 完成 **场景文字识别 (STR, Scene Text Recognition)** 任务。以下图为例，给定一个场景图片，STR需要从图片中识别出对应的文字"keep":
+本例将演示如何用 PaddlePaddle 完成 **场景文字识别 (STR, Scene Text Recognition)** 任务。以下图为例，给定一个场景图片，STR需要从图片中识别出对应的文字"keep"。
 
 <p align="center">
 <img src="./images/503.jpg"/><br/>
@@ -14,70 +14,66 @@
 
 ## 使用 PaddlePaddle 训练与预测
 
+### 安装依赖包
+```bash
+pip install -r requirements.txt
+```
+
+### 指定训练配置参数
+
+通过 `config.py` 脚本修改训练和模型配置参数，脚本中有对可配置参数的详细解释，示例如下：
+```python
+class TrainerConfig(object):
+
+      # Whether to use GPU in training or not.
+      use_gpu = True
+      # The number of computing threads.
+      trainer_count = 1
+
+      # The training batch size.
+      batch_size = 10
+
+      ...
+
+
+class ModelConfig(object):
+
+      # Number of the filters for convolution group.
+      filter_num = 8
+
+      ...
+```
+修改 `config.py` 对参数进行调整。例如，通过修改 `use_gpu` 参数来指定是否使用 GPU 进行训练。
+
 ### 模型训练
 训练脚本 [./train.py](./train.py) 中设置了如下命令行参数：
 
 ```
-usage: train.py [-h] --image_shape IMAGE_SHAPE --train_file_list
-                TRAIN_FILE_LIST --test_file_list TEST_FILE_LIST
-                [--batch_size BATCH_SIZE]
-                [--model_output_prefix MODEL_OUTPUT_PREFIX]
-                [--trainer_count TRAINER_COUNT]
-                [--save_period_by_batch SAVE_PERIOD_BY_BATCH]
-                [--num_passes NUM_PASSES]
-
-PaddlePaddle CTC example
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --image_shape IMAGE_SHAPE
-                        image's shape, format is like '173,46'
-  --train_file_list TRAIN_FILE_LIST
-                        path of the file which contains path list of train
-                        image files
-  --test_file_list TEST_FILE_LIST
-                        path of the file which contains path list of test
-                        image files
-  --batch_size BATCH_SIZE
-                        size of a mini-batch
-  --model_output_prefix MODEL_OUTPUT_PREFIX
-                        prefix of path for model to store (default:
-                        ./model.ctc)
-  --trainer_count TRAINER_COUNT
-                        number of training threads
-  --save_period_by_batch SAVE_PERIOD_BY_BATCH
-                        save model to disk every N batches
-  --num_passes NUM_PASSES
-                        number of passes to train (default: 1)
-```
+Options:
+  --train_file_list_path TEXT  The path of the file which contains path list
+                               of train image files.  [required]
+  --test_file_list_path TEXT   The path of the file which contains path list
+                               of test image files.  [required]
+  --model_save_dir TEXT        The path to save the trained models (default:
+                               'models').
+  --help                       Show this message and exit.
 
-重要的几个参数包括：
+```
 
-- `image_shape` 图片的尺寸
 - `train_file_list` 训练数据的列表文件，每行一个路径加对应的text，具体格式为：
 ```
 word_1.png, "PROPER"
 word_2.png, "FOOD"
 ```
-- `test_file_list` 测试数据的列表文件，格式同上
-
-### 预测
-预测部分由infer.py完成，使用的是最优路径解码算法，即：在每个时间步选择一个概率最大的字符。在使用过程中，需要在infer.py中指定具体的模型目录、图片固定尺寸、batch_size和图片文件的列表文件。例如：
-```python
-model_path = "model.ctc-pass-9-batch-150-test.tar.gz"  
-image_shape = "173,46"
-batch_size = 50
-infer_file_list = 'data/test_data/Challenge2_Test_Task3_GT.txt'
-```
-然后运行```python infer.py```
-
+- `test_file_list` 测试数据的列表文件，格式同上。
+- `model_save_dir` 模型参数会的保存目录目录， 默认为当前目录下的`models`目录。
 
 ### 具体执行的过程：
 
 1.从官方网站下载数据\[[2](#参考文献)\]（Task 2.3: Word Recognition (2013 edition)），会有三个文件: Challenge2_Training_Task3_Images_GT.zip、Challenge2_Test_Task3_Images.zip和 Challenge2_Test_Task3_GT.txt。
 分别对应训练集的图片和图片对应的单词，测试集的图片，测试数据对应的单词，然后执行以下命令，对数据解压并移动至目标文件夹：
 
-```
+```bash
 mkdir -p data/train_data
 mkdir -p data/test_data
 unzip Challenge2_Training_Task3_Images_GT.zip -d data/train_data
@@ -85,16 +81,26 @@ unzip Challenge2_Test_Task3_Images.zip -d data/test_data
 mv Challenge2_Test_Task3_GT.txt data/test_data
 ```
 
-2.获取训练数据文件夹中 `gt.txt` 的路径 (data/train_data）和测试数据文件夹中`Challenge2_Test_Task3_GT.txt`的路径(data/test_data)
+2.获取训练数据文件夹中 `gt.txt` 的路径 (data/train_data）和测试数据文件夹中`Challenge2_Test_Task3_GT.txt`的路径(data/test_data)。
 
-3.执行命令
+3.执行如下命令进行训练：
+```bash
+python train.py \
+--train_file_list_path 'data/train_data/gt.txt' \
+--test_file_list_path 'data/test_data/Challenge2_Test_Task3_GT.txt'
 ```
-python train.py --train_file_list data/train_data/gt.txt --test_file_list data/test_data/Challenge2_Test_Task3_GT.txt --image_shape '173,46'
-```
-4.训练过程中，模型参数会自动备份到指定目录，默认为 ./model.ctc
+4.训练过程中，模型参数会自动备份到指定目录，默认会保存在 `./models` 目录下。
 
-5.设置infer.py中的相关参数(模型所在路径)，运行```python infer.py``` 进行预测
 
+### 预测
+预测部分由 `infer.py` 完成，使用的是最优路径解码算法，即：在每个时间步选择一个概率最大的字符。在使用过程中，需要在 `infer.py` 中指定具体的模型目录、图片固定尺寸、batch_size（默认设置为10）和图片文件的列表文件。执行如下代码：
+```bash
+python infer.py \
+--model_path 'models/params_pass_00000.tar.gz' \
+--image_shape '173,46' \
+--infer_file_list_path 'data/test_data/Challenge2_Test_Task3_GT.txt'
+```
+即可进行预测。
 
 ### 其他数据集
 
@@ -104,7 +110,7 @@ python train.py --train_file_list data/train_data/gt.txt --test_file_list data/t
 ### 注意事项
 
 - 由于模型依赖的 `warp CTC` 只有CUDA的实现，本模型只支持 GPU 运行
-- 本模型参数较多，占用显存比较大，实际执行时可以调节batch_size 控制显存占用
+- 本模型参数较多，占用显存比较大，实际执行时可以调节`batch_size`控制显存占用
 - 本模型使用的数据集较小，可以选用其他更大的数据集\[[3](#参考文献)\]来训练需要的模型
 
 ## 参考文献
diff --git a/scene_text_recognition/config.py b/scene_text_recognition/config.py
new file mode 100644
index 00000000..9cc56354
--- /dev/null
+++ b/scene_text_recognition/config.py
@@ -0,0 +1,75 @@
+__all__ = ["TrainerConfig", "ModelConfig"]
+
+
+class TrainerConfig(object):
+
+    # Whether to use GPU in training or not.
+    use_gpu = True
+
+    # The number of computing threads.
+    trainer_count = 1
+
+    # The training batch size.
+    batch_size = 10
+
+    # The epoch number.
+    num_passes = 10
+
+    # Parameter updates momentum.
+    momentum = 0
+
+    # The shape of images.
+    image_shape = (173, 46)
+
+    # The buffer size of the data reader.
+    # The number of buffer size samples will be shuffled in training.
+    buf_size = 1000
+
+    # The parameter is used to control logging period.
+    # Training log will be printed every log_period.
+    log_period = 50
+
+
+class ModelConfig(object):
+
+    # Number of the filters for convolution group.
+    filter_num = 8
+
+    # Use batch normalization or not in image convolution group.
+    with_bn = True
+
+    # The number of channels for block expand layer.
+    num_channels = 128
+
+    # The parameter stride_x  in block expand layer.
+    stride_x = 1
+
+    # The parameter stride_y  in block expand layer.
+    stride_y = 1
+
+    # The parameter block_x  in block expand layer.
+    block_x = 1
+
+    # The parameter block_y  in block expand layer.
+    block_y = 11
+
+    # The hidden size for gru.
+    hidden_size = num_channels
+
+    # Use norm_by_times or not in warp ctc layer.
+    norm_by_times = True
+
+    # The list for number of filter in image convolution group layer.
+    filter_num_list = [16, 32, 64, 128]
+
+    # The parameter conv_padding in image convolution group layer.
+    conv_padding = 1
+
+    # The parameter conv_filter_size in image convolution group layer.
+    conv_filter_size = 3
+
+    # The parameter pool_size in image convolution group layer.
+    pool_size = 2
+
+    # The parameter pool_stride in image convolution group layer.
+    pool_stride = 2
diff --git a/scene_text_recognition/data_provider.py b/scene_text_recognition/data_provider.py
deleted file mode 100644
index f33a102e..00000000
--- a/scene_text_recognition/data_provider.py
+++ /dev/null
@@ -1,100 +0,0 @@
-from __future__ import absolute_import
-from __future__ import division
-from __future__ import print_function
-
-import os
-import cv2
-
-from paddle.v2.image import load_image
-
-
-class AsciiDic(object):
-    UNK = 0
-
-    def __init__(self):
-        self.dic = {
-            '<unk>': self.UNK,
-        }
-        self.chars = [chr(i) for i in range(40, 171)]
-        for id, c in enumerate(self.chars):
-            self.dic[c] = id + 1
-
-    def lookup(self, w):
-        return self.dic.get(w, self.UNK)
-
-    def id2word(self):
-        self.id2word = {}
-        for key, value in self.dic.items():
-            self.id2word[value] = key
-
-        return self.id2word
-
-    def word2ids(self, sent):
-        '''
-        transform a word to a list of ids.
-        '''
-        return [self.lookup(c) for c in list(sent)]
-
-    def size(self):
-        return len(self.dic)
-
-
-class ImageDataset(object):
-    def __init__(self,
-                 train_image_paths_generator,
-                 test_image_paths_generator,
-                 infer_image_paths_generator,
-                 fixed_shape=None,
-                 is_infer=False):
-        '''
-        :param train_image_paths_generator:
-                            return list of train images' paths.
-        :type train_image_paths_generator: function
-        :param fixed_shape: fixed shape of images.
-        :type fixed_shape: tuple
-        '''
-        if is_infer == False:
-            self.train_filelist = [p for p in train_image_paths_generator]
-            self.test_filelist = [p for p in test_image_paths_generator]
-        else:
-            self.infer_filelist = [p for p in infer_image_paths_generator]
-
-        self.fixed_shape = fixed_shape
-        self.ascii_dic = AsciiDic()
-
-    def train(self):
-        for i, (image, label) in enumerate(self.train_filelist):
-            yield self.load_image(image), self.ascii_dic.word2ids(label)
-
-    def test(self):
-        for i, (image, label) in enumerate(self.test_filelist):
-            yield self.load_image(image), self.ascii_dic.word2ids(label)
-
-    def infer(self):
-        for i, (image, label) in enumerate(self.infer_filelist):
-            yield self.load_image(image), label
-
-    def load_image(self, path):
-        '''
-        load image and transform to 1-dimention vector
-        '''
-        image = load_image(path)
-        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-        # resize all images to a fixed shape
-
-        if self.fixed_shape:
-            image = cv2.resize(
-                image, self.fixed_shape, interpolation=cv2.INTER_CUBIC)
-
-        image = image.flatten() / 255.
-        return image
-
-
-def get_file_list(image_file_list):
-    pwd = os.path.dirname(image_file_list)
-    with open(image_file_list) as f:
-        for line in f:
-            fs = line.strip().split(',', 1)
-            file = fs[0].strip()
-            path = os.path.join(pwd, file)
-            yield path, fs[1][2:-1]
diff --git a/scene_text_recognition/index.html b/scene_text_recognition/index.html
index 46996528..64a1160a 100644
--- a/scene_text_recognition/index.html
+++ b/scene_text_recognition/index.html
@@ -46,7 +46,7 @@
 
 在现实生活中，包括路牌、菜单、大厦标语在内的很多场景均会有文字出现，这些场景的照片中的文字为图片场景的理解提供了更多信息，\[[1](#参考文献)\]使用深度学习模型自动识别路牌中的文字，帮助街景应用获取更加准确的地址信息。
 
-本例将演示如何用 PaddlePaddle 完成 **场景文字识别 (STR, Scene Text Recognition)** 任务。以下图为例，给定一个场景图片，STR需要从图片中识别出对应的文字"keep":
+本例将演示如何用 PaddlePaddle 完成 **场景文字识别 (STR, Scene Text Recognition)** 任务。以下图为例，给定一个场景图片，STR需要从图片中识别出对应的文字"keep"。
 
 <p align="center">
 <img src="./images/503.jpg"/><br/>
@@ -56,70 +56,66 @@
 
 ## 使用 PaddlePaddle 训练与预测
 
+### 安装依赖包
+```bash
+pip install -r requirements.txt
+```
+
+### 指定训练配置参数
+
+通过 `config.py` 脚本修改训练和模型配置参数，脚本中有对可配置参数的详细解释，示例如下：
+```python
+class TrainerConfig(object):
+
+      # Whether to use GPU in training or not.
+      use_gpu = True
+      # The number of computing threads.
+      trainer_count = 1
+
+      # The training batch size.
+      batch_size = 10
+
+      ...
+
+
+class ModelConfig(object):
+
+      # Number of the filters for convolution group.
+      filter_num = 8
+
+      ...
+```
+修改 `config.py` 对参数进行调整。例如，通过修改 `use_gpu` 参数来指定是否使用 GPU 进行训练。
+
 ### 模型训练
 训练脚本 [./train.py](./train.py) 中设置了如下命令行参数：
 
 ```
-usage: train.py [-h] --image_shape IMAGE_SHAPE --train_file_list
-                TRAIN_FILE_LIST --test_file_list TEST_FILE_LIST
-                [--batch_size BATCH_SIZE]
-                [--model_output_prefix MODEL_OUTPUT_PREFIX]
-                [--trainer_count TRAINER_COUNT]
-                [--save_period_by_batch SAVE_PERIOD_BY_BATCH]
-                [--num_passes NUM_PASSES]
-
-PaddlePaddle CTC example
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --image_shape IMAGE_SHAPE
-                        image's shape, format is like '173,46'
-  --train_file_list TRAIN_FILE_LIST
-                        path of the file which contains path list of train
-                        image files
-  --test_file_list TEST_FILE_LIST
-                        path of the file which contains path list of test
-                        image files
-  --batch_size BATCH_SIZE
-                        size of a mini-batch
-  --model_output_prefix MODEL_OUTPUT_PREFIX
-                        prefix of path for model to store (default:
-                        ./model.ctc)
-  --trainer_count TRAINER_COUNT
-                        number of training threads
-  --save_period_by_batch SAVE_PERIOD_BY_BATCH
-                        save model to disk every N batches
-  --num_passes NUM_PASSES
-                        number of passes to train (default: 1)
-```
+Options:
+  --train_file_list_path TEXT  The path of the file which contains path list
+                               of train image files.  [required]
+  --test_file_list_path TEXT   The path of the file which contains path list
+                               of test image files.  [required]
+  --model_save_dir TEXT        The path to save the trained models (default:
+                               'models').
+  --help                       Show this message and exit.
 
-重要的几个参数包括：
+```
 
-- `image_shape` 图片的尺寸
 - `train_file_list` 训练数据的列表文件，每行一个路径加对应的text，具体格式为：
 ```
 word_1.png, "PROPER"
 word_2.png, "FOOD"
 ```
-- `test_file_list` 测试数据的列表文件，格式同上
-
-### 预测
-预测部分由infer.py完成，使用的是最优路径解码算法，即：在每个时间步选择一个概率最大的字符。在使用过程中，需要在infer.py中指定具体的模型目录、图片固定尺寸、batch_size和图片文件的列表文件。例如：
-```python
-model_path = "model.ctc-pass-9-batch-150-test.tar.gz"  
-image_shape = "173,46"
-batch_size = 50
-infer_file_list = 'data/test_data/Challenge2_Test_Task3_GT.txt'
-```
-然后运行```python infer.py```
-
+- `test_file_list` 测试数据的列表文件，格式同上。
+- `model_save_dir` 模型参数会的保存目录目录， 默认为当前目录下的`models`目录。
 
 ### 具体执行的过程：
 
 1.从官方网站下载数据\[[2](#参考文献)\]（Task 2.3: Word Recognition (2013 edition)），会有三个文件: Challenge2_Training_Task3_Images_GT.zip、Challenge2_Test_Task3_Images.zip和 Challenge2_Test_Task3_GT.txt。
 分别对应训练集的图片和图片对应的单词，测试集的图片，测试数据对应的单词，然后执行以下命令，对数据解压并移动至目标文件夹：
 
-```
+```bash
 mkdir -p data/train_data
 mkdir -p data/test_data
 unzip Challenge2_Training_Task3_Images_GT.zip -d data/train_data
@@ -127,16 +123,26 @@ unzip Challenge2_Test_Task3_Images.zip -d data/test_data
 mv Challenge2_Test_Task3_GT.txt data/test_data
 ```
 
-2.获取训练数据文件夹中 `gt.txt` 的路径 (data/train_data）和测试数据文件夹中`Challenge2_Test_Task3_GT.txt`的路径(data/test_data)
+2.获取训练数据文件夹中 `gt.txt` 的路径 (data/train_data）和测试数据文件夹中`Challenge2_Test_Task3_GT.txt`的路径(data/test_data)。
 
-3.执行命令
+3.执行如下命令进行训练：
+```bash
+python train.py \
+--train_file_list_path 'data/train_data/gt.txt' \
+--test_file_list_path 'data/test_data/Challenge2_Test_Task3_GT.txt'
 ```
-python train.py --train_file_list data/train_data/gt.txt --test_file_list data/test_data/Challenge2_Test_Task3_GT.txt --image_shape '173,46'
-```
-4.训练过程中，模型参数会自动备份到指定目录，默认为 ./model.ctc
+4.训练过程中，模型参数会自动备份到指定目录，默认会保存在 `./models` 目录下。
 
-5.设置infer.py中的相关参数(模型所在路径)，运行```python infer.py``` 进行预测
 
+### 预测
+预测部分由 `infer.py` 完成，使用的是最优路径解码算法，即：在每个时间步选择一个概率最大的字符。在使用过程中，需要在 `infer.py` 中指定具体的模型目录、图片固定尺寸、batch_size（默认设置为10）和图片文件的列表文件。执行如下代码：
+```bash
+python infer.py \
+--model_path 'models/params_pass_00000.tar.gz' \
+--image_shape '173,46' \
+--infer_file_list_path 'data/test_data/Challenge2_Test_Task3_GT.txt'
+```
+即可进行预测。
 
 ### 其他数据集
 
@@ -146,7 +152,7 @@ python train.py --train_file_list data/train_data/gt.txt --test_file_list data/t
 ### 注意事项
 
 - 由于模型依赖的 `warp CTC` 只有CUDA的实现，本模型只支持 GPU 运行
-- 本模型参数较多，占用显存比较大，实际执行时可以调节batch_size 控制显存占用
+- 本模型参数较多，占用显存比较大，实际执行时可以调节`batch_size`控制显存占用
 - 本模型使用的数据集较小，可以选用其他更大的数据集\[[3](#参考文献)\]来训练需要的模型
 
 ## 参考文献
diff --git a/scene_text_recognition/infer.py b/scene_text_recognition/infer.py
index ff1f43be..b53c600b 100644
--- a/scene_text_recognition/infer.py
+++ b/scene_text_recognition/infer.py
@@ -1,11 +1,11 @@
-import logging
-import argparse
+import click
 import gzip
 
 import paddle.v2 as paddle
 from model import Model
-from data_provider import get_file_list, AsciiDic, ImageDataset
+from reader import DataGenerator
 from decoder import ctc_greedy_decoder
+from utils import AsciiDic, get_file_list
 
 
 def infer_batch(inferer, test_batch, labels):
@@ -15,9 +15,8 @@ def infer_batch(inferer, test_batch, labels):
         infer_results[i * num_steps:(i + 1) * num_steps]
         for i in xrange(0, len(test_batch))
     ]
-
     results = []
-    # best path decode
+    # Best path decode.
     for i, probs in enumerate(probs_split):
         output_transcription = ctc_greedy_decoder(
             probs_seq=probs, vocabulary=AsciiDic().id2word())
@@ -28,21 +27,42 @@ def infer_batch(inferer, test_batch, labels):
               (result, label))
 
 
-def infer(model_path, image_shape, batch_size, infer_file_list):
+@click.command('infer')
+@click.option(
+    "--model_path", type=str, required=True, help=("The path of saved model."))
+@click.option(
+    "--image_shape",
+    type=str,
+    required=True,
+    help=("The fixed size for image dataset (format is like: '173,46')."))
+@click.option(
+    "--batch_size",
+    type=int,
+    default=10,
+    help=("The number of examples in one batch (default: 10)."))
+@click.option(
+    "--infer_file_list_path",
+    type=str,
+    required=True,
+    help=("The path of the file which contains "
+          "path list of image files for inference."))
+def infer(model_path, image_shape, batch_size, infer_file_list_path):
     image_shape = tuple(map(int, image_shape.split(',')))
-    infer_generator = get_file_list(infer_file_list)
-
-    dataset = ImageDataset(None, None, infer_generator, image_shape, True)
+    infer_file_list = get_file_list(infer_file_list_path)
+    char_dict = AsciiDic()
+    dict_size = char_dict.size()
+    data_generator = DataGenerator(char_dict=char_dict, image_shape=image_shape)
 
-    paddle.init(use_gpu=True, trainer_count=4)
+    paddle.init(use_gpu=True, trainer_count=1)
     parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path))
-    model = Model(AsciiDic().size(), image_shape, is_infer=True)
+    model = Model(dict_size, image_shape, is_infer=True)
     inferer = paddle.inference.Inference(
         output_layer=model.log_probs, parameters=parameters)
 
     test_batch = []
     labels = []
-    for i, (image, label) in enumerate(dataset.infer()):
+    for i, (image,
+            label) in enumerate(data_generator.infer_reader(infer_file_list)()):
         test_batch.append([image])
         labels.append(label)
         if len(test_batch) == batch_size:
@@ -54,9 +74,4 @@ def infer(model_path, image_shape, batch_size, infer_file_list):
 
 
 if __name__ == "__main__":
-    model_path = "model.ctc-pass-9-batch-150-test.tar.gz"
-    image_shape = "173,46"
-    batch_size = 50
-    infer_file_list = 'data/test_data/Challenge2_Test_Task3_GT.txt'
-
-    infer(model_path, image_shape, batch_size, infer_file_list)
+    infer()
diff --git a/scene_text_recognition/model.py b/scene_text_recognition/model.py
index 2ea1240d..86dd852c 100644
--- a/scene_text_recognition/model.py
+++ b/scene_text_recognition/model.py
@@ -3,16 +3,17 @@ from paddle.v2 import layer
 from paddle.v2 import evaluator
 from paddle.v2.activation import Relu, Linear
 from paddle.v2.networks import img_conv_group, simple_gru
+from config import ModelConfig as conf
 
 
 class Model(object):
     def __init__(self, num_classes, shape, is_infer=False):
         '''
-        :param num_classes: size of the character dict.
+        :param num_classes: The size of the character dict.
         :type num_classes: int
-        :param shape: size of the input images.
+        :param shape: The size of the input images.
         :type shape: tuple of 2 int
-        :param is_infer: infer mode or not
+        :param is_infer: For inference or not
         :type shape: bool
         '''
         self.num_classes = num_classes
@@ -24,39 +25,50 @@ class Model(object):
         self.__build_nn__()
 
     def __declare_input_layers__(self):
-        # image input as a float vector
+        '''
+        Define the input layer.
+        '''
+        # Image input as a float vector.
         self.image = layer.data(
             name='image',
             type=paddle.data_type.dense_vector(self.image_vector_size),
             height=self.shape[0],
             width=self.shape[1])
 
-        # label input as a ID list
-        if self.is_infer == False:
+        # Label input as an ID list
+        if not self.is_infer:
             self.label = layer.data(
                 name='label',
                 type=paddle.data_type.integer_value_sequence(self.num_classes))
 
     def __build_nn__(self):
-        # CNN output image features, 128 float matrixes
-        conv_features = self.conv_groups(self.image, 8, True)
+        '''
+        Build the network topology.
+        '''
+        # CNN output image features.
+        conv_features = self.conv_groups(self.image, conf.filter_num,
+                                         conf.with_bn)
 
-        # cutting CNN output into a sequence of feature vectors, which are
+        # Cut CNN output into a sequence of feature vectors, which are
         # 1 pixel wide and 11 pixel high.
         sliced_feature = layer.block_expand(
             input=conv_features,
-            num_channels=128,
-            stride_x=1,
-            stride_y=1,
-            block_x=1,
-            block_y=11)
+            num_channels=conf.num_channels,
+            stride_x=conf.stride_x,
+            stride_y=conf.stride_y,
+            block_x=conf.block_x,
+            block_y=conf.block_y)
 
         # RNNs to capture sequence information forwards and backwards.
-        gru_forward = simple_gru(input=sliced_feature, size=128, act=Relu())
+        gru_forward = simple_gru(
+            input=sliced_feature, size=conf.hidden_size, act=Relu())
         gru_backward = simple_gru(
-            input=sliced_feature, size=128, act=Relu(), reverse=True)
+            input=sliced_feature,
+            size=conf.hidden_size,
+            act=Relu(),
+            reverse=True)
 
-        # map each step of RNN to character distribution.
+        # Map each step of RNN to character distribution.
         self.output = layer.fc(
             input=[gru_forward, gru_backward],
             size=self.num_classes + 1,
@@ -66,31 +78,31 @@ class Model(object):
             input=paddle.layer.identity_projection(input=self.output),
             act=paddle.activation.Softmax())
 
-        # warp CTC to calculate cost for a CTC task.
-        if self.is_infer == False:
+        # Use warp CTC to calculate cost for a CTC task.
+        if not self.is_infer:
             self.cost = layer.warp_ctc(
                 input=self.output,
                 label=self.label,
                 size=self.num_classes + 1,
-                norm_by_times=True,
+                norm_by_times=conf.norm_by_times,
                 blank=self.num_classes)
 
             self.eval = evaluator.ctc_error(input=self.output, label=self.label)
 
-    def conv_groups(self, input_image, num, with_bn):
+    def conv_groups(self, input, num, with_bn):
         '''
-        :param input_image: input image.
-        :type input_image: LayerOutput
-        :param num: number of CONV filters.
+        :param input: Input layer.
+        :type input: LayerOutput
+        :param num: Number of the filters.
         :type num: int
-        :param with_bn: whether with batch normal.
+        :param with_bn: Whether with batch normalization.
         :type with_bn: bool
         '''
         assert num % 4 == 0
 
-        filter_num_list = [16, 32, 64, 128]
+        filter_num_list = conf.filter_num_list
         is_input_image = True
-        tmp = input_image
+        tmp = input
 
         for num_filter in filter_num_list:
 
@@ -103,12 +115,12 @@ class Model(object):
             tmp = img_conv_group(
                 input=tmp,
                 num_channels=num_channels,
-                conv_padding=1,
+                conv_padding=conf.conv_padding,
                 conv_num_filter=[num_filter] * (num / 4),
-                conv_filter_size=3,
+                conv_filter_size=conf.conv_filter_size,
                 conv_act=Relu(),
                 conv_with_batchnorm=with_bn,
-                pool_size=2,
-                pool_stride=2, )
+                pool_size=conf.pool_size,
+                pool_stride=conf.pool_stride, )
 
         return tmp
diff --git a/scene_text_recognition/reader.py b/scene_text_recognition/reader.py
new file mode 100644
index 00000000..013477ad
--- /dev/null
+++ b/scene_text_recognition/reader.py
@@ -0,0 +1,62 @@
+import os
+import cv2
+
+from paddle.v2.image import load_image
+
+
+class DataGenerator(object):
+    def __init__(self, char_dict, image_shape):
+        '''
+        :param char_dict: The dictionary class for labels.
+        :type char_dict: class
+        :param image_shape: The fixed shape of images.
+        :type image_shape: tuple
+        '''
+        self.image_shape = image_shape
+        self.char_dict = char_dict
+
+    def train_reader(self, file_list):
+        '''
+        Reader interface for training.
+        
+        :param file_list: The path list of the image file for training.
+        :type file_list: list
+        '''
+
+        def reader():
+            for i, (image, label) in enumerate(file_list):
+                yield self.load_image(image), self.char_dict.word2ids(label)
+
+        return reader
+
+    def infer_reader(self, file_list):
+        '''
+        Reader interface for inference.
+           
+        :param file_list: The path list of the image file for inference.
+        :type file_list: list
+        '''
+
+        def reader():
+            for i, (image, label) in enumerate(file_list):
+                yield self.load_image(image), label
+
+        return reader
+
+    def load_image(self, path):
+        '''
+        Load image and transform to 1-dimention vector.
+           
+        :param path: The path of the image data.
+        :type path: str
+        '''
+        image = load_image(path)
+        image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+
+        # Resize all images to a fixed shape.
+        if self.image_shape:
+            image = cv2.resize(
+                image, self.image_shape, interpolation=cv2.INTER_CUBIC)
+
+        image = image.flatten() / 255.
+        return image
diff --git a/scene_text_recognition/requirements.txt b/scene_text_recognition/requirements.txt
new file mode 100644
index 00000000..eb8ed79b
--- /dev/null
+++ b/scene_text_recognition/requirements.txt
@@ -0,0 +1,2 @@
+click
+opencv-python
\ No newline at end of file
diff --git a/scene_text_recognition/train.py b/scene_text_recognition/train.py
index 212102c5..557f1ba5 100644
--- a/scene_text_recognition/train.py
+++ b/scene_text_recognition/train.py
@@ -1,109 +1,91 @@
-import logging
-import argparse
 import gzip
+import os
+import click
 
 import paddle.v2 as paddle
+from config import TrainerConfig as conf
 from model import Model
-from data_provider import get_file_list, AsciiDic, ImageDataset
+from reader import DataGenerator
+from utils import get_file_list, AsciiDic
 
-parser = argparse.ArgumentParser(description="PaddlePaddle CTC example")
-parser.add_argument(
-    '--image_shape',
-    type=str,
-    required=True,
-    help="image's shape, format is like '173,46'")
-parser.add_argument(
-    '--train_file_list',
+
+@click.command('train')
+@click.option(
+    "--train_file_list_path",
     type=str,
     required=True,
-    help='path of the file which contains path list of train image files')
-parser.add_argument(
-    '--test_file_list',
+    help=("The path of the file which contains "
+          "path list of train image files."))
+@click.option(
+    "--test_file_list_path",
     type=str,
     required=True,
-    help='path of the file which contains path list of test image files')
-parser.add_argument(
-    '--batch_size', type=int, default=5, help='size of a mini-batch')
-parser.add_argument(
-    '--model_output_prefix',
+    help=("The path of the file which contains "
+          "path list of test image files."))
+@click.option(
+    "--model_save_dir",
     type=str,
-    default='model.ctc',
-    help='prefix of path for model to store (default: ./model.ctc)')
-parser.add_argument(
-    '--trainer_count', type=int, default=4, help='number of training threads')
-parser.add_argument(
-    '--save_period_by_batch',
-    type=int,
-    default=150,
-    help='save model to disk every N batches')
-parser.add_argument(
-    '--num_passes',
-    type=int,
-    default=10,
-    help='number of passes to train (default: 1)')
-
-args = parser.parse_args()
-
-
-def main():
-    image_shape = tuple(map(int, args.image_shape.split(',')))
-
-    print 'image_shape', image_shape
-    print 'batch_size', args.batch_size
-    print 'train_file_list', args.train_file_list
-    print 'test_file_list', args.test_file_list
-
-    train_generator = get_file_list(args.train_file_list)
-    test_generator = get_file_list(args.test_file_list)
-    infer_generator = None
-
-    dataset = ImageDataset(
-        train_generator,
-        test_generator,
-        infer_generator,
-        fixed_shape=image_shape,
-        is_infer=False)
-
-    paddle.init(use_gpu=True, trainer_count=args.trainer_count)
-
-    model = Model(AsciiDic().size(), image_shape, is_infer=False)
+    default="models",
+    help="The path to save the trained models (default: 'models').")
+def train(train_file_list_path, test_file_list_path, model_save_dir):
+
+    if not os.path.exists(model_save_dir):
+        os.mkdir(model_save_dir)
+    train_file_list = get_file_list(train_file_list_path)
+    test_file_list = get_file_list(test_file_list_path)
+    char_dict = AsciiDic()
+    dict_size = char_dict.size()
+    data_generator = DataGenerator(
+        char_dict=char_dict, image_shape=conf.image_shape)
+
+    paddle.init(use_gpu=conf.use_gpu, trainer_count=conf.trainer_count)
+    # Create optimizer.
+    optimizer = paddle.optimizer.Momentum(momentum=conf.momentum)
+    # Define network topology.
+    model = Model(dict_size, conf.image_shape, is_infer=False)
+    # Create all the trainable parameters.
     params = paddle.parameters.create(model.cost)
-    optimizer = paddle.optimizer.Momentum(momentum=0)
+
     trainer = paddle.trainer.SGD(
         cost=model.cost,
         parameters=params,
         update_equation=optimizer,
         extra_layers=model.eval)
+    # Feeding dictionary.
+    feeding = {'image': 0, 'label': 1}
 
     def event_handler(event):
         if isinstance(event, paddle.event.EndIteration):
-            if event.batch_id % 100 == 0:
-                print "Pass %d, batch %d, Samples %d, Cost %f, Eval %s" % (
-                    event.pass_id, event.batch_id,
-                    event.batch_id * args.batch_size, event.cost, event.metrics)
-
-            if event.batch_id > 0 and event.batch_id % args.save_period_by_batch == 0:
-                result = trainer.test(
-                    reader=paddle.batch(dataset.test, batch_size=10),
-                    feeding={'image': 0,
-                             'label': 1})
-                print "Test %d-%d, Cost %f, Eval %s" % (
-                    event.pass_id, event.batch_id, result.cost, result.metrics)
-
-                path = "{}-pass-{}-batch-{}-test.tar.gz".format(
-                    args.model_output_prefix, event.pass_id, event.batch_id)
-                with gzip.open(path, 'w') as f:
-                    params.to_tar(f)
+            if event.batch_id % conf.log_period == 0:
+                print("Pass %d, batch %d, Samples %d, Cost %f, Eval %s" %
+                      (event.pass_id, event.batch_id, event.batch_id *
+                       conf.batch_size, event.cost, event.metrics))
+
+        if isinstance(event, paddle.event.EndPass):
+            # Here, because training and testing data share a same format,
+            # we still use the reader.train_reader to read the testing data.
+            result = trainer.test(
+                reader=paddle.batch(
+                    data_generator.train_reader(test_file_list),
+                    batch_size=conf.batch_size),
+                feeding=feeding)
+            print("Test %d, Cost %f, Eval %s" %
+                  (event.pass_id, result.cost, result.metrics))
+            with gzip.open(
+                    os.path.join(model_save_dir, "params_pass_%05d.tar.gz" %
+                                 event.pass_id), "w") as f:
+                trainer.save_parameter_to_tar(f)
 
     trainer.train(
         reader=paddle.batch(
-            paddle.reader.shuffle(dataset.train, buf_size=500),
-            batch_size=args.batch_size),
-        feeding={'image': 0,
-                 'label': 1},
+            paddle.reader.shuffle(
+                data_generator.train_reader(train_file_list),
+                buf_size=conf.buf_size),
+            batch_size=conf.batch_size),
+        feeding=feeding,
         event_handler=event_handler,
-        num_passes=args.num_passes)
+        num_passes=conf.num_passes)
 
 
 if __name__ == "__main__":
-    main()
+    train()
diff --git a/scene_text_recognition/utils.py b/scene_text_recognition/utils.py
new file mode 100644
index 00000000..dd43113a
--- /dev/null
+++ b/scene_text_recognition/utils.py
@@ -0,0 +1,59 @@
+import os
+
+
+class AsciiDic(object):
+    UNK_ID = 0
+
+    def __init__(self):
+        self.dic = {
+            '<unk>': self.UNK_ID,
+        }
+        self.chars = [chr(i) for i in range(40, 171)]
+        for id, c in enumerate(self.chars):
+            self.dic[c] = id + 1
+
+    def lookup(self, w):
+        return self.dic.get(w, self.UNK_ID)
+
+    def id2word(self):
+        '''
+        Return a reversed char dict.
+        '''
+        self.id2word = {}
+        for key, value in self.dic.items():
+            self.id2word[value] = key
+
+        return self.id2word
+
+    def word2ids(self, word):
+        '''
+        Transform a word to a list of ids.
+
+        :param word: The word appears in image data.
+        :type word: str
+        '''
+        return [self.lookup(c) for c in list(word)]
+
+    def size(self):
+        return len(self.dic)
+
+
+def get_file_list(image_file_list):
+    '''
+    Generate the file list for training and testing data.
+    
+    :param image_file_list: The path of the file which contains
+                           path list of image files.
+    :type image_file_list: str
+    '''
+    dirname = os.path.dirname(image_file_list)
+    path_list = []
+    with open(image_file_list) as f:
+        for line in f:
+            line_split = line.strip().split(',', 1)
+            filename = line_split[0].strip()
+            path = os.path.join(dirname, filename)
+            label = line_split[1][2:-1]
+            path_list.append((path, label))
+
+    return path_list
-- 
GitLab