From 13cd4dc0a13959b0e433cffb83a7312e8f66b2ec Mon Sep 17 00:00:00 2001
From: peterzhang2029 <zhangchao41@baidu.com>
Date: Sun, 15 Oct 2017 21:00:21 +0800
Subject: [PATCH] refine docstring and notation

---
 nested_sequence/text_classification/README.md | 19 +++++++----
 nested_sequence/text_classification/config.py | 29 +++++++++-------
 .../text_classification/index.html            | 19 +++++++----
 nested_sequence/text_classification/infer.py  |  4 ++-
 .../text_classification/network_conf.py       |  6 ++--
 nested_sequence/text_classification/train.py  | 10 +++---
 nested_sequence/text_classification/utils.py  | 33 +++++++++++++++++++
 7 files changed, 88 insertions(+), 32 deletions(-)

diff --git a/nested_sequence/text_classification/README.md b/nested_sequence/text_classification/README.md
index ce019662..dbc1b4a5 100644
--- a/nested_sequence/text_classification/README.md
+++ b/nested_sequence/text_classification/README.md
@@ -76,8 +76,8 @@ pip install -r requirements.txt
 
 ## 指定训练配置参数
 
-`config.py`脚本中包含训练配置和模型配置的参数设置, 示例代码如下：
-```
+通过 `config.py` 脚本修改训练和模型配置参数，脚本中有对可配置参数的详细解释，示例如下：
+```python
 class TrainerConfig(object):
 
     # whether to use GPU for training
@@ -98,8 +98,7 @@ class ModelConfig(object):
 
     ...
 ```
-用户可以对具体参数进行设置实现训练， 例如通过设置 `use_gpu` 参数来指定是否使用 GPU
- 进行训练。
+修改 `config.py` 对参数进行调整。例如，通过修改 `use_gpu` 参数来指定是否使用 GPU 进行训练。
 
 ## 使用 PaddlePaddle 内置数据运行
 
@@ -200,7 +199,11 @@ Options:
 
 修改`train.py`脚本中的启动参数，可以直接运行本例。 以`data`目录下的示例数据为例，在终端执行：
 ```bash
-python train.py --train_data_dir 'data/train_data' --test_data_dir 'data/test_data' --word_dict_path 'word_dict.txt' --label_dict_path 'label_dict.txt'
+python train.py \
+  --train_data_dir 'data/train_data'  \
+  --test_data_dir 'data/test_data' \
+  --word_dict_path 'word_dict.txt' \
+  --label_dict_path 'label_dict.txt'
 ```
 即可对样例数据进行训练。
 
@@ -226,7 +229,11 @@ Options:
 
 2.以`data`目录下的示例数据为例，在终端执行：
 ```bash
-python infer.py --data_path 'data/infer.txt' --word_dict_path 'word_dict.txt' --label_dict_path 'label_dict.txt' --model_path 'models/params_pass_00000.tar.gz'
+python infer.py \
+  --data_path 'data/infer.txt' \
+  --word_dict_path 'word_dict.txt' \
+  --label_dict_path 'label_dict.txt' \
+  --model_path 'models/params_pass_00000.tar.gz'
 ```
 
 即可对样例数据进行预测。
diff --git a/nested_sequence/text_classification/config.py b/nested_sequence/text_classification/config.py
index 4461626f..1a6e4681 100644
--- a/nested_sequence/text_classification/config.py
+++ b/nested_sequence/text_classification/config.py
@@ -3,37 +3,44 @@ __all__ = ["TrainerConfig", "ModelConfig"]
 
 class TrainerConfig(object):
 
-    # whether to use GPU for training
+    # Whether to use GPU in training or not.
     use_gpu = False
-    # the number of threads used in one machine
+    # The number of computing threads.
     trainer_count = 1
 
-    # train batch size
+    # The training batch size.
     batch_size = 32
 
-    # number of pass during training
+    # The epoch number.
     num_passes = 10
 
-    # learning rate for optimizer
+    # The global learning rate.
     learning_rate = 1e-3
 
-    # learning rate for L2Regularization
+    # The decay rate for L2Regularization
     l2_learning_rate = 1e-3
 
-    # average_window for ModelAverage
+    # This parameter is used for the averaged SGD.
+    # About the average_window * (number of the processed batch) parameters
+    # are used for average.
+    # To be accurate, between average_window *(number of the processed batch)
+    # and 2 * average_window * (number of the processed batch) parameters
+    # are used for average.
     average_window = 0.5
 
-    # buffer size for shuffling
+    # The buffer size of the data reader.
+    # The number of buffer size samples will be shuffled in training.
     buf_size = 1000
 
-    # log progress every log_period batches
+    # The parameter is used to control logging period.
+    # Training log will be printed every log_period.
     log_period = 100
 
 
 class ModelConfig(object):
 
-    # embedding vector dimension
+    # The dimension of embedding vector.
     emb_size = 28
 
-    # size of sentence vector representation and fc layer in cnn
+    # The hidden size of sentence vectors.
     hidden_size = 128
diff --git a/nested_sequence/text_classification/index.html b/nested_sequence/text_classification/index.html
index 2b2c9dee..005de924 100644
--- a/nested_sequence/text_classification/index.html
+++ b/nested_sequence/text_classification/index.html
@@ -118,8 +118,8 @@ pip install -r requirements.txt
 
 ## 指定训练配置参数
 
-`config.py`脚本中包含训练配置和模型配置的参数设置, 示例代码如下：
-```
+通过 `config.py` 脚本修改训练和模型配置参数，脚本中有对可配置参数的详细解释，示例如下：
+```python
 class TrainerConfig(object):
 
     # whether to use GPU for training
@@ -140,8 +140,7 @@ class ModelConfig(object):
 
     ...
 ```
-用户可以对具体参数进行设置实现训练， 例如通过设置 `use_gpu` 参数来指定是否使用 GPU
- 进行训练。
+修改 `config.py` 对参数进行调整。例如，通过修改 `use_gpu` 参数来指定是否使用 GPU 进行训练。
 
 ## 使用 PaddlePaddle 内置数据运行
 
@@ -242,7 +241,11 @@ Options:
 
 修改`train.py`脚本中的启动参数，可以直接运行本例。 以`data`目录下的示例数据为例，在终端执行：
 ```bash
-python train.py --train_data_dir 'data/train_data' --test_data_dir 'data/test_data' --word_dict_path 'word_dict.txt' --label_dict_path 'label_dict.txt'
+python train.py \
+  --train_data_dir 'data/train_data'  \
+  --test_data_dir 'data/test_data' \
+  --word_dict_path 'word_dict.txt' \
+  --label_dict_path 'label_dict.txt'
 ```
 即可对样例数据进行训练。
 
@@ -268,7 +271,11 @@ Options:
 
 2.以`data`目录下的示例数据为例，在终端执行：
 ```bash
-python infer.py --data_path 'data/infer.txt' --word_dict_path 'word_dict.txt' --label_dict_path 'label_dict.txt' --model_path 'models/params_pass_00000.tar.gz'
+python infer.py \
+  --data_path 'data/infer.txt' \
+  --word_dict_path 'word_dict.txt' \
+  --label_dict_path 'label_dict.txt' \
+  --model_path 'models/params_pass_00000.tar.gz'
 ```
 
 即可对样例数据进行预测。
diff --git a/nested_sequence/text_classification/infer.py b/nested_sequence/text_classification/infer.py
index 00204c96..461eba49 100644
--- a/nested_sequence/text_classification/infer.py
+++ b/nested_sequence/text_classification/infer.py
@@ -58,6 +58,7 @@ def infer(data_path, model_path, word_dict_path, batch_size, label_dict_path):
         word_reverse_dict = dict((value, key)
                                  for key, value in word_dict.iteritems())
 
+        # The reversed label dict of the imdb dataset 
         label_reverse_dict = {0: "positive", 1: "negative"}
         test_reader = reader.imdb_test(word_dict)
         class_num = 2
@@ -75,11 +76,12 @@ def infer(data_path, model_path, word_dict_path, batch_size, label_dict_path):
         test_reader = reader.infer_reader(data_path, word_dict)()
 
     dict_dim = len(word_dict)
-    prob_layer = nested_net(dict_dim, class_num, is_infer=True)
 
     # initialize PaddlePaddle.
     paddle.init(use_gpu=False, trainer_count=1)
 
+    prob_layer = nested_net(dict_dim, class_num, is_infer=True)
+
     # load the trained models.
     parameters = paddle.parameters.Parameters.from_tar(
         gzip.open(model_path, "r"))
diff --git a/nested_sequence/text_classification/network_conf.py b/nested_sequence/text_classification/network_conf.py
index bee2c083..b4c40669 100644
--- a/nested_sequence/text_classification/network_conf.py
+++ b/nested_sequence/text_classification/network_conf.py
@@ -4,10 +4,10 @@ from config import ModelConfig as conf
 
 def cnn_cov_group(group_input, hidden_size):
     """
-    Covolution group definition
+    Convolution group definition.
     :param group_input: The input of this layer.
     :type group_input: LayerOutput
-    :params hidden_size: Size of FC layer.
+    :params hidden_size: The size of the fully connected layer.
     :type hidden_size: int
     """
     conv3 = paddle.networks.sequence_conv_pool(
@@ -32,7 +32,7 @@ def nested_net(dict_dim, class_num, is_infer=False):
     :type dict_dim: int
     :params class_num: Number of instance class.
     :type class_num: int
-    :params is_infer: The boolean parameter 
+    :params is_infer: The boolean parameter
                         indicating inferring or training.
     :type is_infer: bool
     """
diff --git a/nested_sequence/text_classification/train.py b/nested_sequence/text_classification/train.py
index 863a0b47..a0da1ad0 100644
--- a/nested_sequence/text_classification/train.py
+++ b/nested_sequence/text_classification/train.py
@@ -37,7 +37,7 @@ from config import TrainerConfig as conf
     "--label_dict_path",
     type=str,
     default=None,
-    help=("The path of label dictionary (default: None)."
+    help=("The path of label dictionary (default: None). "
           "If this parameter is not set, imdb dataset will be used. "
           "If this parameter is set, but the file does not exist, "
           "label dictionay will be built from "
@@ -50,16 +50,16 @@ from config import TrainerConfig as conf
 def train(train_data_dir, test_data_dir, word_dict_path, label_dict_path,
           model_save_dir):
     """
-    :params train_data_path: path of training data, if this parameter
+    :params train_data_path: The path of training data, if this parameter
         is not specified, imdb dataset will be used to run this example
     :type train_data_path: str
-    :params test_data_path: path of testing data, if this parameter
+    :params test_data_path: The path of testing data, if this parameter
         is not specified, imdb dataset will be used to run this example
     :type test_data_path: str
-    :params word_dict_path: path of word dictionary, if this parameter
+    :params word_dict_path: The path of word dictionary, if this parameter
         is not specified, imdb dataset will be used to run this example
     :type word_dict_path: str
-    :params label_dict_path: path of label dictionary, if this parameter
+    :params label_dict_path: The path of label dictionary, if this parameter
         is not specified, imdb dataset will be used to run this example
     :type label_dict_path: str
     :params model_save_dir: dir where models saved
diff --git a/nested_sequence/text_classification/utils.py b/nested_sequence/text_classification/utils.py
index b83fef17..1535e31f 100644
--- a/nested_sequence/text_classification/utils.py
+++ b/nested_sequence/text_classification/utils.py
@@ -7,6 +7,18 @@ logger.setLevel(logging.INFO)
 
 
 def build_word_dict(data_dir, save_path, use_col=1, cutoff_fre=1):
+    """
+    Build word dictionary from training data.
+    :param data_dir: The directory of training dataset.
+    :type data_dir: str
+    :params save_path: The path where the word dictionary will be saved.
+    :type save_path: str
+    :params use_col: The index of text juring line split.
+    :type use_col: int
+    :params cutoff_fre: The word will not be added to dictionary if it's
+                    frequency is less than cutoff_fre.
+    :type cutoff_fre: int
+    """
     values = defaultdict(int)
 
     for file_name in os.listdir(data_dir):
@@ -33,6 +45,15 @@ def build_word_dict(data_dir, save_path, use_col=1, cutoff_fre=1):
 
 
 def build_label_dict(data_dir, save_path, use_col=0):
+    """
+    Build label dictionary from training data.
+    :param data_dir: The directory of training dataset.
+    :type data_dir: str
+    :params save_path: The path where the label dictionary will be saved.
+    :type save_path: str
+    :params use_col: The index of label juring line split.
+    :type use_col: int
+    """
     values = defaultdict(int)
 
     for file_name in os.listdir(data_dir):
@@ -53,10 +74,22 @@ def build_label_dict(data_dir, save_path, use_col=0):
 
 
 def load_dict(dict_path):
+    """
+    Load word dictionary from dictionary path.
+    :param dict_path: The path of word dictionary.
+    :type data_dir: str
+    """
     return dict((line.strip().split("\t")[0], idx)
                 for idx, line in enumerate(open(dict_path, "r").readlines()))
 
 
 def load_reverse_dict(dict_path):
+    """
+    Load the reversed word dictionary from dictionary path.
+    Index of each word is saved in key of the dictionary and the
+    corresponding word saved in value of the dictionary.
+    :param dict_path: The path of word dictionary.
+    :type data_dir: str
+    """
     return dict((idx, line.strip().split("\t")[0])
                 for idx, line in enumerate(open(dict_path, "r").readlines()))
-- 
GitLab