refine docstring and notation

13cd4dc0 · peterzhang2029 · 0096515a · 13cd4dc0 · 13cd4dc0 · 13cd4dc0
7 changed file
--- a/nested_sequence/text_classification/README.md
+++ b/nested_sequence/text_classification/README.md
@@ -76,8 +76,8 @@ pip install -r requirements.txt

 ## 指定训练配置参数

-`config.py`脚本中包含训练配置和模型配置的参数设置, 示例代码如下：
-```
+通过 `config.py` 脚本修改训练和模型配置参数，脚本中有对可配置参数的详细解释，示例如下：
+```python
 class TrainerConfig(object):

    # whether to use GPU for training
@@ -98,8 +98,7 @@ class ModelConfig(object):

    ...
 ```
-用户可以对具体参数进行设置实现训练， 例如通过设置 `use_gpu` 参数来指定是否使用 GPU
- 进行训练。
+修改 `config.py` 对参数进行调整。例如，通过修改 `use_gpu` 参数来指定是否使用 GPU 进行训练。

 ## 使用 PaddlePaddle 内置数据运行

@@ -200,7 +199,11 @@ Options:

 修改`train.py`脚本中的启动参数，可以直接运行本例。 以`data`目录下的示例数据为例，在终端执行：
 ```bash
-python train.py --train_data_dir 'data/train_data' --test_data_dir 'data/test_data' --word_dict_path 'word_dict.txt' --label_dict_path 'label_dict.txt'
+python train.py \
+  --train_data_dir 'data/train_data'  \
+  --test_data_dir 'data/test_data' \
+  --word_dict_path 'word_dict.txt' \
+  --label_dict_path 'label_dict.txt'
 ```
 即可对样例数据进行训练。

@@ -226,7 +229,11 @@ Options:

 2.以`data`目录下的示例数据为例，在终端执行：
 ```bash
-python infer.py --data_path 'data/infer.txt' --word_dict_path 'word_dict.txt' --label_dict_path 'label_dict.txt' --model_path 'models/params_pass_00000.tar.gz'
+python infer.py \
+  --data_path 'data/infer.txt' \
+  --word_dict_path 'word_dict.txt' \
+  --label_dict_path 'label_dict.txt' \
+  --model_path 'models/params_pass_00000.tar.gz'
 ```

 即可对样例数据进行预测。
--- a/nested_sequence/text_classification/config.py
+++ b/nested_sequence/text_classification/config.py
@@ -3,37 +3,44 @@ __all__ = ["TrainerConfig", "ModelConfig"]

 class TrainerConfig(object):

-    # whether to use GPU for training
+    # Whether to use GPU in training or not.
    use_gpu = False
-    # the number of threads used in one machine
+    # The number of computing threads.
    trainer_count = 1

-    # train batch size
+    # The training batch size.
    batch_size = 32

-    # number of pass during training
+    # The epoch number.
    num_passes = 10

-    # learning rate for optimizer
+    # The global learning rate.
    learning_rate = 1e-3

-    # learning rate for L2Regularization
+    # The decay rate for L2Regularization
    l2_learning_rate = 1e-3

-    # average_window for ModelAverage
+    # This parameter is used for the averaged SGD.
+    # About the average_window * (number of the processed batch) parameters
+    # are used for average.
+    # To be accurate, between average_window *(number of the processed batch)
+    # and 2 * average_window * (number of the processed batch) parameters
+    # are used for average.
    average_window = 0.5

-    # buffer size for shuffling
+    # The buffer size of the data reader.
+    # The number of buffer size samples will be shuffled in training.
    buf_size = 1000

-    # log progress every log_period batches
+    # The parameter is used to control logging period.
+    # Training log will be printed every log_period.
    log_period = 100


 class ModelConfig(object):

-    # embedding vector dimension
+    # The dimension of embedding vector.
    emb_size = 28

-    # size of sentence vector representation and fc layer in cnn
+    # The hidden size of sentence vectors.
    hidden_size = 128
--- a/nested_sequence/text_classification/index.html
+++ b/nested_sequence/text_classification/index.html
@@ -118,8 +118,8 @@ pip install -r requirements.txt

 ## 指定训练配置参数

-`config.py`脚本中包含训练配置和模型配置的参数设置, 示例代码如下：
-```
+通过 `config.py` 脚本修改训练和模型配置参数，脚本中有对可配置参数的详细解释，示例如下：
+```python
 class TrainerConfig(object):

    # whether to use GPU for training
@@ -140,8 +140,7 @@ class ModelConfig(object):

    ...
 ```
-用户可以对具体参数进行设置实现训练， 例如通过设置 `use_gpu` 参数来指定是否使用 GPU
- 进行训练。
+修改 `config.py` 对参数进行调整。例如，通过修改 `use_gpu` 参数来指定是否使用 GPU 进行训练。

 ## 使用 PaddlePaddle 内置数据运行

@@ -242,7 +241,11 @@ Options:

 修改`train.py`脚本中的启动参数，可以直接运行本例。 以`data`目录下的示例数据为例，在终端执行：
 ```bash
-python train.py --train_data_dir 'data/train_data' --test_data_dir 'data/test_data' --word_dict_path 'word_dict.txt' --label_dict_path 'label_dict.txt'
+python train.py \
+  --train_data_dir 'data/train_data'  \
+  --test_data_dir 'data/test_data' \
+  --word_dict_path 'word_dict.txt' \
+  --label_dict_path 'label_dict.txt'
 ```
 即可对样例数据进行训练。

@@ -268,7 +271,11 @@ Options:

 2.以`data`目录下的示例数据为例，在终端执行：
 ```bash
-python infer.py --data_path 'data/infer.txt' --word_dict_path 'word_dict.txt' --label_dict_path 'label_dict.txt' --model_path 'models/params_pass_00000.tar.gz'
+python infer.py \
+  --data_path 'data/infer.txt' \
+  --word_dict_path 'word_dict.txt' \
+  --label_dict_path 'label_dict.txt' \
+  --model_path 'models/params_pass_00000.tar.gz'
 ```

 即可对样例数据进行预测。

--- a/nested_sequence/text_classification/infer.py
+++ b/nested_sequence/text_classification/infer.py
@@ -58,6 +58,7 @@ def infer(data_path, model_path, word_dict_path, batch_size, label_dict_path):
        word_reverse_dict = dict((value, key)
                                 for key, value in word_dict.iteritems())

+        # The reversed label dict of the imdb dataset 
        label_reverse_dict = {0: "positive", 1: "negative"}
        test_reader = reader.imdb_test(word_dict)
        class_num = 2
@@ -75,11 +76,12 @@ def infer(data_path, model_path, word_dict_path, batch_size, label_dict_path):
        test_reader = reader.infer_reader(data_path, word_dict)()

    dict_dim = len(word_dict)
-    prob_layer = nested_net(dict_dim, class_num, is_infer=True)

    # initialize PaddlePaddle.
    paddle.init(use_gpu=False, trainer_count=1)

+    prob_layer = nested_net(dict_dim, class_num, is_infer=True)
+
    # load the trained models.
    parameters = paddle.parameters.Parameters.from_tar(
        gzip.open(model_path, "r"))

--- a/nested_sequence/text_classification/network_conf.py
+++ b/nested_sequence/text_classification/network_conf.py
@@ -4,10 +4,10 @@ from config import ModelConfig as conf

 def cnn_cov_group(group_input, hidden_size):
    """
-    Covolution group definition
+    Convolution group definition.
    :param group_input: The input of this layer.
    :type group_input: LayerOutput
-    :params hidden_size: Size of FC layer.
+    :params hidden_size: The size of the fully connected layer.
    :type hidden_size: int
    """
    conv3 = paddle.networks.sequence_conv_pool(
@@ -32,7 +32,7 @@ def nested_net(dict_dim, class_num, is_infer=False):
    :type dict_dim: int
    :params class_num: Number of instance class.
    :type class_num: int
-    :params is_infer: The boolean parameter 
+    :params is_infer: The boolean parameter
                        indicating inferring or training.
    :type is_infer: bool
    """

--- a/nested_sequence/text_classification/train.py
+++ b/nested_sequence/text_classification/train.py
@@ -37,7 +37,7 @@ from config import TrainerConfig as conf
    "--label_dict_path",
    type=str,
    default=None,
-    help=("The path of label dictionary (default: None)."
+    help=("The path of label dictionary (default: None). "
          "If this parameter is not set, imdb dataset will be used. "
          "If this parameter is set, but the file does not exist, "
          "label dictionay will be built from "
@@ -50,16 +50,16 @@ from config import TrainerConfig as conf
 def train(train_data_dir, test_data_dir, word_dict_path, label_dict_path,
          model_save_dir):
    """
-    :params train_data_path: path of training data, if this parameter
+    :params train_data_path: The path of training data, if this parameter
        is not specified, imdb dataset will be used to run this example
    :type train_data_path: str
-    :params test_data_path: path of testing data, if this parameter
+    :params test_data_path: The path of testing data, if this parameter
        is not specified, imdb dataset will be used to run this example
    :type test_data_path: str
-    :params word_dict_path: path of word dictionary, if this parameter
+    :params word_dict_path: The path of word dictionary, if this parameter
        is not specified, imdb dataset will be used to run this example
    :type word_dict_path: str
-    :params label_dict_path: path of label dictionary, if this parameter
+    :params label_dict_path: The path of label dictionary, if this parameter
        is not specified, imdb dataset will be used to run this example
    :type label_dict_path: str
    :params model_save_dir: dir where models saved

--- a/nested_sequence/text_classification/utils.py
+++ b/nested_sequence/text_classification/utils.py
@@ -7,6 +7,18 @@ logger.setLevel(logging.INFO)


 def build_word_dict(data_dir, save_path, use_col=1, cutoff_fre=1):
+    """
+    Build word dictionary from training data.
+    :param data_dir: The directory of training dataset.
+    :type data_dir: str
+    :params save_path: The path where the word dictionary will be saved.
+    :type save_path: str
+    :params use_col: The index of text juring line split.
+    :type use_col: int
+    :params cutoff_fre: The word will not be added to dictionary if it's
+                    frequency is less than cutoff_fre.
+    :type cutoff_fre: int
+    """
    values = defaultdict(int)

    for file_name in os.listdir(data_dir):
@@ -33,6 +45,15 @@ def build_word_dict(data_dir, save_path, use_col=1, cutoff_fre=1):


 def build_label_dict(data_dir, save_path, use_col=0):
+    """
+    Build label dictionary from training data.
+    :param data_dir: The directory of training dataset.
+    :type data_dir: str
+    :params save_path: The path where the label dictionary will be saved.
+    :type save_path: str
+    :params use_col: The index of label juring line split.
+    :type use_col: int
+    """
    values = defaultdict(int)

    for file_name in os.listdir(data_dir):
@@ -53,10 +74,22 @@ def build_label_dict(data_dir, save_path, use_col=0):


 def load_dict(dict_path):
+    """
+    Load word dictionary from dictionary path.
+    :param dict_path: The path of word dictionary.
+    :type data_dir: str
+    """
    return dict((line.strip().split("\t")[0], idx)
                for idx, line in enumerate(open(dict_path, "r").readlines()))


 def load_reverse_dict(dict_path):
+    """
+    Load the reversed word dictionary from dictionary path.
+    Index of each word is saved in key of the dictionary and the
+    corresponding word saved in value of the dictionary.
+    :param dict_path: The path of word dictionary.
+    :type data_dir: str
+    """
    return dict((idx, line.strip().split("\t")[0])
                for idx, line in enumerate(open(dict_path, "r").readlines()))