From 13cd4dc0a13959b0e433cffb83a7312e8f66b2ec Mon Sep 17 00:00:00 2001 From: peterzhang2029 Date: Sun, 15 Oct 2017 21:00:21 +0800 Subject: [PATCH] refine docstring and notation --- nested_sequence/text_classification/README.md | 19 +++++++---- nested_sequence/text_classification/config.py | 29 +++++++++------- .../text_classification/index.html | 19 +++++++---- nested_sequence/text_classification/infer.py | 4 ++- .../text_classification/network_conf.py | 6 ++-- nested_sequence/text_classification/train.py | 10 +++--- nested_sequence/text_classification/utils.py | 33 +++++++++++++++++++ 7 files changed, 88 insertions(+), 32 deletions(-) diff --git a/nested_sequence/text_classification/README.md b/nested_sequence/text_classification/README.md index ce019662..dbc1b4a5 100644 --- a/nested_sequence/text_classification/README.md +++ b/nested_sequence/text_classification/README.md @@ -76,8 +76,8 @@ pip install -r requirements.txt ## 指定训练配置参数 -`config.py`脚本中包含训练配置和模型配置的参数设置, 示例代码如下: -``` +通过 `config.py` 脚本修改训练和模型配置参数,脚本中有对可配置参数的详细解释,示例如下: +```python class TrainerConfig(object): # whether to use GPU for training @@ -98,8 +98,7 @@ class ModelConfig(object): ... ``` -用户可以对具体参数进行设置实现训练, 例如通过设置 `use_gpu` 参数来指定是否使用 GPU - 进行训练。 +修改 `config.py` 对参数进行调整。例如,通过修改 `use_gpu` 参数来指定是否使用 GPU 进行训练。 ## 使用 PaddlePaddle 内置数据运行 @@ -200,7 +199,11 @@ Options: 修改`train.py`脚本中的启动参数,可以直接运行本例。 以`data`目录下的示例数据为例,在终端执行: ```bash -python train.py --train_data_dir 'data/train_data' --test_data_dir 'data/test_data' --word_dict_path 'word_dict.txt' --label_dict_path 'label_dict.txt' +python train.py \ + --train_data_dir 'data/train_data' \ + --test_data_dir 'data/test_data' \ + --word_dict_path 'word_dict.txt' \ + --label_dict_path 'label_dict.txt' ``` 即可对样例数据进行训练。 @@ -226,7 +229,11 @@ Options: 2.以`data`目录下的示例数据为例,在终端执行: ```bash -python infer.py --data_path 'data/infer.txt' --word_dict_path 'word_dict.txt' --label_dict_path 'label_dict.txt' --model_path 'models/params_pass_00000.tar.gz' +python infer.py \ + --data_path 'data/infer.txt' \ + --word_dict_path 'word_dict.txt' \ + --label_dict_path 'label_dict.txt' \ + --model_path 'models/params_pass_00000.tar.gz' ``` 即可对样例数据进行预测。 diff --git a/nested_sequence/text_classification/config.py b/nested_sequence/text_classification/config.py index 4461626f..1a6e4681 100644 --- a/nested_sequence/text_classification/config.py +++ b/nested_sequence/text_classification/config.py @@ -3,37 +3,44 @@ __all__ = ["TrainerConfig", "ModelConfig"] class TrainerConfig(object): - # whether to use GPU for training + # Whether to use GPU in training or not. use_gpu = False - # the number of threads used in one machine + # The number of computing threads. trainer_count = 1 - # train batch size + # The training batch size. batch_size = 32 - # number of pass during training + # The epoch number. num_passes = 10 - # learning rate for optimizer + # The global learning rate. learning_rate = 1e-3 - # learning rate for L2Regularization + # The decay rate for L2Regularization l2_learning_rate = 1e-3 - # average_window for ModelAverage + # This parameter is used for the averaged SGD. + # About the average_window * (number of the processed batch) parameters + # are used for average. + # To be accurate, between average_window *(number of the processed batch) + # and 2 * average_window * (number of the processed batch) parameters + # are used for average. average_window = 0.5 - # buffer size for shuffling + # The buffer size of the data reader. + # The number of buffer size samples will be shuffled in training. buf_size = 1000 - # log progress every log_period batches + # The parameter is used to control logging period. + # Training log will be printed every log_period. log_period = 100 class ModelConfig(object): - # embedding vector dimension + # The dimension of embedding vector. emb_size = 28 - # size of sentence vector representation and fc layer in cnn + # The hidden size of sentence vectors. hidden_size = 128 diff --git a/nested_sequence/text_classification/index.html b/nested_sequence/text_classification/index.html index 2b2c9dee..005de924 100644 --- a/nested_sequence/text_classification/index.html +++ b/nested_sequence/text_classification/index.html @@ -118,8 +118,8 @@ pip install -r requirements.txt ## 指定训练配置参数 -`config.py`脚本中包含训练配置和模型配置的参数设置, 示例代码如下: -``` +通过 `config.py` 脚本修改训练和模型配置参数,脚本中有对可配置参数的详细解释,示例如下: +```python class TrainerConfig(object): # whether to use GPU for training @@ -140,8 +140,7 @@ class ModelConfig(object): ... ``` -用户可以对具体参数进行设置实现训练, 例如通过设置 `use_gpu` 参数来指定是否使用 GPU - 进行训练。 +修改 `config.py` 对参数进行调整。例如,通过修改 `use_gpu` 参数来指定是否使用 GPU 进行训练。 ## 使用 PaddlePaddle 内置数据运行 @@ -242,7 +241,11 @@ Options: 修改`train.py`脚本中的启动参数,可以直接运行本例。 以`data`目录下的示例数据为例,在终端执行: ```bash -python train.py --train_data_dir 'data/train_data' --test_data_dir 'data/test_data' --word_dict_path 'word_dict.txt' --label_dict_path 'label_dict.txt' +python train.py \ + --train_data_dir 'data/train_data' \ + --test_data_dir 'data/test_data' \ + --word_dict_path 'word_dict.txt' \ + --label_dict_path 'label_dict.txt' ``` 即可对样例数据进行训练。 @@ -268,7 +271,11 @@ Options: 2.以`data`目录下的示例数据为例,在终端执行: ```bash -python infer.py --data_path 'data/infer.txt' --word_dict_path 'word_dict.txt' --label_dict_path 'label_dict.txt' --model_path 'models/params_pass_00000.tar.gz' +python infer.py \ + --data_path 'data/infer.txt' \ + --word_dict_path 'word_dict.txt' \ + --label_dict_path 'label_dict.txt' \ + --model_path 'models/params_pass_00000.tar.gz' ``` 即可对样例数据进行预测。 diff --git a/nested_sequence/text_classification/infer.py b/nested_sequence/text_classification/infer.py index 00204c96..461eba49 100644 --- a/nested_sequence/text_classification/infer.py +++ b/nested_sequence/text_classification/infer.py @@ -58,6 +58,7 @@ def infer(data_path, model_path, word_dict_path, batch_size, label_dict_path): word_reverse_dict = dict((value, key) for key, value in word_dict.iteritems()) + # The reversed label dict of the imdb dataset label_reverse_dict = {0: "positive", 1: "negative"} test_reader = reader.imdb_test(word_dict) class_num = 2 @@ -75,11 +76,12 @@ def infer(data_path, model_path, word_dict_path, batch_size, label_dict_path): test_reader = reader.infer_reader(data_path, word_dict)() dict_dim = len(word_dict) - prob_layer = nested_net(dict_dim, class_num, is_infer=True) # initialize PaddlePaddle. paddle.init(use_gpu=False, trainer_count=1) + prob_layer = nested_net(dict_dim, class_num, is_infer=True) + # load the trained models. parameters = paddle.parameters.Parameters.from_tar( gzip.open(model_path, "r")) diff --git a/nested_sequence/text_classification/network_conf.py b/nested_sequence/text_classification/network_conf.py index bee2c083..b4c40669 100644 --- a/nested_sequence/text_classification/network_conf.py +++ b/nested_sequence/text_classification/network_conf.py @@ -4,10 +4,10 @@ from config import ModelConfig as conf def cnn_cov_group(group_input, hidden_size): """ - Covolution group definition + Convolution group definition. :param group_input: The input of this layer. :type group_input: LayerOutput - :params hidden_size: Size of FC layer. + :params hidden_size: The size of the fully connected layer. :type hidden_size: int """ conv3 = paddle.networks.sequence_conv_pool( @@ -32,7 +32,7 @@ def nested_net(dict_dim, class_num, is_infer=False): :type dict_dim: int :params class_num: Number of instance class. :type class_num: int - :params is_infer: The boolean parameter + :params is_infer: The boolean parameter indicating inferring or training. :type is_infer: bool """ diff --git a/nested_sequence/text_classification/train.py b/nested_sequence/text_classification/train.py index 863a0b47..a0da1ad0 100644 --- a/nested_sequence/text_classification/train.py +++ b/nested_sequence/text_classification/train.py @@ -37,7 +37,7 @@ from config import TrainerConfig as conf "--label_dict_path", type=str, default=None, - help=("The path of label dictionary (default: None)." + help=("The path of label dictionary (default: None). " "If this parameter is not set, imdb dataset will be used. " "If this parameter is set, but the file does not exist, " "label dictionay will be built from " @@ -50,16 +50,16 @@ from config import TrainerConfig as conf def train(train_data_dir, test_data_dir, word_dict_path, label_dict_path, model_save_dir): """ - :params train_data_path: path of training data, if this parameter + :params train_data_path: The path of training data, if this parameter is not specified, imdb dataset will be used to run this example :type train_data_path: str - :params test_data_path: path of testing data, if this parameter + :params test_data_path: The path of testing data, if this parameter is not specified, imdb dataset will be used to run this example :type test_data_path: str - :params word_dict_path: path of word dictionary, if this parameter + :params word_dict_path: The path of word dictionary, if this parameter is not specified, imdb dataset will be used to run this example :type word_dict_path: str - :params label_dict_path: path of label dictionary, if this parameter + :params label_dict_path: The path of label dictionary, if this parameter is not specified, imdb dataset will be used to run this example :type label_dict_path: str :params model_save_dir: dir where models saved diff --git a/nested_sequence/text_classification/utils.py b/nested_sequence/text_classification/utils.py index b83fef17..1535e31f 100644 --- a/nested_sequence/text_classification/utils.py +++ b/nested_sequence/text_classification/utils.py @@ -7,6 +7,18 @@ logger.setLevel(logging.INFO) def build_word_dict(data_dir, save_path, use_col=1, cutoff_fre=1): + """ + Build word dictionary from training data. + :param data_dir: The directory of training dataset. + :type data_dir: str + :params save_path: The path where the word dictionary will be saved. + :type save_path: str + :params use_col: The index of text juring line split. + :type use_col: int + :params cutoff_fre: The word will not be added to dictionary if it's + frequency is less than cutoff_fre. + :type cutoff_fre: int + """ values = defaultdict(int) for file_name in os.listdir(data_dir): @@ -33,6 +45,15 @@ def build_word_dict(data_dir, save_path, use_col=1, cutoff_fre=1): def build_label_dict(data_dir, save_path, use_col=0): + """ + Build label dictionary from training data. + :param data_dir: The directory of training dataset. + :type data_dir: str + :params save_path: The path where the label dictionary will be saved. + :type save_path: str + :params use_col: The index of label juring line split. + :type use_col: int + """ values = defaultdict(int) for file_name in os.listdir(data_dir): @@ -53,10 +74,22 @@ def build_label_dict(data_dir, save_path, use_col=0): def load_dict(dict_path): + """ + Load word dictionary from dictionary path. + :param dict_path: The path of word dictionary. + :type data_dir: str + """ return dict((line.strip().split("\t")[0], idx) for idx, line in enumerate(open(dict_path, "r").readlines())) def load_reverse_dict(dict_path): + """ + Load the reversed word dictionary from dictionary path. + Index of each word is saved in key of the dictionary and the + corresponding word saved in value of the dictionary. + :param dict_path: The path of word dictionary. + :type data_dir: str + """ return dict((idx, line.strip().split("\t")[0]) for idx, line in enumerate(open(dict_path, "r").readlines())) -- GitLab