clean codes and add comments.

44a72d3d · caoying03 · 431f46fb · 44a72d3d · 44a72d3d · 44a72d3d
11 changed file
--- a/globally_normalized_reader/.gitignore
+++ b/globally_normalized_reader/.gitignore
-data
 *.txt
 *.pyc
--- a/globally_normalized_reader/README.md
+++ b/globally_normalized_reader/README.md
-TBD
+# Globally Normalized Reader
+This model implements the work in the following paper:
+Jonathan Raiman and John Miller. Globally Normalized Reader. Empirical Methods in Natural Language Processing (EMNLP), 2017.
+If you use the dataset/code in your research, please cite the above paper:
+```text
+@inproceedings{raiman2015gnr,
+    author={Raiman, Jonathan and Miller, John},
+    booktitle={Empirical Methods in Natural Language Processing (EMNLP)},
+    title={Globally Normalized Reader},
+    year={2017},
+}
+```
+You can also visit https://github.com/baidu-research/GloballyNormalizedReader to get more information.
+# Installation
+1. Please use [docker image](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html) to install the latest PaddlePaddle, by running:
+    ```bash
+    docker pull paddledev/paddle
+    ```
+2. Download all necessary data by running:
+   ```bash
+   cd data && ./download.sh
+   ```
+3. Featurize the data by running:
+   ```
+   python featurize.py --datadir data --outdir featurized
+   ```
+# Training a Model
+- Configurate the model by modifying `config.py` if needed, and then run:
+    ```bash
+    python train.py 2>&1 | tee train.log
+    ```
+# Inferring by a Trained Model
+- Infer by a trained model by running:
+   ```bash
+   python infer.py \
+     --model_path models/pass_00000.tar.gz \
+     --data_dir data/featurized/ \
+     --batch_size 2 \
+     --use_gpu 0 \
+     --trainer_count 1 \
+     2>&1 | tee infer.log
+   ```
--- a/globally_normalized_reader/basic_modules.py
+++ b/globally_normalized_reader/basic_modules.py
 #!/usr/bin/env python
 #coding=utf-8
 import collections
 import paddle.v2 as paddle
@@ -7,11 +8,43 @@ from paddle.v2.layer import parse_network
 __all__ = [
    "stacked_bidirectional_lstm",
+    "stacked_bidirectional_lstm_by_nested_seq",
    "lstm_by_nested_sequence",
 ]
-def stacked_bidirectional_lstm(inputs, size, depth, drop_rate=0., prefix=""):
+def stacked_bidirectional_lstm(inputs,
+                               hidden_dim,
+                               depth,
+                               drop_rate=0.,
+                               prefix=""):
+    """ The stacked bi-directional LSTM.
+    In PaddlePaddle recurrent layers have two different implementations:
+    1. recurrent layer implemented by recurrent_group: any intermedia states a
+       recurent unit computes during one time step, such as hidden states,
+       input-to-hidden mapping, memory cells and so on, is accessable.
+    2. recurrent layer as a whole: only outputs of the recurrent layer are
+       accessable.
+    The second type (recurrent layer as a whole) is more computation efficient,
+    because recurrent_group is made up of many basic layers (including add,
+    element-wise multiplications, matrix multiplication and so on).
+    This function uses the second type to implement the stacked bi-directional
+    LSTM.
+    Arguments:
+        - inputs:      The input layer to the bi-directional LSTM.
+        - hidden_dim:  The dimension of the hidden state of the LSTM.
+        - depth:       Depth of the stacked bi-directional LSTM.
+        - drop_rate:   The drop rate to drop the LSTM output states.
+        - prefix:      A string which will be appended to name of each layer
+                       created in this function. Each layer in a network should
+                       has a unique name. The prefix makes this fucntion can be
+                       called multiple times.
+    """
    if not isinstance(inputs, collections.Sequence):
        inputs = [inputs]
@@ -20,7 +53,7 @@ def stacked_bidirectional_lstm(inputs, size, depth, drop_rate=0., prefix=""):
        for i in range(depth):
            input_proj = paddle.layer.mixed(
                name="%s_in_proj_%0d_%s__" % (prefix, i, dirt),
-                size=size * 4,
+                size=hidden_dim * 4,
                bias_attr=paddle.attr.Param(initial_std=0.),
                input=[paddle.layer.full_matrix_projection(lstm)] if i else [
                    paddle.layer.full_matrix_projection(in_layer)
@@ -45,8 +78,8 @@ def stacked_bidirectional_lstm(inputs, size, depth, drop_rate=0., prefix=""):
 def lstm_by_nested_sequence(input_layer, hidden_dim, name="", reverse=False):
-    '''
+    """This is a LSTM implemended by nested recurrent_group.
-    This is a LSTM implemended by nested recurrent_group.
    Paragraph is a nature nested sequence:
    1. each paragraph is a sequence of sentence.
    2. each sentence is a sequence of words.
@@ -60,7 +93,14 @@ def lstm_by_nested_sequence(input_layer, hidden_dim, name="", reverse=False):
    5. Consequently, this function is just equivalent to concatenate all
       sentences in a paragraph into one (long) sentence, and use one LSTM to
       encode this new long sentence.
-    '''
+    Arguments:
+        - input_layer:    The input layer to the bi-directional LSTM.
+        - hidden_dim:     The dimension of the hidden state of the LSTM.
+        - name:           The name of the bi-directional LSTM.
+        - reverse:        The boolean parameter indicating whether to prcess
+                          the input sequence by the reverse order.
+    """
    def lstm_outer_step(lstm_group_input, hidden_dim, reverse, name=''):
        outer_memory = paddle.layer.memory(
@@ -71,9 +111,8 @@ def lstm_by_nested_sequence(input_layer, hidden_dim, name="", reverse=False):
                name="__inner_state_%s__" % name,
                size=hidden_dim,
                boot_layer=outer_memory)
-            input_proj = paddle.layer.fc(size=hidden_dim * 4,
+            input_proj = paddle.layer.fc(
-                                         bias_attr=False,
+                size=hidden_dim * 4, bias_attr=False, input=input_layer)
-                                         input=input_layer)
            return paddle.networks.lstmemory_unit(
                input=input_proj,
                name="__inner_state_%s__" % name,
@@ -111,7 +150,27 @@ def lstm_by_nested_sequence(input_layer, hidden_dim, name="", reverse=False):
        reverse=reverse)
-def stacked_bi_lstm_by_nested_seq(input_layer, depth, hidden_dim, prefix=""):
+def stacked_bidirectional_lstm_by_nested_seq(input_layer,
+                                             depth,
+                                             hidden_dim,
+                                             prefix=""):
+    """ The stacked bi-directional LSTM to process a nested sequence.
+    The modules defined in this function is exactly equivalent to
+    that defined in stacked_bidirectional_lstm, the only difference is the
+    bi-directional LSTM defined in this function implemented by recurrent_group
+    in PaddlePaddle, and receive a nested sequence as its input.
+    Arguments:
+        - inputs:      The input layer to the bi-directional LSTM.
+        - hidden_dim:  The dimension of the hidden state of the LSTM.
+        - depth:       Depth of the stacked bi-directional LSTM.
+        - prefix:      A string which will be appended to name of each layer
+                       created in this function. Each layer in a network should
+                       has a unique name. The prefix makes this fucntion can be
+                       called multiple times.
+    """
    lstm_final_outs = []
    for dirt in ["fwd", "bwd"]:
        for i in range(depth):
@@ -122,16 +181,3 @@ def stacked_bi_lstm_by_nested_seq(input_layer, depth, hidden_dim, prefix=""):
                reverse=(dirt == "bwd"))
        lstm_final_outs.append(lstm_out)
    return paddle.layer.concat(input=lstm_final_outs)
-if __name__ == "__main__":
-    vocab_size = 1024
-    emb_dim = 128
-    embedding = paddle.layer.embedding(
-        input=paddle.layer.data(
-            name="word",
-            type=paddle.data_type.integer_value_sub_sequence(vocab_size)),
-        size=emb_dim)
-    print(parse_network(
-        stacked_bi_lstm_by_nested_seq(
-            input_layer=embedding, depth=3, hidden_dim=128, prefix="__lstm")))
--- a/globally_normalized_reader/beam_decoding.py
+++ b/globally_normalized_reader/beam_decoding.py
 #!/usr/bin/env python
 #coding=utf-8
-import pdb
 import numpy as np
 __all__ = ["BeamDecoding"]
 class BeamDecoding(object):
+    """
+    Decode outputs of the PaddlePaddle layers into readable answers.
+    """
    def __init__(self, documents, sentence_scores, selected_sentences,
                 start_scores, selected_starts, end_scores, selected_ends):
+        """ The constructor.
+        Arguments:
+            - documents:          The one-hot input of the document words.
+            - sentence_scores:    The score for each sentece in a document.
+            - selected_sentences: The top k seleceted sentence. This is the
+                                  output of the paddle.layer.kmax_seq_score
+                                  layer in the model.
+            - start_scores:       The score for each words in the selected
+                                  sentence indicating whether the it is start
+                                  of the answer.
+            - selected_starts:    The top k selected start spans. This is the
+                                  output of the paddle.layer.kmax_seq_score
+                                  layer in the model.
+            - end_scores:         The score for each words in the sub-sequence
+                                  which is from the selecetd starts till end of
+                                  the selected sentence.
+            - selected_ends:      The top k selected end spans. This is the
+                                  output of the paddle.layer.kmax_seq_score
+                                  layer in the model.
+        """
        self.documents = documents
        self.sentence_scores = sentence_scores
@@ -19,13 +46,14 @@ class BeamDecoding(object):
        self.end_scores = end_scores
        self.selected_ends = selected_ends
+        """
-        # sequence start position information for the three step search
+        sequence start position information for the three step search
-        # beam1 is to search the sequence index
+        beam1 is to search the sequence index
+        """
        self.beam1_seq_start_positions = []
-        # beam2 is to search the start answer span
+        """beam2 is to search the start answer span"""
        self.beam2_seq_start_positions = []
-        # beam3 is to search the end answer span
+        """beam3 is to search the end answer span """
        self.beam3_seq_start_positions = []
        self.ans_per_sample_in_a_batch = [0]
@@ -34,6 +62,11 @@ class BeamDecoding(object):
        self.final_ans = [[] for i in range(len(documents))]
    def _build_beam1_seq_info(self):
+        """
+        The internal function to calculate the offset of each test sequence
+        in a batch for the first beam in searching the answer sentence.
+        """
        self.beam1_seq_start_positions.append([0])
        for idx, one_doc in enumerate(self.documents):
            for sentence in one_doc:
@@ -45,15 +78,20 @@ class BeamDecoding(object):
                    [self.beam1_seq_start_positions[-1][-1]])
    def _build_beam2_seq_info(self):
+        """
+        The internal function to calculate the offset of each test sequence
+        in a batch for the second beam in searching the start spans.
+        """
        seq_num, beam_size = self.selected_sentences.shape
        self.beam2_seq_start_positions.append([0])
        for i in range(seq_num):
            for j in range(beam_size):
                selected_id = int(self.selected_sentences[i][j])
                if selected_id == -1: break
-                seq_len = self.beam1_seq_start_positions[i][
+                seq_len = self.beam1_seq_start_positions[
-                    selected_id + 1] - self.beam1_seq_start_positions[i][
+                    i][selected_id +
-                        selected_id]
+                       1] - self.beam1_seq_start_positions[i][selected_id]
                self.beam2_seq_start_positions[-1].append(
                    self.beam2_seq_start_positions[-1][-1] + seq_len)
@@ -62,6 +100,11 @@ class BeamDecoding(object):
                    [self.beam2_seq_start_positions[-1][-1]])
    def _build_beam3_seq_info(self):
+        """
+        The internal function to calculate the offset of each test sequence
+        in a batch for the third beam in searching the end spans.
+        """
        seq_num_in_a_batch = len(self.documents)
        seq_id = 0
@@ -71,9 +114,9 @@ class BeamDecoding(object):
        self.beam3_seq_start_positions.append([0])
        sub_seq_num, beam_size = self.selected_starts.shape
        for i in range(sub_seq_num):
-            seq_len = self.beam2_seq_start_positions[seq_id][
+            seq_len = self.beam2_seq_start_positions[
-                sub_seq_id + 1] - self.beam2_seq_start_positions[seq_id][
+                seq_id][sub_seq_id +
-                    sub_seq_id]
+                        1] - self.beam2_seq_start_positions[seq_id][sub_seq_id]
            for j in range(beam_size):
                start_id = int(self.selected_starts[i][j])
                if start_id == -1: break
@@ -88,17 +131,27 @@ class BeamDecoding(object):
                        [self.beam3_seq_start_positions[-1][-1]])
                    sub_seq_id = 0
                    seq_id += 1
-                    sub_seq_count = len(self.beam2_seq_start_positions[
+                    sub_seq_count = len(
-                        seq_id]) - 1
+                        self.beam2_seq_start_positions[seq_id]) - 1
        assert (
            self.beam3_seq_start_positions[-1][-1] == self.end_scores.shape[0])
    def _build_seq_info_for_each_beam(self):
+        """
+        The internal function to calculate the offset of each test sequence
+        in a batch for beams expanded at all the three search steps.
+        """
        self._build_beam1_seq_info()
        self._build_beam2_seq_info()
        self._build_beam3_seq_info()
    def _cal_ans_per_sample_in_a_batch(self):
+        """
+        The internal function to calculate there are how many candidate answers
+        for each of the test sequemce in a batch.
+        """
        start_row = 0
        for seq in self.beam3_seq_start_positions:
            end_row = start_row + len(seq) - 1
@@ -109,6 +162,13 @@ class BeamDecoding(object):
            start_row = end_row
    def _get_valid_seleceted_ids(slef, mat):
+        """
+        The internal function to post-process the output matrix of
+        paddle.layer.kmax_seq_score layer. This function takes off the special
+        dilimeter -1 away and flattens the original two-dimensional output
+        matrix into a python list.
+        """
        flattened = []
        height, width = mat.shape
        for i in range(height):
@@ -118,6 +178,11 @@ class BeamDecoding(object):
        return flattened
    def decoding(self):
+        """
+        The internal function to decode forward results of the GNR network into
+        readable answers.
+        """
        self._build_seq_info_for_each_beam()
        self._cal_ans_per_sample_in_a_batch()
@@ -134,11 +199,16 @@ class BeamDecoding(object):
                if end_pos == -1: break
                self.all_searched_ans.append({
-                    "score": self.end_scores[seq_offset_in_batch + end_pos],
+                    "score":
-                    "sentence_pos": -1,
+                    self.end_scores[seq_offset_in_batch + end_pos],
-                    "start_span_pos": -1,
+                    "sentence_pos":
-                    "end_span_pos": end_pos,
+                    -1,
-                    "parent_ids_in_prev_beam": i
+                    "start_span_pos":
+                    -1,
+                    "end_span_pos":
+                    end_pos,
+                    "parent_ids_in_prev_beam":
+                    i
                })
            sub_seq_id += 1
@@ -196,7 +266,8 @@ class BeamDecoding(object):
                    key=lambda x: x["score"],
                    reverse=True):
                self.final_ans[i].append({
-                    "score": ans["score"],
+                    "score":
+                    ans["score"],
                    "label": [
                        ans["sentence_pos"], ans["start_span_pos"],
                        ans["end_span_pos"]

--- a/globally_normalized_reader/config.py
+++ b/globally_normalized_reader/config.py
 #!/usr/bin/env python
 #coding=utf-8
-__all__ = ["ModelConfig"]
+__all__ = ["ModelConfig", "TrainerConfig"]
 class ModelConfig(object):
@@ -24,14 +24,15 @@ class ModelConfig(object):
 class TrainerConfig(object):
    learning_rate = 1e-3
+    l2_decay_rate = 5e-4
+    gradient_clipping_threshold = 20
    data_dir = "data/featurized"
    save_dir = "models"
-    use_gpu = True
+    use_gpu = False
-    trainer_count = 4
+    trainer_count = 1
-    train_batch_size = trainer_count * 10
+    train_batch_size = trainer_count * 8
-    test_batch_size = 4
    epochs = 20

--- a/globally_normalized_reader/data/download.sh
+++ b/globally_normalized_reader/data/download.sh
+#!/bin/bash
+wget --no-check-certificate https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json
+wget --no-check-certificate https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json
--- a/globally_normalized_reader/index.html
+++ b/globally_normalized_reader/index.html
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+<body>
+<div id="context" class="container-fluid markdown-body">
+</div>
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# Globally Normalized Reader
+This model implements the work in the following paper:
+Jonathan Raiman and John Miller. Globally Normalized Reader. Empirical Methods in Natural Language Processing (EMNLP), 2017.
+If you use the dataset/code in your research, please cite the above paper:
+```text
+@inproceedings{raiman2015gnr,
+    author={Raiman, Jonathan and Miller, John},
+    booktitle={Empirical Methods in Natural Language Processing (EMNLP)},
+    title={Globally Normalized Reader},
+    year={2017},
+}
+```
+You can also visit https://github.com/baidu-research/GloballyNormalizedReader to get more information.
+# Installation
+1. Please use [docker image](http://doc.paddlepaddle.org/develop/doc/getstarted/build_and_install/docker_install_en.html) to install the latest PaddlePaddle, by running:
+    ```bash
+    docker pull paddledev/paddle
+    ```
+2. Download all necessary data by running:
+   ```bash
+   cd data && ./download.sh
+   ```
+3. Featurize the data by running:
+   ```
+   python featurize.py --datadir data --outdir featurized
+   ```
+# Training a Model
+- Configurate the model by modifying `config.py` if needed, and then run:
+    ```bash
+    python train.py 2>&1 | tee train.log
+    ```
+# Inferring by a Trained Model
+- Infer by a trained model by running:
+   ```bash
+   python infer.py \
+     --model_path models/pass_00000.tar.gz \
+     --data_dir data/featurized/ \
+     --batch_size 2 \
+     --use_gpu 0 \
+     --trainer_count 1 \
+     2>&1 | tee infer.log
+   ```
+</div>
+<!-- You can change the lines below now. -->
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
--- a/globally_normalized_reader/infer.py
+++ b/globally_normalized_reader/infer.py
 #!/usr/bin/env python
 #coding=utf-8
 import os
 import sys
+import argparse
 import gzip
 import logging
 import numpy as np
@@ -12,14 +14,62 @@ import reader
 from model import GNR
 from train import choose_samples
-from config import ModelConfig, TrainerConfig
+from config import ModelConfig
 from beam_decoding import BeamDecoding
 logger = logging.getLogger("paddle")
 logger.setLevel(logging.INFO)
+def parse_cmd():
+    """
+    Build the command line arguments parser for inferring task.
+    """
+    parser = argparse.ArgumentParser(
+        description="Globally Normalized Reader in PaddlePaddle.")
+    parser.add_argument(
+        "--model_path",
+        required=True,
+        type=str,
+        help="Path of the trained model to evaluate.",
+        default="")
+    parser.add_argument(
+        "--data_dir",
+        type=str,
+        required=True,
+        help="Path of the training and testing data.",
+        default="")
+    parser.add_argument(
+        "--batch_size",
+        type=int,
+        required=False,
+        help="The batch size for inferring.",
+        default=1)
+    parser.add_argument(
+        "--use_gpu",
+        type=int,
+        required=False,
+        help="Whether to run the inferring on GPU.",
+        default=0)
+    parser.add_argument(
+        "--trainer_count",
+        type=int,
+        required=False,
+        help=("The thread number used in inferring. When set "
+              "use_gpu=True, the trainer_count cannot excess "
+              "the gpu device number in your computer."),
+        default=1)
+    return parser.parse_args()
 def load_reverse_dict(dict_file):
+    """ Build the dict which is used to map the word index to word string.
+    The keys are word index and the values are word strings.
+    Arguments:
+        - dict_file:    The path of a word dictionary.
+    """
    word_dict = {}
    with open(dict_file, "r") as fin:
        for idx, line in enumerate(fin):
@@ -28,6 +78,34 @@ def load_reverse_dict(dict_file):
 def print_result(test_batch, predicted_ans, ids_2_word, print_top_k=1):
+    """ Print the readable predicted answers.
+    Format of the output:
+        query:\tthe input query.
+        documents:\n
+        0\tthe first sentence in the document.
+        1\tthe second sentence in the document.
+        ...
+        gold:\t[i j k] the answer words.
+            (i: the sentence index;
+             j: the start span index;
+             k: the end span index)
+        top answers:
+        score0\t[i j k] the answer with the highest score.
+        score1\t[i j k] the answer with the second highest score.
+            (i, j, k has a same meaning as in gold.)
+        ...
+        By default, top 10 answers will be printed.
+    Arguments:
+        - test_batch:     A test batch returned by reader.
+        - predicted_ans:  The beam decoding results.
+        - ids_2_word:     The dict whose key is word index and the values are
+                          word strings.
+        - print_top_k:    Indicating how many answers will be printed.
+    """
    for i, sample in enumerate(test_batch):
        query_words = [ids_2_word[ids] for ids in sample[0]]
        print("query:\t%s" % (" ".join(query_words)))
@@ -42,7 +120,7 @@ def print_result(test_batch, predicted_ans, ids_2_word, print_top_k=1):
            sample[3], sample[4], sample[5], " ".join(
                [ids_2_word[ids] for ids in sample[1][sample[3]][start:end]])))
-        print("predicted:")
+        print("top answers:")
        for k in range(print_top_k):
            label = predicted_ans[i][k]["label"]
            start = label[1]
@@ -57,14 +135,42 @@ def print_result(test_batch, predicted_ans, ids_2_word, print_top_k=1):
 def infer_a_batch(inferer, test_batch, ids_2_word, out_layer_count):
+    """ Call the PaddlePaddle's infer interface to infer by batch.
+    Arguments:
+        - inferer:         The PaddlePaddle Inference object.
+        - test_batch:      A test batch returned by reader.
+        - ids_2_word:      The dict whose key is word index and the values are
+                           word strings.
+        - out_layer_count: The number of output layers in the inferring process.
+    """
    outs = inferer.infer(input=test_batch, flatten_result=False, field="value")
    decoder = BeamDecoding([sample[1] for sample in test_batch], *outs)
    print_result(test_batch, decoder.decoding(), ids_2_word, print_top_k=10)
-def infer(model_path, data_dir, test_batch_size, config):
+def infer(model_path,
+          data_dir,
+          batch_size,
+          config,
+          use_gpu=False,
+          trainer_count=1):
+    """ The inferring process.
+    Arguments:
+        - model_path:      The path of trained model.
+        - data_dir:        The directory path of test data.
+        - batch_size:      The batch_size.
+        - config:          The model configuration.
+        - use_gpu:         Whether to run the inferring on GPU.
+        - trainer_count:   The thread number used in inferring. When set
+                           use_gpu=True, the trainer_count cannot excess
+                           the gpu device number in your computer.
+    """
    assert os.path.exists(model_path), "The model does not exist."
-    paddle.init(use_gpu=True, trainer_count=1)
+    paddle.init(use_gpu=use_gpu, trainer_count=trainer_count)
    ids_2_word = load_reverse_dict(config.dict_path)
@@ -84,7 +190,7 @@ def infer(model_path, data_dir, test_batch_size, config):
    test_batch = []
    for i, item in enumerate(test_reader()):
        test_batch.append(item)
-        if len(test_batch) == test_batch_size:
+        if len(test_batch) == batch_size:
            infer_a_batch(inferer, test_batch, ids_2_word, len(outputs))
            test_batch = []
@@ -93,7 +199,16 @@ def infer(model_path, data_dir, test_batch_size, config):
        test_batch = []
+def main(args):
+    infer(
+        model_path=args.model_path,
+        data_dir=args.data_dir,
+        batch_size=args.batch_size,
+        config=ModelConfig,
+        use_gpu=args.use_gpu,
+        trainer_count=args.trainer_count)
 if __name__ == "__main__":
-    # infer("models/round1/pass_00000.tar.gz", TrainerConfig.data_dir,
+    args = parse_cmd()
-    infer("models/round2_on_cpu/pass_00000.tar.gz", TrainerConfig.data_dir,
+    main(args)
-          TrainerConfig.test_batch_size, ModelConfig)
--- a/globally_normalized_reader/model.py
+++ b/globally_normalized_reader/model.py
 #!/usr/bin/env python
 #coding=utf-8
 import paddle.v2 as paddle
 from paddle.v2.layer import parse_network
 import basic_modules
@@ -9,52 +10,115 @@ __all__ = ["GNR"]
 def build_pretrained_embedding(name, data_type, emb_dim, emb_drop=0.):
+    """create word a embedding layer which loads pre-trained embeddings.
+    Arguments:
+        - name:       The name of the data layer which accepts one-hot input.
+        - data_type:  PaddlePaddle's data type for data layer.
+        - emb_dim:    The path to the data files.
+    """
    return paddle.layer.embedding(
-        input=paddle.layer.data(
+        input=paddle.layer.data(name=name, type=data_type),
-            name=name, type=data_type),
        size=emb_dim,
-        param_attr=paddle.attr.Param(
+        param_attr=paddle.attr.Param(name="GloveVectors", is_static=True),
-            name="GloveVectors", is_static=True),
        layer_attr=paddle.attr.ExtraLayerAttribute(drop_rate=emb_drop), )
-def encode_question(input_embedding, config, prefix):
+def encode_question(input_embedding,
+                    lstm_hidden_dim,
+                    depth,
+                    passage_indep_embedding_dim,
+                    prefix=""):
+    """build question encoding by using bidirectional LSTM.
+    Each question word is encoded by runing a stack of bidirectional LSTM over
+    word embedding in question, producing hidden states. The hidden states are
+    used to compute a passage-independent question embedding.
+    The final question encoding is constructed by concatenating the final
+    hidden states of the forward and backward LSTMs and the passage-independent
+    embedding.
+    Arguments:
+        - input_embedding:    The question word embeddings.
+        - lstm_hidden_dim:  The dimension of bi-directional LSTM.
+        - depth:  The depth of stacked bi-directional LSTM.
+        - passage_indep_embedding_dim:  The dimension of passage-independent
+                                        embedding.
+        - prefix:    A string which will be appended to name of each layer
+                     created in this function. Each layer in a network should
+                     has a unique name. The prefix makes this fucntion can be
+                     called multiple times.
+    """
+    # stacked bi-directional LSTM to process question embeddings.
    lstm_final, lstm_outs = basic_modules.stacked_bidirectional_lstm(
-        inputs=input_embedding,
+        input_embedding, lstm_hidden_dim, depth, 0., prefix)
-        size=config.lstm_hidden_dim,
-        depth=config.lstm_depth,
+    # compute passage-independent embeddings.
-        drop_rate=config.lstm_hidden_droprate,
+    candidates = paddle.layer.fc(
-        prefix=prefix)
+        input=lstm_outs,
-    # passage-independent embeddings
-    candidates = paddle.layer.fc(input=lstm_outs,
        bias_attr=False,
-                                 size=config.passage_indep_embedding_dim,
+        size=passage_indep_embedding_dim,
        act=paddle.activation.Linear())
-    weights = paddle.layer.fc(input=lstm_outs,
+    weights = paddle.layer.fc(
+        input=lstm_outs,
        size=1,
        bias_attr=False,
        act=paddle.activation.SequenceSoftmax())
    weighted_candidates = paddle.layer.scaling(input=candidates, weight=weights)
    passage_indep_embedding = paddle.layer.pooling(
        input=weighted_candidates, pooling_type=paddle.pooling.Sum())
    return paddle.layer.concat(
        input=[lstm_final, passage_indep_embedding]), lstm_outs
 def question_aligned_passage_embedding(question_lstm_outs, document_embeddings,
-                                       config):
+                                       passage_aligned_embedding_dim):
-    def outer_sentence_step(document_embeddings, question_lstm_outs, config):
+    """create question aligned passage embedding.
-        '''
-        in this recurrent_group, document_embeddings has scattered into sequence,
+    Arguments:
-        '''
+        - question_lstm_outs:    The dimension of output of LSTM that process
+                                 question word embedding.
+        - document_embeddings:   The document embeddings.
+        - passage_aligned_embedding_dim:    The dimension of passage aligned
+                                            embedding.
+    """
+    def outer_sentence_step(document_embeddings, question_lstm_outs,
+                            passage_aligned_embedding_dim):
+        """step function for PaddlePaddle's recurrent_group.
+        In this function, the original input document_embeddings are scattered
+        from nested sequence into sequence by recurrent_group in PaddlePaddle.
+        The step function iterates over each sentence in the document.
+        Arguments:
+            - document_embeddings:   The word embeddings of the document.
+            - question_lstm_outs:    The dimension of output of LSTM that
+                                     process question word embedding.
+            - passage_aligned_embedding_dim:    The dimension of passage aligned
+                                                embedding.
+        """
        def inner_word_step(word_embedding, question_lstm_outs,
-                            question_outs_proj, config):
+                            question_outs_proj, passage_aligned_embedding_dim):
-            '''
+            """
-            in this recurrent_group, sentence embedding has scattered into word
+            In this recurrent_group, sentence embedding has been scattered into
-            embeddings.
+            word embeddings. The step function iterates over each word in one
-            '''
+            sentence in the document.
+            Arguments:
+                - word_embedding: The word embeddings of documents.
+                - question_lstm_outs:    The dimension of output of LSTM that
+                                         process question word embedding.
+                - question_outs_proj:    The projection of question_lstm_outs
+                                         into a new hidden space.
+                - passage_aligned_embedding_dim:    The dimension of passage
+                                                    aligned embedding.
+            """
            doc_word_expand = paddle.layer.expand(
                input=word_embedding,
                expand_as=question_lstm_outs,
@@ -62,10 +126,6 @@ def question_aligned_passage_embedding(question_lstm_outs, document_embeddings,
            weights = paddle.layer.fc(
                input=[question_lstm_outs, doc_word_expand],
-                param_attr=[
-                    paddle.attr.Param(initial_std=1e-3),
-                    paddle.attr.Param(initial_std=1e-3)
-                ],
                size=1,
                bias_attr=False,
                act=paddle.activation.SequenceSoftmax())
@@ -77,13 +137,13 @@ def question_aligned_passage_embedding(question_lstm_outs, document_embeddings,
        question_outs_proj = paddle.layer.fc(
            input=question_lstm_outs,
            bias_attr=False,
-            size=config.passage_aligned_embedding_dim)
+            size=passage_aligned_embedding_dim)
        return paddle.layer.recurrent_group(
            input=[
                paddle.layer.SubsequenceInput(document_embeddings),
                paddle.layer.StaticInput(question_lstm_outs),
                paddle.layer.StaticInput(question_outs_proj),
-                config,
+                passage_aligned_embedding_dim,
            ],
            step=inner_word_step,
            name="iter_over_word")
@@ -91,20 +151,39 @@ def question_aligned_passage_embedding(question_lstm_outs, document_embeddings,
    return paddle.layer.recurrent_group(
        input=[
            paddle.layer.SubsequenceInput(document_embeddings),
-            paddle.layer.StaticInput(question_lstm_outs), config
+            paddle.layer.StaticInput(question_lstm_outs),
+            passage_aligned_embedding_dim
        ],
        step=outer_sentence_step,
        name="iter_over_sen")
 def encode_documents(input_embedding, same_as_question, question_vector,
-                     question_lstm_outs, config, prefix):
+                     question_lstm_outs, passage_indep_embedding_dim, prefix):
+    """Build the final question-aware document embeddings.
+    Each word in the document is represented as concatenation of its word
+    vector, the question vector, boolean features indicating if a word appers
+    in the question or is repeated, and a question aligned embedding.
+    Arguments:
+        - input_embedding:   The word embeddings of the document.
+        - same_as_question:  The boolean features indicating if a word appears
+                             in the question or is repeated.
+        - question_lstm_outs: The final question encoding.
+        - passage_indep_embedding_dim:  The dimension of passage independent
+                                        embedding.
+        - prefix:    The prefix which will be appended to name of each layer in
+                     This function.
+    """
    question_expanded = paddle.layer.expand(
        input=question_vector,
        expand_as=input_embedding,
        expand_level=paddle.layer.ExpandLevel.FROM_NO_SEQUENCE)
    question_aligned_embedding = question_aligned_passage_embedding(
-        question_lstm_outs, input_embedding, config)
+        question_lstm_outs, input_embedding, passage_indep_embedding_dim)
    return paddle.layer.concat(input=[
        input_embedding, question_expanded, same_as_question,
        question_aligned_embedding
@@ -113,15 +192,48 @@ def encode_documents(input_embedding, same_as_question, question_vector,
 def search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx, config,
                  is_infer):
+    """Search the answer from the document.
+    The search process for this layer begins with searching a target sequence
+    from a nested sequence by using paddle.lauer.kmax_seq_score and
+    paddle.layer.sub_nested_seq_layer. In the first search step, top beam size
+    sequences with highest scores, indices of these top k sequences in the
+    original nested sequence, and the ground truth (also called gold)
+    altogether (a triple) make up of the first beam.
+    Then, start and end positions are searched. In these searches, top k
+    positions with highest scores are selected, and then sequence, starting
+    from the selected starts till ends of the sequences are taken to search
+    next by using paddle.layer.seq_slice.
+    Finally, the layer paddle.layer.cross_entropy_over_beam takes all the beam
+    expansions which contain several candidate targets found along the
+    three-step search. cross_entropy_over_beam calculates cross entropy over
+    the expanded beams which all the candidates in the beam as the normalized
+    factor.
+    Note that, if gold falls off the beam at search step t, then the cost is
+    calculated over the beam at step t.
+    Arguments:
+        - doc_lstm_outs:    The output of LSTM that process each document words.
+        - sentence_idx:    Ground-truth indicating sentence index of the answer
+                           in the document.
+        - start_idx:    Ground-truth indicating start span index of the answer
+                        in the sentence.
+        - end_idx:    Ground-truth indicating end span index of the answer
+                      in the sentence.
+        - is_infer:    The boolean parameter indicating inferring or training.
+    """
    last_state_of_sentence = paddle.layer.last_seq(
        input=doc_lstm_outs, agg_level=paddle.layer.AggregateLevel.TO_SEQUENCE)
    sentence_scores = paddle.layer.fc(
        input=last_state_of_sentence,
        size=1,
        bias_attr=False,
-        param_attr=paddle.attr.Param(initial_std=1e-3),
        act=paddle.activation.Linear())
-    topk_sentence_ids = paddle.layer.kmax_sequence_score(
+    topk_sentence_ids = paddle.layer.kmax_seq_score(
        input=sentence_scores, beam_size=config.beam_size)
    topk_sen = paddle.layer.sub_nested_seq(
        input=doc_lstm_outs, selected_indices=topk_sentence_ids)
@@ -131,29 +243,24 @@ def search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx, config,
        input=topk_sen,
        size=1,
        layer_attr=paddle.attr.ExtraLayerAttribute(
-            error_clipping_threshold=10.0),
+            error_clipping_threshold=5.0),
        bias_attr=False,
-        param_attr=paddle.attr.Param(initial_std=1e-3),
        act=paddle.activation.Linear())
-    topk_start_pos_ids = paddle.layer.kmax_sequence_score(
+    topk_start_pos_ids = paddle.layer.kmax_seq_score(
        input=start_pos_scores, beam_size=config.beam_size)
    topk_start_spans = paddle.layer.seq_slice(
        input=topk_sen, starts=topk_start_pos_ids, ends=None)
    # expand beam to search end positions on selected start spans
    _, end_span_embedding = basic_modules.stacked_bidirectional_lstm(
-        inputs=topk_start_spans,
+        topk_start_spans, config.lstm_hidden_dim, config.lstm_depth,
-        size=config.lstm_hidden_dim,
+        config.lstm_hidden_droprate, "__end_span_embeddings__")
-        depth=config.lstm_depth,
-        drop_rate=config.lstm_hidden_droprate,
-        prefix="__end_span_embeddings__")
    end_pos_scores = paddle.layer.fc(
        input=end_span_embedding,
        size=1,
        bias_attr=False,
-        param_attr=paddle.attr.Param(initial_std=1e-3),
        act=paddle.activation.Linear())
-    topk_end_pos_ids = paddle.layer.kmax_sequence_score(
+    topk_end_pos_ids = paddle.layer.kmax_seq_score(
        input=end_pos_scores, beam_size=config.beam_size)
    if is_infer:
@@ -172,15 +279,23 @@ def search_answer(doc_lstm_outs, sentence_idx, start_idx, end_idx, config,
 def GNR(config, is_infer=False):
-    # encoding question words
+    """Build the globally normalized reader model.
+    Arguments:
+        - config:    The model configuration.
+        - is_infer:    The boolean parameter indicating inferring or training.
+    """
+    # encode question words
    question_embeddings = build_pretrained_embedding(
        "question",
        paddle.data_type.integer_value_sequence(config.vocab_size),
        config.embedding_dim, config.embedding_droprate)
    question_vector, question_lstm_outs = encode_question(
-        input_embedding=question_embeddings, config=config, prefix="__ques")
+        question_embeddings, config.lstm_hidden_dim, config.lstm_depth,
+        config.passage_indep_embedding_dim, "__ques")
-    # encoding document words
+    # encode document words
    document_embeddings = build_pretrained_embedding(
        "documents",
        paddle.data_type.integer_value_sub_sequence(config.vocab_size),
@@ -190,20 +305,14 @@ def GNR(config, is_infer=False):
        type=paddle.data_type.dense_vector_sub_sequence(1))
    document_words_ecoding = encode_documents(
-        input_embedding=document_embeddings,
+        document_embeddings, same_as_question, question_vector,
-        question_vector=question_vector,
+        question_lstm_outs, config.passage_indep_embedding_dim, "__doc")
-        question_lstm_outs=question_lstm_outs,
-        same_as_question=same_as_question,
+    doc_lstm_outs = basic_modules.stacked_bidirectional_lstm_by_nested_seq(
-        config=config,
+        document_words_ecoding, config.lstm_depth, config.lstm_hidden_dim,
-        prefix="__doc")
+        "__doc_lstm")
-    doc_lstm_outs = basic_modules.stacked_bi_lstm_by_nested_seq(
+    # search the answer.
-        input_layer=document_words_ecoding,
-        hidden_dim=config.lstm_hidden_dim,
-        depth=config.lstm_depth,
-        prefix="__doc_lstm")
-    # define labels
    sentence_idx = paddle.layer.data(
        name="sen_idx", type=paddle.data_type.integer_value(1))
    start_idx = paddle.layer.data(

--- a/globally_normalized_reader/reader.py
+++ b/globally_normalized_reader/reader.py
 #!/usr/bin/env python
 #coding=utf-8
 import os
 import random
 import json
@@ -10,8 +11,16 @@ logger.setLevel(logging.INFO)
 def data_reader(data_list, is_train=True):
+    """ Data reader.
+    Arguments:
+        - data_list:  A python list which contains path of training samples.
+        - is_train:   A boolean parameter indicating this function is called
+                      in training or in inferring.
+    """
    def reader():
-        # every pass shuffle the data list again
+        """shuffle the data list again at the begining of every pass"""
        if is_train:
            random.shuffle(data_list)
@@ -33,12 +42,3 @@ def data_reader(data_list, is_train=True):
                   data['ans_end'] - data['ans_start'])
    return reader
-if __name__ == "__main__":
-    from train import choose_samples
-    train_list, dev_list = choose_samples("data/featurized")
-    for i, item in enumerate(data_reader(train_list)()):
-        print(item)
-        if i > 5: break
--- a/globally_normalized_reader/train.py
+++ b/globally_normalized_reader/train.py
 #!/usr/bin/env python
 #coding=utf-8
 from __future__ import print_function
 import os
@@ -21,44 +22,59 @@ logger.setLevel(logging.INFO)
 def load_initial_model(model_path, parameters):
-    """
+    """ Initalize parameters in the network from a trained model.
-    initalize parameters in the network from a trained model.
+    This is useful in resuming the training from previously saved models.
+    Arguments:
+        - model_path:    The path of a trained model.
+        - parameters:    The parameters in a network which will be initialized
+                         from the specified model.
    """
    with gzip.open(model_path, "rb") as f:
        parameters.init_from_tar(f)
-def load_pretrained_parameters(path, height, width):
+def load_pretrained_parameters(path):
+    """ Load one pre-trained parameter.
+    Arguments:
+        - path:    The path of the pre-trained parameter.
+    """
    return np.load(path)
 def save_model(save_path, parameters):
+    """ Save the trained parameters.
+    Arguments:
+        - save_path:    The path to save the trained parameters.
+        - parameters:   The trained model parameters.
+    """
    with gzip.open(save_path, "w") as f:
        parameters.to_tar(f)
-def load_initial_model(model_path, parameters):
-    with gzip.open(model_path, "rb") as f:
-        parameters.init_from_tar(f)
 def show_parameter_init_info(parameters):
+    """ Print the information of initialization mean and std of parameters.
+    Arguments:
+        - parameters:   The parameters created in a model.
+    """
    for p in parameters:
        logger.info("%s : initial_mean %.4f initial_std %.4f" %
                    (p, parameters.__param_conf__[p].initial_mean,
                     parameters.__param_conf__[p].initial_std))
-def dump_value_matrix(param_name, dims, value):
+def show_parameter_status(parameters):
-    np.savetxt(
+    """ Print some statistical information of parameters in a network.
-        param_name + ".txt",
-        value.reshape(dims[0], dims[1]),
-        fmt="%.4f",
-        delimiter=",")
+    This is used for debugging the model.
-def show_parameter_status(parameters):
+    Arguments:
-    # for debug print
+        - parameters:   The parameters created in a model.
+    """
    for p in parameters:
        value = parameters.get(p)
@@ -75,8 +91,10 @@ def show_parameter_status(parameters):
 def choose_samples(path):
-    """
+    """Load filenames for train, dev, and augmented samples.
-    Load filenames for train, dev, and augmented samples.
+    Arguments:
+        - path:   The path of training data.
    """
    if not os.path.exists(os.path.join(path, "train")):
        print(
@@ -97,8 +115,11 @@ def choose_samples(path):
 def build_reader(data_dir, batch_size):
-    """
+    """Build the data reader for this model.
-    Build the data reader for this model.
+    Arguments:
+        - data_dir:   The path of training data.
+        - batch_size:   batch size for the training task.
    """
    train_samples, valid_samples = choose_samples(data_dir)
@@ -109,15 +130,18 @@ def build_reader(data_dir, batch_size):
    # testing data is not shuffled
    test_reader = paddle.batch(
-        reader.data_reader(
+        reader.data_reader(valid_samples, is_train=False),
-            valid_samples, is_train=False),
        batch_size=batch_size)
-    return train_reader, test_reader
+    return train_reader, test_reader, len(train_samples)
-def build_event_handler(config, parameters, trainer, test_reader):
+def build_event_handler(config, parameters, trainer):
-    """
+    """Build the event handler for this model.
-    Build the event handler for this model.
+    Arguments:
+        - config:        The training task configuration for this model.
+        - parameters:    The parameters in the network.
+        - trainer:       The trainer object.
    """
    # End batch and end pass event handler
@@ -127,12 +151,8 @@ def build_event_handler(config, parameters, trainer, test_reader):
        if isinstance(event, paddle.event.EndIteration):
            if  event.batch_id and \
                    (not event.batch_id % config.checkpoint_period):
-                # save_path = os.path.join(config.save_dir,
-                #                          "checkpoint_param.latest.tar.gz")
                save_path = os.path.join(config.save_dir,
-                                         "pass_%05d_%03d.tar.gz" %
+                                         "checkpoint_param.latest.tar.gz")
-                                         (event.pass_id, event.batch_id))
                save_model(save_path, parameters)
            if event.batch_id and not event.batch_id % config.log_period:
@@ -148,14 +168,17 @@ def build_event_handler(config, parameters, trainer, test_reader):
                                     "pass_%05d.tar.gz" % event.pass_id)
            save_model(save_path, parameters)
-            # result = trainer.test(reader=test_reader)
-            # logger.info("Test with Pass %d, %s" %
-            #             (event.pass_id, result.metrics))
    return event_handler
 def train(model_config, trainer_config):
+    """Training the GNR model.
+    Arguments:
+        - modle_config:     The model configuration for this model.
+        - trainer_config:   The training task configuration for this model.
+    """
    if not os.path.exists(trainer_config.save_dir):
        os.mkdir(trainer_config.save_dir)
@@ -163,13 +186,24 @@ def train(model_config, trainer_config):
        use_gpu=trainer_config.use_gpu,
        trainer_count=trainer_config.trainer_count)
-    # define the optimizer
+    train_reader, test_reader, train_sample_count = build_reader(
+        trainer_config.data_dir, trainer_config.train_batch_size)
+    """
+    Define the optimizer. The learning rate will decrease according to
+    the following formula:
+    lr = learning_rate * pow(learning_rate_decay_a,
+                             floor(num_samples_processed /
+                                   learning_rate_decay_b))
+    """
    optimizer = paddle.optimizer.Adam(
        learning_rate=trainer_config.learning_rate,
-        gradient_clipping_threshold=50,
+        gradient_clipping_threshold=trainer_config.gradient_clipping_threshold,
-        regularization=paddle.optimizer.L2Regularization(rate=5e-4),
+        regularization=paddle.optimizer.L2Regularization(
-        model_average=paddle.optimizer.ModelAverage(
+            trainer_config.l2_decay_rate),
-            average_window=0.5, max_average_window=1000))
+        learning_rate_decay_a=0.5,
+        learning_rate_decay_b=train_sample_count,
+        learning_rate_schedule="discexp")
    # define network topology
    loss = GNR(model_config)
@@ -180,22 +214,14 @@ def train(model_config, trainer_config):
        load_initial_model(trainer_config.init_model_path, parameters)
    else:
        show_parameter_init_info(parameters)
-        # load the pre-trained embeddings
+        parameters.set(
-        parameters.set("GloveVectors",
+            "GloveVectors",
-                       load_pretrained_parameters(
+            load_pretrained_parameters(ModelConfig.pretrained_emb_path))
-                           ModelConfig.pretrained_emb_path,
-                           height=ModelConfig.vocab_size,
+    trainer = paddle.trainer.SGD(
-                           width=ModelConfig.embedding_dim))
+        cost=loss, parameters=parameters, update_equation=optimizer)
-    trainer = paddle.trainer.SGD(cost=loss,
+    event_handler = build_event_handler(trainer_config, parameters, trainer)
-                                 parameters=parameters,
-                                 update_equation=optimizer)
-    train_reader, test_reader = build_reader(trainer_config.data_dir,
-                                             trainer_config.train_batch_size)
-    event_handler = build_event_handler(trainer_config, parameters, trainer,
-                                        test_reader)
    trainer.train(
        reader=train_reader,
        num_passes=trainer_config.epochs,