fix style problems

a5cbdd54 · Peng Li · a8c3de0c · a5cbdd54 · a5cbdd54 · a5cbdd54
14 changed file
--- a/neural_seq_qa/.gitignore
+++ b/neural_seq_qa/.gitignore
+*.DS_Store
+build/
+build_doc/
+*.user
+*.swp
+.vscode
+.idea
+.project
+.cproject
+.pydevproject
+.settings/
+Makefile
+.test_env/
+third_party/
+*~
+bazel-*
+third_party/
+# clion workspace.
+cmake-build-*
+data/data
+data/embedding
+data/evaluation
+data/LICENSE
+data/Readme.md
+tmp
+eval.*.txt
+models*
+*.log
+run.sh
--- a/neural_seq_qa/README.md
+++ b/neural_seq_qa/README.md
+# Neural Recurrent Sequence Labeling Model for Open-Domain Factoid Question Answering
+This model implements the work in the following paper:
+Peng Li, Wei Li, Zhengyan He, Xuguang Wang, Ying Cao, Jie Zhou, and Wei Xu. Dataset and Neural Recurrent Sequence Labeling Model for Open-Domain Factoid Question Answering. [arXiv:1607.06275](https://arxiv.org/abs/1607.06275).
+If you use the dataset/code in your research, please cite the above paper:
+```text
+@article{li:2016:arxiv,
+  author  = {Li, Peng and Li, Wei and He, Zhengyan and Wang, Xuguang and Cao, Ying and Zhou, Jie and Xu, Wei},
+  title   = {Dataset and Neural Recurrent Sequence Labeling Model for Open-Domain Factoid Question Answering},
+  journal = {arXiv:1607.06275v2},
+  year    = {2016},
+  url     = {https://arxiv.org/abs/1607.06275v2},
+}
+```
+# Installation
+1. Install PaddlePaddle v0.10.5 by the following commond. Note that v0.10.0 is not supported.
+    ```bash
+    # either one is OK
+    # CPU
+    pip install paddlepaddle
+    # GPU
+    pip install paddlepaddle-gpu
+    ```
+2. Download the [WebQA](http://idl.baidu.com/WebQA.html) dataset by running
+   ```bash
+   cd data && ./download.sh && cd ..
+   ```
+#Hyperparameters
+All the hyperparameters are defined in `config.py`. The default values are aligned with the paper.
+# Training
+Training can be launched using the following command:
+```bash
+PYTHONPATH=data/evaluation:$PYTHONPATH python train.py 2>&1 | tee train.log
+```
+# Validation and Test
+WebQA provoides two versions of validation and test sets.  Automatic valiation and test can be lauched by
+```bash
+PYTHONPATH=data/evaluation:$PYTHONPATH python val_and_test.py models [ann|ir]
+```
+where
+* `models`: the directory where model files are stored. You can use `models` if `config.py` is not changed.
+* `ann`: using the validation and test sets with annotated evidence.
+* `ir`: using the validation and test sets with retrieved evidence.
+Note that validation and test can run simultaneously with training. `val_and_test.py` will handle the synchronization related problems.
+Intermediate results are stored in the directory `tmp`. You can delete them safely after validation and test.
+The results should be comparable with those shown in Table 3 in the paper.
+# Inferring using a Trained Model
+Infer using a trained model by running:
+```bash
+PYTHONPATH=data/evaluation:$PYTHONPATH python infer.py \
+  MODEL_FILE \
+  INPUT_DATA \
+  OUTPUT_FILE \
+  2>&1 | tee infer.log
+```
+where
+* `MODEL_FILE`: a trained model produced by `train.py`.
+* `INPUT_DATA`: input data in the same format as the validation/test sets of the WebQA dataset.
+* `OUTPUT_FILE`: results in the format specified in the WebQA dataset for the evaluation scripts.
--- a/neural_seq_qa/config.py
+++ b/neural_seq_qa/config.py
+import math
+__all__ = ["TrainingConfig", "InferConfig"]
+class CommonConfig(object):
+    def __init__(self):
+        # network size:
+        # dimension of the question LSTM
+        self.q_lstm_dim = 64
+        # dimension of the attention layer
+        self.latent_chain_dim = 64
+        # dimension of the evidence LSTMs
+        self.e_lstm_dim = 64
+        # dimension of the qe.comm and ee.comm feature embeddings
+        self.com_vec_dim = 2
+        self.drop_rate = 0.05
+        # CRF:
+        # valid values are BIO and BIO2
+        self.label_schema = "BIO2"
+        # word embedding:
+        # vocabulary file path
+        self.word_dict_path = "data/embedding/wordvecs.vcb"
+        # word embedding file path
+        self.wordvecs_path = "data/embedding/wordvecs.txt"
+        self.word_vec_dim = 64
+        # saving model & logs:
+        # dir for saving models
+        self.model_save_dir = "models"
+        # print training info every log_period batches
+        self.log_period = 100
+        # show parameter status every show_parameter_status_period batches
+        self.show_parameter_status_period = 100
+    @property
+    def label_num(self):
+        if self.label_schema == "BIO":
+            return 3
+        elif self.label_schema == "BIO2":
+            return 4
+        else:
+            raise ValueError("wrong value for label_schema")
+    @property
+    def default_init_std(self):
+        return 1 / math.sqrt(self.e_lstm_dim * 4)
+    @property
+    def default_l2_rate(self):
+        return 8e-4 * self.batch_size / 6
+    @property
+    def dict_dim(self):
+        return len(self.vocab)
+class TrainingConfig(CommonConfig):
+    def __init__(self):
+        super(TrainingConfig, self).__init__()
+        # data:
+        # training data path
+        self.train_data_path = "data/data/training.json.gz"
+        # number of batches used in each pass
+        self.batches_per_pass = 1000
+        # number of passes to train
+        self.num_passes = 25
+        # batch size
+        self.batch_size = 120
+        # the ratio of negative samples used in training
+        self.negative_sample_ratio = 0.2
+        # the ratio of negative samples that contain golden answer string
+        self.hit_ans_negative_sample_ratio = 0.25
+        # keep only first B in golden labels
+        self.keep_first_b = False
+        # use GPU to train the model
+        self.use_gpu = False
+        # number of threads
+        self.trainer_count = 1
+        # random seeds:
+        # data reader random seed, 0 for random seed
+        self.seed = 0
+        # paddle random seed, 0 for random seed
+        self.paddle_seed = 0
+        # optimizer:
+        self.learning_rate = 1e-3
+        # rmsprop
+        self.rho = 0.95
+        self.epsilon = 1e-4
+        # model average
+        self.average_window = 0.5
+        self.max_average_window = 10000
+class InferConfig(CommonConfig):
+    def __init__(self):
+        super(InferConfig, self).__init__()
+        self.use_gpu = False
+        self.trainer_count = 1
+        self.batch_size = 120
+        self.wordvecs = None
--- a/neural_seq_qa/data/download.sh
+++ b/neural_seq_qa/data/download.sh
+#!/bin/bash
+if [[ -d data ]] && [[ -d embedding ]] && [[ -d evaluation ]]; then
+  echo "data exist"
+  exit 0
+else
+  wget -c http://paddlepaddle.bj.bcebos.com/dataset/webqa/WebQA.v1.0.zip
+fi
+if [[ `md5sum -c md5sum.txt` =~ 'OK' ]] ; then
+    unzip WebQA.v1.0.zip
+    mv WebQA.v1.0/* .
+    rmdir WebQA.v1.0
+    rm WebQA.v1.0.zip
+else
+  echo "download data error!" >> /dev/stderr
+  exit 1
+fi
--- a/neural_seq_qa/data/md5sum.txt
+++ b/neural_seq_qa/data/md5sum.txt
+b129df2a4eb547d8b398721dd7ed6cc6  WebQA.v1.0.zip
--- a/neural_seq_qa/index.html
+++ b/neural_seq_qa/index.html
+<html>
+<head>
+  <script type="text/x-mathjax-config">
+  MathJax.Hub.Config({
+    extensions: ["tex2jax.js", "TeX/AMSsymbols.js", "TeX/AMSmath.js"],
+    jax: ["input/TeX", "output/HTML-CSS"],
+    tex2jax: {
+      inlineMath: [ ['$','$'] ],
+      displayMath: [ ['$$','$$'] ],
+      processEscapes: true
+    },
+    "HTML-CSS": { availableFonts: ["TeX"] }
+  });
+  </script>
+  <script src="https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js" async></script>
+  <script type="text/javascript" src="../.tools/theme/marked.js">
+  </script>
+  <link href="http://cdn.bootcss.com/highlight.js/9.9.0/styles/darcula.min.css" rel="stylesheet">
+  <script src="http://cdn.bootcss.com/highlight.js/9.9.0/highlight.min.js"></script>
+  <link href="http://cdn.bootcss.com/bootstrap/4.0.0-alpha.6/css/bootstrap.min.css" rel="stylesheet">
+  <link href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" rel="stylesheet">
+  <link href="../.tools/theme/github-markdown.css" rel='stylesheet'>
+</head>
+<style type="text/css" >
+.markdown-body {
+    box-sizing: border-box;
+    min-width: 200px;
+    max-width: 980px;
+    margin: 0 auto;
+    padding: 45px;
+}
+</style>
+<body>
+<div id="context" class="container-fluid markdown-body">
+</div>
+<!-- This block will be replaced by each markdown file content. Please do not change lines below.-->
+<div id="markdown" style='display:none'>
+# Neural Recurrent Sequence Labeling Model for Open-Domain Factoid Question Answering
+This model implements the work in the following paper:
+Peng Li, Wei Li, Zhengyan He, Xuguang Wang, Ying Cao, Jie Zhou, and Wei Xu. Dataset and Neural Recurrent Sequence Labeling Model for Open-Domain Factoid Question Answering. [arXiv:1607.06275](https://arxiv.org/abs/1607.06275).
+If you use the dataset/code in your research, please cite the above paper:
+```text
+@article{li:2016:arxiv,
+  author  = {Li, Peng and Li, Wei and He, Zhengyan and Wang, Xuguang and Cao, Ying and Zhou, Jie and Xu, Wei},
+  title   = {Dataset and Neural Recurrent Sequence Labeling Model for Open-Domain Factoid Question Answering},
+  journal = {arXiv:1607.06275v2},
+  year    = {2016},
+  url     = {https://arxiv.org/abs/1607.06275v2},
+}
+```
+# Installation
+1. Install PaddlePaddle v0.10.5 by the following commond. Note that v0.10.0 is not supported.
+    ```bash
+    # either one is OK
+    # CPU
+    pip install paddlepaddle
+    # GPU
+    pip install paddlepaddle-gpu
+    ```
+2. Download the [WebQA](http://idl.baidu.com/WebQA.html) dataset by running
+   ```bash
+   cd data && ./download.sh && cd ..
+   ```
+#Hyperparameters
+All the hyperparameters are defined in `config.py`. The default values are aligned with the paper.
+# Training
+Training can be launched using the following command:
+```bash
+PYTHONPATH=data/evaluation:$PYTHONPATH python train.py 2>&1 | tee train.log
+```
+# Validation and Test
+WebQA provoides two versions of validation and test sets.  Automatic valiation and test can be lauched by
+```bash
+PYTHONPATH=data/evaluation:$PYTHONPATH python val_and_test.py models [ann|ir]
+```
+where
+* `models`: the directory where model files are stored. You can use `models` if `config.py` is not changed.
+* `ann`: using the validation and test sets with annotated evidence.
+* `ir`: using the validation and test sets with retrieved evidence.
+Note that validation and test can run simultaneously with training. `val_and_test.py` will handle the synchronization related problems.
+Intermediate results are stored in the directory `tmp`. You can delete them safely after validation and test.
+The results should be comparable with those shown in Table 3 in the paper.
+# Inferring using a Trained Model
+Infer using a trained model by running:
+```bash
+PYTHONPATH=data/evaluation:$PYTHONPATH python infer.py \
+  MODEL_FILE \
+  INPUT_DATA \
+  OUTPUT_FILE \
+  2>&1 | tee infer.log
+```
+where
+* `MODEL_FILE`: a trained model produced by `train.py`.
+* `INPUT_DATA`: input data in the same format as the validation/test sets of the WebQA dataset.
+* `OUTPUT_FILE`: results in the format specified in the WebQA dataset for the evaluation scripts.
+</div>
+<!-- You can change the lines below now. -->
+<script type="text/javascript">
+marked.setOptions({
+  renderer: new marked.Renderer(),
+  gfm: true,
+  breaks: false,
+  smartypants: true,
+  highlight: function(code, lang) {
+    code = code.replace(/&amp;/g, "&")
+    code = code.replace(/&gt;/g, ">")
+    code = code.replace(/&lt;/g, "<")
+    code = code.replace(/&nbsp;/g, " ")
+    return hljs.highlightAuto(code, [lang]).value;
+  }
+});
+document.getElementById("context").innerHTML = marked(
+        document.getElementById("markdown").innerHTML)
+</script>
+</body>
--- a/neural_seq_qa/infer.py
+++ b/neural_seq_qa/infer.py
+import os
+import sys
+import argparse
+import paddle.v2 as paddle
+import reader
+import utils
+import network
+import config
+from utils import logger
+class Infer(object):
+    def __init__(self, conf):
+        self.conf = conf
+        self.settings = reader.Settings(
+            vocab=conf.vocab, is_training=False, label_schema=conf.label_schema)
+        # init paddle
+        # TODO(lipeng17) v2 API does not support parallel_nn yet. Therefore, we
+        # can only use CPU currently
+        paddle.init(use_gpu=conf.use_gpu, trainer_count=conf.trainer_count)
+        # define network
+        self.tags_layer = network.inference_net(conf)
+    def infer(self, model_path, data_path, output):
+        test_reader = paddle.batch(
+            paddle.reader.buffered(
+                reader.create_reader(data_path, self.settings),
+                size=self.conf.batch_size * 1000),
+            batch_size=self.conf.batch_size)
+        # load the trained models
+        parameters = paddle.parameters.Parameters.from_tar(
+            utils.open_file(model_path, "r"))
+        inferer = paddle.inference.Inference(
+            output_layer=self.tags_layer, parameters=parameters)
+        def count_evi_ids(test_batch):
+            num = 0
+            for sample in test_batch:
+                num += len(sample[reader.E_IDS])
+            return num
+        for test_batch in test_reader():
+            tags = inferer.infer(
+                input=test_batch, field=["id"], feeding=network.feeding)
+            evi_ids_num = count_evi_ids(test_batch)
+            assert len(tags) == evi_ids_num
+            print >> output, ";\n".join(str(tag) for tag in tags) + ";"
+def parse_cmd():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("model_path")
+    parser.add_argument("data_path")
+    parser.add_argument("output", help="'-' for stdout")
+    return parser.parse_args()
+def main(args):
+    conf = config.InferConfig()
+    conf.vocab = utils.load_dict(conf.word_dict_path)
+    logger.info("length of word dictionary is : %d." % len(conf.vocab))
+    if args.output == "-":
+        output = sys.stdout
+    else:
+        output = utils.open_file(args.output, "w")
+    infer = Infer(conf)
+    infer.infer(args.model_path, args.data_path, output)
+    output.close()
+if __name__ == "__main__":
+    main(parse_cmd())
--- a/neural_seq_qa/network.py
+++ b/neural_seq_qa/network.py
+import math
+import paddle.v2 as paddle
+import reader
+__all__ = ["training_net", "inference_net", "feeding"]
+feeding = {
+    reader.Q_IDS_STR: reader.Q_IDS,
+    reader.E_IDS_STR: reader.E_IDS,
+    reader.QE_COMM_STR: reader.QE_COMM,
+    reader.EE_COMM_STR: reader.EE_COMM,
+    reader.LABELS_STR: reader.LABELS
+}
+def get_embedding(input, word_vec_dim, wordvecs):
+    """
+    Defined word embedding
+    :param input: layer input
+    :type input: LayerOutput
+    :param word_vec_dim: dimension of the word embeddings
+    :type word_vec_dim: int
+    :param wordvecs: word embedding matrix
+    :type wordvecs: numpy array
+    :return: embedding
+    :rtype: LayerOutput
+    """
+    return paddle.layer.embedding(
+        input=input,
+        size=word_vec_dim,
+        param_attr=paddle.attr.ParamAttr(
+            name="wordvecs", is_static=True, initializer=lambda _: wordvecs))
+def encoding_question(question, q_lstm_dim, latent_chain_dim, word_vec_dim,
+                      drop_rate, wordvecs, default_init_std, default_l2_rate):
+    """
+    Define network for encoding question
+    :param question: question token ids
+    :type question: LayerOutput
+    :param q_lstm_dim: dimension of the question LSTM
+    :type q_lstm_dim: int
+    :param latent_chain_dim: dimension of the attention layer
+    :type latent_chain_dim: int
+    :param word_vec_dim: dimension of the word embeddings
+    :type word_vec_dim: int
+    :param drop_rate: dropout rate
+    :type drop_rate: float
+    :param wordvecs: word embedding matrix
+    :type wordvecs: numpy array
+    :param default_init_std: default initial std
+    :type default_init_std: float
+    :param default_l2_rate: default l2 rate
+    :type default_l2_rate: float
+    :return: question encoding
+    :rtype: LayerOutput
+    """
+    # word embedding
+    emb = get_embedding(question, word_vec_dim, wordvecs)
+    # question LSTM
+    wx = paddle.layer.fc(
+        act=paddle.activation.Linear(),
+        size=q_lstm_dim * 4,
+        input=emb,
+        param_attr=paddle.attr.ParamAttr(
+            name="_q_hidden1.w0",
+            initial_std=default_init_std,
+            l2_rate=default_l2_rate),
+        bias_attr=paddle.attr.ParamAttr(
+            name="_q_hidden1.wbias", initial_std=0, l2_rate=default_l2_rate))
+    q_rnn = paddle.layer.lstmemory(
+        input=wx,
+        bias_attr=paddle.attr.ParamAttr(
+            name="_q_rnn1.wbias", initial_std=0, l2_rate=default_l2_rate),
+        param_attr=paddle.attr.ParamAttr(
+            name="_q_rnn1.w0",
+            initial_std=default_init_std,
+            l2_rate=default_l2_rate))
+    q_rnn = paddle.layer.dropout(q_rnn, drop_rate)
+    # self attention
+    fc = paddle.layer.fc(
+        act=paddle.activation.Tanh(),
+        size=latent_chain_dim,
+        input=q_rnn,
+        param_attr=paddle.attr.ParamAttr(
+            name="_attention_layer1.w0",
+            initial_std=default_init_std,
+            l2_rate=default_l2_rate),
+        bias_attr=False)
+    weight = paddle.layer.fc(
+        size=1,
+        act=paddle.activation.SequenceSoftmax(),
+        input=fc,
+        param_attr=paddle.attr.ParamAttr(
+            name="_attention_weight.w0",
+            initial_std=default_init_std,
+            l2_rate=default_l2_rate),
+        bias_attr=False)
+    scaled_q_rnn = paddle.layer.scaling(input=q_rnn, weight=weight)
+    q_encoding = paddle.layer.pooling(
+        input=scaled_q_rnn, pooling_type=paddle.pooling.Sum())
+    return q_encoding
+def encoding_evidence(evidence, qe_comm, ee_comm, q_encoding, e_lstm_dim,
+                      word_vec_dim, com_vec_dim, drop_rate, wordvecs,
+                      default_init_std, default_l2_rate):
+    """
+    Define network for encoding evidence
+    :param qe_comm: qe.ecomm features
+    :type qe_comm: LayerOutput
+    :param ee_comm: ee.ecomm features
+    :type ee_comm: LayerOutput
+    :param q_encoding: question encoding, a fixed-length vector
+    :type q_encoding: LayerOutput
+    :param e_lstm_dim: dimension of the evidence LSTMs
+    :type e_lstm_dim: int
+    :param word_vec_dim: dimension of the word embeddings
+    :type word_vec_dim: int
+    :param com_vec_dim: dimension of the qe.comm and ee.comm feature embeddings
+    :type com_vec_dim: int
+    :param drop_rate: dropout rate
+    :type drop_rate: float
+    :param wordvecs: word embedding matrix
+    :type wordvecs: numpy array
+    :param default_init_std: default initial std
+    :type default_init_std: float
+    :param default_l2_rate: default l2 rate
+    :type default_l2_rate: float
+    :return: evidence encoding
+    :rtype: LayerOutput
+    """
+    def lstm(idx, reverse, inputs):
+        """LSTM wrapper"""
+        bias_attr = paddle.attr.ParamAttr(
+            name="_e_hidden%d.wbias" % idx,
+            initial_std=0,
+            l2_rate=default_l2_rate)
+        with paddle.layer.mixed(size=e_lstm_dim * 4, bias_attr=bias_attr) as wx:
+            for i, input in enumerate(inputs):
+                param_attr = paddle.attr.ParamAttr(
+                    name="_e_hidden%d.w%d" % (idx, i),
+                    initial_std=default_init_std,
+                    l2_rate=default_l2_rate)
+                wx += paddle.layer.full_matrix_projection(
+                    input=input, param_attr=param_attr)
+        e_rnn = paddle.layer.lstmemory(
+            input=wx,
+            reverse=reverse,
+            bias_attr=paddle.attr.ParamAttr(
+                name="_e_rnn%d.wbias" % idx,
+                initial_std=0,
+                l2_rate=default_l2_rate),
+            param_attr=paddle.attr.ParamAttr(
+                name="_e_rnn%d.w0" % idx,
+                initial_std=default_init_std,
+                l2_rate=default_l2_rate))
+        e_rnn = paddle.layer.dropout(e_rnn, drop_rate)
+        return e_rnn
+    # share word embeddings with question
+    emb = get_embedding(evidence, word_vec_dim, wordvecs)
+    # copy q_encoding len(evidence) times
+    q_encoding_expand = paddle.layer.expand(
+        input=q_encoding, expand_as=evidence)
+    # feature embeddings
+    comm_initial_std = 1 / math.sqrt(64.0)
+    qe_comm_emb = paddle.layer.embedding(
+        input=qe_comm,
+        size=com_vec_dim,
+        param_attr=paddle.attr.ParamAttr(
+            name="_cw_embedding.w0",
+            initial_std=comm_initial_std,
+            l2_rate=default_l2_rate))
+    ee_comm_emb = paddle.layer.embedding(
+        input=ee_comm,
+        size=com_vec_dim,
+        param_attr=paddle.attr.ParamAttr(
+            name="_eecom_embedding.w0",
+            initial_std=comm_initial_std,
+            l2_rate=default_l2_rate))
+    # evidence LSTMs
+    first_layer_extra_inputs = [q_encoding_expand, qe_comm_emb, ee_comm_emb]
+    e_rnn1 = lstm(1, False, [emb] + first_layer_extra_inputs)
+    e_rnn2 = lstm(2, True, [e_rnn1])
+    e_rnn3 = lstm(3, False, [e_rnn2, e_rnn1])  # with cross layer links
+    return e_rnn3
+def define_data(dict_dim, label_num):
+    """
+    Define data layers
+    :param dict_dim: number of words in the vocabulary
+    :type dict_dim: int
+    :param label_num: label numbers, BIO:3, BIO2:4
+    :type label_num: int
+    :return: data layers
+    :rtype: tuple of LayerOutput
+    """
+    question = paddle.layer.data(
+        name=reader.Q_IDS_STR,
+        type=paddle.data_type.integer_value_sequence(dict_dim))
+    evidence = paddle.layer.data(
+        name=reader.E_IDS_STR,
+        type=paddle.data_type.integer_value_sequence(dict_dim))
+    qe_comm = paddle.layer.data(
+        name=reader.QE_COMM_STR,
+        type=paddle.data_type.integer_value_sequence(2))
+    ee_comm = paddle.layer.data(
+        name=reader.EE_COMM_STR,
+        type=paddle.data_type.integer_value_sequence(2))
+    label = paddle.layer.data(
+        name=reader.LABELS_STR,
+        type=paddle.data_type.integer_value_sequence(label_num),
+        layer_attr=paddle.attr.ExtraAttr(device=-1))
+    return question, evidence, qe_comm, ee_comm, label
+def define_common_network(conf):
+    """
+    Define common network
+    :param conf: network conf
+    :return: CRF features, golden labels
+    :rtype: tuple
+    """
+    # define data layers
+    question, evidence, qe_comm, ee_comm, label = \
+            define_data(conf.dict_dim, conf.label_num)
+    # encode question
+    q_encoding = encoding_question(question, conf.q_lstm_dim,
+                                   conf.latent_chain_dim, conf.word_vec_dim,
+                                   conf.drop_rate, conf.wordvecs,
+                                   conf.default_init_std, conf.default_l2_rate)
+    # encode evidence
+    e_encoding = encoding_evidence(
+        evidence, qe_comm, ee_comm, q_encoding, conf.e_lstm_dim,
+        conf.word_vec_dim, conf.com_vec_dim, conf.drop_rate, conf.wordvecs,
+        conf.default_init_std, conf.default_l2_rate)
+    # pre-compute CRF features
+    crf_feats = paddle.layer.fc(
+        act=paddle.activation.Linear(),
+        input=e_encoding,
+        size=conf.label_num,
+        param_attr=paddle.attr.ParamAttr(
+            name="_output.w0",
+            initial_std=conf.default_init_std,
+            l2_rate=conf.default_l2_rate),
+        bias_attr=False)
+    return crf_feats, label
+def training_net(conf):
+    """
+    Define training network
+    :param conf: network conf
+    :return: CRF cost
+    :rtype: LayerOutput
+    """
+    e_encoding, label = define_common_network(conf)
+    crf = paddle.layer.crf(
+        input=e_encoding,
+        label=label,
+        size=conf.label_num,
+        param_attr=paddle.attr.ParamAttr(
+            name="_crf.w0",
+            initial_std=conf.default_init_std,
+            l2_rate=conf.default_l2_rate),
+        layer_attr=paddle.attr.ExtraAttr(device=-1))
+    return crf
+def inference_net(conf):
+    """
+    Define training network
+    :param conf: network conf
+    :return: CRF viberbi decoding result
+    :rtype: LayerOutput
+    """
+    e_encoding, label = define_common_network(conf)
+    ret = paddle.layer.crf_decoding(
+        input=e_encoding,
+        size=conf.label_num,
+        param_attr=paddle.attr.ParamAttr(name="_crf.w0"),
+        layer_attr=paddle.attr.ExtraAttr(device=-1))
+    return ret
--- a/neural_seq_qa/reader.py
+++ b/neural_seq_qa/reader.py
+import sys
+import random
+from itertools import izip
+import json
+import traceback
+from datapoint import DataPoint, Evidence, EecommFeatures
+import utils
+from utils import logger
+__all__ = ["Q_IDS", "E_IDS", "LABELS", "QE_COMM", "EE_COMM",
+           "Q_IDS_STR", "E_IDS_STR", "LABELS_STR", "QE_COMM_STR", "EE_COMM_STR",
+           "Settings", "create_reader"]
+# slot names
+Q_IDS_STR = "q_ids"
+E_IDS_STR = "e_ids"
+LABELS_STR = "labels"
+QE_COMM_STR = "qe.comm"
+EE_COMM_STR = "ee.comm"
+Q_IDS = 0
+E_IDS = 1
+LABELS = 2
+QE_COMM = 3
+EE_COMM = 4
+NO_ANSWER = "no_answer"
+class Settings(object):
+    """
+    class for storing settings
+    """
+    def __init__(self,
+                 vocab,
+                 is_training,
+                 label_schema="BIO2",
+                 negative_sample_ratio=0.2,
+                 hit_ans_negative_sample_ratio=0.25,
+                 keep_first_b=False,
+                 seed=31425926):
+        """
+        Init function
+        :param vocab: word dict
+        :type vocab: dict
+        :param is_training: True for training
+        :type is_training: bool
+        :param label_schema: label schema, valid values are BIO and BIO2,
+            the default value is BIO2
+        :type label_schema: str
+        :param negative_sample_ratio: the ratio of negative samples used in
+            training, the default value is 0.2
+        :type negative_sample_ratio: float
+        :param hit_ans_negative_sample_ratio: the ratio of negative samples 
+            that contain golden answer string, the default value is 0.25
+        :type hit_ans_negative_sample_ratio: float
+        :param keep_first_b: only keep the first B in golden tag sequence,
+            the default value is False
+        :type keep_first_b: bool
+        :param seed: random seed, the default value is 31425926
+        :type seed: int
+        """
+        self.negative_sample_ratio = negative_sample_ratio
+        self.hit_ans_negative_sample_ratio = hit_ans_negative_sample_ratio
+        self.keep_first_b = keep_first_b
+        self.is_training = is_training
+        self.vocab = vocab
+        # set up label schema
+        if label_schema == "BIO":
+            B, I, O1, O2 = 0, 1, 2, 2
+        elif label_schema == "BIO2":
+            B, I, O1, O2 = 0, 1, 2, 3
+        else:
+            raise ValueError("label_schema should be BIO/BIO2") 
+        self.B, self.I, self.O1, self.O2 = B, I, O1, O2
+        self.label_map = {"B":B, "I":I, "O1":O1, "O2":O2,
+                          "b":B, "i":I, "o1":O1, "o2":O2}
+        self.label_num = len(set((B, I, O1, O2)))
+        # id for OOV
+        self.oov_id = 0
+        # set up random seed
+        random.seed(seed)
+        # booking message
+        logger.info("negative_sample_ratio: %f", negative_sample_ratio)
+        logger.info("hit_ans_negative_sample_ratio: %f",
+                    hit_ans_negative_sample_ratio)
+        logger.info("keep_first_b: %s", keep_first_b)
+        logger.info("data reader random seed: %d", seed)
+class SampleStream(object):
+    def __init__(self, filename, settings):
+        self.filename = filename
+        self.settings = settings
+    def __iter__(self):
+        return self.load_and_filter_samples(self.filename)
+    def load_and_filter_samples(self, filename):
+        def remove_extra_b(labels):
+            if labels.count(self.settings.B) <= 1: return
+            i = 0
+            # find the first B
+            while i < len(labels) and labels[i] == self.settings.O1:
+                i += 1
+            i += 1 # skip B
+            # skip the following Is
+            while i < len(labels) and labels[i] == self.settings.I:
+                i += 1
+            # change all the other tags to O2
+            while i < len(labels):
+                labels[i] = self.settings.O2
+                i += 1
+        def filter_and_preprocess_evidences(evidences):
+            for i, evi in enumerate(evidences):
+                # convert golden labels to labels ids
+                if Evidence.GOLDEN_LABELS in evi:
+                    labels = [self.settings.label_map[l] \
+                                for l in evi[Evidence.GOLDEN_LABELS]]
+                else:
+                    labels = [self.settings.O1] * len(evi[Evidence.E_TOKENS])
+                # determine the current evidence is negative or not
+                answer_list = evi[Evidence.GOLDEN_ANSWERS]
+                is_negative = len(answer_list) == 1 \
+                                and "".join(answer_list[0]).lower() == NO_ANSWER
+                # drop positive evidences that do not contain golden answer
+                # matches in training
+                is_all_o1 = labels.count(self.settings.O1) == len(labels)
+                if self.settings.is_training and is_all_o1 and not is_negative:
+                    evidences[i] = None # dropped
+                    continue
+                if self.settings.keep_first_b:
+                    remove_extra_b(labels)
+                evi[Evidence.GOLDEN_LABELS] = labels
+        def get_eecom_feats_list(cur_sample_is_negative,
+                                 eecom_feats_list,
+                                 evidences):
+            if not self.settings.is_training:
+                 return [item[EecommFeatures.EECOMM_FEATURES] \
+                            for item in eecom_feats_list]
+            positive_eecom_feats_list = []
+            negative_eecom_feats_list = []
+            for eecom_feats_, other_evi in izip(eecom_feats_list, evidences):
+                if not other_evi: continue
+                eecom_feats = eecom_feats_[EecommFeatures.EECOMM_FEATURES]
+                if not eecom_feats: continue
+                other_evi_type = eecom_feats_[EecommFeatures.OTHER_E_TYPE]
+                if cur_sample_is_negative and \
+                        other_evi_type != Evidence.POSITIVE:
+                    continue
+                if other_evi_type == Evidence.POSITIVE:
+                    positive_eecom_feats_list.append(eecom_feats)
+                else:
+                    negative_eecom_feats_list.append(eecom_feats)
+            eecom_feats_list = positive_eecom_feats_list
+            if negative_eecom_feats_list:
+                eecom_feats_list += [negative_eecom_feats_list]
+            return eecom_feats_list
+        def process_tokens(data, tok_key):
+            ids = [self.settings.vocab.get(token, self.settings.oov_id) \
+                        for token in data[tok_key]]
+            return ids
+        def process_evi(q_ids, evi, evidences):
+            e_ids = process_tokens(evi, Evidence.E_TOKENS)
+            labels = evi[Evidence.GOLDEN_LABELS]
+            qe_comm = evi[Evidence.QECOMM_FEATURES]
+            sample_type = evi[Evidence.TYPE]
+            ret = [None] *  5
+            ret[Q_IDS] = q_ids
+            ret[E_IDS] = e_ids
+            ret[LABELS] = labels
+            ret[QE_COMM] = qe_comm
+            eecom_feats_list = get_eecom_feats_list(
+                                    sample_type != Evidence.POSITIVE,
+                                    evi[Evidence.EECOMM_FEATURES_LIST],
+                                    evidences)
+            if not eecom_feats_list:
+                return None
+            else:
+                ret[EE_COMM] = eecom_feats_list
+                return ret
+        with utils.DotBar(utils.open_file(filename)) as f_:
+            for q_idx, line in enumerate(f_):
+                # parse json line
+                try:
+                    data = json.loads(line)
+                except Exception:
+                    logger.fatal("ERROR LINE: %s", line.strip())
+                    traceback.print_exc()
+                    continue
+                # convert question tokens to ids
+                q_ids = process_tokens(data, DataPoint.Q_TOKENS)
+                # process evidences
+                evidences = data[DataPoint.EVIDENCES]
+                filter_and_preprocess_evidences(evidences)
+                for evi in evidences:
+                    if not evi: continue
+                    sample = process_evi(q_ids, evi, evidences)
+                    if sample: yield q_idx, sample, evi[Evidence.TYPE]
+class DataReader(object):
+    def __iter__(self):
+        return self
+    def _next(self):
+        raise NotImplemented()
+    def next(self):
+        data_point = self._next()
+        return self.post_process_sample(data_point)
+    def post_process_sample(self, sample):
+        ret = list(sample)
+        # choose eecom features randomly
+        eecom_feats = random.choice(sample[EE_COMM])
+        if not isinstance(eecom_feats[0], int):
+            # the other evidence is a negative evidence
+            eecom_feats = random.choice(eecom_feats)
+        ret[EE_COMM] = eecom_feats
+        return ret
+class TrainingDataReader(DataReader):
+    def __init__(self,
+                 sample_stream,
+                 negative_ratio,
+                 hit_ans_negative_ratio):
+        super(TrainingDataReader, self).__init__()
+        self.positive_data = []
+        self.hit_ans_negative_data = []
+        self.other_negative_data = []
+        self.negative_ratio = negative_ratio
+        self.hit_ans_negative_ratio = hit_ans_negative_ratio
+        self.p_idx = 0
+        self.hit_idx = 0
+        self.other_idx = 0
+        self.load_samples(sample_stream)
+    def add_data(self, positive, hit_negative, other_negative):
+        if not positive: return
+        self.positive_data.extend(positive)
+        for samples, target_list in \
+                zip((hit_negative, other_negative),
+                    (self.hit_ans_negative_data, self.other_negative_data)):
+            if not samples: continue
+            # `0" is an index, further refer to _next_negative_data()
+            target_list.append([samples, 0])
+    def load_samples(self, sample_stream):
+        logger.info("loading data...")
+        last_q_id, positive, hit_negative, other_negative = None, [], [], []
+        for q_id, sample, type_ in sample_stream:
+            if not last_q_id and q_id != last_q_id:
+                self.add_data(positive, hit_negative, other_negative)
+                positive, hit_negative, other_negative = [], [], []
+            last_q_id = q_id
+            if type_ == Evidence.POSITIVE:
+                positive.append(sample)
+            elif type_ == Evidence.HIT_ANS_NEGATIVE:
+                hit_negative.append(sample)
+            elif type_ == Evidence.OTHER_NEGATIVE:
+                other_negative.append(sample)
+            else:
+                raise ValueError("wrong type: %s" % str(type_))
+        self.add_data(positive, hit_negative, other_negative)
+        # we are not sure whether the input data is shuffled or not
+        # so we shuffle them
+        random.shuffle(self.positive_data)
+        random.shuffle(self.hit_ans_negative_data)
+        random.shuffle(self.other_negative_data)
+        # set thresholds
+        if len(self.positive_data) == 0:
+            logger.fatal("zero positive sample")
+            raise ValueError("zero positive sample")
+        zero_hit = len(self.hit_ans_negative_data) == 0 
+        zero_other = len(self.other_negative_data) == 0
+        if zero_hit and zero_other:
+            logger.fatal("zero negative sample")
+            raise ValueError("zero negative sample")
+        if zero_hit:
+            logger.warning("zero hit_ans_negative sample")
+            self.hit_ans_neg_threshold = 0
+        else:
+            self.hit_ans_neg_threshold = \
+                self.negative_ratio * self.hit_ans_negative_ratio
+        self.other_neg_threshold = self.negative_ratio
+        if zero_other:
+            logger.warning("zero other_negative sample")
+            self.hit_ans_neg_threshold = self.negative_ratio
+        logger.info("loaded")
+    def next_positive_data(self):
+        if self.p_idx >= len(self.positive_data):
+            random.shuffle(self.positive_data)
+            self.p_idx = 0
+        self.p_idx += 1
+        return self.positive_data[self.p_idx-1]
+    def _next_negative_data(self, idx, negative_data):
+        if idx >= len(negative_data):
+            random.shuffle(negative_data)
+            idx = 0
+        # a negative evidence is sampled in two steps: 
+        # step 1: sample a question uniformly
+        # step 2: sample a negative evidence corresponding to the question
+        #         uniformly
+        # bundle -> (sample, idx)
+        bundle = negative_data[idx]
+        if bundle[1] >= len(bundle[0]):
+            random.shuffle(bundle[0])
+            bundle[1] = 0
+        bundle[1] += 1
+        return idx+1, bundle[0][bundle[1]-1]
+    def next_hit_ans_negative_data(self):
+        self.hit_idx, data = self._next_negative_data(
+                self.hit_idx, self.hit_ans_negative_data)
+        return data
+    def next_other_negative_data(self):
+        self.other_idx, data = self._next_negative_data(
+                self.other_idx, self.other_negative_data)
+        return data
+    def _next(self):
+        rand = random.random()
+        if rand <= self.hit_ans_neg_threshold:
+            return self.next_hit_ans_negative_data()
+        elif rand < self.other_neg_threshold:
+            return self.next_other_negative_data()
+        else:
+            return self.next_positive_data()
+class TestDataReader(DataReader):
+    def __init__(self, sample_stream):
+        super(TestDataReader, self).__init__()
+        self.data_generator = iter(sample_stream)
+    def _next(self):
+        q_idx, sample, type_ = self.data_generator.next()
+        return sample
+def create_reader(filename, settings, samples_per_pass=sys.maxint):
+    if settings.is_training:
+        training_reader = TrainingDataReader(
+                SampleStream(filename, settings),
+                settings.negative_sample_ratio,
+                settings.hit_ans_negative_sample_ratio)
+        def wrapper():
+            for i, data in izip(xrange(samples_per_pass), training_reader):
+                yield data
+        return wrapper
+    else:
+        def wrapper():
+            sample_stream = SampleStream(filename, settings)
+            return TestDataReader(sample_stream)
+        return wrapper
--- a/neural_seq_qa/test/test_reader.py
+++ b/neural_seq_qa/test/test_reader.py
+import unittest
+import os
+import itertools
+import math
+import logging
+# set up python path
+topdir = os.path.join(os.path.dirname(os.path.abspath(__file__)), "..")
+import sys
+sys.path += [topdir, os.path.join(topdir, "data", "evaluation")]
+import reader
+import utils
+formatter = logging.Formatter(
+    "[%(levelname)s %(asctime)s.%(msecs)d %(filename)s:%(lineno)d] %(message)s",
+    datefmt='%Y-%m-%d %I:%M:%S')
+ch = logging.StreamHandler()
+ch.setFormatter(formatter)
+utils.logger.addHandler(ch)
+class Vocab(object):
+    @property
+    def data(self):
+        word_dict_path = os.path.join(
+                topdir, "data", "embedding", "wordvecs.vcb")
+        return utils.load_dict(word_dict_path)
+class NegativeSampleRatioTest(unittest.TestCase):
+    def check_ratio(self, negative_sample_ratio):
+        for keep_first_b in [True, False]:
+            settings = reader.Settings(
+                vocab=Vocab().data,
+                is_training=True,
+                label_schema="BIO2",
+                negative_sample_ratio=negative_sample_ratio,
+                hit_ans_negative_sample_ratio=0.25,
+                keep_first_b=keep_first_b)
+            filename = os.path.join(topdir, "test", "trn_data.gz")
+            data_stream = reader.create_reader(filename, settings)
+            total, negative_num = 5000, 0
+            for _, d in itertools.izip(xrange(total), data_stream()):
+                labels = d[reader.LABELS]
+                if labels.count(0) == 0:
+                    negative_num += 1
+            ratio = negative_num / float(total)
+            self.assertLessEqual(math.fabs(ratio - negative_sample_ratio), 0.01)
+    def runTest(self):
+        for ratio in [1., 0.25, 0.]:
+            self.check_ratio(ratio)
+class KeepFirstBTest(unittest.TestCase):
+    def runTest(self):
+        for keep_first_b in [True, False]:
+            for label_schema in ["BIO", "BIO2"]:
+                settings = reader.Settings(
+                    vocab=Vocab().data,
+                    is_training=True,
+                    label_schema=label_schema,
+                    negative_sample_ratio=0.2,
+                    hit_ans_negative_sample_ratio=0.25,
+                    keep_first_b=keep_first_b)
+                filename = os.path.join(topdir, "test", "trn_data.gz")
+                data_stream = reader.create_reader(filename, settings)
+                total, at_least_one, one = 1000, 0, 0
+                for _, d in itertools.izip(xrange(total), data_stream()):
+                    labels = d[reader.LABELS]
+                    b_num = labels.count(0)
+                    if b_num >= 1:
+                        at_least_one += 1
+                    if b_num == 1:
+                        one += 1
+                self.assertLess(at_least_one, total)
+                if keep_first_b:
+                    self.assertEqual(one, at_least_one)
+                else:
+                    self.assertLess(one, at_least_one)
+class DictTest(unittest.TestCase):
+    def runTest(self):
+        settings = reader.Settings(
+            vocab=Vocab().data,
+            is_training=True,
+            label_schema="BIO2",
+            negative_sample_ratio=0.2,
+            hit_ans_negative_sample_ratio=0.25,
+            keep_first_b=True)
+        filename = os.path.join(topdir, "test", "trn_data.gz")
+        data_stream = reader.create_reader(filename, settings)
+        q_uniq_ids, e_uniq_ids = set(), set()
+        for _, d in itertools.izip(xrange(1000), data_stream()):
+            q_uniq_ids.update(d[reader.Q_IDS])
+            e_uniq_ids.update(d[reader.E_IDS])
+        self.assertGreater(len(q_uniq_ids), 50)
+        self.assertGreater(len(e_uniq_ids), 50)
+if __name__ == '__main__':
+    unittest.main()
--- a/neural_seq_qa/test/trn_data.gz
+++ b/neural_seq_qa/test/trn_data.gz
--- a/neural_seq_qa/train.py
+++ b/neural_seq_qa/train.py
+import sys
+import os
+import argparse
+import numpy as np
+import paddle.v2 as paddle
+import reader
+import utils
+import network
+import config
+from utils import logger
+def save_model(trainer, model_save_dir, parameters, pass_id):
+    f = os.path.join(model_save_dir, "params_pass_%05d.tar.gz" % pass_id)
+    logger.info("model saved to %s" % f)
+    with utils.open_file(f, "w") as f:
+        trainer.save_parameter_to_tar(f)
+def show_parameter_init_info(parameters):
+    """
+    Print the information of initialization mean and std of parameters
+    :param parameters: the parameters created in a model
+    """
+    logger.info("Parameter init info:")
+    for p in parameters:
+        p_val = parameters.get(p)
+        logger.info(("%-25s : initial_mean=%-7.4f initial_std=%-7.4f "
+                     "actual_mean=%-7.4f actual_std=%-7.4f dims=%s") %
+                    (p, parameters.__param_conf__[p].initial_mean,
+                     parameters.__param_conf__[p].initial_std, p_val.mean(),
+                     p_val.std(), parameters.__param_conf__[p].dims))
+    logger.info("\n")
+def show_parameter_status(parameters):
+    """
+    Print some statistical information of parameters in a network
+    :param parameters: the parameters created in a model
+    """
+    for p in parameters:
+        abs_val = np.abs(parameters.get(p))
+        abs_grad = np.abs(parameters.get_grad(p))
+        logger.info(
+            ("%-25s avg_abs_val=%-10.6f max_val=%-10.6f avg_abs_grad=%-10.6f "
+             "max_grad=%-10.6f min_val=%-10.6f min_grad=%-10.6f") %
+            (p, abs_val.mean(), abs_val.max(), abs_grad.mean(), abs_grad.max(),
+             abs_val.min(), abs_grad.min()))
+def train(conf):
+    if not os.path.exists(conf.model_save_dir):
+        os.makedirs(conf.model_save_dir, mode=0755)
+    settings = reader.Settings(
+        vocab=conf.vocab,
+        is_training=True,
+        label_schema=conf.label_schema,
+        negative_sample_ratio=conf.negative_sample_ratio,
+        hit_ans_negative_sample_ratio=conf.hit_ans_negative_sample_ratio,
+        keep_first_b=conf.keep_first_b,
+        seed=conf.seed)
+    samples_per_pass = conf.batch_size * conf.batches_per_pass
+    train_reader = paddle.batch(
+        paddle.reader.buffered(
+            reader.create_reader(conf.train_data_path, settings,
+                                 samples_per_pass),
+            size=samples_per_pass),
+        batch_size=conf.batch_size)
+    # TODO(lipeng17) v2 API does not support parallel_nn yet. Therefore, we can
+    # only use CPU currently
+    paddle.init(
+        use_gpu=conf.use_gpu,
+        trainer_count=conf.trainer_count,
+        seed=conf.paddle_seed)
+    # network config
+    cost = network.training_net(conf)
+    # create parameters
+    # NOTE: parameter values are not initilized here, therefore, we need to
+    # print parameter initialization info in the beginning of the first batch
+    parameters = paddle.parameters.create(cost)
+    # create optimizer
+    rmsprop_optimizer = paddle.optimizer.RMSProp(
+        learning_rate=conf.learning_rate,
+        rho=conf.rho,
+        epsilon=conf.epsilon,
+        model_average=paddle.optimizer.ModelAverage(
+            average_window=conf.average_window,
+            max_average_window=conf.max_average_window))
+    # create trainer
+    trainer = paddle.trainer.SGD(
+        cost=cost, parameters=parameters, update_equation=rmsprop_optimizer)
+    # begin training network
+    def _event_handler(event):
+        """
+        Define end batch and end pass event handler
+        """
+        if isinstance(event, paddle.event.EndIteration):
+            sys.stderr.write(".")
+            batch_num = event.batch_id + 1
+            total_batch = conf.batches_per_pass * event.pass_id + batch_num
+            if batch_num % conf.log_period == 0:
+                sys.stderr.write("\n")
+                logger.info("Total batch=%d Batch=%d CurrentCost=%f Eval: %s" \
+                        % (total_batch, batch_num, event.cost, event.metrics))
+            if batch_num % conf.show_parameter_status_period == 0:
+                show_parameter_status(parameters)
+        elif isinstance(event, paddle.event.EndPass):
+            save_model(trainer, conf.model_save_dir, parameters, event.pass_id)
+        elif isinstance(event, paddle.event.BeginIteration):
+            if event.batch_id == 0 and event.pass_id == 0:
+                show_parameter_init_info(parameters)
+    ## for debugging purpose
+    #with utils.open_file("config", "w") as config:
+    #    print >> config, paddle.layer.parse_network(cost)
+    trainer.train(
+        reader=train_reader,
+        event_handler=_event_handler,
+        feeding=network.feeding,
+        num_passes=conf.num_passes)
+    logger.info("Training has finished.")
+def main():
+    conf = config.TrainingConfig()
+    logger.info("loading word embeddings...")
+    conf.vocab, conf.wordvecs = utils.load_wordvecs(conf.word_dict_path,
+                                                    conf.wordvecs_path)
+    logger.info("loaded")
+    logger.info("length of word dictionary is : %d." % len(conf.vocab))
+    train(conf)
+if __name__ == "__main__":
+    main()
--- a/neural_seq_qa/utils.py
+++ b/neural_seq_qa/utils.py
+import argparse
+import gzip
+import logging
+import sys
+import numpy
+__all__ = [
+    "open_file", "cumsum", "logger", "DotBar", "load_dict", "load_wordvecs"
+]
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
+def open_file(filename, *args1, **args2):
+    """
+    Open a file
+    :param filename: name of the file
+    :type filename: str
+    :return: a file handler
+    """
+    if filename.endswith(".gz"):
+        return gzip.open(filename, *args1, **args2)
+    else:
+        return open(filename, *args1, **args2)
+def cumsum(array):
+    """
+    Caculute the accumulated sum of array. For example, array=[1, 2, 3], the
+    result is [1, 1+2, 1+2+3]
+    :param array: input array
+    :type array: python list or numpy array
+    :return: the accumulated sum of array
+    """
+    if len(array) <= 1:
+        return list(array)
+    ret = list(array)
+    for i in xrange(1, len(ret)):
+        ret[i] += ret[i - 1]
+    return ret
+class DotBar(object):
+    """
+    A simple dot bar
+    """
+    def __init__(self, obj, step=200, dots_per_line=50, f=sys.stderr):
+        """
+        :param obj: an iteratable obj
+        :type obj: a python itertor
+        :param step: print a dot every step iterations
+        :type step: int
+        :param dots_per_line: dots each line
+        :type dots_per_line: int
+        :param f: print dot to f, default value is sys.stderr
+        :type f: a file handler
+        """
+        self.obj = obj
+        self.step = step
+        self.dots_per_line = dots_per_line
+        self.f = f
+    def __enter__(
+            self, ):
+        self.obj.__enter__()
+        self.idx = 0
+        return self
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.f.write("\n")
+        if self.obj is sys.stdin or self.obj is sys.stdout:
+            return
+        self.obj.__exit__(exc_type, exc_value, traceback)
+    def __iter__(self):
+        return self
+    def next(self):
+        self.idx += 1
+        if self.idx % self.step == 0:
+            self.f.write(".")
+        if self.idx % (self.step * self.dots_per_line) == 0:
+            self.f.write("\n")
+        return self.obj.next()
+def load_dict(word_dict_path):
+    with open_file(word_dict_path) as f:
+        # the first word must be OOV
+        vocab = {k.rstrip("\n").split()[0].decode("utf-8"):i \
+                        for i, k in enumerate(f)}
+    return vocab
+def load_wordvecs(word_dict_path, wordvecs_path):
+    vocab = load_dict(word_dict_path)
+    wordvecs = numpy.loadtxt(wordvecs_path, delimiter=",", dtype="float32")
+    assert len(vocab) == wordvecs.shape[0]
+    return vocab, wordvecs
--- a/neural_seq_qa/val_and_test.py
+++ b/neural_seq_qa/val_and_test.py
+import os
+import sys
+import argparse
+import time
+import traceback
+import subprocess
+import re
+import utils
+import infer
+import config
+from utils import logger
+def load_existing_results(eval_result_file):
+    evals = {}
+    with utils.open_file(eval_result_file) as f:
+        for line in f:
+            line = line.strip()
+            if not line: continue
+            pos = line.find(" ")
+            pass_id, ret = int(line[len("Pass="):pos]), line[pos + 1:]
+            evals[pass_id] = ret
+    return evals
+__PATTERN_CHUNK_F1 = re.compile("chunk_f1=(\d+(\.\d+)?)")
+def find_best_pass(evals):
+    results = []
+    for pass_id, eval_ret in evals.iteritems():
+        chunk_f1 = float(__PATTERN_CHUNK_F1.search(eval_ret).group(1))
+        results.append((pass_id, chunk_f1))
+    results.sort(key=lambda item: (-item[1], item[0]))
+    return results[0][0]
+def eval_one_pass(infer_obj, conf, model_path, data_path, eval_script):
+    if not os.path.exists("tmp"): os.makedirs("tmp")
+    # model file is not ready
+    if not os.path.exists(model_path): return False
+    output_path = os.path.join("tmp", "%s_%s.txt.gz" % (
+        os.path.basename(model_path), os.path.basename(data_path)))
+    with utils.open_file(output_path, "w") as output:
+        try:
+            infer_obj.infer(model_path, data_path, output)
+        except Exception as ex:
+            traceback.print_exc()
+            return None
+    cmd = [
+        "python", eval_script, output_path, data_path, "--fuzzy", "--schema",
+        conf.label_schema
+    ]
+    logger.info("cmd: %s" % " ".join(cmd))
+    eval_ret = subprocess.check_output(cmd)
+    if "chunk_f1" not in eval_ret:
+        raise ValueError("Unknown error in cmd \"%s\"" % " ".join(cmd))
+    return eval_ret
+def run_eval(infer_obj,
+             conf,
+             model_dir,
+             input_path,
+             eval_script,
+             log_file,
+             start_pass_id,
+             end_pass_id,
+             force_rerun=False):
+    if not force_rerun and os.path.exists(log_file):
+        evals = load_existing_results(log_file)
+    else:
+        evals = {}
+    with utils.open_file(log_file, "w") as log:
+        for i in xrange(start_pass_id, end_pass_id + 1):
+            if i in evals:
+                eval_ret = evals[i]
+            else:
+                pass_id = "%05d" % i
+                model_path = os.path.join(model_dir,
+                                          "params_pass_%s.tar.gz" % pass_id)
+                logger.info("Waiting for model %s ..." % model_path)
+                while True:
+                    eval_ret = eval_one_pass(infer_obj, conf, model_path,
+                                             input_path, eval_script)
+                    if eval_ret:
+                        evals[i] = eval_ret
+                        break
+                    # wait for one minute and rerun
+                    time.sleep(60)
+            print >> log, "Pass=%d %s" % (i, eval_ret.rstrip())
+            log.flush()
+    return evals
+def parse_cmd():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("model_dir")
+    parser.add_argument("data_type", choices=["ann", "ir"], default="ann")
+    parser.add_argument(
+        "--val_eval_output", help="validation set evaluation result file")
+    parser.add_argument(
+        "--tst_eval_output", help="test set evaluation result file")
+    parser.add_argument("--start_pass_id", type=int, default=0)
+    parser.add_argument(
+        "--end_pass_id", type=int, default=24, help="this pass is included")
+    parser.add_argument("--force_rerun", action="store_true")
+    return parser.parse_args()
+__eval_scripts = {
+    "ann": "data/evaluation/evaluate-tagging-result.py",
+    "ir": "data/evaluation/evaluate-voting-result.py",
+}
+__val_data = {
+    "ann": "./data/data/validation.ann.json.gz",
+    "ir": "./data/data/validation.ir.json.gz",
+}
+__tst_data = {
+    "ann": "./data/data/test.ann.json.gz",
+    "ir": "./data/data/test.ir.json.gz",
+}
+def main(args):
+    conf = config.InferConfig()
+    conf.vocab = utils.load_dict(conf.word_dict_path)
+    logger.info("length of word dictionary is : %d." % len(conf.vocab))
+    if args.val_eval_output:
+        val_eval_output = args.val_eval_output
+    else:
+        val_eval_output = "eval.val.%s.txt" % args.data_type
+    if args.tst_eval_output:
+        tst_eval_output = args.tst_eval_output
+    else:
+        tst_eval_output = "eval.tst.%s.txt" % args.data_type
+    eval_script = __eval_scripts[args.data_type]
+    val_data_file = __val_data[args.data_type]
+    tst_data_file = __tst_data[args.data_type]
+    infer_obj = infer.Infer(conf)
+    val_evals = run_eval(
+        infer_obj,
+        conf,
+        args.model_dir,
+        val_data_file,
+        eval_script,
+        val_eval_output,
+        args.start_pass_id,
+        args.end_pass_id,
+        force_rerun=args.force_rerun)
+    best_pass_id = find_best_pass(val_evals)
+    tst_evals = run_eval(
+        infer_obj,
+        conf,
+        args.model_dir,
+        tst_data_file,
+        eval_script,
+        tst_eval_output,
+        start_pass_id=best_pass_id,
+        end_pass_id=best_pass_id,
+        force_rerun=args.force_rerun)
+    logger.info("Best Pass=%d" % best_pass_id)
+    logger.info("Validation: %s" % val_evals[best_pass_id])
+    logger.info("Test      : %s" % tst_evals[best_pass_id])
+if __name__ == "__main__":
+    main(parse_cmd())