Merge pull request #431 from ranqiu92/convseq2seq

Add the conv_seq_to_seq model (first version).

Merge pull request #431 from ranqiu92/convseq2seq
Add the conv_seq_to_seq model (first version).
1edc8317 · Cao Ying · GitHub · a743c87e · 5ee232f0 · 1edc8317
12 changed file
--- a/conv_seq2seq/README.md
+++ b/conv_seq2seq/README.md
+# Convolutional Sequence to Sequence Learning
+This model implements the work in the following paper:
+
+Jonas Gehring, Micheal Auli, David Grangier, et al. Convolutional Sequence to Sequence Learning. Association for Computational Linguistics (ACL), 2017
+
+# Training a Model
+- Modify the following script if needed and then run:
+
+	```bash
+	python train.py \
+	  --train_data_path ./data/train_data \
+	  --test_data_path ./data/test_data \
+	  --src_dict_path ./data/src_dict \
+	  --trg_dict_path ./data/trg_dict \
+	  --enc_blocks "[(256, 3)] * 5" \
+	  --dec_blocks "[(256, 3)] * 3" \
+	  --emb_size 256 \
+	  --pos_size 200 \
+	  --drop_rate 0.1 \
+	  --use_gpu False \
+	  --trainer_count 1 \
+	  --batch_size 32 \
+	  --num_passes 20 \
+	  >train.log 2>&1
+	```
+
+# Inferring by a Trained Model
+- Infer by a trained model by running:
+
+	```bash
+	python infer.py \
+	  --infer_data_path ./data/infer_data \
+	  --src_dict_path ./data/src_dict \
+	  --trg_dict_path ./data/trg_dict \
+	  --enc_blocks "[(256, 3)] * 5" \
+	  --dec_blocks "[(256, 3)] * 3" \
+	  --emb_size 256 \
+	  --pos_size 200 \
+	  --drop_rate 0.1 \
+	  --use_gpu False \
+	  --trainer_count 1 \
+	  --max_len 100 \
+	  --beam_size 1 \
+	  --model_path ./params.pass-0.tar.gz \
+	  1>infer_result 2>infer.log
+	```
+
+# Notes
+
+Currently, beam search will forward the encoder multiple times when predicting each target word, which requires extra computations. And we will fix it later.
--- a/conv_seq2seq/beamsearch.py
+++ b/conv_seq2seq/beamsearch.py
+#coding=utf-8
+
+import sys
+import time
+import numpy as np
+
+
+class BeamSearch(object):
+    """
+    Generate sequence by beam search
+    NOTE: this class only implements generating one sentence at a time.
+    """
+
+    def __init__(self,
+                 inferer,
+                 trg_dict,
+                 pos_size,
+                 padding_num,
+                 beam_size=1,
+                 max_len=100):
+        self.inferer = inferer
+        self.trg_dict = trg_dict
+        self.word_padding = trg_dict.__len__()
+        self.pos_size = pos_size
+        self.pos_padding = pos_size
+        self.padding_num = padding_num
+        self.win_len = padding_num + 1
+        self.max_len = max_len
+        self.beam_size = beam_size
+
+    def get_beam_input(self, pre_beam_list, infer_data):
+        """
+        Get input for generation at the current iteration.
+        """
+        beam_input = []
+
+        if len(pre_beam_list) == 0:
+            cur_trg = [self.word_padding
+                       ] * self.padding_num + [self.trg_dict['<s>']]
+            cur_trg_pos = [self.pos_padding] * self.padding_num + [0]
+            beam_input.append(infer_data + [cur_trg] + [cur_trg_pos])
+        else:
+            for seq in pre_beam_list:
+                if len(seq) < self.win_len:
+                    cur_trg = [self.word_padding] * (
+                        self.win_len - len(seq) - 1
+                    ) + [self.trg_dict['<s>']] + seq
+                    cur_trg_pos = [self.pos_padding] * (
+                        self.win_len - len(seq) - 1) + [0] + range(1,
+                                                                   len(seq) + 1)
+                else:
+                    cur_trg = seq[-self.win_len:]
+                    cur_trg_pos = range(
+                        len(seq) + 1 - self.win_len, len(seq) + 1)
+
+                beam_input.append(infer_data + [cur_trg] + [cur_trg_pos])
+        return beam_input
+
+    def get_prob(self, beam_input):
+        """
+        Get the probabilities of all possible tokens.
+        """
+        row_list = [j * self.win_len for j in range(len(beam_input))]
+        prob = self.inferer.infer(beam_input, field='value')[row_list, :]
+        return prob
+
+    def get_candidate(self, pre_beam_list, pre_beam_score, prob):
+        """
+        Get top beam_size tokens and their scores for each beam.
+        """
+        if prob.ndim == 1:
+            candidate_id = prob.argsort()[-self.beam_size:][::-1]
+            candidate_log_prob = np.log(prob[candidate_id])
+        else:
+            candidate_id = prob.argsort()[:, -self.beam_size:][:, ::-1]
+            candidate_log_prob = np.zeros_like(candidate_id).astype('float32')
+            for j in range(len(pre_beam_list)):
+                candidate_log_prob[j, :] = np.log(prob[j, candidate_id[j, :]])
+
+        if pre_beam_score.size > 0:
+            candidate_score = candidate_log_prob + pre_beam_score.reshape(
+                (pre_beam_score.size, 1))
+        else:
+            candidate_score = candidate_log_prob
+
+        return candidate_id, candidate_score
+
+    def prune(self, candidate_id, candidate_score, pre_beam_list,
+              completed_seq_list, completed_seq_score, completed_seq_min_score):
+        """
+        Pruning process of the beam search. During the process, beam_size most possible sequences
+        are selected for the beam in the next iteration. Besides, their scores and the minimum score
+        of the completed sequences are updated.
+        """
+        candidate_id = candidate_id.flatten()
+        candidate_score = candidate_score.flatten()
+
+        topk_idx = candidate_score.argsort()[-self.beam_size:][::-1].tolist()
+        topk_seq_idx = [idx / self.beam_size for idx in topk_idx]
+
+        next_beam = []
+        beam_score = []
+        for j in range(len(topk_idx)):
+            if candidate_id[topk_idx[j]] == self.trg_dict['<e>']:
+                if len(
+                        completed_seq_list
+                ) < self.beam_size or completed_seq_min_score <= candidate_score[
+                        topk_idx[j]]:
+                    completed_seq_list.append(pre_beam_list[topk_seq_idx[j]])
+                    completed_seq_score.append(candidate_score[topk_idx[j]])
+
+                    if completed_seq_min_score is None or (
+                            completed_seq_min_score >=
+                            candidate_score[topk_idx[j]] and
+                            len(completed_seq_list) < self.beam_size):
+                        completed_seq_min_score = candidate_score[topk_idx[j]]
+            else:
+                seq = pre_beam_list[topk_seq_idx[
+                    j]] + [candidate_id[topk_idx[j]]]
+                score = candidate_score[topk_idx[j]]
+                next_beam.append(seq)
+                beam_score.append(score)
+
+        beam_score = np.array(beam_score)
+        return next_beam, beam_score, completed_seq_min_score
+
+    def search_one_sample(self, infer_data):
+        """
+        Beam search process for one sample.
+        """
+        completed_seq_list = []
+        completed_seq_score = []
+        completed_seq_min_score = None
+        uncompleted_seq_list = [[]]
+        uncompleted_seq_score = np.zeros(0)
+
+        for i in xrange(self.max_len):
+            beam_input = self.get_beam_input(uncompleted_seq_list, infer_data)
+
+            prob = self.get_prob(beam_input)
+
+            candidate_id, candidate_score = self.get_candidate(
+                uncompleted_seq_list, uncompleted_seq_score, prob)
+
+            uncompleted_seq_list, uncompleted_seq_score, completed_seq_min_score = self.prune(
+                candidate_id, candidate_score, uncompleted_seq_list,
+                completed_seq_list, completed_seq_score,
+                completed_seq_min_score)
+
+            if len(uncompleted_seq_list) == 0:
+                break
+            if len(completed_seq_list) >= self.beam_size:
+                seq_max_score = uncompleted_seq_score.max()
+                if seq_max_score < completed_seq_min_score:
+                    uncompleted_seq_list = []
+                    break
+
+        final_seq_list = completed_seq_list + uncompleted_seq_list
+        final_score = np.concatenate(
+            (np.array(completed_seq_score), uncompleted_seq_score))
+        max_id = final_score.argmax()
+        top_seq = final_seq_list[max_id]
+        return top_seq
--- a/conv_seq2seq/infer.py
+++ b/conv_seq2seq/infer.py
+#coding=utf-8
+
+import sys
+import argparse
+import distutils.util
+import gzip
+
+import paddle.v2 as paddle
+from model import conv_seq2seq
+from beamsearch import BeamSearch
+import reader
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="PaddlePaddle Convolutional Seq2Seq")
+    parser.add_argument(
+        '--infer_data_path',
+        type=str,
+        required=True,
+        help="Path of the dataset for inference")
+    parser.add_argument(
+        '--src_dict_path',
+        type=str,
+        required=True,
+        help='Path of the source dictionary')
+    parser.add_argument(
+        '--trg_dict_path',
+        type=str,
+        required=True,
+        help='path of the target dictionary')
+    parser.add_argument(
+        '--enc_blocks', type=str, help='Convolution blocks of the encoder')
+    parser.add_argument(
+        '--dec_blocks', type=str, help='Convolution blocks of the decoder')
+    parser.add_argument(
+        '--emb_size',
+        type=int,
+        default=512,
+        help='Dimension of word embedding. (default: %(default)s)')
+    parser.add_argument(
+        '--pos_size',
+        type=int,
+        default=200,
+        help='Total number of the position indexes. (default: %(default)s)')
+    parser.add_argument(
+        '--drop_rate',
+        type=float,
+        default=0.,
+        help='Dropout rate. (default: %(default)s)')
+    parser.add_argument(
+        "--use_gpu",
+        default=False,
+        type=distutils.util.strtobool,
+        help="Use gpu or not. (default: %(default)s)")
+    parser.add_argument(
+        "--trainer_count",
+        default=1,
+        type=int,
+        help="Trainer number. (default: %(default)s)")
+    parser.add_argument(
+        '--max_len',
+        type=int,
+        default=100,
+        help="The maximum length of the sentence to be generated. (default: %(default)s)"
+    )
+    parser.add_argument(
+        "--beam_size",
+        default=1,
+        type=int,
+        help="The width of beam expasion. (default: %(default)s)")
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        required=True,
+        help="The path of trained model. (default: %(default)s)")
+    return parser.parse_args()
+
+
+def to_sentence(seq, dictionary):
+    raw_sentence = [dictionary[id] for id in seq]
+    sentence = " ".join(raw_sentence)
+    return sentence
+
+
+def infer(infer_data_path,
+          src_dict_path,
+          trg_dict_path,
+          model_path,
+          enc_conv_blocks,
+          dec_conv_blocks,
+          emb_dim=512,
+          pos_size=200,
+          drop_rate=0.,
+          max_len=100,
+          beam_size=1):
+    """
+    Inference.
+
+    :param infer_data_path: The path of the data for inference.
+    :type infer_data_path: str
+    :param src_dict_path: The path of the source dictionary.
+    :type src_dict_path: str
+    :param trg_dict_path: The path of the target dictionary.
+    :type trg_dict_path: str
+    :param model_path: The path of a trained model.
+    :type model_path: str
+    :param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of
+                            the list contains output dimension and context length of the corresponding
+                            convolution block.
+    :type enc_conv_blocks: list of tuple
+    :param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of
+                            the list contains output dimension and context length of the corresponding
+                            convolution block.
+    :type dec_conv_blocks: list of tuple
+    :param emb_dim: The dimension of the embedding vector.
+    :type emb_dim: int
+    :param pos_size: The total number of the position indexes, which means
+                     the maximum value of the index is pos_size - 1.
+    :type pos_size: int
+    :param drop_rate: Dropout rate.
+    :type drop_rate: float
+    :param max_len: The maximum length of the sentence to be generated.
+    :type max_len: int
+    :param beam_size: The width of beam expansion.
+    :type beam_size: int
+    """
+    # load dict
+    src_dict = reader.load_dict(src_dict_path)
+    trg_dict = reader.load_dict(trg_dict_path)
+    src_dict_size = src_dict.__len__()
+    trg_dict_size = trg_dict.__len__()
+
+    prob = conv_seq2seq(
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        pos_size=pos_size,
+        emb_dim=emb_dim,
+        enc_conv_blocks=enc_conv_blocks,
+        dec_conv_blocks=dec_conv_blocks,
+        drop_rate=drop_rate,
+        is_infer=True)
+
+    # load parameters
+    parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path))
+
+    padding_list = [context_len - 1 for (size, context_len) in dec_conv_blocks]
+    padding_num = reduce(lambda x, y: x + y, padding_list)
+    infer_reader = reader.data_reader(
+        data_file=infer_data_path,
+        src_dict=src_dict,
+        trg_dict=trg_dict,
+        pos_size=pos_size,
+        padding_num=padding_num)
+
+    inferer = paddle.inference.Inference(
+        output_layer=prob, parameters=parameters)
+
+    searcher = BeamSearch(
+        inferer=inferer,
+        trg_dict=trg_dict,
+        pos_size=pos_size,
+        padding_num=padding_num,
+        max_len=max_len,
+        beam_size=beam_size)
+
+    reverse_trg_dict = reader.get_reverse_dict(trg_dict)
+    for i, raw_data in enumerate(infer_reader()):
+        infer_data = [raw_data[0], raw_data[1]]
+        result = searcher.search_one_sample(infer_data)
+        sentence = to_sentence(result, reverse_trg_dict)
+        print sentence
+        sys.stdout.flush()
+    return
+
+
+def main():
+    args = parse_args()
+    enc_conv_blocks = eval(args.enc_blocks)
+    dec_conv_blocks = eval(args.dec_blocks)
+
+    paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
+
+    infer(
+        infer_data_path=args.infer_data_path,
+        src_dict_path=args.src_dict_path,
+        trg_dict_path=args.trg_dict_path,
+        model_path=args.model_path,
+        enc_conv_blocks=enc_conv_blocks,
+        dec_conv_blocks=dec_conv_blocks,
+        emb_dim=args.emb_size,
+        pos_size=args.pos_size,
+        drop_rate=args.drop_rate,
+        max_len=args.max_len,
+        beam_size=args.beam_size)
+
+
+if __name__ == '__main__':
+    main()
--- a/conv_seq2seq/model.py
+++ b/conv_seq2seq/model.py
+#coding=utf-8
+
+import math
+
+import paddle.v2 as paddle
+
+__all__ = ["conv_seq2seq"]
+
+
+def gated_conv_with_batchnorm(input,
+                              size,
+                              context_len,
+                              context_start=None,
+                              learning_rate=1.0,
+                              drop_rate=0.):
+    """
+    Definition of the convolution block.
+
+    :param input: The input of this block.
+    :type input: LayerOutput
+    :param size: The dimension of the block's output.
+    :type size: int
+    :param context_len: The context length of the convolution.
+    :type context_len: int
+    :param context_start: The start position of the context.
+    :type context_start: int
+    :param learning_rate: The learning rate factor of the parameters in the block.
+                          The actual learning rate is the product of the global
+                          learning rate and this factor.
+    :type learning_rate: float
+    :param drop_rate: Dropout rate.
+    :type drop_rate: float
+    :return: The output of the convolution block.
+    :rtype: LayerOutput
+    """
+    input = paddle.layer.dropout(input=input, dropout_rate=drop_rate)
+
+    context = paddle.layer.mixed(
+        size=input.size * context_len,
+        input=paddle.layer.context_projection(
+            input=input, context_len=context_len, context_start=context_start))
+
+    raw_conv = paddle.layer.fc(
+        input=context,
+        size=size * 2,
+        act=paddle.activation.Linear(),
+        param_attr=paddle.attr.Param(
+            initial_mean=0.,
+            initial_std=math.sqrt(4.0 * (1.0 - drop_rate) / context.size),
+            learning_rate=learning_rate),
+        bias_attr=False)
+
+    batch_norm_conv = paddle.layer.batch_norm(
+        input=raw_conv,
+        act=paddle.activation.Linear(),
+        param_attr=paddle.attr.Param(learning_rate=learning_rate))
+
+    with paddle.layer.mixed(size=size) as conv:
+        conv += paddle.layer.identity_projection(
+            batch_norm_conv, size=size, offset=0)
+
+    with paddle.layer.mixed(size=size, act=paddle.activation.Sigmoid()) as gate:
+        gate += paddle.layer.identity_projection(
+            batch_norm_conv, size=size, offset=size)
+
+    with paddle.layer.mixed(size=size) as gated_conv:
+        gated_conv += paddle.layer.dotmul_operator(conv, gate)
+
+    return gated_conv
+
+
+def encoder(token_emb,
+            pos_emb,
+            conv_blocks=[(256, 3)] * 5,
+            num_attention=3,
+            drop_rate=0.1):
+    """
+    Definition of the encoder.
+
+    :param token_emb: The embedding vector of the input token.
+    :type token_emb: LayerOutput
+    :param pos_emb: The embedding vector of the input token's position.
+    :type pos_emb: LayerOutput
+    :param conv_blocks: The scale list of the convolution blocks. Each element of
+                        the list contains output dimension and context length of
+                        the corresponding convolution block.
+    :type conv_blocks: list of tuple
+    :param num_attention: The total number of the attention modules used in the decoder.
+    :type num_attention: int
+    :param drop_rate: Dropout rate.
+    :type drop_rate: float
+    :return: The input token encoding.
+    :rtype: LayerOutput
+    """
+    embedding = paddle.layer.addto(
+        input=[token_emb, pos_emb],
+        layer_attr=paddle.attr.Extra(drop_rate=drop_rate))
+
+    proj_size = conv_blocks[0][0]
+    block_input = paddle.layer.fc(
+        input=embedding,
+        size=proj_size,
+        act=paddle.activation.Linear(),
+        param_attr=paddle.attr.Param(
+            initial_mean=0.,
+            initial_std=math.sqrt((1.0 - drop_rate) / embedding.size),
+            learning_rate=1.0 / (2.0 * num_attention)),
+        bias_attr=True, )
+
+    for (size, context_len) in conv_blocks:
+        if block_input.size == size:
+            residual = block_input
+        else:
+            residual = paddle.layer.fc(
+                input=block_input,
+                size=size,
+                act=paddle.activation.Linear(),
+                param_attr=paddle.attr.Param(learning_rate=1.0 /
+                                             (2.0 * num_attention)),
+                bias_attr=True)
+
+        gated_conv = gated_conv_with_batchnorm(
+            input=block_input,
+            size=size,
+            context_len=context_len,
+            learning_rate=1.0 / (2.0 * num_attention),
+            drop_rate=drop_rate)
+
+        with paddle.layer.mixed(size=size) as block_output:
+            block_output += paddle.layer.identity_projection(residual)
+            block_output += paddle.layer.identity_projection(gated_conv)
+
+        # halve the variance of the sum
+        block_output = paddle.layer.slope_intercept(
+            input=block_output, slope=math.sqrt(0.5))
+
+        block_input = block_output
+
+    emb_dim = embedding.size
+    encoded_vec = paddle.layer.fc(
+        input=block_output,
+        size=emb_dim,
+        act=paddle.activation.Linear(),
+        param_attr=paddle.attr.Param(learning_rate=1.0 / (2.0 * num_attention)),
+        bias_attr=True)
+
+    encoded_sum = paddle.layer.addto(input=[encoded_vec, embedding])
+
+    # halve the variance of the sum
+    encoded_sum = paddle.layer.slope_intercept(input=encoded_sum, slope=math.sqrt(0.5))
+
+    return encoded_vec, encoded_sum
+
+
+def attention(decoder_state, cur_embedding, encoded_vec, encoded_sum):
+    """
+    Definition of the attention.
+
+    :param decoder_state: The hidden state of the decoder.
+    :type decoder_state: LayerOutput
+    :param cur_embedding: The embedding vector of the current token.
+    :type cur_embedding: LayerOutput
+    :param encoded_vec: The source token encoding.
+    :type encoded_vec: LayerOutput
+    :param encoded_sum: The sum of the source token's encoding and embedding.
+    :type encoded_sum: LayerOutput
+    :return: A context vector.
+    :rtype: LayerOutput
+    """
+    residual = decoder_state
+
+    state_size = decoder_state.size
+    emb_dim = cur_embedding.size
+    with paddle.layer.mixed(size=emb_dim, bias_attr=True) as state_summary:
+        state_summary += paddle.layer.full_matrix_projection(decoder_state)
+        state_summary += paddle.layer.identity_projection(cur_embedding)
+
+    # halve the variance of the sum
+    state_summary = paddle.layer.slope_intercept(
+        input=state_summary, slope=math.sqrt(0.5))
+
+    expanded = paddle.layer.expand(input=state_summary, expand_as=encoded_vec)
+
+    m = paddle.layer.linear_comb(weights=expanded, vectors=encoded_vec)
+
+    attention_weight = paddle.layer.fc(
+        input=m,
+        size=1,
+        act=paddle.activation.SequenceSoftmax(),
+        bias_attr=False)
+
+    scaled = paddle.layer.scaling(weight=attention_weight, input=encoded_sum)
+
+    attended = paddle.layer.pooling(
+        input=scaled, pooling_type=paddle.pooling.Sum())
+
+    attended_proj = paddle.layer.fc(
+        input=attended,
+        size=state_size,
+        act=paddle.activation.Linear(),
+        bias_attr=True)
+
+    attention_result = paddle.layer.addto(input=[attended_proj, residual])
+
+    # halve the variance of the sum
+    attention_result = paddle.layer.slope_intercept(
+        input=attention_result, slope=math.sqrt(0.5))
+    return attention_result
+
+
+def decoder(token_emb,
+            pos_emb,
+            encoded_vec,
+            encoded_sum,
+            dict_size,
+            conv_blocks=[(256, 3)] * 3,
+            drop_rate=0.1):
+    """
+    Definition of the decoder.
+
+    :param token_emb: The embedding vector of the input token.
+    :type token_emb: LayerOutput
+    :param pos_emb: The embedding vector of the input token's position.
+    :type pos_emb: LayerOutput
+    :param encoded_vec: The source token encoding.
+    :type encoded_vec: LayerOutput
+    :param encoded_sum: The sum of the source token's encoding and embedding.
+    :type encoded_sum: LayerOutput
+    :param dict_size: The size of the target dictionary.
+    :type dict_size: int
+    :param conv_blocks: The scale list of the convolution blocks. Each element
+                        of the list contains output dimension and context length
+                        of the corresponding convolution block.
+    :type conv_blocks: list of tuple
+    :param drop_rate: Dropout rate.
+    :type drop_rate: float
+    :return: The probability of the predicted token.
+    :rtype: LayerOutput
+    """
+
+    def attention_step(decoder_state, cur_embedding, encoded_vec, encoded_sum):
+        conditional = attention(
+            decoder_state=decoder_state,
+            cur_embedding=cur_embedding,
+            encoded_vec=encoded_vec,
+            encoded_sum=encoded_sum)
+        return conditional
+
+    embedding = paddle.layer.addto(
+        input=[token_emb, pos_emb],
+        layer_attr=paddle.attr.Extra(drop_rate=drop_rate))
+
+    proj_size = conv_blocks[0][0]
+    block_input = paddle.layer.fc(
+        input=embedding,
+        size=proj_size,
+        act=paddle.activation.Linear(),
+        param_attr=paddle.attr.Param(
+            initial_mean=0.,
+            initial_std=math.sqrt((1.0 - drop_rate) / embedding.size)),
+        bias_attr=True, )
+
+    for (size, context_len) in conv_blocks:
+        if block_input.size == size:
+            residual = block_input
+        else:
+            residual = paddle.layer.fc(
+                input=block_input,
+                size=size,
+                act=paddle.activation.Linear(),
+                bias_attr=True)
+
+        decoder_state = gated_conv_with_batchnorm(
+            input=block_input,
+            size=size,
+            context_len=context_len,
+            context_start=0,
+            drop_rate=drop_rate)
+
+        group_inputs = [
+            decoder_state,
+            embedding,
+            paddle.layer.StaticInput(input=encoded_vec),
+            paddle.layer.StaticInput(input=encoded_sum),
+        ]
+
+        conditional = paddle.layer.recurrent_group(
+            step=attention_step, input=group_inputs)
+
+        block_output = paddle.layer.addto(input=[conditional, residual])
+
+        # halve the variance of the sum
+        block_output = paddle.layer.slope_intercept(
+            input=block_output, slope=math.sqrt(0.5))
+
+        block_input = block_output
+
+    out_emb_dim = embedding.size
+    block_output = paddle.layer.fc(
+        input=block_output,
+        size=out_emb_dim,
+        act=paddle.activation.Linear(),
+        layer_attr=paddle.attr.Extra(drop_rate=drop_rate))
+
+    decoder_out = paddle.layer.fc(
+        input=block_output,
+        size=dict_size,
+        act=paddle.activation.Softmax(),
+        param_attr=paddle.attr.Param(
+            initial_mean=0.,
+            initial_std=math.sqrt((1.0 - drop_rate) / block_output.size)),
+        bias_attr=True)
+
+    return decoder_out
+
+
+def conv_seq2seq(src_dict_size,
+                 trg_dict_size,
+                 pos_size,
+                 emb_dim,
+                 enc_conv_blocks=[(256, 3)] * 5,
+                 dec_conv_blocks=[(256, 3)] * 3,
+                 drop_rate=0.1,
+                 is_infer=False):
+    """
+    Definition of convolutional sequence-to-sequence network.
+
+    :param src_dict_size: The size of the source dictionary.
+    :type src_dict_size: int
+    :param trg_dict_size: The size of the target dictionary.
+    :type trg_dict_size: int
+    :param pos_size: The total number of the position indexes, which means
+                     the maximum value of the index is pos_size - 1.
+    :type pos_size: int
+    :param emb_dim: The dimension of the embedding vector.
+    :type emb_dim: int
+    :param enc_conv_blocks: The scale list of the encoder's convolution blocks. Each element
+                            of the list contains output dimension and context length of the
+                            corresponding convolution block.
+    :type enc_conv_blocks: list of tuple
+    :param dec_conv_blocks: The scale list of the decoder's convolution blocks. Each element
+                            of the list contains output dimension and context length of the
+                            corresponding convolution block.
+    :type dec_conv_blocks: list of tuple
+    :param drop_rate: Dropout rate.
+    :type drop_rate: float
+    :param is_infer: Whether infer or not.
+    :type is_infer: bool
+    :return: Cost or output layer.
+    :rtype: LayerOutput
+    """
+    src = paddle.layer.data(
+        name='src_word',
+        type=paddle.data_type.integer_value_sequence(src_dict_size))
+    src_pos = paddle.layer.data(
+        name='src_word_pos',
+        type=paddle.data_type.integer_value_sequence(pos_size +
+                                                     1))  # one for padding
+
+    src_emb = paddle.layer.embedding(
+        input=src,
+        size=emb_dim,
+        name='src_word_emb',
+        param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))
+    src_pos_emb = paddle.layer.embedding(
+        input=src_pos,
+        size=emb_dim,
+        name='src_pos_emb',
+        param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))
+
+    num_attention = len(dec_conv_blocks)
+    encoded_vec, encoded_sum = encoder(
+        token_emb=src_emb,
+        pos_emb=src_pos_emb,
+        conv_blocks=enc_conv_blocks,
+        num_attention=num_attention,
+        drop_rate=drop_rate)
+
+    trg = paddle.layer.data(
+        name='trg_word',
+        type=paddle.data_type.integer_value_sequence(trg_dict_size +
+                                                     1))  # one for padding
+    trg_pos = paddle.layer.data(
+        name='trg_word_pos',
+        type=paddle.data_type.integer_value_sequence(pos_size +
+                                                     1))  # one for padding
+
+    trg_emb = paddle.layer.embedding(
+        input=trg,
+        size=emb_dim,
+        name='trg_word_emb',
+        param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))
+    trg_pos_emb = paddle.layer.embedding(
+        input=trg_pos,
+        size=emb_dim,
+        name='trg_pos_emb',
+        param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))
+
+    decoder_out = decoder(
+        token_emb=trg_emb,
+        pos_emb=trg_pos_emb,
+        encoded_vec=encoded_vec,
+        encoded_sum=encoded_sum,
+        dict_size=trg_dict_size,
+        conv_blocks=dec_conv_blocks,
+        drop_rate=drop_rate)
+
+    if is_infer:
+        return decoder_out
+
+    trg_next_word = paddle.layer.data(
+        name='trg_next_word',
+        type=paddle.data_type.integer_value_sequence(trg_dict_size))
+    cost = paddle.layer.classification_cost(
+        input=decoder_out, label=trg_next_word)
+
+    return cost
--- a/conv_seq2seq/reader.py
+++ b/conv_seq2seq/reader.py
+#coding=utf-8
+
+import random
+
+
+def load_dict(dict_file):
+    word_dict = dict()
+    with open(dict_file, 'r') as f:
+        for i, line in enumerate(f):
+            w = line.strip().split()[0]
+            word_dict[w] = i
+    return word_dict
+
+
+def get_reverse_dict(dictionary):
+    reverse_dict = {dictionary[k]: k for k in dictionary.keys()}
+    return reverse_dict
+
+
+def load_data(data_file, src_dict, trg_dict):
+    UNK_IDX = src_dict['<unk>']
+    with open(data_file, 'r') as f:
+        for line in f:
+            line_split = line.strip().split('\t')
+            if len(line_split) < 2:
+                continue
+            src, trg = line_split
+            src_words = src.strip().split()
+            trg_words = trg.strip().split()
+            src_seq = [src_dict.get(w, UNK_IDX) for w in src_words]
+            trg_seq = [trg_dict.get(w, UNK_IDX) for w in trg_words]
+            yield src_seq, trg_seq
+
+
+def data_reader(data_file, src_dict, trg_dict, pos_size, padding_num):
+    def reader():
+        UNK_IDX = src_dict['<unk>']
+        word_padding = trg_dict.__len__()
+        pos_padding = pos_size
+
+        def _get_pos(pos_list, pos_size, pos_padding):
+            return [pos if pos < pos_size else pos_padding for pos in pos_list]
+
+        with open(data_file, 'r') as f:
+            for line in f:
+                line_split = line.strip().split('\t')
+                if len(line_split) != 2:
+                    continue
+                src, trg = line_split
+                src = src.strip().split()
+                src_word = [src_dict.get(w, UNK_IDX) for w in src]
+                src_word_pos = range(len(src_word))
+                src_word_pos = _get_pos(src_word_pos, pos_size, pos_padding)
+
+                trg = trg.strip().split()
+                trg_word = [trg_dict['<s>']
+                            ] + [trg_dict.get(w, UNK_IDX) for w in trg]
+                trg_word_pos = range(len(trg_word))
+                trg_word_pos = _get_pos(trg_word_pos, pos_size, pos_padding)
+
+                trg_next_word = trg_word[1:] + [trg_dict['<e>']]
+                trg_word = [word_padding] * padding_num + trg_word
+                trg_word_pos = [pos_padding] * padding_num + trg_word_pos
+                trg_next_word = trg_next_word + [trg_dict['<e>']] * padding_num
+                yield src_word, src_word_pos, trg_word, trg_word_pos, trg_next_word
+
+    return reader
--- a/conv_seq2seq/train.py
+++ b/conv_seq2seq/train.py
+#coding=utf-8
+
+import os
+import sys
+import time
+import argparse
+import distutils.util
+import gzip
+import numpy as np
+
+import paddle.v2 as paddle
+from model import conv_seq2seq
+import reader
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="PaddlePaddle Convolutional Seq2Seq")
+    parser.add_argument(
+        '--train_data_path',
+        type=str,
+        required=True,
+        help="Path of the training set")
+    parser.add_argument(
+        '--test_data_path', type=str, help='Path of the test set')
+    parser.add_argument(
+        '--src_dict_path',
+        type=str,
+        required=True,
+        help='Path of source dictionary')
+    parser.add_argument(
+        '--trg_dict_path',
+        type=str,
+        required=True,
+        help='Path of target dictionary')
+    parser.add_argument(
+        '--enc_blocks', type=str, help='Convolution blocks of the encoder')
+    parser.add_argument(
+        '--dec_blocks', type=str, help='Convolution blocks of the decoder')
+    parser.add_argument(
+        '--emb_size',
+        type=int,
+        default=512,
+        help='Dimension of word embedding. (default: %(default)s)')
+    parser.add_argument(
+        '--pos_size',
+        type=int,
+        default=200,
+        help='Total number of the position indexes. (default: %(default)s)')
+    parser.add_argument(
+        '--drop_rate',
+        type=float,
+        default=0.,
+        help='Dropout rate. (default: %(default)s)')
+    parser.add_argument(
+        "--use_gpu",
+        default=False,
+        type=distutils.util.strtobool,
+        help="Use gpu or not. (default: %(default)s)")
+    parser.add_argument(
+        "--trainer_count",
+        default=1,
+        type=int,
+        help="Trainer number. (default: %(default)s)")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=32,
+        help="Size of a mini-batch. (default: %(default)s)")
+    parser.add_argument(
+        '--num_passes',
+        type=int,
+        default=15,
+        help="Number of passes to train. (default: %(default)s)")
+    return parser.parse_args()
+
+
+def create_reader(padding_num,
+                  train_data_path,
+                  test_data_path=None,
+                  src_dict=None,
+                  trg_dict=None,
+                  pos_size=200,
+                  batch_size=32):
+
+    train_reader = paddle.batch(
+        reader=paddle.reader.shuffle(
+            reader=reader.data_reader(
+                data_file=train_data_path,
+                src_dict=src_dict,
+                trg_dict=trg_dict,
+                pos_size=pos_size,
+                padding_num=padding_num),
+            buf_size=10240),
+        batch_size=batch_size)
+
+    test_reader = None
+    if test_data_path:
+        test_reader = paddle.batch(
+            reader=paddle.reader.shuffle(
+                reader=reader.data_reader(
+                    data_file=test_data_path,
+                    src_dict=src_dict,
+                    trg_dict=trg_dict,
+                    pos_size=pos_size,
+                    padding_num=padding_num),
+                buf_size=10240),
+            batch_size=batch_size)
+
+    return train_reader, test_reader
+
+
+def train(train_data_path,
+          test_data_path,
+          src_dict_path,
+          trg_dict_path,
+          enc_conv_blocks,
+          dec_conv_blocks,
+          emb_dim=512,
+          pos_size=200,
+          drop_rate=0.,
+          batch_size=32,
+          num_passes=15):
+    """
+    Train the convolution sequence-to-sequence model.    
+
+    :param train_data_path: The path of the training set.
+    :type train_data_path: str
+    :param test_data_path: The path of the test set.
+    :type test_data_path: str
+    :param src_dict_path: The path of the source dictionary.
+    :type src_dict_path: str
+    :param trg_dict_path: The path of the target dictionary.
+    :type trg_dict_path: str
+    :param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of
+                            the list contains output dimension and context length of the corresponding
+                            convolution block.
+    :type enc_conv_blocks: list of tuple
+    :param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of
+                            the list contains output dimension and context length of the corresponding
+                            convolution block.
+    :type dec_conv_blocks: list of tuple
+    :param emb_dim: The dimension of the embedding vector.
+    :type emb_dim: int
+    :param pos_size: The total number of the position indexes, which means
+                     the maximum value of the index is pos_size - 1.
+    :type pos_size: int
+    :param drop_rate: Dropout rate.
+    :type drop_rate: float
+    :param batch_size: The size of a mini-batch.
+    :type batch_size: int
+    :param num_passes: The total number of the passes to train.
+    :type num_passes: int
+    """
+    # load dict
+    src_dict = reader.load_dict(src_dict_path)
+    trg_dict = reader.load_dict(trg_dict_path)
+    src_dict_size = src_dict.__len__()
+    trg_dict_size = trg_dict.__len__()
+
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=1e-3, )
+
+    cost = conv_seq2seq(
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        pos_size=pos_size,
+        emb_dim=emb_dim,
+        enc_conv_blocks=enc_conv_blocks,
+        dec_conv_blocks=dec_conv_blocks,
+        drop_rate=drop_rate,
+        is_infer=False)
+
+    # create parameters and trainer
+    parameters = paddle.parameters.create(cost)
+    trainer = paddle.trainer.SGD(
+        cost=cost, parameters=parameters, update_equation=optimizer)
+
+    padding_list = [context_len - 1 for (size, context_len) in dec_conv_blocks]
+    padding_num = reduce(lambda x, y: x + y, padding_list)
+    train_reader, test_reader = create_reader(
+        padding_num=padding_num,
+        train_data_path=train_data_path,
+        test_data_path=test_data_path,
+        src_dict=src_dict,
+        trg_dict=trg_dict,
+        pos_size=pos_size,
+        batch_size=batch_size)
+
+    feeding = {
+        'src_word': 0,
+        'src_word_pos': 1,
+        'trg_word': 2,
+        'trg_word_pos': 3,
+        'trg_next_word': 4
+    }
+
+    # create event handler
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 20 == 0:
+                cur_time = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime())
+                print "[%s]: Pass: %d, Batch: %d, TrainCost: %f, %s" % (
+                    cur_time, event.pass_id, event.batch_id, event.cost,
+                    event.metrics)
+            else:
+                sys.stdout.flush()
+
+        if isinstance(event, paddle.event.EndPass):
+            if test_reader is not None:
+                cur_time = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime())
+                result = trainer.test(reader=test_reader, feeding=feeding)
+                print "[%s]: Pass: %d, TestCost: %f, %s" % (
+                    cur_time, event.pass_id, result.cost, result.metrics)
+                sys.stdout.flush()
+            with gzip.open("output/params.pass-%d.tar.gz" % event.pass_id,
+                           'w') as f:
+                trainer.save_parameter_to_tar(f)
+
+    if not os.path.exists('output'):
+        os.mkdir('output')
+
+    trainer.train(
+        reader=train_reader,
+        event_handler=event_handler,
+        num_passes=num_passes,
+        feeding=feeding)
+
+
+def main():
+    args = parse_args()
+    enc_conv_blocks = eval(args.enc_blocks)
+    dec_conv_blocks = eval(args.dec_blocks)
+
+    paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
+
+    train(
+        train_data_path=args.train_data_path,
+        test_data_path=args.test_data_path,
+        src_dict_path=args.src_dict_path,
+        trg_dict_path=args.trg_dict_path,
+        enc_conv_blocks=enc_conv_blocks,
+        dec_conv_blocks=dec_conv_blocks,
+        emb_dim=args.emb_size,
+        pos_size=args.pos_size,
+        drop_rate=args.drop_rate,
+        batch_size=args.batch_size,
+        num_passes=args.num_passes)
+
+
+if __name__ == '__main__':
+    main()
--- a/conv_seq_to_seq/README.md
+++ b/conv_seq_to_seq/README.md
+# Convolutional Sequence to Sequence Learning
+This model implements the work in the following paper:
+
+Jonas Gehring, Micheal Auli, David Grangier, et al. Convolutional Sequence to Sequence Learning. Association for Computational Linguistics (ACL), 2017
+
+# Training a Model
+- Modify the following script if needed and then run:
+
+	```bash
+	python train.py \
+	  --train_data_path ./data/train_data \
+	  --test_data_path ./data/test_data \
+	  --src_dict_path ./data/src_dict \
+	  --trg_dict_path ./data/trg_dict \
+	  --enc_blocks "[(256, 3)] * 5" \
+	  --dec_blocks "[(256, 3)] * 3" \
+	  --emb_size 256 \
+	  --pos_size 200 \
+	  --drop_rate 0.1 \
+	  --use_gpu False \
+	  --trainer_count 1 \
+	  --batch_size 32 \
+	  --num_passes 20 \
+	  >train.log 2>&1
+	```
+
+# Inferring by a Trained Model
+- Infer by a trained model by running:
+
+	```bash
+	python infer.py \
+	  --infer_data_path ./data/infer_data \
+	  --src_dict_path ./data/src_dict \
+	  --trg_dict_path ./data/trg_dict \
+	  --enc_blocks "[(256, 3)] * 5" \
+	  --dec_blocks "[(256, 3)] * 3" \
+	  --emb_size 256 \
+	  --pos_size 200 \
+	  --drop_rate 0.1 \
+	  --use_gpu False \
+	  --trainer_count 1 \
+	  --max_len 100 \
+	  --beam_size 1 \
+	  --model_path ./params.pass-0.tar.gz \
+	  1>infer_result 2>infer.log
+	```
+
+# Notes
+
+Currently, beam search will forward the encoder multiple times when predicting each target word, which requires extra computations. And we will fix it later.
--- a/conv_seq_to_seq/beamsearch.py
+++ b/conv_seq_to_seq/beamsearch.py
+#coding=utf-8
+
+import sys
+import time
+import numpy as np
+
+
+class BeamSearch(object):
+    """
+    Generate sequence by beam search
+    NOTE: this class only implements generating one sentence at a time.
+    """
+
+    def __init__(self,
+                 inferer,
+                 trg_dict,
+                 pos_size,
+                 padding_num,
+                 beam_size=1,
+                 max_len=100):
+        self.inferer = inferer
+        self.trg_dict = trg_dict
+        self.word_padding = trg_dict.__len__()
+        self.pos_size = pos_size
+        self.pos_padding = pos_size
+        self.padding_num = padding_num
+        self.win_len = padding_num + 1
+        self.max_len = max_len
+        self.beam_size = beam_size
+
+    def get_beam_input(self, pre_beam_list, infer_data):
+        """
+        Get input for generation at the current iteration.
+        """
+        beam_input = []
+
+        if len(pre_beam_list) == 0:
+            cur_trg = [self.word_padding
+                       ] * self.padding_num + [self.trg_dict['<s>']]
+            cur_trg_pos = [self.pos_padding] * self.padding_num + [0]
+            beam_input.append(infer_data + [cur_trg] + [cur_trg_pos])
+        else:
+            for seq in pre_beam_list:
+                if len(seq) < self.win_len:
+                    cur_trg = [self.word_padding] * (
+                        self.win_len - len(seq) - 1
+                    ) + [self.trg_dict['<s>']] + seq
+                    cur_trg_pos = [self.pos_padding] * (
+                        self.win_len - len(seq) - 1) + [0] + range(1,
+                                                                   len(seq) + 1)
+                else:
+                    cur_trg = seq[-self.win_len:]
+                    cur_trg_pos = range(
+                        len(seq) + 1 - self.win_len, len(seq) + 1)
+
+                beam_input.append(infer_data + [cur_trg] + [cur_trg_pos])
+        return beam_input
+
+    def get_prob(self, beam_input):
+        """
+        Get the probabilities of all possible tokens.
+        """
+        row_list = [j * self.win_len for j in range(len(beam_input))]
+        prob = self.inferer.infer(beam_input, field='value')[row_list, :]
+        return prob
+
+    def get_candidate(self, pre_beam_list, pre_beam_score, prob):
+        """
+        Get top beam_size tokens and their scores for each beam.
+        """
+        if prob.ndim == 1:
+            candidate_id = prob.argsort()[-self.beam_size:][::-1]
+            candidate_log_prob = np.log(prob[candidate_id])
+        else:
+            candidate_id = prob.argsort()[:, -self.beam_size:][:, ::-1]
+            candidate_log_prob = np.zeros_like(candidate_id).astype('float32')
+            for j in range(len(pre_beam_list)):
+                candidate_log_prob[j, :] = np.log(prob[j, candidate_id[j, :]])
+
+        if pre_beam_score.size > 0:
+            candidate_score = candidate_log_prob + pre_beam_score.reshape(
+                (pre_beam_score.size, 1))
+        else:
+            candidate_score = candidate_log_prob
+
+        return candidate_id, candidate_score
+
+    def prune(self, candidate_id, candidate_score, pre_beam_list,
+              completed_seq_list, completed_seq_score, completed_seq_min_score):
+        """
+        Pruning process of the beam search. During the process, beam_size most possible sequences
+        are selected for the beam in the next iteration. Besides, their scores and the minimum score
+        of the completed sequences are updated.
+        """
+        candidate_id = candidate_id.flatten()
+        candidate_score = candidate_score.flatten()
+
+        topk_idx = candidate_score.argsort()[-self.beam_size:][::-1].tolist()
+        topk_seq_idx = [idx / self.beam_size for idx in topk_idx]
+
+        next_beam = []
+        beam_score = []
+        for j in range(len(topk_idx)):
+            if candidate_id[topk_idx[j]] == self.trg_dict['<e>']:
+                if len(
+                        completed_seq_list
+                ) < self.beam_size or completed_seq_min_score <= candidate_score[
+                        topk_idx[j]]:
+                    completed_seq_list.append(pre_beam_list[topk_seq_idx[j]])
+                    completed_seq_score.append(candidate_score[topk_idx[j]])
+
+                    if completed_seq_min_score is None or (
+                            completed_seq_min_score >=
+                            candidate_score[topk_idx[j]] and
+                            len(completed_seq_list) < self.beam_size):
+                        completed_seq_min_score = candidate_score[topk_idx[j]]
+            else:
+                seq = pre_beam_list[topk_seq_idx[
+                    j]] + [candidate_id[topk_idx[j]]]
+                score = candidate_score[topk_idx[j]]
+                next_beam.append(seq)
+                beam_score.append(score)
+
+        beam_score = np.array(beam_score)
+        return next_beam, beam_score, completed_seq_min_score
+
+    def search_one_sample(self, infer_data):
+        """
+        Beam search process for one sample.
+        """
+        completed_seq_list = []
+        completed_seq_score = []
+        completed_seq_min_score = None
+        uncompleted_seq_list = [[]]
+        uncompleted_seq_score = np.zeros(0)
+
+        for i in xrange(self.max_len):
+            beam_input = self.get_beam_input(uncompleted_seq_list, infer_data)
+
+            prob = self.get_prob(beam_input)
+
+            candidate_id, candidate_score = self.get_candidate(
+                uncompleted_seq_list, uncompleted_seq_score, prob)
+
+            uncompleted_seq_list, uncompleted_seq_score, completed_seq_min_score = self.prune(
+                candidate_id, candidate_score, uncompleted_seq_list,
+                completed_seq_list, completed_seq_score,
+                completed_seq_min_score)
+
+            if len(uncompleted_seq_list) == 0:
+                break
+            if len(completed_seq_list) >= self.beam_size:
+                seq_max_score = uncompleted_seq_score.max()
+                if seq_max_score < completed_seq_min_score:
+                    uncompleted_seq_list = []
+                    break
+
+        final_seq_list = completed_seq_list + uncompleted_seq_list
+        final_score = np.concatenate(
+            (np.array(completed_seq_score), uncompleted_seq_score))
+        max_id = final_score.argmax()
+        top_seq = final_seq_list[max_id]
+        return top_seq
--- a/conv_seq_to_seq/infer.py
+++ b/conv_seq_to_seq/infer.py
+#coding=utf-8
+
+import sys
+import argparse
+import distutils.util
+import gzip
+
+import paddle.v2 as paddle
+from model import conv_seq2seq
+from beamsearch import BeamSearch
+import reader
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="PaddlePaddle Convolutional Seq2Seq")
+    parser.add_argument(
+        '--infer_data_path',
+        type=str,
+        required=True,
+        help="Path of the dataset for inference")
+    parser.add_argument(
+        '--src_dict_path',
+        type=str,
+        required=True,
+        help='Path of the source dictionary')
+    parser.add_argument(
+        '--trg_dict_path',
+        type=str,
+        required=True,
+        help='path of the target dictionary')
+    parser.add_argument(
+        '--enc_blocks', type=str, help='Convolution blocks of the encoder')
+    parser.add_argument(
+        '--dec_blocks', type=str, help='Convolution blocks of the decoder')
+    parser.add_argument(
+        '--emb_size',
+        type=int,
+        default=512,
+        help='Dimension of word embedding. (default: %(default)s)')
+    parser.add_argument(
+        '--pos_size',
+        type=int,
+        default=200,
+        help='Total number of the position indexes. (default: %(default)s)')
+    parser.add_argument(
+        '--drop_rate',
+        type=float,
+        default=0.,
+        help='Dropout rate. (default: %(default)s)')
+    parser.add_argument(
+        "--use_gpu",
+        default=False,
+        type=distutils.util.strtobool,
+        help="Use gpu or not. (default: %(default)s)")
+    parser.add_argument(
+        "--trainer_count",
+        default=1,
+        type=int,
+        help="Trainer number. (default: %(default)s)")
+    parser.add_argument(
+        '--max_len',
+        type=int,
+        default=100,
+        help="The maximum length of the sentence to be generated. (default: %(default)s)"
+    )
+    parser.add_argument(
+        "--beam_size",
+        default=1,
+        type=int,
+        help="The width of beam expasion. (default: %(default)s)")
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        required=True,
+        help="The path of trained model. (default: %(default)s)")
+    return parser.parse_args()
+
+
+def to_sentence(seq, dictionary):
+    raw_sentence = [dictionary[id] for id in seq]
+    sentence = " ".join(raw_sentence)
+    return sentence
+
+
+def infer(infer_data_path,
+          src_dict_path,
+          trg_dict_path,
+          model_path,
+          enc_conv_blocks,
+          dec_conv_blocks,
+          emb_dim=512,
+          pos_size=200,
+          drop_rate=0.,
+          max_len=100,
+          beam_size=1):
+    """
+    Inference.
+
+    :param infer_data_path: The path of the data for inference.
+    :type infer_data_path: str
+    :param src_dict_path: The path of the source dictionary.
+    :type src_dict_path: str
+    :param trg_dict_path: The path of the target dictionary.
+    :type trg_dict_path: str
+    :param model_path: The path of a trained model.
+    :type model_path: str
+    :param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of
+                            the list contains output dimension and context length of the corresponding
+                            convolution block.
+    :type enc_conv_blocks: list of tuple
+    :param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of
+                            the list contains output dimension and context length of the corresponding
+                            convolution block.
+    :type dec_conv_blocks: list of tuple
+    :param emb_dim: The dimension of the embedding vector.
+    :type emb_dim: int
+    :param pos_size: The total number of the position indexes, which means
+                     the maximum value of the index is pos_size - 1.
+    :type pos_size: int
+    :param drop_rate: Dropout rate.
+    :type drop_rate: float
+    :param max_len: The maximum length of the sentence to be generated.
+    :type max_len: int
+    :param beam_size: The width of beam expansion.
+    :type beam_size: int
+    """
+    # load dict
+    src_dict = reader.load_dict(src_dict_path)
+    trg_dict = reader.load_dict(trg_dict_path)
+    src_dict_size = src_dict.__len__()
+    trg_dict_size = trg_dict.__len__()
+
+    prob = conv_seq2seq(
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        pos_size=pos_size,
+        emb_dim=emb_dim,
+        enc_conv_blocks=enc_conv_blocks,
+        dec_conv_blocks=dec_conv_blocks,
+        drop_rate=drop_rate,
+        is_infer=True)
+
+    # load parameters
+    parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path))
+
+    padding_list = [context_len - 1 for (size, context_len) in dec_conv_blocks]
+    padding_num = reduce(lambda x, y: x + y, padding_list)
+    infer_reader = reader.data_reader(
+        data_file=infer_data_path,
+        src_dict=src_dict,
+        trg_dict=trg_dict,
+        pos_size=pos_size,
+        padding_num=padding_num)
+
+    inferer = paddle.inference.Inference(
+        output_layer=prob, parameters=parameters)
+
+    searcher = BeamSearch(
+        inferer=inferer,
+        trg_dict=trg_dict,
+        pos_size=pos_size,
+        padding_num=padding_num,
+        max_len=max_len,
+        beam_size=beam_size)
+
+    reverse_trg_dict = reader.get_reverse_dict(trg_dict)
+    for i, raw_data in enumerate(infer_reader()):
+        infer_data = [raw_data[0], raw_data[1]]
+        result = searcher.search_one_sample(infer_data)
+        sentence = to_sentence(result, reverse_trg_dict)
+        print sentence
+        sys.stdout.flush()
+    return
+
+
+def main():
+    args = parse_args()
+    enc_conv_blocks = eval(args.enc_blocks)
+    dec_conv_blocks = eval(args.dec_blocks)
+
+    paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
+
+    infer(
+        infer_data_path=args.infer_data_path,
+        src_dict_path=args.src_dict_path,
+        trg_dict_path=args.trg_dict_path,
+        model_path=args.model_path,
+        enc_conv_blocks=enc_conv_blocks,
+        dec_conv_blocks=dec_conv_blocks,
+        emb_dim=args.emb_size,
+        pos_size=args.pos_size,
+        drop_rate=args.drop_rate,
+        max_len=args.max_len,
+        beam_size=args.beam_size)
+
+
+if __name__ == '__main__':
+    main()
--- a/conv_seq_to_seq/model.py
+++ b/conv_seq_to_seq/model.py
+#coding=utf-8
+
+import math
+
+import paddle.v2 as paddle
+
+__all__ = ["conv_seq2seq"]
+
+
+def gated_conv_with_batchnorm(input,
+                              size,
+                              context_len,
+                              context_start=None,
+                              learning_rate=1.0,
+                              drop_rate=0.):
+    """
+    Definition of the convolution block.
+
+    :param input: The input of this block.
+    :type input: LayerOutput
+    :param size: The dimension of the block's output.
+    :type size: int
+    :param context_len: The context length of the convolution.
+    :type context_len: int
+    :param context_start: The start position of the context.
+    :type context_start: int
+    :param learning_rate: The learning rate factor of the parameters in the block.
+                          The actual learning rate is the product of the global
+                          learning rate and this factor.
+    :type learning_rate: float
+    :param drop_rate: Dropout rate.
+    :type drop_rate: float
+    :return: The output of the convolution block.
+    :rtype: LayerOutput
+    """
+    input = paddle.layer.dropout(input=input, dropout_rate=drop_rate)
+
+    context = paddle.layer.mixed(
+        size=input.size * context_len,
+        input=paddle.layer.context_projection(
+            input=input, context_len=context_len, context_start=context_start))
+
+    raw_conv = paddle.layer.fc(
+        input=context,
+        size=size * 2,
+        act=paddle.activation.Linear(),
+        param_attr=paddle.attr.Param(
+            initial_mean=0.,
+            initial_std=math.sqrt(4.0 * (1.0 - drop_rate) / context.size),
+            learning_rate=learning_rate),
+        bias_attr=False)
+
+    batch_norm_conv = paddle.layer.batch_norm(
+        input=raw_conv,
+        act=paddle.activation.Linear(),
+        param_attr=paddle.attr.Param(learning_rate=learning_rate))
+
+    with paddle.layer.mixed(size=size) as conv:
+        conv += paddle.layer.identity_projection(
+            batch_norm_conv, size=size, offset=0)
+
+    with paddle.layer.mixed(size=size, act=paddle.activation.Sigmoid()) as gate:
+        gate += paddle.layer.identity_projection(
+            batch_norm_conv, size=size, offset=size)
+
+    with paddle.layer.mixed(size=size) as gated_conv:
+        gated_conv += paddle.layer.dotmul_operator(conv, gate)
+
+    return gated_conv
+
+
+def encoder(token_emb,
+            pos_emb,
+            conv_blocks=[(256, 3)] * 5,
+            num_attention=3,
+            drop_rate=0.1):
+    """
+    Definition of the encoder.
+
+    :param token_emb: The embedding vector of the input token.
+    :type token_emb: LayerOutput
+    :param pos_emb: The embedding vector of the input token's position.
+    :type pos_emb: LayerOutput
+    :param conv_blocks: The scale list of the convolution blocks. Each element of
+                        the list contains output dimension and context length of
+                        the corresponding convolution block.
+    :type conv_blocks: list of tuple
+    :param num_attention: The total number of the attention modules used in the decoder.
+    :type num_attention: int
+    :param drop_rate: Dropout rate.
+    :type drop_rate: float
+    :return: The input token encoding.
+    :rtype: LayerOutput
+    """
+    embedding = paddle.layer.addto(
+        input=[token_emb, pos_emb],
+        layer_attr=paddle.attr.Extra(drop_rate=drop_rate))
+
+    proj_size = conv_blocks[0][0]
+    block_input = paddle.layer.fc(
+        input=embedding,
+        size=proj_size,
+        act=paddle.activation.Linear(),
+        param_attr=paddle.attr.Param(
+            initial_mean=0.,
+            initial_std=math.sqrt((1.0 - drop_rate) / embedding.size),
+            learning_rate=1.0 / (2.0 * num_attention)),
+        bias_attr=True, )
+
+    for (size, context_len) in conv_blocks:
+        if block_input.size == size:
+            residual = block_input
+        else:
+            residual = paddle.layer.fc(
+                input=block_input,
+                size=size,
+                act=paddle.activation.Linear(),
+                param_attr=paddle.attr.Param(learning_rate=1.0 /
+                                             (2.0 * num_attention)),
+                bias_attr=True)
+
+        gated_conv = gated_conv_with_batchnorm(
+            input=block_input,
+            size=size,
+            context_len=context_len,
+            learning_rate=1.0 / (2.0 * num_attention),
+            drop_rate=drop_rate)
+
+        with paddle.layer.mixed(size=size) as block_output:
+            block_output += paddle.layer.identity_projection(residual)
+            block_output += paddle.layer.identity_projection(gated_conv)
+
+        # halve the variance of the sum
+        block_output = paddle.layer.slope_intercept(
+            input=block_output, slope=math.sqrt(0.5))
+
+        block_input = block_output
+
+    emb_dim = embedding.size
+    encoded_vec = paddle.layer.fc(
+        input=block_output,
+        size=emb_dim,
+        act=paddle.activation.Linear(),
+        param_attr=paddle.attr.Param(learning_rate=1.0 / (2.0 * num_attention)),
+        bias_attr=True)
+
+    encoded_sum = paddle.layer.addto(input=[encoded_vec, embedding])
+
+    # halve the variance of the sum
+    encoded_sum = paddle.layer.slope_intercept(input=encoded_sum, slope=math.sqrt(0.5))
+
+    return encoded_vec, encoded_sum
+
+
+def attention(decoder_state, cur_embedding, encoded_vec, encoded_sum):
+    """
+    Definition of the attention.
+
+    :param decoder_state: The hidden state of the decoder.
+    :type decoder_state: LayerOutput
+    :param cur_embedding: The embedding vector of the current token.
+    :type cur_embedding: LayerOutput
+    :param encoded_vec: The source token encoding.
+    :type encoded_vec: LayerOutput
+    :param encoded_sum: The sum of the source token's encoding and embedding.
+    :type encoded_sum: LayerOutput
+    :return: A context vector.
+    :rtype: LayerOutput
+    """
+    residual = decoder_state
+
+    state_size = decoder_state.size
+    emb_dim = cur_embedding.size
+    with paddle.layer.mixed(size=emb_dim, bias_attr=True) as state_summary:
+        state_summary += paddle.layer.full_matrix_projection(decoder_state)
+        state_summary += paddle.layer.identity_projection(cur_embedding)
+
+    # halve the variance of the sum
+    state_summary = paddle.layer.slope_intercept(
+        input=state_summary, slope=math.sqrt(0.5))
+
+    expanded = paddle.layer.expand(input=state_summary, expand_as=encoded_vec)
+
+    m = paddle.layer.linear_comb(weights=expanded, vectors=encoded_vec)
+
+    attention_weight = paddle.layer.fc(
+        input=m,
+        size=1,
+        act=paddle.activation.SequenceSoftmax(),
+        bias_attr=False)
+
+    scaled = paddle.layer.scaling(weight=attention_weight, input=encoded_sum)
+
+    attended = paddle.layer.pooling(
+        input=scaled, pooling_type=paddle.pooling.Sum())
+
+    attended_proj = paddle.layer.fc(
+        input=attended,
+        size=state_size,
+        act=paddle.activation.Linear(),
+        bias_attr=True)
+
+    attention_result = paddle.layer.addto(input=[attended_proj, residual])
+
+    # halve the variance of the sum
+    attention_result = paddle.layer.slope_intercept(
+        input=attention_result, slope=math.sqrt(0.5))
+    return attention_result
+
+
+def decoder(token_emb,
+            pos_emb,
+            encoded_vec,
+            encoded_sum,
+            dict_size,
+            conv_blocks=[(256, 3)] * 3,
+            drop_rate=0.1):
+    """
+    Definition of the decoder.
+
+    :param token_emb: The embedding vector of the input token.
+    :type token_emb: LayerOutput
+    :param pos_emb: The embedding vector of the input token's position.
+    :type pos_emb: LayerOutput
+    :param encoded_vec: The source token encoding.
+    :type encoded_vec: LayerOutput
+    :param encoded_sum: The sum of the source token's encoding and embedding.
+    :type encoded_sum: LayerOutput
+    :param dict_size: The size of the target dictionary.
+    :type dict_size: int
+    :param conv_blocks: The scale list of the convolution blocks. Each element
+                        of the list contains output dimension and context length
+                        of the corresponding convolution block.
+    :type conv_blocks: list of tuple
+    :param drop_rate: Dropout rate.
+    :type drop_rate: float
+    :return: The probability of the predicted token.
+    :rtype: LayerOutput
+    """
+
+    def attention_step(decoder_state, cur_embedding, encoded_vec, encoded_sum):
+        conditional = attention(
+            decoder_state=decoder_state,
+            cur_embedding=cur_embedding,
+            encoded_vec=encoded_vec,
+            encoded_sum=encoded_sum)
+        return conditional
+
+    embedding = paddle.layer.addto(
+        input=[token_emb, pos_emb],
+        layer_attr=paddle.attr.Extra(drop_rate=drop_rate))
+
+    proj_size = conv_blocks[0][0]
+    block_input = paddle.layer.fc(
+        input=embedding,
+        size=proj_size,
+        act=paddle.activation.Linear(),
+        param_attr=paddle.attr.Param(
+            initial_mean=0.,
+            initial_std=math.sqrt((1.0 - drop_rate) / embedding.size)),
+        bias_attr=True, )
+
+    for (size, context_len) in conv_blocks:
+        if block_input.size == size:
+            residual = block_input
+        else:
+            residual = paddle.layer.fc(
+                input=block_input,
+                size=size,
+                act=paddle.activation.Linear(),
+                bias_attr=True)
+
+        decoder_state = gated_conv_with_batchnorm(
+            input=block_input,
+            size=size,
+            context_len=context_len,
+            context_start=0,
+            drop_rate=drop_rate)
+
+        group_inputs = [
+            decoder_state,
+            embedding,
+            paddle.layer.StaticInput(input=encoded_vec),
+            paddle.layer.StaticInput(input=encoded_sum),
+        ]
+
+        conditional = paddle.layer.recurrent_group(
+            step=attention_step, input=group_inputs)
+
+        block_output = paddle.layer.addto(input=[conditional, residual])
+
+        # halve the variance of the sum
+        block_output = paddle.layer.slope_intercept(
+            input=block_output, slope=math.sqrt(0.5))
+
+        block_input = block_output
+
+    out_emb_dim = embedding.size
+    block_output = paddle.layer.fc(
+        input=block_output,
+        size=out_emb_dim,
+        act=paddle.activation.Linear(),
+        layer_attr=paddle.attr.Extra(drop_rate=drop_rate))
+
+    decoder_out = paddle.layer.fc(
+        input=block_output,
+        size=dict_size,
+        act=paddle.activation.Softmax(),
+        param_attr=paddle.attr.Param(
+            initial_mean=0.,
+            initial_std=math.sqrt((1.0 - drop_rate) / block_output.size)),
+        bias_attr=True)
+
+    return decoder_out
+
+
+def conv_seq2seq(src_dict_size,
+                 trg_dict_size,
+                 pos_size,
+                 emb_dim,
+                 enc_conv_blocks=[(256, 3)] * 5,
+                 dec_conv_blocks=[(256, 3)] * 3,
+                 drop_rate=0.1,
+                 is_infer=False):
+    """
+    Definition of convolutional sequence-to-sequence network.
+
+    :param src_dict_size: The size of the source dictionary.
+    :type src_dict_size: int
+    :param trg_dict_size: The size of the target dictionary.
+    :type trg_dict_size: int
+    :param pos_size: The total number of the position indexes, which means
+                     the maximum value of the index is pos_size - 1.
+    :type pos_size: int
+    :param emb_dim: The dimension of the embedding vector.
+    :type emb_dim: int
+    :param enc_conv_blocks: The scale list of the encoder's convolution blocks. Each element
+                            of the list contains output dimension and context length of the
+                            corresponding convolution block.
+    :type enc_conv_blocks: list of tuple
+    :param dec_conv_blocks: The scale list of the decoder's convolution blocks. Each element
+                            of the list contains output dimension and context length of the
+                            corresponding convolution block.
+    :type dec_conv_blocks: list of tuple
+    :param drop_rate: Dropout rate.
+    :type drop_rate: float
+    :param is_infer: Whether infer or not.
+    :type is_infer: bool
+    :return: Cost or output layer.
+    :rtype: LayerOutput
+    """
+    src = paddle.layer.data(
+        name='src_word',
+        type=paddle.data_type.integer_value_sequence(src_dict_size))
+    src_pos = paddle.layer.data(
+        name='src_word_pos',
+        type=paddle.data_type.integer_value_sequence(pos_size +
+                                                     1))  # one for padding
+
+    src_emb = paddle.layer.embedding(
+        input=src,
+        size=emb_dim,
+        name='src_word_emb',
+        param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))
+    src_pos_emb = paddle.layer.embedding(
+        input=src_pos,
+        size=emb_dim,
+        name='src_pos_emb',
+        param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))
+
+    num_attention = len(dec_conv_blocks)
+    encoded_vec, encoded_sum = encoder(
+        token_emb=src_emb,
+        pos_emb=src_pos_emb,
+        conv_blocks=enc_conv_blocks,
+        num_attention=num_attention,
+        drop_rate=drop_rate)
+
+    trg = paddle.layer.data(
+        name='trg_word',
+        type=paddle.data_type.integer_value_sequence(trg_dict_size +
+                                                     1))  # one for padding
+    trg_pos = paddle.layer.data(
+        name='trg_word_pos',
+        type=paddle.data_type.integer_value_sequence(pos_size +
+                                                     1))  # one for padding
+
+    trg_emb = paddle.layer.embedding(
+        input=trg,
+        size=emb_dim,
+        name='trg_word_emb',
+        param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))
+    trg_pos_emb = paddle.layer.embedding(
+        input=trg_pos,
+        size=emb_dim,
+        name='trg_pos_emb',
+        param_attr=paddle.attr.Param(initial_mean=0., initial_std=0.1))
+
+    decoder_out = decoder(
+        token_emb=trg_emb,
+        pos_emb=trg_pos_emb,
+        encoded_vec=encoded_vec,
+        encoded_sum=encoded_sum,
+        dict_size=trg_dict_size,
+        conv_blocks=dec_conv_blocks,
+        drop_rate=drop_rate)
+
+    if is_infer:
+        return decoder_out
+
+    trg_next_word = paddle.layer.data(
+        name='trg_next_word',
+        type=paddle.data_type.integer_value_sequence(trg_dict_size))
+    cost = paddle.layer.classification_cost(
+        input=decoder_out, label=trg_next_word)
+
+    return cost
--- a/conv_seq_to_seq/reader.py
+++ b/conv_seq_to_seq/reader.py
+#coding=utf-8
+
+import random
+
+
+def load_dict(dict_file):
+    word_dict = dict()
+    with open(dict_file, 'r') as f:
+        for i, line in enumerate(f):
+            w = line.strip().split()[0]
+            word_dict[w] = i
+    return word_dict
+
+
+def get_reverse_dict(dictionary):
+    reverse_dict = {dictionary[k]: k for k in dictionary.keys()}
+    return reverse_dict
+
+
+def load_data(data_file, src_dict, trg_dict):
+    UNK_IDX = src_dict['<unk>']
+    with open(data_file, 'r') as f:
+        for line in f:
+            line_split = line.strip().split('\t')
+            if len(line_split) < 2:
+                continue
+            src, trg = line_split
+            src_words = src.strip().split()
+            trg_words = trg.strip().split()
+            src_seq = [src_dict.get(w, UNK_IDX) for w in src_words]
+            trg_seq = [trg_dict.get(w, UNK_IDX) for w in trg_words]
+            yield src_seq, trg_seq
+
+
+def data_reader(data_file, src_dict, trg_dict, pos_size, padding_num):
+    def reader():
+        UNK_IDX = src_dict['<unk>']
+        word_padding = trg_dict.__len__()
+        pos_padding = pos_size
+
+        def _get_pos(pos_list, pos_size, pos_padding):
+            return [pos if pos < pos_size else pos_padding for pos in pos_list]
+
+        with open(data_file, 'r') as f:
+            for line in f:
+                line_split = line.strip().split('\t')
+                if len(line_split) != 2:
+                    continue
+                src, trg = line_split
+                src = src.strip().split()
+                src_word = [src_dict.get(w, UNK_IDX) for w in src]
+                src_word_pos = range(len(src_word))
+                src_word_pos = _get_pos(src_word_pos, pos_size, pos_padding)
+
+                trg = trg.strip().split()
+                trg_word = [trg_dict['<s>']
+                            ] + [trg_dict.get(w, UNK_IDX) for w in trg]
+                trg_word_pos = range(len(trg_word))
+                trg_word_pos = _get_pos(trg_word_pos, pos_size, pos_padding)
+
+                trg_next_word = trg_word[1:] + [trg_dict['<e>']]
+                trg_word = [word_padding] * padding_num + trg_word
+                trg_word_pos = [pos_padding] * padding_num + trg_word_pos
+                trg_next_word = trg_next_word + [trg_dict['<e>']] * padding_num
+                yield src_word, src_word_pos, trg_word, trg_word_pos, trg_next_word
+
+    return reader
--- a/conv_seq_to_seq/train.py
+++ b/conv_seq_to_seq/train.py
+#coding=utf-8
+
+import os
+import sys
+import time
+import argparse
+import distutils.util
+import gzip
+import numpy as np
+
+import paddle.v2 as paddle
+from model import conv_seq2seq
+import reader
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="PaddlePaddle Convolutional Seq2Seq")
+    parser.add_argument(
+        '--train_data_path',
+        type=str,
+        required=True,
+        help="Path of the training set")
+    parser.add_argument(
+        '--test_data_path', type=str, help='Path of the test set')
+    parser.add_argument(
+        '--src_dict_path',
+        type=str,
+        required=True,
+        help='Path of source dictionary')
+    parser.add_argument(
+        '--trg_dict_path',
+        type=str,
+        required=True,
+        help='Path of target dictionary')
+    parser.add_argument(
+        '--enc_blocks', type=str, help='Convolution blocks of the encoder')
+    parser.add_argument(
+        '--dec_blocks', type=str, help='Convolution blocks of the decoder')
+    parser.add_argument(
+        '--emb_size',
+        type=int,
+        default=512,
+        help='Dimension of word embedding. (default: %(default)s)')
+    parser.add_argument(
+        '--pos_size',
+        type=int,
+        default=200,
+        help='Total number of the position indexes. (default: %(default)s)')
+    parser.add_argument(
+        '--drop_rate',
+        type=float,
+        default=0.,
+        help='Dropout rate. (default: %(default)s)')
+    parser.add_argument(
+        "--use_gpu",
+        default=False,
+        type=distutils.util.strtobool,
+        help="Use gpu or not. (default: %(default)s)")
+    parser.add_argument(
+        "--trainer_count",
+        default=1,
+        type=int,
+        help="Trainer number. (default: %(default)s)")
+    parser.add_argument(
+        '--batch_size',
+        type=int,
+        default=32,
+        help="Size of a mini-batch. (default: %(default)s)")
+    parser.add_argument(
+        '--num_passes',
+        type=int,
+        default=15,
+        help="Number of passes to train. (default: %(default)s)")
+    return parser.parse_args()
+
+
+def create_reader(padding_num,
+                  train_data_path,
+                  test_data_path=None,
+                  src_dict=None,
+                  trg_dict=None,
+                  pos_size=200,
+                  batch_size=32):
+
+    train_reader = paddle.batch(
+        reader=paddle.reader.shuffle(
+            reader=reader.data_reader(
+                data_file=train_data_path,
+                src_dict=src_dict,
+                trg_dict=trg_dict,
+                pos_size=pos_size,
+                padding_num=padding_num),
+            buf_size=10240),
+        batch_size=batch_size)
+
+    test_reader = None
+    if test_data_path:
+        test_reader = paddle.batch(
+            reader=paddle.reader.shuffle(
+                reader=reader.data_reader(
+                    data_file=test_data_path,
+                    src_dict=src_dict,
+                    trg_dict=trg_dict,
+                    pos_size=pos_size,
+                    padding_num=padding_num),
+                buf_size=10240),
+            batch_size=batch_size)
+
+    return train_reader, test_reader
+
+
+def train(train_data_path,
+          test_data_path,
+          src_dict_path,
+          trg_dict_path,
+          enc_conv_blocks,
+          dec_conv_blocks,
+          emb_dim=512,
+          pos_size=200,
+          drop_rate=0.,
+          batch_size=32,
+          num_passes=15):
+    """
+    Train the convolution sequence-to-sequence model.    
+
+    :param train_data_path: The path of the training set.
+    :type train_data_path: str
+    :param test_data_path: The path of the test set.
+    :type test_data_path: str
+    :param src_dict_path: The path of the source dictionary.
+    :type src_dict_path: str
+    :param trg_dict_path: The path of the target dictionary.
+    :type trg_dict_path: str
+    :param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of
+                            the list contains output dimension and context length of the corresponding
+                            convolution block.
+    :type enc_conv_blocks: list of tuple
+    :param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of
+                            the list contains output dimension and context length of the corresponding
+                            convolution block.
+    :type dec_conv_blocks: list of tuple
+    :param emb_dim: The dimension of the embedding vector.
+    :type emb_dim: int
+    :param pos_size: The total number of the position indexes, which means
+                     the maximum value of the index is pos_size - 1.
+    :type pos_size: int
+    :param drop_rate: Dropout rate.
+    :type drop_rate: float
+    :param batch_size: The size of a mini-batch.
+    :type batch_size: int
+    :param num_passes: The total number of the passes to train.
+    :type num_passes: int
+    """
+    # load dict
+    src_dict = reader.load_dict(src_dict_path)
+    trg_dict = reader.load_dict(trg_dict_path)
+    src_dict_size = src_dict.__len__()
+    trg_dict_size = trg_dict.__len__()
+
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=1e-3, )
+
+    cost = conv_seq2seq(
+        src_dict_size=src_dict_size,
+        trg_dict_size=trg_dict_size,
+        pos_size=pos_size,
+        emb_dim=emb_dim,
+        enc_conv_blocks=enc_conv_blocks,
+        dec_conv_blocks=dec_conv_blocks,
+        drop_rate=drop_rate,
+        is_infer=False)
+
+    # create parameters and trainer
+    parameters = paddle.parameters.create(cost)
+    trainer = paddle.trainer.SGD(
+        cost=cost, parameters=parameters, update_equation=optimizer)
+
+    padding_list = [context_len - 1 for (size, context_len) in dec_conv_blocks]
+    padding_num = reduce(lambda x, y: x + y, padding_list)
+    train_reader, test_reader = create_reader(
+        padding_num=padding_num,
+        train_data_path=train_data_path,
+        test_data_path=test_data_path,
+        src_dict=src_dict,
+        trg_dict=trg_dict,
+        pos_size=pos_size,
+        batch_size=batch_size)
+
+    feeding = {
+        'src_word': 0,
+        'src_word_pos': 1,
+        'trg_word': 2,
+        'trg_word_pos': 3,
+        'trg_next_word': 4
+    }
+
+    # create event handler
+    def event_handler(event):
+        if isinstance(event, paddle.event.EndIteration):
+            if event.batch_id % 20 == 0:
+                cur_time = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime())
+                print "[%s]: Pass: %d, Batch: %d, TrainCost: %f, %s" % (
+                    cur_time, event.pass_id, event.batch_id, event.cost,
+                    event.metrics)
+            else:
+                sys.stdout.flush()
+
+        if isinstance(event, paddle.event.EndPass):
+            if test_reader is not None:
+                cur_time = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime())
+                result = trainer.test(reader=test_reader, feeding=feeding)
+                print "[%s]: Pass: %d, TestCost: %f, %s" % (
+                    cur_time, event.pass_id, result.cost, result.metrics)
+                sys.stdout.flush()
+            with gzip.open("output/params.pass-%d.tar.gz" % event.pass_id,
+                           'w') as f:
+                trainer.save_parameter_to_tar(f)
+
+    if not os.path.exists('output'):
+        os.mkdir('output')
+
+    trainer.train(
+        reader=train_reader,
+        event_handler=event_handler,
+        num_passes=num_passes,
+        feeding=feeding)
+
+
+def main():
+    args = parse_args()
+    enc_conv_blocks = eval(args.enc_blocks)
+    dec_conv_blocks = eval(args.dec_blocks)
+
+    paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
+
+    train(
+        train_data_path=args.train_data_path,
+        test_data_path=args.test_data_path,
+        src_dict_path=args.src_dict_path,
+        trg_dict_path=args.trg_dict_path,
+        enc_conv_blocks=enc_conv_blocks,
+        dec_conv_blocks=dec_conv_blocks,
+        emb_dim=args.emb_size,
+        pos_size=args.pos_size,
+        drop_rate=args.drop_rate,
+        batch_size=args.batch_size,
+        num_passes=args.num_passes)
+
+
+if __name__ == '__main__':
+    main()