Merge remote-tracking branch 'upstream/develop' into chinese_poetry

dbff6d68 · wangmeng28 · 16c4afee · 8fe0c210 · dbff6d68 · dbff6d68
17 changed file
--- a/deep_speech_2/data_utils/data.py
+++ b/deep_speech_2/data_utils/data.py
@@ -11,6 +11,7 @@ import multiprocessing
 import numpy as np
 import paddle.v2 as paddle
 from threading import local
+import atexit
 from data_utils.utility import read_manifest
 from data_utils.utility import xmap_readers_mp
 from data_utils.augmentor.augmentation import AugmentationPipeline
@@ -59,6 +60,9 @@ class DataGenerator(object):
                                    be passed forward directly without
                                    converting to index sequence.
    :type keep_transcription_text: bool
+    :param num_conv_layers: The number of convolution layer, used to compute
+                            the sequence length.
+    :type num_conv_layers: int
    """
    def __init__(self,
@@ -74,7 +78,8 @@ class DataGenerator(object):
                 use_dB_normalization=True,
                 num_threads=multiprocessing.cpu_count() // 2,
                 random_seed=0,
-                 keep_transcription_text=False):
+                 keep_transcription_text=False,
+                 num_conv_layers=2):
        self._max_duration = max_duration
        self._min_duration = min_duration
        self._normalizer = FeatureNormalizer(mean_std_filepath)
@@ -95,6 +100,7 @@ class DataGenerator(object):
        self._local_data = local()
        self._local_data.tar2info = {}
        self._local_data.tar2object = {}
+        self._num_conv_layers = num_conv_layers
    def process_utterance(self, filename, transcript):
        """Load, augment, featurize and normalize for speech data.
@@ -213,7 +219,15 @@ class DataGenerator(object):
        :return: Data feeding dict.
        :rtype: dict
        """
-        return {"audio_spectrogram": 0, "transcript_text": 1}
+        feeding_dict = {
+            "audio_spectrogram": 0,
+            "transcript_text": 1,
+            "sequence_offset": 2,
+            "sequence_length": 3
+        }
+        for i in xrange(self._num_conv_layers):
+            feeding_dict["conv%d_index_range" % i] = len(feeding_dict)
+        return feeding_dict
    @property
    def vocab_size(self):
@@ -274,13 +288,18 @@ class DataGenerator(object):
            for instance in manifest:
                yield instance
-        return xmap_readers_mp(
+        reader, cleanup_callback = xmap_readers_mp(
            lambda instance: self.process_utterance(instance["audio_filepath"], instance["text"]),
            reader,
            self._num_threads,
            4096,
            order=True)
+        # register callback to main process
+        atexit.register(cleanup_callback)
+        return reader
    def _padding_batch(self, batch, padding_to=-1, flatten=False):
        """
        Padding audio features with zeros to make them have the same shape (or
@@ -306,7 +325,30 @@ class DataGenerator(object):
            padded_audio[:, :audio.shape[1]] = audio
            if flatten:
                padded_audio = padded_audio.flatten()
-            new_batch.append((padded_audio, text))
+            # Stride size for conv0 is (3, 2)
+            # Stride size for conv1 to convN is (1, 2)
+            # Same as the network, hard-coded here
+            padded_instance = [padded_audio, text]
+            padded_conv0_h = (padded_audio.shape[0] - 1) // 2 + 1
+            padded_conv0_w = (padded_audio.shape[1] - 1) // 3 + 1
+            valid_w = (audio.shape[1] - 1) // 3 + 1
+            padded_instance += [
+                [0],  # sequence offset, always 0
+                [valid_w],  # valid sequence length
+                # Index ranges for channel, height and width
+                # Please refer scale_sub_region layer to see details
+                [1, 32, 1, padded_conv0_h, valid_w + 1, padded_conv0_w]
+            ]
+            pre_padded_h = padded_conv0_h
+            for i in xrange(self._num_conv_layers - 1):
+                padded_h = (pre_padded_h - 1) // 2 + 1
+                pre_padded_h = padded_h
+                padded_instance += [
+                    [1, 32, 1, padded_h, valid_w + 1, padded_conv0_w]
+                ]
+            new_batch.append(padded_instance)
        return new_batch
    def _batch_shuffle(self, manifest, batch_size, clipped=False):

--- a/deep_speech_2/data_utils/utility.py
+++ b/deep_speech_2/data_utils/utility.py
@@ -138,6 +138,10 @@ def xmap_readers_mp(mapper, reader, process_num, buffer_size, order=False):
                out_queue.put(sample)
        out_queue.put(end_flag)
+    def cleanup():
+        # kill all sub process and threads
+        os._exit(0)
    def xreader():
        # prepare shared memory
        manager = Manager()
@@ -174,4 +178,4 @@ def xmap_readers_mp(mapper, reader, process_num, buffer_size, order=False):
            yield sample
            sample = flush_queue.get()
-    return xreader
+    return xreader, cleanup
--- a/deep_speech_2/decoders/swig/setup.py
+++ b/deep_speech_2/decoders/swig/setup.py
@@ -70,7 +70,6 @@ FILES = glob.glob('kenlm/util/*.cc') \
 FILES += glob.glob('openfst-1.6.3/src/lib/*.cc')
-# FILES + glob.glob('glog/src/*.cc')
 FILES = [
    fn for fn in FILES
    if not (fn.endswith('main.cc') or fn.endswith('test.cc') or fn.endswith(
@@ -107,7 +106,6 @@ decoders_module = [
            'kenlm',
            'openfst-1.6.3/src/include',
            'ThreadPool',
-            #'glog/src'
        ],
        libraries=LIBS,
        extra_compile_args=ARGS)
@@ -115,7 +113,7 @@ decoders_module = [
 setup(
    name='swig_decoders',
-    version='0.1',
+    version='1.0',
    description="""CTC decoders""",
    ext_modules=decoders_module,
    py_modules=['swig_decoders'], )
--- a/deep_speech_2/infer.py
+++ b/deep_speech_2/infer.py
@@ -69,7 +69,8 @@ def infer():
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        num_threads=1,
-        keep_transcription_text=True)
+        keep_transcription_text=True,
+        num_conv_layers=args.num_conv_layers)
    batch_reader = data_generator.batch_reader_creator(
        manifest_path=args.infer_manifest,
        batch_size=args.num_samples,
@@ -100,10 +101,11 @@ def infer():
        cutoff_top_n=args.cutoff_top_n,
        vocab_list=vocab_list,
        language_model_path=args.lang_model_path,
-        num_processes=args.num_proc_bsearch)
+        num_processes=args.num_proc_bsearch,
+        feeding_dict=data_generator.feeding)
    error_rate_func = cer if args.error_rate_type == 'cer' else wer
-    target_transcripts = [transcript for _, transcript in infer_data]
+    target_transcripts = [data[1] for data in infer_data]
    for target, result in zip(target_transcripts, result_transcripts):
        print("\nTarget Transcription: %s\nOutput Transcription: %s" %
              (target, result))

--- a/deep_speech_2/model_utils/model.py
+++ b/deep_speech_2/model_utils/model.py
@@ -165,7 +165,7 @@ class DeepSpeech2Model(object):
    def infer_batch(self, infer_data, decoding_method, beam_alpha, beam_beta,
                    beam_size, cutoff_prob, cutoff_top_n, vocab_list,
-                    language_model_path, num_processes):
+                    language_model_path, num_processes, feeding_dict):
        """Model inference. Infer the transcription for a batch of speech
        utterances.
@@ -195,6 +195,9 @@ class DeepSpeech2Model(object):
        :type language_model_path: basestring|None
        :param num_processes: Number of processes (CPU) for decoder.
        :type num_processes: int
+        :param feeding_dict: Feeding is a map of field name and tuple index
+                             of the data that reader returns.
+        :type feeding_dict: dict|list
        :return: List of transcription texts.
        :rtype: List of basestring
        """
@@ -203,10 +206,13 @@ class DeepSpeech2Model(object):
            self._inferer = paddle.inference.Inference(
                output_layer=self._log_probs, parameters=self._parameters)
        # run inference
-        infer_results = self._inferer.infer(input=infer_data)
+        infer_results = self._inferer.infer(
-        num_steps = len(infer_results) // len(infer_data)
+            input=infer_data, feeding=feeding_dict)
+        start_pos = [0] * (len(infer_data) + 1)
+        for i in xrange(len(infer_data)):
+            start_pos[i + 1] = start_pos[i] + infer_data[i][3][0]
        probs_split = [
-            infer_results[i * num_steps:(i + 1) * num_steps]
+            infer_results[start_pos[i]:start_pos[i + 1]]
            for i in xrange(0, len(infer_data))
        ]
        # run decoder
@@ -274,9 +280,25 @@ class DeepSpeech2Model(object):
        text_data = paddle.layer.data(
            name="transcript_text",
            type=paddle.data_type.integer_value_sequence(vocab_size))
+        seq_offset_data = paddle.layer.data(
+            name='sequence_offset',
+            type=paddle.data_type.integer_value_sequence(1))
+        seq_len_data = paddle.layer.data(
+            name='sequence_length',
+            type=paddle.data_type.integer_value_sequence(1))
+        index_range_datas = []
+        for i in xrange(num_rnn_layers):
+            index_range_datas.append(
+                paddle.layer.data(
+                    name='conv%d_index_range' % i,
+                    type=paddle.data_type.dense_vector(6)))
        self._log_probs, self._loss = deep_speech_v2_network(
            audio_data=audio_data,
            text_data=text_data,
+            seq_offset_data=seq_offset_data,
+            seq_len_data=seq_len_data,
+            index_range_datas=index_range_datas,
            dict_size=vocab_size,
            num_conv_layers=num_conv_layers,
            num_rnn_layers=num_rnn_layers,

--- a/deep_speech_2/model_utils/network.py
+++ b/deep_speech_2/model_utils/network.py
@@ -7,7 +7,7 @@ import paddle.v2 as paddle
 def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
-                  padding, act):
+                  padding, act, index_range_data):
    """Convolution layer with batch normalization.
    :param input: Input layer.
@@ -24,6 +24,8 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
    :type padding: int|tuple|list
    :param act: Activation type.
    :type act: BaseActivation
+    :param index_range_data: Index range to indicate sub region.
+    :type index_range_data: LayerOutput
    :return: Batch norm layer after convolution layer.
    :rtype: LayerOutput
    """
@@ -36,7 +38,11 @@ def conv_bn_layer(input, filter_size, num_channels_in, num_channels_out, stride,
        padding=padding,
        act=paddle.activation.Linear(),
        bias_attr=False)
-    return paddle.layer.batch_norm(input=conv_layer, act=act)
+    batch_norm = paddle.layer.batch_norm(input=conv_layer, act=act)
+    # reset padding part to 0
+    scale_sub_region = paddle.layer.scale_sub_region(
+        batch_norm, index_range_data, value=0.0)
+    return scale_sub_region
 def bidirectional_simple_rnn_bn_layer(name, input, size, act, share_weights):
@@ -136,13 +142,15 @@ def bidirectional_gru_bn_layer(name, input, size, act):
    return paddle.layer.concat(input=[forward_gru, backward_gru])
-def conv_group(input, num_stacks):
+def conv_group(input, num_stacks, index_range_datas):
    """Convolution group with stacked convolution layers.
    :param input: Input layer.
    :type input: LayerOutput
    :param num_stacks: Number of stacked convolution layers.
    :type num_stacks: int
+    :param index_range_datas: Index ranges for each convolution layer.
+    :type index_range_datas: tuple|list
    :return: Output layer of the convolution group.
    :rtype: LayerOutput
    """
@@ -153,7 +161,8 @@ def conv_group(input, num_stacks):
        num_channels_out=32,
        stride=(3, 2),
        padding=(5, 20),
-        act=paddle.activation.BRelu())
+        act=paddle.activation.BRelu(),
+        index_range_data=index_range_datas[0])
    for i in xrange(num_stacks - 1):
        conv = conv_bn_layer(
            input=conv,
@@ -162,7 +171,8 @@ def conv_group(input, num_stacks):
            num_channels_out=32,
            stride=(1, 2),
            padding=(5, 10),
-            act=paddle.activation.BRelu())
+            act=paddle.activation.BRelu(),
+            index_range_data=index_range_datas[i + 1])
    output_num_channels = 32
    output_height = 160 // pow(2, num_stacks) + 1
    return conv, output_num_channels, output_height
@@ -207,6 +217,9 @@ def rnn_group(input, size, num_stacks, use_gru, share_rnn_weights):
 def deep_speech_v2_network(audio_data,
                           text_data,
+                           seq_offset_data,
+                           seq_len_data,
+                           index_range_datas,
                           dict_size,
                           num_conv_layers=2,
                           num_rnn_layers=3,
@@ -219,6 +232,12 @@ def deep_speech_v2_network(audio_data,
    :type audio_data: LayerOutput
    :param text_data: Transcription text data layer.
    :type text_data: LayerOutput
+    :param seq_offset_data: Sequence offset data layer.
+    :type seq_offset_data: LayerOutput
+    :param seq_len_data: Valid sequence length data layer.
+    :type seq_len_data: LayerOutput
+    :param index_range_datas: Index ranges data layers.
+    :type index_range_datas: tuple|list
    :param dict_size: Dictionary size for tokenized transcription.
    :type dict_size: int
    :param num_conv_layers: Number of stacking convolution layers.
@@ -239,7 +258,9 @@ def deep_speech_v2_network(audio_data,
    """
    # convolution group
    conv_group_output, conv_group_num_channels, conv_group_height = conv_group(
-        input=audio_data, num_stacks=num_conv_layers)
+        input=audio_data,
+        num_stacks=num_conv_layers,
+        index_range_datas=index_range_datas)
    # convert data form convolution feature map to sequence of vectors
    conv2seq = paddle.layer.block_expand(
        input=conv_group_output,
@@ -248,9 +269,16 @@ def deep_speech_v2_network(audio_data,
        stride_y=1,
        block_x=1,
        block_y=conv_group_height)
+    # remove padding part
+    remove_padding_data = paddle.layer.sub_seq(
+        input=conv2seq,
+        offsets=seq_offset_data,
+        sizes=seq_len_data,
+        act=paddle.activation.Linear(),
+        bias_attr=False)
    # rnn group
    rnn_group_output = rnn_group(
-        input=conv2seq,
+        input=remove_padding_data,
        size=rnn_size,
        num_stacks=num_rnn_layers,
        use_gru=use_gru,

--- a/deep_speech_2/test.py
+++ b/deep_speech_2/test.py
@@ -70,7 +70,8 @@ def evaluate():
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        num_threads=args.num_proc_data,
-        keep_transcription_text=True)
+        keep_transcription_text=True,
+        num_conv_layers=args.num_conv_layers)
    batch_reader = data_generator.batch_reader_creator(
        manifest_path=args.test_manifest,
        batch_size=args.batch_size,
@@ -103,8 +104,9 @@ def evaluate():
            cutoff_top_n=args.cutoff_top_n,
            vocab_list=vocab_list,
            language_model_path=args.lang_model_path,
-            num_processes=args.num_proc_bsearch)
+            num_processes=args.num_proc_bsearch,
-        target_transcripts = [transcript for _, transcript in infer_data]
+            feeding_dict=data_generator.feeding)
+        target_transcripts = [data[1] for data in infer_data]
        for target, result in zip(target_transcripts, result_transcripts):
            error_sum += error_rate_func(target, result)
            num_ins += 1

--- a/deep_speech_2/tools/tune.py
+++ b/deep_speech_2/tools/tune.py
@@ -88,7 +88,8 @@ def tune():
        augmentation_config='{}',
        specgram_type=args.specgram_type,
        num_threads=args.num_proc_data,
-        keep_transcription_text=True)
+        keep_transcription_text=True,
+        num_conv_layers=args.num_conv_layers)
    audio_data = paddle.layer.data(
        name="audio_spectrogram",
@@ -96,10 +97,25 @@ def tune():
    text_data = paddle.layer.data(
        name="transcript_text",
        type=paddle.data_type.integer_value_sequence(data_generator.vocab_size))
+    seq_offset_data = paddle.layer.data(
+        name='sequence_offset',
+        type=paddle.data_type.integer_value_sequence(1))
+    seq_len_data = paddle.layer.data(
+        name='sequence_length',
+        type=paddle.data_type.integer_value_sequence(1))
+    index_range_datas = []
+    for i in xrange(args.num_rnn_layers):
+        index_range_datas.append(
+            paddle.layer.data(
+                name='conv%d_index_range' % i,
+                type=paddle.data_type.dense_vector(6)))
    output_probs, _ = deep_speech_v2_network(
        audio_data=audio_data,
        text_data=text_data,
+        seq_offset_data=seq_offset_data,
+        seq_len_data=seq_len_data,
+        index_range_datas=index_range_datas,
        dict_size=data_generator.vocab_size,
        num_conv_layers=args.num_conv_layers,
        num_rnn_layers=args.num_rnn_layers,
@@ -156,15 +172,17 @@ def tune():
    for infer_data in batch_reader():
        if (args.num_batches >= 0) and (cur_batch >= args.num_batches):
            break
-        infer_results = inferer.infer(input=infer_data)
+        infer_results = inferer.infer(input=infer_data,
+                                      feeding=data_generator.feeding)
-        num_steps = len(infer_results) // len(infer_data)
+        start_pos = [0] * (len(infer_data) + 1)
+        for i in xrange(len(infer_data)):
+            start_pos[i + 1] = start_pos[i] + infer_data[i][3][0]
        probs_split = [
-            infer_results[i * num_steps:(i + 1) * num_steps]
+                infer_results[start_pos[i]:start_pos[i + 1]]
-            for i in xrange(len(infer_data))
+                for i in xrange(0, len(infer_data))
        ]
-        target_transcripts = [transcript for _, transcript in infer_data]
+        target_transcripts = [ data[1] for data in infer_data ]
        num_ins += len(target_transcripts)
        # grid search

--- a/deep_speech_2/train.py
+++ b/deep_speech_2/train.py
@@ -75,13 +75,15 @@ def train():
        max_duration=args.max_duration,
        min_duration=args.min_duration,
        specgram_type=args.specgram_type,
-        num_threads=args.num_proc_data)
+        num_threads=args.num_proc_data,
+        num_conv_layers=args.num_conv_layers)
    dev_generator = DataGenerator(
        vocab_filepath=args.vocab_path,
        mean_std_filepath=args.mean_std_path,
        augmentation_config="{}",
        specgram_type=args.specgram_type,
-        num_threads=args.num_proc_data)
+        num_threads=args.num_proc_data,
+        num_conv_layers=args.num_conv_layers)
    train_batch_reader = train_generator.batch_reader_creator(
        manifest_path=args.train_manifest,
        batch_size=args.batch_size,

--- a/dssm/README.cn.md
+++ b/dssm/README.cn.md
 # 深度结构化语义模型 (Deep Structured Semantic Models, DSSM)
-DSSM使用DNN模型在一个连续的语义空间中学习文本低纬的表示向量，并且建模两个句子间的语义相似度。
+DSSM使用DNN模型在一个连续的语义空间中学习文本低纬的表示向量，并且建模两个句子间的语义相似度。本例演示如何使用PaddlePaddle实现一个通用的DSSM 模型，用于建模两个字符串间的语义相似度，模型实现支持通用的数据格式，用户替换数据便可以在真实场景中使用该模型。
-本例演示如何使用 PaddlePaddle实现一个通用的DSSM 模型，用于建模两个字符串间的语义相似度，
-模型实现支持通用的数据格式，用户替换数据便可以在真实场景中使用该模型。
 ## 背景介绍
-DSSM \[[1](##参考文献)\]是微软研究院13年提出来的经典的语义模型，用于学习两个文本之间的语义距离，
+DSSM \[[1](##参考文献)\]是微软研究院13年提出来的经典的语义模型，用于学习两个文本之间的语义距离，广义上模型也可以推广和适用如下场景：
-广义上模型也可以推广和适用如下场景：
 1. CTR预估模型，衡量用户搜索词（Query）与候选网页集合（Documents）之间的相关联程度。
 2. 文本相关性，衡量两个字符串间的语义相关程度。
 3. 自动推荐，衡量User与被推荐的Item之间的关联程度。
-DSSM 已经发展成了一个框架，可以很自然地建模两个记录之间的距离关系，
+DSSM 已经发展成了一个框架，可以很自然地建模两个记录之间的距离关系，例如对于文本相关性问题，可以用余弦相似度 (cosin similarity) 来刻画语义距离；而对于搜索引擎的结果排序，可以在DSSM上接上Rank损失训练出一个排序模型。
-例如对于文本相关性问题，可以用余弦相似度 (cosin similarity) 来刻画语义距离；
-而对于搜索引擎的结果排序，可以在DSSM上接上Rank损失训练出一个排序模型。
 ## 模型简介
 在原论文\[[1](#参考文献)\]中，DSSM模型用来衡量用户搜索词 Query 和文档集合 Documents 之间隐含的语义关系，模型结构如下
@@ -23,12 +18,9 @@ DSSM 已经发展成了一个框架，可以很自然地建模两个记录之间
 图 1. DSSM 原始结构
 </p>
-其贯彻的思想是， **用DNN将高维特征向量转化为低纬空间的连续向量（图中红色框部分）** ，
+其贯彻的思想是， **用DNN将高维特征向量转化为低纬空间的连续向量（图中红色框部分）** ，**在上层使用cosine similarity来衡量用户搜索词与候选文档间的语义相关性** 。
-**在上层用cosine similarity来衡量用户搜索词与候选文档间的语义相关性** 。
-在最顶层损失函数的设计上，原始模型使用类似Word2Vec中负例采样的方法，
+在最顶层损失函数的设计上，原始模型使用类似Word2Vec中负例采样的方法，一个Query会抽取正例 $D+$ 和4个负例 $D-$ 整体上算条件概率用对数似然函数作为损失，这也就是图 1中类似 $P(D_1|Q)$ 的结构，具体细节请参考原论文。
-一个Query会抽取正例 $D+$ 和4个负例 $D-$ 整体上算条件概率用对数似然函数作为损失，
-这也就是图 1中类似 $P(D_1|Q)$ 的结构，具体细节请参考原论文。
 随着后续优化DSSM模型的结构得以简化\[[3](#参考文献)\]，演变为：
@@ -37,37 +29,30 @@ DSSM 已经发展成了一个框架，可以很自然地建模两个记录之间
 图 2. DSSM通用结构
 </p>
-图中的空白方框可以用任何模型替代，比如全连接FC，卷积CNN，RNN等都可以，
+图中的空白方框可以用任何模型替代，例如：全连接FC，卷积CNN，RNN等。该模型结构专门用于衡量两个元素（比如字符串）间的语义距离。在实际任务中，DSSM模型会作为基础的积木，搭配上不同的损失函数来实现具体的功能，比如：
-该模型结构专门用于衡量两个元素（比如字符串）间的语义距离。
-在现实使用中，DSSM模型会作为基础的积木，搭配上不同的损失函数来实现具体的功能，比如
 - 在排序学习中，将 图 2 中结构添加 pairwise rank损失，变成一个排序模型
 - 在CTR预估中，对点击与否做0，1二元分类，添加交叉熵损失变成一个分类模型
 - 在需要对一个子串打分时，可以使用余弦相似度来计算相似度，变成一个回归模型
-本例将尝试面向应用提供一个比较通用的解决方案，在模型任务类型上支持
+本例提供一个比较通用的解决方案，在模型任务类型上支持：
 - 分类
 - [-1, 1] 值域内的回归
 - Pairwise-Rank
-在生成低纬语义向量的模型结构上，本模型支持以下三种：
+在生成低纬语义向量的模型结构上，支持以下三种：
 - FC, 多层全连接层
 - CNN，卷积神经网络
 - RNN，递归神经网络
 ## 模型实现
-DSSM模型可以拆成三小块实现，分别是左边和右边的DNN，以及顶层的损失函数。
+DSSM模型可以拆成三部分：分别是左边和右边的DNN，以及顶层的损失函数。在复杂任务中，左右两边DNN的结构可以不同。在原始论文中左右网络分别学习Query和Document的语义向量，两者数据的数据不同，建议对应定制DNN的结构。
-在复杂任务中，左右两边DNN的结构可以是不同的，比如在原始论文中左右分别学习Query和Document的semantic vector，
-两者数据的数据不同，建议对应定制DNN的结构。
-本例中为了简便和通用，将左右两个DNN的结构都设为相同的，因此只有三个选项FC,CNN,RNN等。
+**本例中为了简便和通用，将左右两个DNN的结构设为相同，因此只提供三个选项FC、CNN、RNN**。
-在损失函数的设计方面，也支持三种，分类, 回归, 排序；
+损失函数的设计也支持三种类型：分类, 回归, 排序；其中，在回归和排序两种损失中，左右两边的匹配程度通过余弦相似度（cosine similairty）来计算；在分类任务中，类别预测的分布通过softmax计算。
-其中，在回归和排序两种损失中，左右两边的匹配程度通过余弦相似度（cossim）来计算；
-在分类任务中，类别预测的分布通过softmax计算。
 在其它教程中，对上述很多内容都有过详细的介绍，例如：
@@ -77,19 +62,17 @@ DSSM模型可以拆成三小块实现，分别是左边和右边的DNN，以及
 相关原理在此不再赘述，本文接下来的篇幅主要集中介绍使用PaddlePaddle实现这些结构上。
-如图3，回归和分类模型的结构很相似
+如图3，回归和分类模型的结构相似:
 <p align="center">
 <img src="./images/dssm3.jpg"/><br/><br/>
 图 3. DSSM for REGRESSION or CLASSIFICATION
 </p>
-最重要的组成部分包括词向量，图中`(1)`,`(2)`两个低纬向量的学习器（可以用RNN/CNN/FC中的任意一种实现），
+最重要的组成部分包括词向量，图中`(1)`,`(2)`两个低纬向量的学习器（可以用RNN/CNN/FC中的任意一种实现），最上层对应的损失函数。
-最上层对应的损失函数。
-而Pairwise Rank的结构会复杂一些，类似两个 图 4. 中的结构，增加了对应的损失函数：
- 模型总体思想是，用同一个source(源)为左右两个target(目标)分别打分——`(a),(b)`，学习目标是(a),(b)间的大小关系
+Pairwise Rank的结构会复杂一些，图 4. 中的结构会出现两次，增加了对应的损失函数，模型总体思想是：
+- 给定同一个source(源)为左右两个target(目标)分别打分——`(a),(b)`，学习目标是(a),(b)之间的大小关系
 - `(a)`和`(b)`类似图3中结构，用于给source和target的pair打分
 - `(1)`和`(2)`的结构其实是共用的，都表示同一个source，图中为了表达效果展开成两个
@@ -98,17 +81,18 @@ DSSM模型可以拆成三小块实现，分别是左边和右边的DNN，以及
 图 4. DSSM for Pairwise Rank
 </p>
-下面是各个部分具体的实现方法，所有的代码均包含在 `./network_conf.py` 中。
+下面是各个部分的具体实现，相关代码均包含在 `./network_conf.py` 中。
 ### 创建文本的词向量表
 ```python
 def create_embedding(self, input, prefix=''):
-    '''
+    """
-    Create an embedding table whose name has a `prefix`.
+    Create word embedding. The `prefix` is added in front of the name of
-    '''
+    embedding"s learnable parameter.
-    logger.info("create embedding table [%s] which dimention is %d" %
+    """
+    logger.info("Create embedding table [%s] whose dimention is %d" %
                (prefix, self.dnn_dims[0]))
    emb = paddle.layer.embedding(
        input=input,
@@ -123,14 +107,15 @@ def create_embedding(self, input, prefix=''):
 ```python
 def create_cnn(self, emb, prefix=''):
-    '''
+    """
    A multi-layer CNN.
+    :param emb: The word embedding.
+    :type emb: paddle.layer
+    :param prefix: The prefix will be added to of layers' names.
+    :type prefix: str
+    """
-    @emb: paddle.layer
-        output of the embedding layer
-    @prefix: str
-        prefix of layers' names, used to share parameters between more than one `cnn` parts.
-    '''
    def create_conv(context_len, hidden_size, prefix):
        key = "%s_%d_%d" % (prefix, context_len, hidden_size)
        conv = paddle.networks.sequence_conv_pool(
@@ -138,21 +123,18 @@ def create_cnn(self, emb, prefix=''):
            context_len=context_len,
            hidden_size=hidden_size,
            # set parameter attr for parameter sharing
-            context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'),
+            context_proj_param_attr=ParamAttr(name=key + "contex_proj.w"),
-            fc_param_attr=ParamAttr(name=key + '_fc.w'),
+            fc_param_attr=ParamAttr(name=key + "_fc.w"),
-            fc_bias_attr=ParamAttr(name=key + '_fc.b'),
+            fc_bias_attr=ParamAttr(name=key + "_fc.b"),
-            pool_bias_attr=ParamAttr(name=key + '_pool.b'))
+            pool_bias_attr=ParamAttr(name=key + "_pool.b"))
        return conv
-    logger.info('create a sequence_conv_pool which context width is 3')
    conv_3 = create_conv(3, self.dnn_dims[1], "cnn")
-    logger.info('create a sequence_conv_pool which context width is 4')
    conv_4 = create_conv(4, self.dnn_dims[1], "cnn")
    return conv_3, conv_4
 ```
-CNN 接受 embedding table输出的词向量序列，通过卷积和池化操作捕捉到原始句子的关键信息，
+CNN 接受词向量序列，通过卷积和池化操作捕捉到原始句子的关键信息，最终输出一个语义向量（可以认为是句子向量）。
-最终输出一个语义向量（可以认为是句子向量）。
 本例的实现中，分别使用了窗口长度为3和4的CNN学到的句子向量按元素求和得到最终的句子向量。
@@ -162,9 +144,9 @@ RNN很适合学习变长序列的信息，使用RNN来学习句子的信息几
 ```python
 def create_rnn(self, emb, prefix=''):
-    '''
+    """
    A GRU sentence vector learner.
-    '''
+    """
    gru = paddle.networks.simple_gru(
        input=emb,
        size=self.dnn_dims[1],
@@ -176,18 +158,19 @@ def create_rnn(self, emb, prefix=''):
    return sent_vec
 ```
-### FC 结构实现
+### 多层全连接网络FC
 ```python
 def create_fc(self, emb, prefix=''):
-    '''
+    """
    A multi-layer fully connected neural networks.
+    :param emb: The output of the embedding layer
+    :type emb: paddle.layer
+    :param prefix: A prefix will be added to the layers' names.
+    :type prefix: str
+    """
-    @emb: paddle.layer
-        output of the embedding layer
-    @prefix: str
-        prefix of layers' names, used to share parameters between more than one `fc` parts.
-    '''
    _input_layer = paddle.layer.pooling(
        input=emb, pooling_type=paddle.pooling.Max())
    fc = paddle.layer.fc(
@@ -198,21 +181,17 @@ def create_fc(self, emb, prefix=''):
    return fc
 ```
-在构建FC时需要首先使用`paddle.layer.pooling` 对词向量序列进行最大池化操作，将边长序列转化为一个固定维度向量，
+在构建全连接网络时首先使用`paddle.layer.pooling` 对词向量序列进行最大池化操作，将边长序列转化为一个固定维度向量，作为整个句子的语义表达，使用最大池化能够降低句子长度对句向量表达的影响。
-作为整个句子的语义表达，使用最大池化能够降低句子长度对句向量表达的影响。
-### 多层DNN实现
+### 多层DNN
 在 CNN/DNN/FC提取出 semantic vector后，在上层可继续接多层FC来实现深层DNN结构。
 ```python
 def create_dnn(self, sent_vec, prefix):
-    # if more than three layers exists, a fc layer will be added.
    if len(self.dnn_dims) > 1:
        _input_layer = sent_vec
        for id, dim in enumerate(self.dnn_dims[1:]):
            name = "%s_fc_%d_%d" % (prefix, id, dim)
-            logger.info("create fc layer [%s] which dimention is %d" %
-                        (name, dim))
            fc = paddle.layer.fc(
                input=_input_layer,
                size=dim,
@@ -224,119 +203,13 @@ def create_dnn(self, sent_vec, prefix):
    return _input_layer
 ```
-### 分类或回归实现
+### 分类及回归
-分类和回归的结构比较相似，因此可以用一个函数创建出来
+分类和回归的结构比较相似，具体实现请参考[network_conf.py]( https://github.com/PaddlePaddle/models/blob/develop/dssm/network_conf.py)中的
+`_build_classification_or_regression_model` 函数。
-```python
+### Pairwise Rank
-def _build_classification_or_regression_model(self, is_classification):
+Pairwise Rank复用上面的DNN结构，同一个source对两个target求相似度打分，如果左边的target打分高，预测为1，否则预测为 0。实现请参考 [network_conf.py]( https://github.com/PaddlePaddle/models/blob/develop/dssm/network_conf.py) 中的`_build_rank_model` 函数。
-    '''
-    Build a classification/regression model, and the cost is returned.
-    A Classification has 3 inputs:
-      - source sentence
-      - target sentence
-      - classification label
-    '''
-    # prepare inputs.
-    assert self.class_num
-    source = paddle.layer.data(
-        name='source_input',
-        type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
-    target = paddle.layer.data(
-        name='target_input',
-        type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
-    label = paddle.layer.data(
-        name='label_input',
-        type=paddle.data_type.integer_value(self.class_num)
-        if is_classification else paddle.data_type.dense_input)
-    prefixs = '_ _'.split(
-    ) if self.share_semantic_generator else 'source target'.split()
-    embed_prefixs = '_ _'.split(
-    ) if self.share_embed else 'source target'.split()
-    word_vecs = []
-    for id, input in enumerate([source, target]):
-        x = self.create_embedding(input, prefix=embed_prefixs[id])
-        word_vecs.append(x)
-    semantics = []
-    for id, input in enumerate(word_vecs):
-        x = self.model_arch_creater(input, prefix=prefixs[id])
-        semantics.append(x)
-    if is_classification:
-        concated_vector = paddle.layer.concat(semantics)
-        prediction = paddle.layer.fc(
-            input=concated_vector,
-            size=self.class_num,
-            act=paddle.activation.Softmax())
-        cost = paddle.layer.classification_cost(
-            input=prediction, label=label)
-    else:
-        prediction = paddle.layer.cos_sim(*semantics)
-        cost = paddle.layer.square_error_cost(prediction, label)
-    if not self.is_infer:
-        return cost, prediction, label
-    return prediction
-```
-### Pairwise Rank实现
-Pairwise Rank复用上面的DNN结构，同一个source对两个target求相似度打分，
-如果左边的target打分高，预测为1，否则预测为 0。
-```python
-def _build_rank_model(self):
-    '''
-    Build a pairwise rank model, and the cost is returned.
-    A pairwise rank model has 3 inputs:
-      - source sentence
-      - left_target sentence
-      - right_target sentence
-      - label, 1 if left_target should be sorted in front of right_target, otherwise 0.
-    '''
-    source = paddle.layer.data(
-        name='source_input',
-        type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
-    left_target = paddle.layer.data(
-        name='left_target_input',
-        type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
-    right_target = paddle.layer.data(
-        name='right_target_input',
-        type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
-    label = paddle.layer.data(
-        name='label_input', type=paddle.data_type.integer_value(1))
-    prefixs = '_ _ _'.split(
-    ) if self.share_semantic_generator else 'source target target'.split()
-    embed_prefixs = '_ _'.split(
-    ) if self.share_embed else 'source target target'.split()
-    word_vecs = []
-    for id, input in enumerate([source, left_target, right_target]):
-        x = self.create_embedding(input, prefix=embed_prefixs[id])
-        word_vecs.append(x)
-    semantics = []
-    for id, input in enumerate(word_vecs):
-        x = self.model_arch_creater(input, prefix=prefixs[id])
-        semantics.append(x)
-    # cossim score of source and left_target
-    left_score = paddle.layer.cos_sim(semantics[0], semantics[1])
-    # cossim score of source and right target
-    right_score = paddle.layer.cos_sim(semantics[0], semantics[2])
-    # rank cost
-    cost = paddle.layer.rank_cost(left_score, right_score, label=label)
-    # prediction = left_score - right_score
-    # but this operator is not supported currently.
-    # so AUC will not used.
-    return cost, None, None
-```
 ## 数据格式
 在 `./data` 中有简单的示例数据
@@ -371,7 +244,6 @@ def _build_rank_model(self):
 6 10 \t 8 3 1 \t 1
 ```
 ### 排序的数据格式
 ```
 # 4 fields each line:
@@ -391,68 +263,11 @@ def _build_rank_model(self):
 ## 执行训练
-可以直接执行 `python train.py -y 0 --model_arch 0` 使用 `./data/classification` 目录里简单的数据来训练一个分类的FC模型。
+可以直接执行 `python train.py -y 0 --model_arch 0` 使用 `./data/classification` 目录里的实例数据来测试能否直接运行训练分类FC模型。
-其他模型结构也可以通过命令行实现定制，详细命令行参数如下
-```
+其他模型结构也可以通过命令行实现定制，详细命令行参数请执行 `python train.py --help`进行查阅。
-usage: train.py [-h] [-i TRAIN_DATA_PATH] [-t TEST_DATA_PATH]
-                [-s SOURCE_DIC_PATH] [--target_dic_path TARGET_DIC_PATH]
-                [-b BATCH_SIZE] [-p NUM_PASSES] -y MODEL_TYPE -a MODEL_ARCH
-                [--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
-                [--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
-                [--num_workers NUM_WORKERS] [--use_gpu USE_GPU] [-c CLASS_NUM]
-                [--model_output_prefix MODEL_OUTPUT_PREFIX]
-                [-g NUM_BATCHES_TO_LOG] [-e NUM_BATCHES_TO_TEST]
-                [-z NUM_BATCHES_TO_SAVE_MODEL]
-PaddlePaddle DSSM example
-optional arguments:
-  -h, --help            show this help message and exit
-  -i TRAIN_DATA_PATH, --train_data_path TRAIN_DATA_PATH
-                        path of training dataset
-  -t TEST_DATA_PATH, --test_data_path TEST_DATA_PATH
-                        path of testing dataset
-  -s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
-                        path of the source's word dic
-  --target_dic_path TARGET_DIC_PATH
-                        path of the target's word dic, if not set, the
-                        `source_dic_path` will be used
-  -b BATCH_SIZE, --batch_size BATCH_SIZE
-                        size of mini-batch (default:32)
-  -p NUM_PASSES, --num_passes NUM_PASSES
-                        number of passes to run(default:10)
-  -y MODEL_TYPE, --model_type MODEL_TYPE
-                        model type, 0 for classification, 1 for pairwise rank,
-                        2 for regression (default: classification)
-  -a MODEL_ARCH, --model_arch MODEL_ARCH
-                        model architecture, 1 for CNN, 0 for FC, 2 for RNN
-  --share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
-                        whether to share network parameters between source and
-                        target
-  --share_embed SHARE_EMBED
-                        whether to share word embedding between source and
-                        target
-  --dnn_dims DNN_DIMS   dimentions of dnn layers, default is '256,128,64,32',
-                        which means create a 4-layer dnn, demention of each
-                        layer is 256, 128, 64 and 32
-  --num_workers NUM_WORKERS
-                        num worker threads, default 1
-  --use_gpu USE_GPU     whether to use GPU devices (default: False)
-  -c CLASS_NUM, --class_num CLASS_NUM
-                        number of categories for classification task.
-  --model_output_prefix MODEL_OUTPUT_PREFIX
-                        prefix of the path for model to store, (default: ./)
-  -g NUM_BATCHES_TO_LOG, --num_batches_to_log NUM_BATCHES_TO_LOG
-                        number of batches to output train log, (default: 100)
-  -e NUM_BATCHES_TO_TEST, --num_batches_to_test NUM_BATCHES_TO_TEST
-                        number of batches to test, (default: 200)
-  -z NUM_BATCHES_TO_SAVE_MODEL, --num_batches_to_save_model NUM_BATCHES_TO_SAVE_MODEL
-                        number of batches to output model, (default: 400)
-```
-重要的参数描述如下
+这里介绍最重要的几个参数：
 - `train_data_path` 训练数据路径
 - `test_data_path` 测试数据路局，可以不设置
@@ -462,49 +277,8 @@ optional arguments:
 - `model_arch` 模型结构，FC 0， CNN 1, RNN 2
 - `dnn_dims` 模型各层的维度设置，默认为 `256,128,64,32`，即模型有4层，各层维度如上设置
-## 用训练好的模型预测
+## 使用训练好的模型预测
-```
+详细命令行参数请执行 `python train.py --help`进行查阅。重要参数解释如下：
-usage: infer.py [-h] --model_path MODEL_PATH -i DATA_PATH -o
-                PREDICTION_OUTPUT_PATH -y MODEL_TYPE [-s SOURCE_DIC_PATH]
-                [--target_dic_path TARGET_DIC_PATH] -a MODEL_ARCH
-                [--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
-                [--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
-                [-c CLASS_NUM]
-PaddlePaddle DSSM infer
-optional arguments:
-  -h, --help            show this help message and exit
-  --model_path MODEL_PATH
-                        path of model parameters file
-  -i DATA_PATH, --data_path DATA_PATH
-                        path of the dataset to infer
-  -o PREDICTION_OUTPUT_PATH, --prediction_output_path PREDICTION_OUTPUT_PATH
-                        path to output the prediction
-  -y MODEL_TYPE, --model_type MODEL_TYPE
-                        model type, 0 for classification, 1 for pairwise rank,
-                        2 for regression (default: classification)
-  -s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
-                        path of the source's word dic
-  --target_dic_path TARGET_DIC_PATH
-                        path of the target's word dic, if not set, the
-                        `source_dic_path` will be used
-  -a MODEL_ARCH, --model_arch MODEL_ARCH
-                        model architecture, 1 for CNN, 0 for FC, 2 for RNN
-  --share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
-                        whether to share network parameters between source and
-                        target
-  --share_embed SHARE_EMBED
-                        whether to share word embedding between source and
-                        target
-  --dnn_dims DNN_DIMS   dimentions of dnn layers, default is '256,128,64,32',
-                        which means create a 4-layer dnn, demention of each
-                        layer is 256, 128, 64 and 32
-  -c CLASS_NUM, --class_num CLASS_NUM
-                        number of categories for classification task.
-```
-部分参数可以参考 `train.py`，重要参数解释如下
 - `data_path` 需要预测的数据路径
 - `prediction_output_path` 预测的输出路径

--- a/dssm/README.md
+++ b/dssm/README.md
@@ -65,10 +65,11 @@ In below, we describe how to train DSSM model in PaddlePaddle. All the codes are
 ### Create a word vector table for the text
 ```python
 def create_embedding(self, input, prefix=''):
-    '''
+    """
-    Create an embedding table whose name has a `prefix`.
+    Create word embedding. The `prefix` is added in front of the name of
-    '''
+    embedding"s learnable parameter.
-    logger.info("create embedding table [%s] which dimention is %d" %
+    """
+    logger.info("Create embedding table [%s] whose dimention is %d" %
                (prefix, self.dnn_dims[0]))
    emb = paddle.layer.embedding(
        input=input,
@@ -82,14 +83,15 @@ Since the input (embedding table) is a list of the IDs of the words correspondin
 ### CNN implementation
 ```python
 def create_cnn(self, emb, prefix=''):
-    '''
+    """
    A multi-layer CNN.
+    :param emb: The word embedding.
+    :type emb: paddle.layer
+    :param prefix: The prefix will be added to of layers' names.
+    :type prefix: str
+    """
-    @emb: paddle.layer
-        output of the embedding layer
-    @prefix: str
-        prefix of layers' names, used to share parameters between more than one `cnn` parts.
-    '''
    def create_conv(context_len, hidden_size, prefix):
        key = "%s_%d_%d" % (prefix, context_len, hidden_size)
        conv = paddle.networks.sequence_conv_pool(
@@ -97,15 +99,13 @@ def create_cnn(self, emb, prefix=''):
            context_len=context_len,
            hidden_size=hidden_size,
            # set parameter attr for parameter sharing
-            context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'),
+            context_proj_param_attr=ParamAttr(name=key + "contex_proj.w"),
-            fc_param_attr=ParamAttr(name=key + '_fc.w'),
+            fc_param_attr=ParamAttr(name=key + "_fc.w"),
-            fc_bias_attr=ParamAttr(name=key + '_fc.b'),
+            fc_bias_attr=ParamAttr(name=key + "_fc.b"),
-            pool_bias_attr=ParamAttr(name=key + '_pool.b'))
+            pool_bias_attr=ParamAttr(name=key + "_pool.b"))
        return conv
-    logger.info('create a sequence_conv_pool which context width is 3')
    conv_3 = create_conv(3, self.dnn_dims[1], "cnn")
-    logger.info('create a sequence_conv_pool which context width is 4')
    conv_4 = create_conv(4, self.dnn_dims[1], "cnn")
    return conv_3, conv_4
 ```
@@ -118,9 +118,9 @@ RNN is suitable for learning variable length of the information
 ```python
 def create_rnn(self, emb, prefix=''):
-    '''
+    """
    A GRU sentence vector learner.
-    '''
+    """
    gru = paddle.networks.simple_gru(
        input=emb,
        size=self.dnn_dims[1],
@@ -136,14 +136,15 @@ def create_rnn(self, emb, prefix=''):
 ```python
 def create_fc(self, emb, prefix=''):
-    '''
+    """
    A multi-layer fully connected neural networks.
+    :param emb: The output of the embedding layer
+    :type emb: paddle.layer
+    :param prefix: A prefix will be added to the layers' names.
+    :type prefix: str
+    """
-    @emb: paddle.layer
-        output of the embedding layer
-    @prefix: str
-        prefix of layers' names, used to share parameters between more than one `fc` parts.
-    '''
    _input_layer = paddle.layer.pooling(
        input=emb, pooling_type=paddle.pooling.Max())
    fc = paddle.layer.fc(
@@ -160,13 +161,10 @@ In the construction of FC, we use `paddle.layer.pooling` for the maximum pooling
 ```python
 def create_dnn(self, sent_vec, prefix):
-    # if more than three layers exists, a fc layer will be added.
    if len(self.dnn_dims) > 1:
        _input_layer = sent_vec
        for id, dim in enumerate(self.dnn_dims[1:]):
            name = "%s_fc_%d_%d" % (prefix, id, dim)
-            logger.info("create fc layer [%s] which dimention is %d" %
-                        (name, dim))
            fc = paddle.layer.fc(
                input=_input_layer,
                size=dim,
@@ -180,117 +178,12 @@ def create_dnn(self, sent_vec, prefix):
 ### Classification / Regression
 The structure of classification and regression is similar. Below function can be used for both tasks.
+Please check the function `_build_classification_or_regression_model` in [network_conf.py]( https://github.com/PaddlePaddle/models/blob/develop/dssm/network_conf.py) for detail implementation.
-```python
-def _build_classification_or_regression_model(self, is_classification):
-    '''
-    Build a classification/regression model, and the cost is returned.
-    A Classification has 3 inputs:
-      - source sentence
-      - target sentence
-      - classification label
-    '''
-    # prepare inputs.
-    assert self.class_num
-    source = paddle.layer.data(
-        name='source_input',
-        type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
-    target = paddle.layer.data(
-        name='target_input',
-        type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
-    label = paddle.layer.data(
-        name='label_input',
-        type=paddle.data_type.integer_value(self.class_num)
-        if is_classification else paddle.data_type.dense_input)
-    prefixs = '_ _'.split(
-    ) if self.share_semantic_generator else 'source target'.split()
-    embed_prefixs = '_ _'.split(
-    ) if self.share_embed else 'source target'.split()
-    word_vecs = []
-    for id, input in enumerate([source, target]):
-        x = self.create_embedding(input, prefix=embed_prefixs[id])
-        word_vecs.append(x)
-    semantics = []
-    for id, input in enumerate(word_vecs):
-        x = self.model_arch_creater(input, prefix=prefixs[id])
-        semantics.append(x)
-    if is_classification:
-        concated_vector = paddle.layer.concat(semantics)
-        prediction = paddle.layer.fc(
-            input=concated_vector,
-            size=self.class_num,
-            act=paddle.activation.Softmax())
-        cost = paddle.layer.classification_cost(
-            input=prediction, label=label)
-    else:
-        prediction = paddle.layer.cos_sim(*semantics)
-        cost = paddle.layer.square_error_cost(prediction, label)
-    if not self.is_infer:
-        return cost, prediction, label
-    return prediction
-```
 ### Pairwise Rank
+Please check the function `_build_rank_model` in [network_conf.py]( https://github.com/PaddlePaddle/models/blob/develop/dssm/network_conf.py) for implementation.
-```python
-def _build_rank_model(self):
-    '''
-    Build a pairwise rank model, and the cost is returned.
-    A pairwise rank model has 3 inputs:
-      - source sentence
-      - left_target sentence
-      - right_target sentence
-      - label, 1 if left_target should be sorted in front of right_target, otherwise 0.
-    '''
-    source = paddle.layer.data(
-        name='source_input',
-        type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
-    left_target = paddle.layer.data(
-        name='left_target_input',
-        type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
-    right_target = paddle.layer.data(
-        name='right_target_input',
-        type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
-    label = paddle.layer.data(
-        name='label_input', type=paddle.data_type.integer_value(1))
-    prefixs = '_ _ _'.split(
-    ) if self.share_semantic_generator else 'source target target'.split()
-    embed_prefixs = '_ _'.split(
-    ) if self.share_embed else 'source target target'.split()
-    word_vecs = []
-    for id, input in enumerate([source, left_target, right_target]):
-        x = self.create_embedding(input, prefix=embed_prefixs[id])
-        word_vecs.append(x)
-    semantics = []
-    for id, input in enumerate(word_vecs):
-        x = self.model_arch_creater(input, prefix=prefixs[id])
-        semantics.append(x)
-    # cossim score of source and left_target
-    left_score = paddle.layer.cos_sim(semantics[0], semantics[1])
-    # cossim score of source and right target
-    right_score = paddle.layer.cos_sim(semantics[0], semantics[2])
-    # rank cost
-    cost = paddle.layer.rank_cost(left_score, right_score, label=label)
-    # prediction = left_score - right_score
-    # but this operator is not supported currently.
-    # so AUC will not used.
-    return cost, None, None
-```
 ## Data Format
 Below is a simple example for the data in `./data`
@@ -347,67 +240,7 @@ The example of this format is as follows.
 ## Training
-We use `python train.py -y 0 --model_arch 0` with the data in  `./data/classification` to train a DSSM model for classification.
+We use `python train.py -y 0 --model_arch 0` with the data in  `./data/classification` to train a DSSM model for classification. The paremeters to execute the script `train.py` can be found by execution `python infer.py --help`. Some important parameters are：
-```
-usage: train.py [-h] [-i TRAIN_DATA_PATH] [-t TEST_DATA_PATH]
-                [-s SOURCE_DIC_PATH] [--target_dic_path TARGET_DIC_PATH]
-                [-b BATCH_SIZE] [-p NUM_PASSES] -y MODEL_TYPE -a MODEL_ARCH
-                [--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
-                [--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
-                [--num_workers NUM_WORKERS] [--use_gpu USE_GPU] [-c CLASS_NUM]
-                [--model_output_prefix MODEL_OUTPUT_PREFIX]
-                [-g NUM_BATCHES_TO_LOG] [-e NUM_BATCHES_TO_TEST]
-                [-z NUM_BATCHES_TO_SAVE_MODEL]
-PaddlePaddle DSSM example
-optional arguments:
-  -h, --help            show this help message and exit
-  -i TRAIN_DATA_PATH, --train_data_path TRAIN_DATA_PATH
-                        path of training dataset
-  -t TEST_DATA_PATH, --test_data_path TEST_DATA_PATH
-                        path of testing dataset
-  -s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
-                        path of the source's word dic
-  --target_dic_path TARGET_DIC_PATH
-                        path of the target's word dic, if not set, the
-                        `source_dic_path` will be used
-  -b BATCH_SIZE, --batch_size BATCH_SIZE
-                        size of mini-batch (default:32)
-  -p NUM_PASSES, --num_passes NUM_PASSES
-                        number of passes to run(default:10)
-  -y MODEL_TYPE, --model_type MODEL_TYPE
-                        model type, 0 for classification, 1 for pairwise rank,
-                        2 for regression (default: classification)
-  -a MODEL_ARCH, --model_arch MODEL_ARCH
-                        model architecture, 1 for CNN, 0 for FC, 2 for RNN
-  --share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
-                        whether to share network parameters between source and
-                        target
-  --share_embed SHARE_EMBED
-                        whether to share word embedding between source and
-                        target
-  --dnn_dims DNN_DIMS   dimentions of dnn layers, default is '256,128,64,32',
-                        which means create a 4-layer dnn, demention of each
-                        layer is 256, 128, 64 and 32
-  --num_workers NUM_WORKERS
-                        num worker threads, default 1
-  --use_gpu USE_GPU     whether to use GPU devices (default: False)
-  -c CLASS_NUM, --class_num CLASS_NUM
-                        number of categories for classification task.
-  --model_output_prefix MODEL_OUTPUT_PREFIX
-                        prefix of the path for model to store, (default: ./)
-  -g NUM_BATCHES_TO_LOG, --num_batches_to_log NUM_BATCHES_TO_LOG
-                        number of batches to output train log, (default: 100)
-  -e NUM_BATCHES_TO_TEST, --num_batches_to_test NUM_BATCHES_TO_TEST
-                        number of batches to test, (default: 200)
-  -z NUM_BATCHES_TO_SAVE_MODEL, --num_batches_to_save_model NUM_BATCHES_TO_SAVE_MODEL
-                        number of batches to output model, (default: 400)
-```
-Parameter description:
 - `train_data_path` Training data path
 - `test_data_path`  Test data path, optional
@@ -418,48 +251,8 @@ Parameter description:
 - `dnn_dims` The dimension of each layer of the model is set, the default is `256,128,64,32`，with 4 layers.
 ## To predict using the trained model
-```
-usage: infer.py [-h] --model_path MODEL_PATH -i DATA_PATH -o
-                PREDICTION_OUTPUT_PATH -y MODEL_TYPE [-s SOURCE_DIC_PATH]
-                [--target_dic_path TARGET_DIC_PATH] -a MODEL_ARCH
-                [--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
-                [--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
-                [-c CLASS_NUM]
-PaddlePaddle DSSM infer
-optional arguments:
-  -h, --help            show this help message and exit
-  --model_path MODEL_PATH
-                        path of model parameters file
-  -i DATA_PATH, --data_path DATA_PATH
-                        path of the dataset to infer
-  -o PREDICTION_OUTPUT_PATH, --prediction_output_path PREDICTION_OUTPUT_PATH
-                        path to output the prediction
-  -y MODEL_TYPE, --model_type MODEL_TYPE
-                        model type, 0 for classification, 1 for pairwise rank,
-                        2 for regression (default: classification)
-  -s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
-                        path of the source's word dic
-  --target_dic_path TARGET_DIC_PATH
-                        path of the target's word dic, if not set, the
-                        `source_dic_path` will be used
-  -a MODEL_ARCH, --model_arch MODEL_ARCH
-                        model architecture, 1 for CNN, 0 for FC, 2 for RNN
-  --share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
-                        whether to share network parameters between source and
-                        target
-  --share_embed SHARE_EMBED
-                        whether to share word embedding between source and
-                        target
-  --dnn_dims DNN_DIMS   dimentions of dnn layers, default is '256,128,64,32',
-                        which means create a 4-layer dnn, demention of each
-                        layer is 256, 128, 64 and 32
-  -c CLASS_NUM, --class_num CLASS_NUM
-                        number of categories for classification task.
-```
-Important parameters are
+The paremeters to execute the script `infer.py` can be found by execution `python infer.py --help`. Some important parameters are：
 - `data_path` Path for the data to predict
 - `prediction_output_path` Prediction output path

--- a/dssm/index.html
+++ b/dssm/index.html
@@ -107,10 +107,11 @@ In below, we describe how to train DSSM model in PaddlePaddle. All the codes are
 ### Create a word vector table for the text
 ```python
 def create_embedding(self, input, prefix=''):
-    '''
+    """
-    Create an embedding table whose name has a `prefix`.
+    Create word embedding. The `prefix` is added in front of the name of
-    '''
+    embedding"s learnable parameter.
-    logger.info("create embedding table [%s] which dimention is %d" %
+    """
+    logger.info("Create embedding table [%s] whose dimention is %d" %
                (prefix, self.dnn_dims[0]))
    emb = paddle.layer.embedding(
        input=input,
@@ -124,14 +125,15 @@ Since the input (embedding table) is a list of the IDs of the words correspondin
 ### CNN implementation
 ```python
 def create_cnn(self, emb, prefix=''):
-    '''
+    """
    A multi-layer CNN.
+    :param emb: The word embedding.
+    :type emb: paddle.layer
+    :param prefix: The prefix will be added to of layers' names.
+    :type prefix: str
+    """
-    @emb: paddle.layer
-        output of the embedding layer
-    @prefix: str
-        prefix of layers' names, used to share parameters between more than one `cnn` parts.
-    '''
    def create_conv(context_len, hidden_size, prefix):
        key = "%s_%d_%d" % (prefix, context_len, hidden_size)
        conv = paddle.networks.sequence_conv_pool(
@@ -139,15 +141,13 @@ def create_cnn(self, emb, prefix=''):
            context_len=context_len,
            hidden_size=hidden_size,
            # set parameter attr for parameter sharing
-            context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'),
+            context_proj_param_attr=ParamAttr(name=key + "contex_proj.w"),
-            fc_param_attr=ParamAttr(name=key + '_fc.w'),
+            fc_param_attr=ParamAttr(name=key + "_fc.w"),
-            fc_bias_attr=ParamAttr(name=key + '_fc.b'),
+            fc_bias_attr=ParamAttr(name=key + "_fc.b"),
-            pool_bias_attr=ParamAttr(name=key + '_pool.b'))
+            pool_bias_attr=ParamAttr(name=key + "_pool.b"))
        return conv
-    logger.info('create a sequence_conv_pool which context width is 3')
    conv_3 = create_conv(3, self.dnn_dims[1], "cnn")
-    logger.info('create a sequence_conv_pool which context width is 4')
    conv_4 = create_conv(4, self.dnn_dims[1], "cnn")
    return conv_3, conv_4
 ```
@@ -160,9 +160,9 @@ RNN is suitable for learning variable length of the information
 ```python
 def create_rnn(self, emb, prefix=''):
-    '''
+    """
    A GRU sentence vector learner.
-    '''
+    """
    gru = paddle.networks.simple_gru(
        input=emb,
        size=self.dnn_dims[1],
@@ -178,14 +178,15 @@ def create_rnn(self, emb, prefix=''):
 ```python
 def create_fc(self, emb, prefix=''):
-    '''
+    """
    A multi-layer fully connected neural networks.
+    :param emb: The output of the embedding layer
+    :type emb: paddle.layer
+    :param prefix: A prefix will be added to the layers' names.
+    :type prefix: str
+    """
-    @emb: paddle.layer
-        output of the embedding layer
-    @prefix: str
-        prefix of layers' names, used to share parameters between more than one `fc` parts.
-    '''
    _input_layer = paddle.layer.pooling(
        input=emb, pooling_type=paddle.pooling.Max())
    fc = paddle.layer.fc(
@@ -202,13 +203,10 @@ In the construction of FC, we use `paddle.layer.pooling` for the maximum pooling
 ```python
 def create_dnn(self, sent_vec, prefix):
-    # if more than three layers exists, a fc layer will be added.
    if len(self.dnn_dims) > 1:
        _input_layer = sent_vec
        for id, dim in enumerate(self.dnn_dims[1:]):
            name = "%s_fc_%d_%d" % (prefix, id, dim)
-            logger.info("create fc layer [%s] which dimention is %d" %
-                        (name, dim))
            fc = paddle.layer.fc(
                input=_input_layer,
                size=dim,
@@ -222,117 +220,12 @@ def create_dnn(self, sent_vec, prefix):
 ### Classification / Regression
 The structure of classification and regression is similar. Below function can be used for both tasks.
+Please check the function `_build_classification_or_regression_model` in [network_conf.py]( https://github.com/PaddlePaddle/models/blob/develop/dssm/network_conf.py) for detail implementation.
-```python
-def _build_classification_or_regression_model(self, is_classification):
-    '''
-    Build a classification/regression model, and the cost is returned.
-    A Classification has 3 inputs:
-      - source sentence
-      - target sentence
-      - classification label
-    '''
-    # prepare inputs.
-    assert self.class_num
-    source = paddle.layer.data(
-        name='source_input',
-        type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
-    target = paddle.layer.data(
-        name='target_input',
-        type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
-    label = paddle.layer.data(
-        name='label_input',
-        type=paddle.data_type.integer_value(self.class_num)
-        if is_classification else paddle.data_type.dense_input)
-    prefixs = '_ _'.split(
-    ) if self.share_semantic_generator else 'source target'.split()
-    embed_prefixs = '_ _'.split(
-    ) if self.share_embed else 'source target'.split()
-    word_vecs = []
-    for id, input in enumerate([source, target]):
-        x = self.create_embedding(input, prefix=embed_prefixs[id])
-        word_vecs.append(x)
-    semantics = []
-    for id, input in enumerate(word_vecs):
-        x = self.model_arch_creater(input, prefix=prefixs[id])
-        semantics.append(x)
-    if is_classification:
-        concated_vector = paddle.layer.concat(semantics)
-        prediction = paddle.layer.fc(
-            input=concated_vector,
-            size=self.class_num,
-            act=paddle.activation.Softmax())
-        cost = paddle.layer.classification_cost(
-            input=prediction, label=label)
-    else:
-        prediction = paddle.layer.cos_sim(*semantics)
-        cost = paddle.layer.square_error_cost(prediction, label)
-    if not self.is_infer:
-        return cost, prediction, label
-    return prediction
-```
 ### Pairwise Rank
+Please check the function `_build_rank_model` in [network_conf.py]( https://github.com/PaddlePaddle/models/blob/develop/dssm/network_conf.py) for implementation.
-```python
-def _build_rank_model(self):
-    '''
-    Build a pairwise rank model, and the cost is returned.
-    A pairwise rank model has 3 inputs:
-      - source sentence
-      - left_target sentence
-      - right_target sentence
-      - label, 1 if left_target should be sorted in front of right_target, otherwise 0.
-    '''
-    source = paddle.layer.data(
-        name='source_input',
-        type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
-    left_target = paddle.layer.data(
-        name='left_target_input',
-        type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
-    right_target = paddle.layer.data(
-        name='right_target_input',
-        type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
-    label = paddle.layer.data(
-        name='label_input', type=paddle.data_type.integer_value(1))
-    prefixs = '_ _ _'.split(
-    ) if self.share_semantic_generator else 'source target target'.split()
-    embed_prefixs = '_ _'.split(
-    ) if self.share_embed else 'source target target'.split()
-    word_vecs = []
-    for id, input in enumerate([source, left_target, right_target]):
-        x = self.create_embedding(input, prefix=embed_prefixs[id])
-        word_vecs.append(x)
-    semantics = []
-    for id, input in enumerate(word_vecs):
-        x = self.model_arch_creater(input, prefix=prefixs[id])
-        semantics.append(x)
-    # cossim score of source and left_target
-    left_score = paddle.layer.cos_sim(semantics[0], semantics[1])
-    # cossim score of source and right target
-    right_score = paddle.layer.cos_sim(semantics[0], semantics[2])
-    # rank cost
-    cost = paddle.layer.rank_cost(left_score, right_score, label=label)
-    # prediction = left_score - right_score
-    # but this operator is not supported currently.
-    # so AUC will not used.
-    return cost, None, None
-```
 ## Data Format
 Below is a simple example for the data in `./data`
@@ -389,67 +282,7 @@ The example of this format is as follows.
 ## Training
-We use `python train.py -y 0 --model_arch 0` with the data in  `./data/classification` to train a DSSM model for classification.
+We use `python train.py -y 0 --model_arch 0` with the data in  `./data/classification` to train a DSSM model for classification. The paremeters to execute the script `train.py` can be found by execution `python infer.py --help`. Some important parameters are：
-```
-usage: train.py [-h] [-i TRAIN_DATA_PATH] [-t TEST_DATA_PATH]
-                [-s SOURCE_DIC_PATH] [--target_dic_path TARGET_DIC_PATH]
-                [-b BATCH_SIZE] [-p NUM_PASSES] -y MODEL_TYPE -a MODEL_ARCH
-                [--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
-                [--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
-                [--num_workers NUM_WORKERS] [--use_gpu USE_GPU] [-c CLASS_NUM]
-                [--model_output_prefix MODEL_OUTPUT_PREFIX]
-                [-g NUM_BATCHES_TO_LOG] [-e NUM_BATCHES_TO_TEST]
-                [-z NUM_BATCHES_TO_SAVE_MODEL]
-PaddlePaddle DSSM example
-optional arguments:
-  -h, --help            show this help message and exit
-  -i TRAIN_DATA_PATH, --train_data_path TRAIN_DATA_PATH
-                        path of training dataset
-  -t TEST_DATA_PATH, --test_data_path TEST_DATA_PATH
-                        path of testing dataset
-  -s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
-                        path of the source's word dic
-  --target_dic_path TARGET_DIC_PATH
-                        path of the target's word dic, if not set, the
-                        `source_dic_path` will be used
-  -b BATCH_SIZE, --batch_size BATCH_SIZE
-                        size of mini-batch (default:32)
-  -p NUM_PASSES, --num_passes NUM_PASSES
-                        number of passes to run(default:10)
-  -y MODEL_TYPE, --model_type MODEL_TYPE
-                        model type, 0 for classification, 1 for pairwise rank,
-                        2 for regression (default: classification)
-  -a MODEL_ARCH, --model_arch MODEL_ARCH
-                        model architecture, 1 for CNN, 0 for FC, 2 for RNN
-  --share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
-                        whether to share network parameters between source and
-                        target
-  --share_embed SHARE_EMBED
-                        whether to share word embedding between source and
-                        target
-  --dnn_dims DNN_DIMS   dimentions of dnn layers, default is '256,128,64,32',
-                        which means create a 4-layer dnn, demention of each
-                        layer is 256, 128, 64 and 32
-  --num_workers NUM_WORKERS
-                        num worker threads, default 1
-  --use_gpu USE_GPU     whether to use GPU devices (default: False)
-  -c CLASS_NUM, --class_num CLASS_NUM
-                        number of categories for classification task.
-  --model_output_prefix MODEL_OUTPUT_PREFIX
-                        prefix of the path for model to store, (default: ./)
-  -g NUM_BATCHES_TO_LOG, --num_batches_to_log NUM_BATCHES_TO_LOG
-                        number of batches to output train log, (default: 100)
-  -e NUM_BATCHES_TO_TEST, --num_batches_to_test NUM_BATCHES_TO_TEST
-                        number of batches to test, (default: 200)
-  -z NUM_BATCHES_TO_SAVE_MODEL, --num_batches_to_save_model NUM_BATCHES_TO_SAVE_MODEL
-                        number of batches to output model, (default: 400)
-```
-Parameter description:
 - `train_data_path` Training data path
 - `test_data_path`  Test data path, optional
@@ -460,48 +293,8 @@ Parameter description:
 - `dnn_dims` The dimension of each layer of the model is set, the default is `256,128,64,32`，with 4 layers.
 ## To predict using the trained model
-```
-usage: infer.py [-h] --model_path MODEL_PATH -i DATA_PATH -o
-                PREDICTION_OUTPUT_PATH -y MODEL_TYPE [-s SOURCE_DIC_PATH]
-                [--target_dic_path TARGET_DIC_PATH] -a MODEL_ARCH
-                [--share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET]
-                [--share_embed SHARE_EMBED] [--dnn_dims DNN_DIMS]
-                [-c CLASS_NUM]
-PaddlePaddle DSSM infer
-optional arguments:
-  -h, --help            show this help message and exit
-  --model_path MODEL_PATH
-                        path of model parameters file
-  -i DATA_PATH, --data_path DATA_PATH
-                        path of the dataset to infer
-  -o PREDICTION_OUTPUT_PATH, --prediction_output_path PREDICTION_OUTPUT_PATH
-                        path to output the prediction
-  -y MODEL_TYPE, --model_type MODEL_TYPE
-                        model type, 0 for classification, 1 for pairwise rank,
-                        2 for regression (default: classification)
-  -s SOURCE_DIC_PATH, --source_dic_path SOURCE_DIC_PATH
-                        path of the source's word dic
-  --target_dic_path TARGET_DIC_PATH
-                        path of the target's word dic, if not set, the
-                        `source_dic_path` will be used
-  -a MODEL_ARCH, --model_arch MODEL_ARCH
-                        model architecture, 1 for CNN, 0 for FC, 2 for RNN
-  --share_network_between_source_target SHARE_NETWORK_BETWEEN_SOURCE_TARGET
-                        whether to share network parameters between source and
-                        target
-  --share_embed SHARE_EMBED
-                        whether to share word embedding between source and
-                        target
-  --dnn_dims DNN_DIMS   dimentions of dnn layers, default is '256,128,64,32',
-                        which means create a 4-layer dnn, demention of each
-                        layer is 256, 128, 64 and 32
-  -c CLASS_NUM, --class_num CLASS_NUM
-                        number of categories for classification task.
-```
-Important parameters are
+The paremeters to execute the script `infer.py` can be found by execution `python infer.py --help`. Some important parameters are：
 - `data_path` Path for the data to predict
 - `prediction_output_path` Prediction output path

--- a/dssm/infer.py
+++ b/dssm/infer.py
@@ -9,83 +9,81 @@ from utils import logger, ModelType, ModelArch, load_dic
 parser = argparse.ArgumentParser(description="PaddlePaddle DSSM infer")
 parser.add_argument(
-    '--model_path',
+    "--model_path", type=str, required=True, help="The path of trained model.")
-    type=str,
-    required=True,
-    help="path of model parameters file")
 parser.add_argument(
-    '-i',
+    "-i",
-    '--data_path',
+    "--data_path",
    type=str,
    required=True,
-    help="path of the dataset to infer")
+    help="The path of the data for inferring.")
 parser.add_argument(
-    '-o',
+    "-o",
-    '--prediction_output_path',
+    "--prediction_output_path",
    type=str,
    required=True,
-    help="path to output the prediction")
+    help="The path to save the predictions.")
 parser.add_argument(
-    '-y',
+    "-y",
-    '--model_type',
+    "--model_type",
    type=int,
    required=True,
    default=ModelType.CLASSIFICATION_MODE,
-    help=("model type, %d for classification, %d for pairwise rank, "
+    help=("The model type: %d for classification, %d for pairwise rank, "
-          "%d for regression (default: classification)") %
+          "%d for regression (default: classification).") %
    (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE,
     ModelType.REGRESSION_MODE))
 parser.add_argument(
-    '-s',
+    "-s",
-    '--source_dic_path',
+    "--source_dic_path",
    type=str,
    required=False,
-    help="path of the source's word dic")
+    help="The path of the source's word dictionary.")
 parser.add_argument(
-    '--target_dic_path',
+    "--target_dic_path",
    type=str,
    required=False,
-    help=("path of the target's word dictionary, "
+    help=("The path of the target's word dictionary, "
-          "if not set, the `source_dic_path` will be used"))
+          "if this parameter is not set, the `source_dic_path` will be used."))
 parser.add_argument(
-    '-a',
+    "-a",
-    '--model_arch',
+    "--model_arch",
    type=int,
    required=True,
    default=ModelArch.CNN_MODE,
    help="model architecture, %d for CNN, %d for FC, %d for RNN" %
    (ModelArch.CNN_MODE, ModelArch.FC_MODE, ModelArch.RNN_MODE))
 parser.add_argument(
-    '--share_network_between_source_target',
+    "--share_network_between_source_target",
    type=distutils.util.strtobool,
    default=False,
    help="whether to share network parameters between source and target")
 parser.add_argument(
-    '--share_embed',
+    "--share_embed",
    type=distutils.util.strtobool,
    default=False,
    help="whether to share word embedding between source and target")
 parser.add_argument(
-    '--dnn_dims',
+    "--dnn_dims",
    type=str,
-    default='256,128,64,32',
+    default="256,128,64,32",
-    help=("dimentions of dnn layers, default is '256,128,64,32', "
+    help=("The dimentions of dnn layers, default is `256,128,64,32`, "
-          "which means create a 4-layer dnn, "
+          "which means a dnn with 4 layers with "
-          "demention of each layer is 256, 128, 64 and 32"))
+          "dmentions 256, 128, 64 and 32 will be created."))
 parser.add_argument(
-    '-c',
+    "-c",
-    '--class_num',
+    "--class_num",
    type=int,
    default=0,
-    help="number of categories for classification task.")
+    help="The number of categories for classification task.")
 args = parser.parse_args()
 args.model_type = ModelType(args.model_type)
 args.model_arch = ModelArch(args.model_arch)
 if args.model_type.is_classification():
-    assert args.class_num > 1, "--class_num should be set in classification task."
+    assert args.class_num > 1, ("The parameter class_num should be set "
+                                "in classification task.")
-layer_dims = map(int, args.dnn_dims.split(','))
+layer_dims = map(int, args.dnn_dims.split(","))
 args.target_dic_path = args.source_dic_path if not args.target_dic_path \
        else args.target_dic_path
@@ -94,8 +92,6 @@ paddle.init(use_gpu=False, trainer_count=1)
 class Inferer(object):
    def __init__(self, param_path):
-        logger.info("create DSSM model")
        prediction = DSSM(
            dnn_dims=layer_dims,
            vocab_sizes=[
@@ -110,14 +106,13 @@ class Inferer(object):
            is_infer=True)()
        # load parameter
-        logger.info("load model parameters from %s" % param_path)
+        logger.info("Load the trained model from %s." % param_path)
        self.parameters = paddle.parameters.Parameters.from_tar(
-            open(param_path, 'r'))
+            open(param_path, "r"))
        self.inferer = paddle.inference.Inference(
            output_layer=prediction, parameters=self.parameters)
    def infer(self, data_path):
-        logger.info("infer data...")
        dataset = reader.Dataset(
            train_path=data_path,
            test_path=None,
@@ -125,19 +120,20 @@ class Inferer(object):
            target_dic_path=args.target_dic_path,
            model_type=args.model_type, )
        infer_reader = paddle.batch(dataset.infer, batch_size=1000)
-        logger.warning('write predictions to %s' % args.prediction_output_path)
+        logger.warning("Write predictions to %s." % args.prediction_output_path)
-        output_f = open(args.prediction_output_path, 'w')
+        output_f = open(args.prediction_output_path, "w")
        for id, batch in enumerate(infer_reader()):
            res = self.inferer.infer(input=batch)
-            predictions = [' '.join(map(str, x)) for x in res]
+            predictions = [" ".join(map(str, x)) for x in res]
            assert len(batch) == len(predictions), (
-                "predict error, %d inputs, "
+                "Error! %d inputs are given, "
-                "but %d predictions") % (len(batch), len(predictions))
+                "but only %d predictions are returned.") % (len(batch),
-            output_f.write('\n'.join(map(str, predictions)) + '\n')
+                                                            len(predictions))
+            output_f.write("\n".join(map(str, predictions)) + "\n")
-if __name__ == '__main__':
+if __name__ == "__main__":
    inferer = Inferer(args.model_path)
    inferer.infer(args.data_path)
--- a/dssm/network_conf.py
+++ b/dssm/network_conf.py
@@ -13,26 +13,33 @@ class DSSM(object):
                 class_num=None,
                 share_embed=False,
                 is_infer=False):
-        '''
+        """
-        @dnn_dims: list of int
+        :param dnn_dims: The dimention of each layer in the semantic vector
-            dimentions of each layer in semantic vector generator.
+                         generator.
-        @vocab_sizes: 2-d tuple
+        :type dnn_dims: list of int
-            size of both left and right items.
+        :param vocab_sizes: The size of left and right items.
-        @model_type: int
+        :type vocab_sizes: A list having 2 elements.
-            type of task, should be 'rank: 0', 'regression: 1' or 'classification: 2'
+        :param model_type: The type of task to train the DSSM model. The value
-        @model_arch: int
+                           should be "rank: 0", "regression: 1" or
-            model architecture
+                           "classification: 2".
-        @share_semantic_generator: bool
+        :type model_type: int
-            whether to share the semantic vector generator for both left and right.
+        :param model_arch: A value indicating the model architecture to use.
-        @share_embed: bool
+        :type model_arch: int
-            whether to share the embeddings between left and right.
+        :param share_semantic_generator: A flag indicating whether to share the
-        @class_num: int
+                                         semantic vector between the left and
-            number of categories.
+                                         the right item.
-        '''
+        :type share_semantic_generator: bool
+        :param share_embed: A floag indicating whether to share the embeddings
+                            between the left and the right item.
+        :type share_embed: bool
+        :param class_num: The number of categories.
+        :type class_num: int
+        """
        assert len(vocab_sizes) == 2, (
-            "vocab_sizes specify the sizes left and right inputs, "
+            "The vocab_sizes specifying the sizes left and right inputs. "
-            "and dim should be 2.")
+            "Its dimension should be 2.")
-        assert len(dnn_dims) > 1, "more than two layers is needed."
+        assert len(dnn_dims) > 1, ("In the DNN model, more than two layers "
+                                   "are needed.")
        self.dnn_dims = dnn_dims
        self.vocab_sizes = vocab_sizes
@@ -42,91 +49,89 @@ class DSSM(object):
        self.model_arch = ModelArch(model_arch)
        self.class_num = class_num
        self.is_infer = is_infer
-        logger.warning("build DSSM model with config of %s, %s" %
+        logger.warning("Build DSSM model with config of %s, %s" %
                       (self.model_type, self.model_arch))
-        logger.info("vocabulary sizes: %s" % str(self.vocab_sizes))
+        logger.info("The vocabulary size is : %s" % str(self.vocab_sizes))
        # bind model architecture
        _model_arch = {
-            'cnn': self.create_cnn,
+            "cnn": self.create_cnn,
-            'fc': self.create_fc,
+            "fc": self.create_fc,
-            'rnn': self.create_rnn,
+            "rnn": self.create_rnn,
        }
-        def _model_arch_creater(emb, prefix=''):
+        def _model_arch_creater(emb, prefix=""):
            sent_vec = _model_arch.get(str(model_arch))(emb, prefix)
            dnn = self.create_dnn(sent_vec, prefix)
            return dnn
        self.model_arch_creater = _model_arch_creater
-        # build model type
        _model_type = {
-            'classification': self._build_classification_model,
+            "classification": self._build_classification_model,
-            'rank': self._build_rank_model,
+            "rank": self._build_rank_model,
-            'regression': self._build_regression_model,
+            "regression": self._build_regression_model,
        }
-        print 'model type: ', str(self.model_type)
+        print("model type: ", str(self.model_type))
        self.model_type_creater = _model_type[str(self.model_type)]
    def __call__(self):
        return self.model_type_creater()
-    def create_embedding(self, input, prefix=''):
+    def create_embedding(self, input, prefix=""):
-        '''
+        """
-        Create an embedding table whose name has a `prefix`.
+        Create word embedding. The `prefix` is added in front of the name of
-        '''
+        embedding"s learnable parameter.
-        logger.info("create embedding table [%s] which dimention is %d" %
+        """
+        logger.info("Create embedding table [%s] whose dimention is %d. " %
                    (prefix, self.dnn_dims[0]))
        emb = paddle.layer.embedding(
            input=input,
            size=self.dnn_dims[0],
-            param_attr=ParamAttr(name='%s_emb.w' % prefix))
+            param_attr=ParamAttr(name="%s_emb.w" % prefix))
        return emb
-    def create_fc(self, emb, prefix=''):
+    def create_fc(self, emb, prefix=""):
-        '''
+        """
        A multi-layer fully connected neural networks.
-        @emb: paddle.layer
+        :param emb: The output of the embedding layer
-            output of the embedding layer
+        :type emb: paddle.layer
-        @prefix: str
+        :param prefix: A prefix will be added to the layers' names.
-            prefix of layers' names, used to share parameters between
+        :type prefix: str
-            more than one `fc` parts.
+        """
-        '''
        _input_layer = paddle.layer.pooling(
            input=emb, pooling_type=paddle.pooling.Max())
        fc = paddle.layer.fc(
            input=_input_layer,
            size=self.dnn_dims[1],
-            param_attr=ParamAttr(name='%s_fc.w' % prefix),
+            param_attr=ParamAttr(name="%s_fc.w" % prefix),
            bias_attr=ParamAttr(name="%s_fc.b" % prefix, initial_std=0.))
        return fc
-    def create_rnn(self, emb, prefix=''):
+    def create_rnn(self, emb, prefix=""):
-        '''
+        """
        A GRU sentence vector learner.
-        '''
+        """
        gru = paddle.networks.simple_gru(
            input=emb,
            size=self.dnn_dims[1],
-            mixed_param_attr=ParamAttr(name='%s_gru_mixed.w' % prefix),
+            mixed_param_attr=ParamAttr(name="%s_gru_mixed.w" % prefix),
            mixed_bias_param_attr=ParamAttr(name="%s_gru_mixed.b" % prefix),
-            gru_param_attr=ParamAttr(name='%s_gru.w' % prefix),
+            gru_param_attr=ParamAttr(name="%s_gru.w" % prefix),
            gru_bias_attr=ParamAttr(name="%s_gru.b" % prefix))
        sent_vec = paddle.layer.last_seq(gru)
        return sent_vec
-    def create_cnn(self, emb, prefix=''):
+    def create_cnn(self, emb, prefix=""):
-        '''
+        """
        A multi-layer CNN.
-        @emb: paddle.layer
+        :param emb: The word embedding.
-            output of the embedding layer
+        :type emb: paddle.layer
-        @prefix: str
+        :param prefix: The prefix will be added to of layers' names.
-            prefix of layers' names, used to share parameters between
+        :type prefix: str
-            more than one `cnn` parts.
+        """
-        '''
        def create_conv(context_len, hidden_size, prefix):
            key = "%s_%d_%d" % (prefix, context_len, hidden_size)
@@ -135,15 +140,15 @@ class DSSM(object):
                context_len=context_len,
                hidden_size=hidden_size,
                # set parameter attr for parameter sharing
-                context_proj_param_attr=ParamAttr(name=key + 'contex_proj.w'),
+                context_proj_param_attr=ParamAttr(name=key + "contex_proj.w"),
-                fc_param_attr=ParamAttr(name=key + '_fc.w'),
+                fc_param_attr=ParamAttr(name=key + "_fc.w"),
-                fc_bias_attr=ParamAttr(name=key + '_fc.b'),
+                fc_bias_attr=ParamAttr(name=key + "_fc.b"),
-                pool_bias_attr=ParamAttr(name=key + '_pool.b'))
+                pool_bias_attr=ParamAttr(name=key + "_pool.b"))
            return conv
-        logger.info('create a sequence_conv_pool which context width is 3')
+        logger.info("create a sequence_conv_pool which context width is 3")
        conv_3 = create_conv(3, self.dnn_dims[1], "cnn")
-        logger.info('create a sequence_conv_pool which context width is 4')
+        logger.info("create a sequence_conv_pool which context width is 4")
        conv_4 = create_conv(4, self.dnn_dims[1], "cnn")
        return conv_3, conv_4
@@ -160,8 +165,8 @@ class DSSM(object):
                    input=_input_layer,
                    size=dim,
                    act=paddle.activation.Tanh(),
-                    param_attr=ParamAttr(name='%s.w' % name),
+                    param_attr=ParamAttr(name="%s.w" % name),
-                    bias_attr=ParamAttr(name='%s.b' % name, initial_std=0.))
+                    bias_attr=ParamAttr(name="%s.b" % name, initial_std=0.))
                _input_layer = fc
        return _input_layer
@@ -178,7 +183,7 @@ class DSSM(object):
            is_classification=False)
    def _build_rank_model(self):
-        '''
+        """
        Build a pairwise rank model, and the cost is returned.
        A pairwise rank model has 3 inputs:
@@ -187,26 +192,26 @@ class DSSM(object):
          - right_target sentence
          - label, 1 if left_target should be sorted in front of
                   right_target, otherwise 0.
-        '''
+        """
        logger.info("build rank model")
        assert self.model_type.is_rank()
        source = paddle.layer.data(
-            name='source_input',
+            name="source_input",
            type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
        left_target = paddle.layer.data(
-            name='left_target_input',
+            name="left_target_input",
            type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
        right_target = paddle.layer.data(
-            name='right_target_input',
+            name="right_target_input",
            type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
        if not self.is_infer:
            label = paddle.layer.data(
-                name='label_input', type=paddle.data_type.integer_value(1))
+                name="label_input", type=paddle.data_type.integer_value(1))
-        prefixs = '_ _ _'.split(
+        prefixs = "_ _ _".split(
-        ) if self.share_semantic_generator else 'source target target'.split()
+        ) if self.share_semantic_generator else "source target target".split()
-        embed_prefixs = '_ _'.split(
+        embed_prefixs = "_ _".split(
-        ) if self.share_embed else 'source target target'.split()
+        ) if self.share_embed else "source target target".split()
        word_vecs = []
        for id, input in enumerate([source, left_target, right_target]):
@@ -218,9 +223,9 @@ class DSSM(object):
            x = self.model_arch_creater(input, prefix=prefixs[id])
            semantics.append(x)
-        # cossim score of source and left_target
+        # The cosine similarity score of source and left_target.
        left_score = paddle.layer.cos_sim(semantics[0], semantics[1])
-        # cossim score of source and right target
+        # The cosine similarity score of source and right target.
        right_score = paddle.layer.cos_sim(semantics[0], semantics[2])
        if not self.is_infer:
@@ -233,34 +238,33 @@ class DSSM(object):
        return right_score
    def _build_classification_or_regression_model(self, is_classification):
-        '''
+        """
        Build a classification/regression model, and the cost is returned.
-        A Classification has 3 inputs:
+        The classification/regression task expects 3 inputs:
          - source sentence
          - target sentence
          - classification label
-        '''
+        """
        if is_classification:
-            # prepare inputs.
            assert self.class_num
        source = paddle.layer.data(
-            name='source_input',
+            name="source_input",
            type=paddle.data_type.integer_value_sequence(self.vocab_sizes[0]))
        target = paddle.layer.data(
-            name='target_input',
+            name="target_input",
            type=paddle.data_type.integer_value_sequence(self.vocab_sizes[1]))
        label = paddle.layer.data(
-            name='label_input',
+            name="label_input",
            type=paddle.data_type.integer_value(self.class_num)
            if is_classification else paddle.data_type.dense_vector(1))
-        prefixs = '_ _'.split(
+        prefixs = "_ _".split(
-        ) if self.share_semantic_generator else 'source target'.split()
+        ) if self.share_semantic_generator else "source target".split()
-        embed_prefixs = '_ _'.split(
+        embed_prefixs = "_ _".split(
-        ) if self.share_embed else 'source target'.split()
+        ) if self.share_embed else "source target".split()
        word_vecs = []
        for id, input in enumerate([source, target]):

--- a/dssm/train.py
+++ b/dssm/train.py
@@ -9,120 +9,129 @@ from utils import TaskType, load_dic, logger, ModelType, ModelArch, display_args
 parser = argparse.ArgumentParser(description="PaddlePaddle DSSM example")
 parser.add_argument(
-    '-i',
+    "-i",
-    '--train_data_path',
+    "--train_data_path",
    type=str,
    required=False,
-    help="path of training dataset")
+    help="The path of training data.")
 parser.add_argument(
-    '-t',
+    "-t",
-    '--test_data_path',
+    "--test_data_path",
    type=str,
    required=False,
-    help="path of testing dataset")
+    help="The path of testing data.")
 parser.add_argument(
-    '-s',
+    "-s",
-    '--source_dic_path',
+    "--source_dic_path",
    type=str,
    required=False,
-    help="path of the source's word dic")
+    help="The path of the source's word dictionary.")
 parser.add_argument(
-    '--target_dic_path',
+    "--target_dic_path",
    type=str,
    required=False,
-    help=("path of the target's word dictionary, "
+    help=("The path of the target's word dictionary, "
-          "if not set, the `source_dic_path` will be used"))
+          "if this parameter is not set, the `source_dic_path` will be used"))
 parser.add_argument(
-    '-b',
+    "-b",
-    '--batch_size',
+    "--batch_size",
    type=int,
    default=32,
-    help="size of mini-batch (default:32)")
+    help="The size of mini-batch (default:32).")
 parser.add_argument(
-    '-p',
+    "-p",
-    '--num_passes',
+    "--num_passes",
    type=int,
    default=10,
-    help="number of passes to run(default:10)")
+    help="The number of passes to run(default:10).")
 parser.add_argument(
-    '-y',
+    "-y",
-    '--model_type',
+    "--model_type",
    type=int,
    required=True,
    default=ModelType.CLASSIFICATION_MODE,
-    help="model type, %d for classification, %d for pairwise rank, %d for regression (default: classification)"
+    help=("model type, %d for classification, %d for pairwise rank, "
-    % (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE,
+          "%d for regression (default: classification).") %
-       ModelType.REGRESSION_MODE))
+    (ModelType.CLASSIFICATION_MODE, ModelType.RANK_MODE,
+     ModelType.REGRESSION_MODE))
 parser.add_argument(
-    '-a',
+    "-a",
-    '--model_arch',
+    "--model_arch",
    type=int,
    required=True,
    default=ModelArch.CNN_MODE,
-    help="model architecture, %d for CNN, %d for FC, %d for RNN" %
+    help="The model architecture, %d for CNN, %d for FC, %d for RNN." %
    (ModelArch.CNN_MODE, ModelArch.FC_MODE, ModelArch.RNN_MODE))
 parser.add_argument(
-    '--share_network_between_source_target',
+    "--share_network_between_source_target",
    type=distutils.util.strtobool,
    default=False,
-    help="whether to share network parameters between source and target")
+    help="Whether to share network parameters between source and target.")
 parser.add_argument(
-    '--share_embed',
+    "--share_embed",
    type=distutils.util.strtobool,
    default=False,
-    help="whether to share word embedding between source and target")
+    help="Whether to share word embedding between source and target.")
 parser.add_argument(
-    '--dnn_dims',
+    "--dnn_dims",
    type=str,
-    default='256,128,64,32',
+    default="256,128,64,32",
-    help="dimentions of dnn layers, default is '256,128,64,32', which means create a 4-layer dnn, demention of each layer is 256, 128, 64 and 32"
+    help=("The dimentions of dnn layers, default is '256,128,64,32', "
-)
+          "which means create a 4-layer dnn. The dimention of each layer is "
+          "'256, 128, 64 and 32'."))
 parser.add_argument(
-    '--num_workers', type=int, default=1, help="num worker threads, default 1")
+    "--num_workers",
+    type=int,
+    default=1,
+    help="The number of worker threads, default 1.")
 parser.add_argument(
-    '--use_gpu',
+    "--use_gpu",
    type=distutils.util.strtobool,
    default=False,
-    help="whether to use GPU devices (default: False)")
+    help="Whether to use GPU devices (default: False)")
 parser.add_argument(
-    '-c',
+    "-c",
-    '--class_num',
+    "--class_num",
    type=int,
    default=0,
-    help="number of categories for classification task.")
+    help="The number of categories for classification task.")
 parser.add_argument(
-    '--model_output_prefix',
+    "--model_output_prefix",
    type=str,
    default="./",
-    help="prefix of the path for model to store, (default: ./)")
+    help="The prefix of the path to store the trained models (default: ./).")
 parser.add_argument(
-    '-g',
+    "-g",
-    '--num_batches_to_log',
+    "--num_batches_to_log",
    type=int,
    default=100,
-    help="number of batches to output train log, (default: 100)")
+    help=("The log period. Every num_batches_to_test batches, "
+          "a training log will be printed. (default: 100)"))
 parser.add_argument(
-    '-e',
+    "-e",
-    '--num_batches_to_test',
+    "--num_batches_to_test",
    type=int,
    default=200,
-    help="number of batches to test, (default: 200)")
+    help=("The test period. Every num_batches_to_save_model batches, "
+          "the specified test sample will be test (default: 200)."))
 parser.add_argument(
-    '-z',
+    "-z",
-    '--num_batches_to_save_model',
+    "--num_batches_to_save_model",
    type=int,
    default=400,
-    help="number of batches to output model, (default: 400)")
+    help=("Every num_batches_to_save_model batches, "
+          "a trained model will be saved (default: 400)."))
-# arguments check.
 args = parser.parse_args()
 args.model_type = ModelType(args.model_type)
 args.model_arch = ModelArch(args.model_arch)
 if args.model_type.is_classification():
-    assert args.class_num > 1, "--class_num should be set in classification task."
+    assert args.class_num > 1, ("The parameter class_num should be set in "
+                                "classification task.")
-layer_dims = [int(i) for i in args.dnn_dims.split(',')]
+layer_dims = [int(i) for i in args.dnn_dims.split(",")]
-args.target_dic_path = args.source_dic_path if not args.target_dic_path else args.target_dic_path
+args.target_dic_path = args.source_dic_path if not \
+        args.target_dic_path else args.target_dic_path
 def train(train_data_path=None,
@@ -138,15 +147,15 @@ def train(train_data_path=None,
          class_num=None,
          num_workers=1,
          use_gpu=False):
-    '''
+    """
    Train the DSSM.
-    '''
+    """
-    default_train_path = './data/rank/train.txt'
+    default_train_path = "./data/rank/train.txt"
-    default_test_path = './data/rank/test.txt'
+    default_test_path = "./data/rank/test.txt"
-    default_dic_path = './data/vocab.txt'
+    default_dic_path = "./data/vocab.txt"
    if not model_type.is_rank():
-        default_train_path = './data/classification/train.txt'
+        default_train_path = "./data/classification/train.txt"
-        default_test_path = './data/classification/test.txt'
+        default_test_path = "./data/classification/test.txt"
    use_default_data = not train_data_path
@@ -200,19 +209,19 @@ def train(train_data_path=None,
    feeding = {}
    if model_type.is_classification() or model_type.is_regression():
-        feeding = {'source_input': 0, 'target_input': 1, 'label_input': 2}
+        feeding = {"source_input": 0, "target_input": 1, "label_input": 2}
    else:
        feeding = {
-            'source_input': 0,
+            "source_input": 0,
-            'left_target_input': 1,
+            "left_target_input": 1,
-            'right_target_input': 2,
+            "right_target_input": 2,
-            'label_input': 3
+            "label_input": 3
        }
    def _event_handler(event):
-        '''
+        """
        Define batch handler
-        '''
+        """
        if isinstance(event, paddle.event.EndIteration):
            # output train log
            if event.batch_id % args.num_batches_to_log == 0:
@@ -249,7 +258,7 @@ def train(train_data_path=None,
    logger.info("Training has finished.")
-if __name__ == '__main__':
+if __name__ == "__main__":
    display_args(args)
    train(
        train_data_path=args.train_data_path,

--- a/dssm/utils.py
+++ b/dssm/utils.py
@@ -8,7 +8,7 @@ logger.setLevel(logging.INFO)
 def mode_attr_name(mode):
-    return mode.upper() + '_MODE'
+    return mode.upper() + "_MODE"
 def create_attrs(cls):
@@ -17,9 +17,9 @@ def create_attrs(cls):
 def make_check_method(cls):
-    '''
+    """
    create methods for classes.
-    '''
+    """
    def method(mode):
        def _method(self):
@@ -28,7 +28,7 @@ def make_check_method(cls):
        return _method
    for id, mode in enumerate(cls.modes):
-        setattr(cls, 'is_' + mode, method(mode))
+        setattr(cls, "is_" + mode, method(mode))
 def make_create_method(cls):
@@ -41,10 +41,10 @@ def make_create_method(cls):
        return _method
    for id, mode in enumerate(cls.modes):
-        setattr(cls, 'create_' + mode, method(mode))
+        setattr(cls, "create_" + mode, method(mode))
-def make_str_method(cls, type_name='unk'):
+def make_str_method(cls, type_name="unk"):
    def _str_(self):
        for mode in cls.modes:
            if self.mode == getattr(cls, mode_attr_name(mode)):
@@ -53,9 +53,9 @@ def make_str_method(cls, type_name='unk'):
    def _hash_(self):
        return self.mode
-    setattr(cls, '__str__', _str_)
+    setattr(cls, "__str__", _str_)
-    setattr(cls, '__repr__', _str_)
+    setattr(cls, "__repr__", _str_)
-    setattr(cls, '__hash__', _hash_)
+    setattr(cls, "__hash__", _hash_)
    cls.__name__ = type_name
@@ -65,7 +65,7 @@ def _init_(self, mode, cls):
    elif isinstance(mode, cls):
        self.mode = mode.mode
    else:
-        raise Exception("wrong mode type, get type: %s, value: %s" %
+        raise Exception("A wrong mode type, get type: %s, value: %s." %
                        (type(mode), mode))
@@ -77,21 +77,21 @@ def build_mode_class(cls):
 class TaskType(object):
-    modes = 'train test infer'.split()
+    modes = "train test infer".split()
    def __init__(self, mode):
        _init_(self, mode, TaskType)
 class ModelType:
-    modes = 'classification rank regression'.split()
+    modes = "classification rank regression".split()
    def __init__(self, mode):
        _init_(self, mode, ModelType)
 class ModelArch:
-    modes = 'fc cnn rnn'.split()
+    modes = "fc cnn rnn".split()
    def __init__(self, mode):
        _init_(self, mode, ModelArch)
@@ -103,22 +103,16 @@ build_mode_class(ModelArch)
 def sent2ids(sent, vocab):
-    '''
+    """
    transform a sentence to a list of ids.
+    """
-    @sent: str
-        a sentence.
-    @vocab: dict
-        a word dic
-    '''
    return [vocab.get(w, UNK) for w in sent.split()]
 def load_dic(path):
-    '''
+    """
-    word dic format:
+    The format of word dictionary : each line is a word.
-      each line is a word
+    """
-    '''
    dic = {}
    with open(path) as f:
        for id, line in enumerate(f):
@@ -128,13 +122,6 @@ def load_dic(path):
 def display_args(args):
-    logger.info("arguments passed by command line:")
+    logger.info("The arguments passed by command line is :")
    for k, v in sorted(v for v in vars(args).items()):
        logger.info("{}:\t{}".format(k, v))
-if __name__ == '__main__':
-    t = TaskType(1)
-    t = TaskType.create_train()
-    print t
-    print 'is', t.is_train()
--- a/nce_cost/network_conf.py
+++ b/nce_cost/network_conf.py
@@ -33,7 +33,6 @@ def ngram_lm(hidden_size, emb_size, dict_size, gram_num=4, is_train=True):
            num_classes=dict_size,
            param_attr=paddle.attr.Param(name="nce_w"),
            bias_attr=paddle.attr.Param(name="nce_b"),
-            act=paddle.activation.Sigmoid(),
            num_neg_samples=25,
            neg_distribution=None)
    else:
@@ -41,7 +40,7 @@ def ngram_lm(hidden_size, emb_size, dict_size, gram_num=4, is_train=True):
            size=dict_size,
            input=paddle.layer.trans_full_matrix_projection(
                hidden_layer, param_attr=paddle.attr.Param(name="nce_w")),
-            act=paddle.activation.Sigmoid(),
+            act=paddle.activation.Softmax(),
            bias_attr=paddle.attr.Param(name="nce_b"))