diff --git a/cmake/util.cmake b/cmake/util.cmake
index c9b48e5f8fc3bcb2160b9f999cb36886ae479f94..50af3c38cdaef6eee35d0138a7c9d1147ca400ff 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -95,7 +95,6 @@ function(link_paddle_exe TARGET_NAME)
         paddle_parameter
         paddle_proto
         paddle_cuda
-        paddle_test_main
         ${METRIC_LIBS}
         ${EXTERNAL_LIBS}
         ${CMAKE_THREAD_LIBS_INIT}
@@ -130,8 +129,9 @@ endfunction()
 # Rest Arguemnts: not used.
 function(link_paddle_test TARGET_NAME)
     link_paddle_exe(${TARGET_NAME})
-    target_link_libraries(${TARGET_NAME} ${GTEST_MAIN_LIBRARIES}
-        ${GTEST_LIBRARIES})
+    target_link_libraries(${TARGET_NAME}
+                          paddle_test_main
+                          ${GTEST_LIBRARIES})
 endfunction()
 
 # add_unittest_without_exec
diff --git a/demo/model_zoo/embedding/pre_DictAndModel.sh b/demo/model_zoo/embedding/pre_DictAndModel.sh
index f97ef2610734449c88fdfca6216b1cab57472b84..f61c65a935c76032a06613cfe0b50f1c90bc50d9 100755
--- a/demo/model_zoo/embedding/pre_DictAndModel.sh
+++ b/demo/model_zoo/embedding/pre_DictAndModel.sh
@@ -14,9 +14,19 @@
 # limitations under the License.
 set -e
 set -x
+BASE_URL='http://paddlepaddle.cdn.bcebos.com/model_zoo/embedding'
 
-# download the dictionary and pretrained model 
-for file in baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb
-do 
-  wget http://paddlepaddle.bj.bcebos.com/model_zoo/embedding/$file
+DOWNLOAD_ITEMS=(baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb)
+ITEM_MD5=(fa03a12321eaab6c30a8fcc9442eaea3
+          f88c8325ee6da6187f1080e8fe66c1cd
+          927cf70f27f860aff1a5703ebf7f1584
+	  a52e43655cd25d279777ed509a1ae27b
+	  b92c67fe9ff70fea53596080e351ac80)
+
+for ((i=0; i<${#ITEM_MD5[@]}; i++))
+do
+  FILENAME=${DOWNLOAD_ITEMS[${i}]}
+  REAL_MD5=`wget ${BASE_URL}/${FILENAME} -O - | tee ${FILENAME} | md5sum | cut -d ' ' -f 1`
+  EXPECTED_MD5=${ITEM_MD5[${i}]}
+  [ "${EXPECTED_MD5}" = "${REAL_MD5}" ]
 done
diff --git a/demo/recommendation/evaluate.py b/demo/recommendation/evaluate.py
new file mode 100755
index 0000000000000000000000000000000000000000..3afa7a1e9db5fefb1bbf5aaa174b8168afae4058
--- /dev/null
+++ b/demo/recommendation/evaluate.py
@@ -0,0 +1,37 @@
+#!/usr/bin/python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import re
+import math
+
+
+def get_best_pass(log_filename):
+    with open(log_filename, 'r') as f:
+        text = f.read()
+        pattern = re.compile('Test.*? cost=([0-9]+\.[0-9]+).*?pass-([0-9]+)',
+                             re.S)
+        results = re.findall(pattern, text)
+        sorted_results = sorted(results, key=lambda result: float(result[0]))
+        return sorted_results[0]
+
+
+log_filename = sys.argv[1]
+log = get_best_pass(log_filename)
+predict_error = math.sqrt(float(log[0])) / 2
+print 'Best pass is %s, error is %s, which means predict get error as %f' % (
+    log[1], log[0], predict_error)
+
+evaluate_pass = "output/pass-%s" % log[1]
+print "evaluating from pass %s" % evaluate_pass
diff --git a/doc/faq/index_cn.rst b/doc/faq/index_cn.rst
index ea0ef25f00d4cb3232d5c6ee1f1e33abd2dadaee..7d425a05d46131d84ba895d0fefc3a592a9a36e1 100644
--- a/doc/faq/index_cn.rst
+++ b/doc/faq/index_cn.rst
@@ -72,7 +72,7 @@ PaddlePaddle支持非常多的优化算法(Optimizer)，不同的优化算法需
 减少数据载入的耗时
 ++++++++++++++++++
 
-使用 :code:`pydataprovider`时，可以减少缓存池的大小，同时设置内存缓存功能，即可以极大的加速数据载入流程。
+使用\ :code:`pydataprovider`\ 时，可以减少缓存池的大小，同时设置内存缓存功能，即可以极大的加速数据载入流程。
 :code:`DataProvider` 缓存池的减小，和之前减小通过减小缓存池来减小内存占用的原理一致。
 
 ..  literalinclude:: src/reduce_min_pool_size.py
diff --git a/doc/howto/deep_model/rnn/index_cn.rst b/doc/howto/deep_model/rnn/index_cn.rst
index 9e805ca85191b793c8798a239927a318c70b96f5..9ecab5594cff47cde4700b7ce0f58013a960a16e 100644
--- a/doc/howto/deep_model/rnn/index_cn.rst
+++ b/doc/howto/deep_model/rnn/index_cn.rst
@@ -4,6 +4,7 @@ RNN相关模型
 ..  toctree::
   :maxdepth: 1
 
+  rnn_config_cn.rst
   recurrent_group_cn.md
   hierarchical_layer_cn.rst
   hrnn_rnn_api_compare_cn.rst
diff --git a/doc/howto/deep_model/rnn/rnn_cn.md b/doc/howto/deep_model/rnn/rnn_cn.md
deleted file mode 100644
index 5ec05b2cab9ba85f9f6e9644375ee14f647a413c..0000000000000000000000000000000000000000
--- a/doc/howto/deep_model/rnn/rnn_cn.md
+++ /dev/null
@@ -1,226 +0,0 @@
-RNN 配置
-=================
-
-本教程将指导你如何在 PaddlePaddle 中配置循环神经网络（RNN）。PaddlePaddle 高度支持灵活和高效的循环神经网络配置。 在本教程中，您将了解如何：
-
--   准备用来学习循环神经网络的序列数据。
--   配置循环神经网络架构。
--   使用学习完成的循环神经网络模型生成序列。
-
-我们将使用 vanilla 循环神经网络和 sequence to sequence 模型来指导你完成这些步骤。sequence to sequence 模型的代码可以在`demo / seqToseq`找到。
-
-准备序列数据
----------------------
-
-PaddlePaddle 不需要对序列数据进行任何预处理，例如填充。唯一需要做的是将相应类型设置为输入。例如，以下代码段定义了三个输入。 它们都是序列，它们的大小是`src_dict`，`trg_dict`和`trg_dict`：
-
-``` sourceCode
-settings.input_types = [
-  integer_value_sequence(len(settings.src_dict)),
-  integer_value_sequence(len(settings.trg_dict)),
-  integer_value_sequence(len(settings.trg_dict))]
-```
-
-在`process`函数中，每个`yield`函数将返回三个整数列表。每个整数列表被视为一个整数序列：
-
-``` sourceCode
-yield src_ids, trg_ids, trg_ids_next
-```
-
-有关如何编写数据提供程序的更多细节描述，请参考 [PyDataProvider2](../../ui/data_provider/index.html)。完整的数据提供文件在 `demo/seqToseq/dataprovider.py`。
-
-配置循环神经网络架构
------------------------------------------------
-
-### 简单门控循环神经网络(Gated Recurrent Neural Network)
-
-循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。
-
-![image](../../../tutorials/sentiment_analysis/bi_lstm.jpg)
-
-一般来说，循环网络从 *t* = 1 到 *t* = *T* 或者反向地从 *t* = *T* 到 *t* = 1 执行以下操作。
-
-*x*<sub>*t* + 1</sub> = *f*<sub>*x*</sub>(*x*<sub>*t*</sub>),*y*<sub>*t*</sub> = *f*<sub>*y*</sub>(*x*<sub>*t*</sub>)
-
-其中 *f*<sub>*x*</sub>(.) 称为**单步函数**（即单时间步执行的函数，step function），而 *f*<sub>*y*</sub>(.) 称为**输出函数**。在 vanilla 循环神经网络中，单步函数和输出函数都非常简单。然而，PaddlePaddle 可以通过修改这两个函数来实现复杂的网络配置。我们将使用 sequence to sequence 模型演示如何配置复杂的循环神经网络模型。在本节中，我们将使用简单的 vanilla 循环神经网络作为使用`recurrent_group`配置简单循环神经网络的例子。 注意，如果你只需要使用简单的RNN，GRU或LSTM，那么推荐使用`grumemory`和`lstmemory`，因为它们的计算效率比`recurrent_group`更高。
-
-对于 vanilla RNN，在每个时间步长，**单步函数**为：
-
-*x*<sub>*t* + 1</sub> = *W*<sub>*x*</sub>*x*<sub>*t*</sub> + *W*<sub>*i*</sub>*I*<sub>*t*</sub> + *b*
-
-其中 *x*<sub>*t*</sub> 是RNN状态，并且 *I*<sub>*t*</sub> 是输入，*W*<sub>*x*</sub> 和 *W*<sub>*i*</sub> 分别是RNN状态和输入的变换矩阵。*b* 是偏差。它的**输出函数**只需要*x*<sub>*t*</sub>作为输出。
-
-`recurrent_group`是构建循环神经网络的最重要的工具。 它定义了**单步函数**，**输出函数**和循环神经网络的输入。注意，这个函数的`step`参数需要实现`step function`（单步函数）和`output function`（输出函数）：
-
-
-``` sourceCode
-def simple_rnn(input,
-               size=None,
-               name=None,
-               reverse=False,
-               rnn_bias_attr=None,
-               act=None,
-               rnn_layer_attr=None):
-    def __rnn_step__(ipt):
-       out_mem = memory(name=name, size=size)
-       rnn_out = mixed_layer(input = [full_matrix_projection(ipt),
-                                      full_matrix_projection(out_mem)],
-                             name = name,
-                             bias_attr = rnn_bias_attr,
-                             act = act,
-                             layer_attr = rnn_layer_attr,
-                             size = size)
-       return rnn_out
-    return recurrent_group(name='%s_recurrent_group' % name,
-                           step=__rnn_step__,
-                           reverse=reverse,
-                           input=input)
-```
-
-PaddlePaddle 使用“Memory”（记忆模块）实现单步函数。**Memory**是在PaddlePaddle中构造循环神经网络时最重要的概念。 Memory是在单步函数中循环使用的状态，例如*x*<sub>*t* + 1</sub> = *f*<sub>*x*</sub>(*x*<sub>*t*</sub>)。 一个Memory包含**输出**和**输入**。当前时间步处的Memory的输出作为下一时间步Memory的输入。Memory也可以具有**boot layer(引导层)**，其输出被用作Memory的初始值。 在我们的例子中，门控循环单元的输出被用作输出Memory。请注意，`rnn_out`层的名称与`out_mem`的名称相同。这意味着`rnn_out` (*x*<sub>*t* + 1</sub>)的输出被用作`out_mem`Memory的**输出**。
-
-Memory也可以是序列。在这种情况下，在每个时间步中，我们有一个序列作为循环神经网络的状态。这在构造非常复杂的循环神经网络时是有用的。 其他高级功能包括定义多个Memory，以及使用子序列来定义分级循环神经网络架构。
-
-我们在函数的结尾返回`rnn_out`。 这意味着 `rnn_out` 层的输出被用作门控循环神经网络的**输出**函数。
-
-### Sequence to Sequence Model with Attention
-
-我们将使用 sequence to sequence model with attention 作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。
-
-![image](../../../tutorials/text_generation/encoder-decoder-attention-model.png)
-
-在这个模型中，源序列 *S* = {*s*<sub>1</sub>, …, *s*<sub>*T*</sub>} 用双向门控循环神经网络编码。双向门控循环神经网络的隐藏状态 *H*<sub>*S*</sub> = {*H*<sub>1</sub>, …, *H*<sub>*T*</sub>} 被称为 *编码向量*。解码器是门控循环神经网络。当解读每一个*y*<sub>*t*</sub>时, 这个门控循环神经网络生成一系列权重 *W*<sub>*S*</sub><sup>*t*</sup> = {*W*<sub>1</sub><sup>*t*</sup>, …, *W*<sub>*T*</sub><sup>*t*</sup>}, 用于计算编码向量的加权和。加权和用来生成*y*<sub>*t*</sub>。
-
-模型的编码器部分如下所示。它叫做`grumemory`来表示门控循环神经网络。如果网络架构简单，那么推荐使用循环神经网络的方法，因为它比 `recurrent_group` 更快。我们已经实现了大多数常用的循环神经网络架构，可以参考 [Layers](../../ui/api/trainer_config_helpers/layers_index.html) 了解更多细节。
-
-我们还将编码向量投射到 `decoder_size` 维空间。这通过获得反向循环网络的第一个实例，并将其投射到 `decoder_size` 维空间完成：
-
-``` sourceCode
-# 定义源语句的数据层
-src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
-# 计算每个词的词向量
-src_embedding = embedding_layer(
-    input=src_word_id,
-    size=word_vector_dim,
-    param_attr=ParamAttr(name='_source_language_embedding'))
-# 应用前向循环神经网络
-src_forward = grumemory(input=src_embedding, size=encoder_size)
-# 应用反向递归神经网络（reverse=True表示反向循环神经网络）
-src_backward = grumemory(input=src_embedding,
-                          size=encoder_size,
-                          reverse=True)
-# 将循环神经网络的前向和反向部分混合在一起
-encoded_vector = concat_layer(input=[src_forward, src_backward])
-
-# 投射编码向量到 decoder_size
-encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)],
-                           size = decoder_size)
-
-# 计算反向RNN的第一个实例
-backward_first = first_seq(input=src_backward)
-
-# 投射反向RNN的第一个实例到 decoder size
-decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation())
-```
-
-解码器使用 `recurrent_group` 来定义循环神经网络。单步函数和输出函数在 `gru_decoder_with_attention` 中定义：
-
-``` sourceCode
-group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-              StaticInput(input=encoded_proj,is_seq=True)]
-trg_embedding = embedding_layer(
-    input=data_layer(name='target_language_word',
-                     size=target_dict_dim),
-    size=word_vector_dim,
-    param_attr=ParamAttr(name='_target_language_embedding'))
-group_inputs.append(trg_embedding)
-
-# 对于配备有注意力机制的解码器，在训练中，
-# 目标向量（groudtruth）是数据输入，
-# 而源序列的编码向量可以被无边界的memory访问
-# StaticInput 意味着不同时间步的输入都是相同的值，
-# 否则它以一个序列输入，不同时间步的输入是不同的。
-# 所有输入序列应该有相同的长度。
-decoder = recurrent_group(name=decoder_group_name,
-                          step=gru_decoder_with_attention,
-                          input=group_inputs)
-```
-
-单步函数的实现如下所示。首先，它定义解码网络的**Memory**。然后定义 attention，门控循环单元单步函数和输出函数：
-
-``` sourceCode
-def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
-    # 定义解码器的Memory
-    # Memory的输出定义在 gru_step 内
-    # 注意 gru_step 应该与它的Memory名字相同
-    decoder_mem = memory(name='gru_decoder',
-                         size=decoder_size,
-                         boot_layer=decoder_boot)
-    # 计算 attention 加权编码向量
-    context = simple_attention(encoded_sequence=enc_vec,
-                               encoded_proj=enc_proj,
-                               decoder_state=decoder_mem)
-    # 混合当前词向量和attention加权编码向量
-    decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context),
-                                           full_matrix_projection(current_word)],
-                                 size = decoder_size * 3)
-    # 定义门控循环单元循环神经网络单步函数
-    gru_step = gru_step_layer(name='gru_decoder',
-                              input=decoder_inputs,
-                              output_mem=decoder_mem,
-                              size=decoder_size)
-    # 定义输出函数
-    out = mixed_layer(input=[full_matrix_projection(input=gru_step)],
-                      size=target_dict_dim,
-                      bias_attr=True,
-                      act=SoftmaxActivation())
-    return out
-```
-
-生成序列
------------------
-
-训练模型后，我们可以使用它来生成序列。通常的做法是使用**beam search** 生成序列。以下代码片段定义 beam search 算法。注意，`beam_search` 函数假设 `step` 的输出函数返回的是下一个时刻输出词的 softmax 归一化概率向量。我们对模型进行了以下更改。
-
--   使用 `GeneratedInput` 来表示 trg\_embedding。 `GeneratedInput` 将上一时间步所生成的词的向量来作为当前时间步的输入。
--   使用 `beam_search` 函数。这个函数需要设置：
-    -   `bos_id`: 开始标记。每个句子都以开始标记开头。
-    -   `eos_id`: 结束标记。每个句子都以结束标记结尾。
-    -   `beam_size`: beam search 算法中的beam大小。
-    -   `max_length`: 生成序列的最大长度。
--   使用 `seqtext_printer_evaluator` 根据索引矩阵和字典打印文本。这个函数需要设置：
-    -   `id_input`: 数据的整数ID，用于标识生成的文件中的相应输出。
-    -   `dict_file`: 用于将词ID转换为词的字典文件。
-    -   `result_file`: 生成结果文件的路径。
-
-代码如下：
-
-``` sourceCode
-group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
-              StaticInput(input=encoded_proj,is_seq=True)]
-# 在生成时，解码器基于编码源序列和最后生成的目标词预测下一目标词。
-# 编码源序列（编码器输出）必须由只读Memory的 StaticInput 指定。
-# 这里， GeneratedInputs 自动获取上一个生成的词，并在最开始初始化为起始词，如 <s>。
-trg_embedding = GeneratedInput(
-    size=target_dict_dim,
-    embedding_name='_target_language_embedding',
-    embedding_size=word_vector_dim)
-group_inputs.append(trg_embedding)
-beam_gen = beam_search(name=decoder_group_name,
-                       step=gru_decoder_with_attention,
-                       input=group_inputs,
-                       bos_id=0, # Beginnning token.
-                       eos_id=1, # End of sentence token.
-                       beam_size=beam_size,
-                       max_length=max_length)
-
-seqtext_printer_evaluator(input=beam_gen,
-                          id_input=data_layer(name="sent_id", size=1),
-                          dict_file=trg_dict_path,
-                          result_file=gen_trans_file)
-outputs(beam_gen)
-```
-
-注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅 [Semantic Role Labeling Demo](../../demo/semantic_role_labeling/index.html) 了解更多详细信息。
-
-完整的配置文件在`demo/seqToseq/seqToseq_net.py`。
diff --git a/doc/howto/deep_model/rnn_config_cn.rst b/doc/howto/deep_model/rnn/rnn_config_cn.rst
similarity index 86%
rename from doc/howto/deep_model/rnn_config_cn.rst
rename to doc/howto/deep_model/rnn/rnn_config_cn.rst
index e6d8c1133a5e8a481c9bf5340c4641343804dcbe..ac2bd0775f4ab2e0a0c37462e2c23001123b152b 100644
--- a/doc/howto/deep_model/rnn_config_cn.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_cn.rst
@@ -1,4 +1,4 @@
-RNN 配置
+RNN配置
 ========
 
 本教程将指导你如何在 PaddlePaddle
@@ -20,7 +20,7 @@ PaddlePaddle
 不需要对序列数据进行任何预处理，例如填充。唯一需要做的是将相应类型设置为输入。例如，以下代码段定义了三个输入。
 它们都是序列，它们的大小是\ ``src_dict``\ ，\ ``trg_dict``\ 和\ ``trg_dict``\ ：
 
-.. code:: sourcecode
+.. code:: python
 
     settings.input_types = [
       integer_value_sequence(len(settings.src_dict)),
@@ -29,12 +29,11 @@ PaddlePaddle
 
 在\ ``process``\ 函数中，每个\ ``yield``\ 函数将返回三个整数列表。每个整数列表被视为一个整数序列：
 
-.. code:: sourcecode
+.. code:: python
 
     yield src_ids, trg_ids, trg_ids_next
 
-有关如何编写数据提供程序的更多细节描述，请参考
-`PyDataProvider2 <../../ui/data_provider/index.html>`__\ 。完整的数据提供文件在
+有关如何编写数据提供程序的更多细节描述，请参考 :ref:`api_pydataprovider2` 。完整的数据提供文件在
 ``demo/seqToseq/dataprovider.py``\ 。
 
 配置循环神经网络架构
@@ -45,18 +44,17 @@ PaddlePaddle
 
 循环神经网络在每个时间步骤顺序地处理序列。下面列出了 LSTM 的架构的示例。
 
-.. figure:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg
-   :alt: image
+.. image:: ../../../tutorials/sentiment_analysis/bi_lstm.jpg
+      :align: center
 
-   image
+一般来说，循环网络从 :math:`t=1` 到 :math:`t=T` 或者反向地从 :math:`t=T` 到 :math:`t=1` 执行以下操作。
 
-一般来说，循环网络从 *t* = 1 到 *t* = *T* 或者反向地从 *t* = *T* 到 *t*
-= 1 执行以下操作。
+.. math::
 
-*x*\ \ *t* + 1 = *f*\ \ *x*\ (*x*\ \ *t*\ ),\ *y*\ \ *t*\  = *f*\ \ *y*\ (*x*\ \ *t*\ )
+    x_{t+1} = f_x(x_t), y_t = f_y(x_t)
 
-其中 *f*\ \ *x*\ (.) 称为\ **单步函数**\ （即单时间步执行的函数，step
-function），而 *f*\ \ *y*\ (.) 称为\ **输出函数**\ 。在 vanilla
+其中 :math:`f_x(.)` 称为\ **单步函数**\ （即单时间步执行的函数，step
+function），而 :math:`f_y(.)` 称为\ **输出函数**\ 。在 vanilla
 循环神经网络中，单步函数和输出函数都非常简单。然而，PaddlePaddle
 可以通过修改这两个函数来实现复杂的网络配置。我们将使用 sequence to
 sequence
@@ -67,16 +65,17 @@ vanilla
 
 对于 vanilla RNN，在每个时间步长，\ **单步函数**\ 为：
 
-*x*\ \ *t* + 1 = *W*\ \ *x*\ \ *x*\ \ *t*\  + *W*\ \ *i*\ \ *I*\ \ *t*\  + *b*
+.. math::
 
-其中 *x*\ \ *t*\  是RNN状态，并且 *I*\ \ *t*\  是输入，\ *W*\ \ *x*\  和
-*W*\ \ *i*\  分别是RNN状态和输入的变换矩阵。\ *b*
-是偏差。它的\ **输出函数**\ 只需要\ *x*\ \ *t*\ 作为输出。
+    x_{t+1} = W_x x_t + W_i I_t + b
+
+其中 :math:`x_t` 是RNN状态，并且 :math:`I_t` 是输入，:math:`W_x` 和
+:math:`W_i` 分别是RNN状态和输入的变换矩阵。:math:`b` 是偏差。它的\ **输出函数**\ 只需要 :math:`x_t` 作为输出。
 
 ``recurrent_group``\ 是构建循环神经网络的最重要的工具。
 它定义了\ **单步函数**\ ，\ **输出函数**\ 和循环神经网络的输入。注意，这个函数的\ ``step``\ 参数需要实现\ ``step function``\ （单步函数）和\ ``output function``\ （输出函数）：
 
-.. code:: sourcecode
+.. code:: python
 
     def simple_rnn(input,
                    size=None,
@@ -102,7 +101,7 @@ vanilla
 
 PaddlePaddle
 使用“Memory”（记忆模块）实现单步函数。\ **Memory**\ 是在PaddlePaddle中构造循环神经网络时最重要的概念。
-Memory是在单步函数中循环使用的状态，例如\ *x*\ \ *t* + 1 = *f*\ \ *x*\ (*x*\ \ *t*\ )。
+Memory是在单步函数中循环使用的状态，例如 :math:`x_{t+1} = f_x(x_t)` 。
 一个Memory包含\ **输出**\ 和\ **输入**\ 。当前时间步处的Memory的输出作为下一时间步Memory的输入。Memory也可以具有\ **boot
 layer(引导层)**\ ，其输出被用作Memory的初始值。
 在我们的例子中，门控循环单元的输出被用作输出Memory。请注意，\ ``rnn_out``\ 层的名称与\ ``out_mem``\ 的名称相同。这意味着\ ``rnn_out``
@@ -120,30 +119,25 @@ Sequence to Sequence Model with Attention
 我们将使用 sequence to sequence model with attention
 作为例子演示如何配置复杂的循环神经网络模型。该模型的说明如下图所示。
 
-.. figure:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
-   :alt: image
-
-   image
+.. image:: ../../../tutorials/text_generation/encoder-decoder-attention-model.png
+      :align: center
 
-在这个模型中，源序列 *S* = {*s*\ 1, …, \ *s*\ \ *T*\ }
+在这个模型中，源序列 :math:`S = \{s_1, \dots, s_T\}` 
 用双向门控循环神经网络编码。双向门控循环神经网络的隐藏状态
-*H*\ \ *S*\  = {*H*\ 1, …, \ *H*\ \ *T*\ } 被称为
-*编码向量*\ 。解码器是门控循环神经网络。当解读每一个\ *y*\ \ *t*\ 时,
-这个门控循环神经网络生成一系列权重
-*W*\ \ *S*\ \ *t*\  = {*W*\ 1\ *t*\ , …, \ *W*\ \ *T*\ \ *t*\ },
-用于计算编码向量的加权和。加权和用来生成\ *y*\ \ *t*\ 。
+:math:`H_S = \{H_1, \dots, H_T\}` 被称为
+*编码向量*\ 。解码器是门控循环神经网络。当解读每一个 :math:`y_t` 时,
+这个门控循环神经网络生成一系列权重  :math:`W_S^t = \{W_1^t, \dots, W_T^t\}` ,
+用于计算编码向量的加权和。加权和用来生成 :math:`y_t` 。
 
 模型的编码器部分如下所示。它叫做\ ``grumemory``\ 来表示门控循环神经网络。如果网络架构简单，那么推荐使用循环神经网络的方法，因为它比
 ``recurrent_group``
-更快。我们已经实现了大多数常用的循环神经网络架构，可以参考
-`Layers <../../ui/api/trainer_config_helpers/layers_index.html>`__
-了解更多细节。
+更快。我们已经实现了大多数常用的循环神经网络架构，可以参考 :ref:`api_trainer_config_helpers_layers` 了解更多细节。
 
 我们还将编码向量投射到 ``decoder_size``
 维空间。这通过获得反向循环网络的第一个实例，并将其投射到
 ``decoder_size`` 维空间完成：
 
-.. code:: sourcecode
+.. code:: python
 
     # 定义源语句的数据层
     src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
@@ -174,7 +168,7 @@ Sequence to Sequence Model with Attention
 解码器使用 ``recurrent_group`` 来定义循环神经网络。单步函数和输出函数在
 ``gru_decoder_with_attention`` 中定义：
 
-.. code:: sourcecode
+.. code:: python
 
     group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
                   StaticInput(input=encoded_proj,is_seq=True)]
@@ -198,7 +192,7 @@ Sequence to Sequence Model with Attention
 单步函数的实现如下所示。首先，它定义解码网络的\ **Memory**\ 。然后定义
 attention，门控循环单元单步函数和输出函数：
 
-.. code:: sourcecode
+.. code:: python
 
     def gru_decoder_with_attention(enc_vec, enc_proj, current_word):
         # 定义解码器的Memory
@@ -253,7 +247,7 @@ attention，门控循环单元单步函数和输出函数：
 
 代码如下：
 
-.. code:: sourcecode
+.. code:: python
 
     group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
                   StaticInput(input=encoded_proj,is_seq=True)]
@@ -279,9 +273,6 @@ attention，门控循环单元单步函数和输出函数：
                               result_file=gen_trans_file)
     outputs(beam_gen)
 
-注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅
-`Semantic Role Labeling
-Demo <../../demo/semantic_role_labeling/index.html>`__
-了解更多详细信息。
+注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅 :ref:`semantic_role_labeling` 了解更多详细信息。
 
 完整的配置文件在\ ``demo/seqToseq/seqToseq_net.py``\ 。
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index 6a14ce8ae75c3dd372184ea6ea9f6034a3dbf919..bd3d0ec292057037414792b1ac176d12605b90d5 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -7,10 +7,11 @@
 ..  toctree::
   :maxdepth: 1
 
+  usage/cmd_parameter/index_cn.rst
   usage/concepts/use_concepts_cn.rst
   usage/cluster/cluster_train_cn.md
-  usage/cluster/k8s/k8s_cn.md
-  usage/cluster/k8s/k8s_distributed_cn.md
+  usage/k8s/k8s_cn.md
+  usage/k8s/k8s_distributed_cn.md
 
 开发标准
 --------
diff --git a/doc/howto/index_en.rst b/doc/howto/index_en.rst
index 983dc743eb453a0210bc5fb3c7e4525fa838d428..1fbfcd260b912078f00ed5b720ed607db725c4e2 100644
--- a/doc/howto/index_en.rst
+++ b/doc/howto/index_en.rst
@@ -7,8 +7,10 @@ Usage
 ..  toctree::
   :maxdepth: 1
 
-  usage/cmd_parameter/index_en.md
+  usage/cmd_parameter/index_en.rst
   usage/cluster/cluster_train_en.md
+  usage/k8s/k8s_en.md
+  usage/k8s/k8s_aws_en.md
 
 Development
 ------------
diff --git a/doc/howto/usage/cmd_parameter/arguments_cn.md b/doc/howto/usage/cmd_parameter/arguments_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..833e21dd19ef3c01f5ef990bd12c3fc3b41ba483
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/arguments_cn.md
@@ -0,0 +1,409 @@
+# 参数概述
+
+虽然Paddle看起来包含了众多参数，但是大部分参数是为开发者提供的，或者已经在集群提交环境中自动设置，因此用户并不需要关心它们。在此，根据这些参数的使用场合，我们将它们划分为不同的类别。例如，`通用`类别中的参数可用于所有场合。某些参数只可用于特定的层中，而有些参数需要在集群多机训练中使用等。
+
+<html>
+<table border="2" frame="border">
+<thead>
+<tr>
+<th scope="col" class="left"></th>
+<th scope="col" class="left">参数</th>
+<th scope="col" class="left">本地训练</th>
+<th scope="col" class="left">集群训练</th>
+<th scope="col" class="left">本地测试</th>
+<th scope="col" class="left">集群测试</th>
+</tr>
+</thead>
+
+<tbody>
+<tr>
+<td class="left" rowspan="9">通用</td>
+<td class="left">job</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">use_gpu</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">local</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">config</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">config_args</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">num_passes</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">trainer_count</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">version</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">show_layer_stat</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan="15">训练</td><td class="left">dot_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">saving_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">show_parameter_stats_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">init_model_path</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">load_missing_parameter_strategy</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">saving_period_by_batches</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">use_old_updater</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">enable_grad_share</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">grad_share_block_num</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_error_clipping</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_clipping</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">save_only_one</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">allow_inefficient_sparse_update</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">start_pass</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">训练/测试</td><td class="left">save_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">训练过程中测试</td><td class="left">test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">average_test_period</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "5">测试</td><td class="left">model_list</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">test_wait</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">test_pass</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">predict_output_dir</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">distribute_test</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">Auc/正负对验证(PnpairValidation)</td><td class="left">predict_file</td>
+<td class="left"></td><td class="left"></td><td class="left"></td>√<td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "6">GPU</td><td class="left">gpu_id</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">parallel_nn</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">allow_only_one_model_on_one_gpu</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cudnn_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cuda_dir</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">cudnn_conv_workspace_limit_in_mb</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "4">递归神经网络(RNN)</td>
+<td class="left">beam_size</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">rnn_use_batch</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">prev_batch_state</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">diy_beam_search_prob_so</td>
+<td class="left"></td><td class="left"></td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">度量学习(metric learning)</td><td class="left">external</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">data_server_port</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "16">参数服务器(PServer)</td><td class="left">start_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">pservers</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">port</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">port_num</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">ports_num_for_sparse</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">nics</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">rdma_tcp</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">small_messages</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">loadsave_parameters_in_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left">√</td>
+</tr>
+
+<tr>
+<td class="left">log_period_server</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">pserver_num_threads</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">sock_send_buf_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">sock_recv_buf_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">num_gradient_servers</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">parameter_block_size</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">parameter_block_size_for_sparse</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "3">异步随机梯度下降(Async SGD)</td><td class="left">async_count</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">async_lagged_ratio_min</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">async_lagged_ratio_default</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "8">性能调优(Performance Tuning)</td><td class="left">log_barrier_abstract</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_barrier_lowest_nodes</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">log_barrier_show_log</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_batches</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_ratio</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_unbalance_degree</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">check_sparse_distribution_in_pserver</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">show_check_sparse_distribution_log</td>
+<td class="left"></td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">数据提供器(Data Provider)</td><td class="left">memory_threshold_on_load_data</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left" rowspan = "2">随机数</td><td class="left">seed</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">thread_local_rand_use_global_seed</td>
+<td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">单元测试</td><td class="left">checkgrad_eps</td>
+<td class="left"></td><td class="left"></td><td class="left"></td><td class="left"></td>
+</tr>
+
+<tr>
+<td class="left">矩阵/向量</td><td class="left">enable_parallel_vector</td>
+<td class="left">√</td><td class="left">√</td><td class="left">√</td><td class="left">√</td>
+</tr>
+
+</tbody>
+
+</table>
+</html>
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_cn.md b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..dbf7c6f00b8ba5c62d86fb2143221a27330b9506
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_cn.md
@@ -0,0 +1,336 @@
+# 细节描述
+
+## 通用
+
+* `--job`
+  - 工作模式，包括: **train, test, checkgrad**，其中checkgrad主要为开发者使用，使用者不需要关心。
+  - 类型: string (默认: train)
+
+* `--config`
+  - 用于指定网络配置文件。
+  - 类型: string (默认: null).
+
+* `--use_gpu`
+  - 训练过程是否使用GPU，设置为true使用GPU模式，否则使用CPU模式。
+  - 类型: bool (默认: 1).
+
+* `--local`
+  - 训练过程是否为本地模式，设置为true使用本地训练或者使用集群上的一个节点，否则使用多机训练。
+  - 类型: bool (默认: 1).
+
+* `--trainer_count`
+  - 指定一台机器上使用的线程数。例如，trainer_count = 4, 意思是在GPU模式下使用4个GPU，或者在CPU模式下使用4个线程。每个线程（或GPU）分配到当前数据块样本数的四分之一。也就是说，如果在训练配置中设置batch_size为512，每个线程分配到128个样本用于训练。
+  - 类型: int32 (默认: 1).
+
+* `--num_passes`
+  - 当模式为`--job=train`时, 该参数的意思是训练num_passes轮。每轮会将数据集中的所有训练样本使用一次。当模式为`--job=test`时，意思是使用第test_pass个模型到第 num_passes-1 个模型测试数据。
+  - 类型: int32 (默认: 100).
+
+* `--config_args`
+  - 传递给配置文件的参数。格式: key1=value1,key2=value2.
+  - 类型: string (默认: null).
+
+* `--version`
+  - 是否打印版本信息。
+  - 类型: bool (默认: 0).
+
+* `--show_layer_stat`
+  - 是否显示**每个批次数据**中每层的数值统计.
+  - 类型: bool (默认: 0).
+
+## 训练
+
+* `--log_period`
+  - 每log_period个批次打印日志进度.
+  - 类型: int32 (默认: 100).
+
+* `--dot_period`
+  - 每dot_period个批次输出符号'.'.
+  - 类型: int32 (默认: 1).
+
+* `--saving_period`
+  - 每saving_period轮保存训练参数.
+  - 类型: int32 (默认: 1).
+
+* `--save_dir`
+  - 保存模型参数的目录，需要明确指定，但不需要提前创建。
+  - 类型: string (默认: null).
+
+* `--start_pass`
+  - 从start_pass轮开始训练，会加载上一轮的参数。
+  - 类型: int32 (默认: 0).
+
+* `--show_parameter_stats_period`
+  - 在训练过程中每show_parameter_stats_period个批次输出参数统计。默认不显示。
+  - 类型: int32 (默认: 0).
+
+* `--save_only_one`
+  - 只保存最后一轮的参数，而之前的参数将会被删除。
+  - 类型: bool (默认: 0).
+
+* `--load_missing_parameter_strategy`
+  - 当模型参数不存在时，指定加载的方式。目前支持fail/rand/zero三种操作.
+    - `fail`: 程序直接退出.
+    - `rand`: 根据网络配置中的**initial\_strategy**采用均匀分布或者高斯分布初始化。均匀分布的范围是: **[mean - std, mean + std]**, 其中mean和std是训练配置中的参数.
+    - `zero`: 所有参数置为零.
+  - 类型: string (默认: fail).
+
+* `--init_model_path`
+   - 初始化模型的路径。如果设置该参数，start\_pass将不起作用。同样也可以在测试模式中指定模型路径。
+   - 类型: string (默认: null).
+
+* `--saving_period_by_batches`
+   - 在一轮中每saving_period_by_batches个批次保存一次参数。
+   - 类型: int32 (默认: 0).
+
+* `--log_error_clipping`
+  - 当在网络层配置中设置**error_clipping_threshold**时，该参数指示是否打印错误截断日志。如果为true，**每批次**的反向传播将会打印日志信息。该截断会影响**输出的梯度**.
+  - 类型: bool (默认: 0).
+
+* `--log_clipping`
+  - 当在训练配置中设置**gradient_clipping_threshold**时，该参数指示是否打印日志截断信息。该截断会影响**权重更新的梯度**.
+  - 类型: bool (默认: 0).
+
+* `--use_old_updater`
+  - 是否使用旧的RemoteParameterUpdater。 默认使用ConcurrentRemoteParameterUpdater，主要为开发者使用，使用者通常无需关心.
+  - 类型: bool (默认: 0).
+
+* `--enable_grad_share`
+  - 启用梯度参数的阈值，在多CPU训练时共享该参数.
+  - 类型: int32 (默认: 100 \* 1024 \* 1024).
+
+* `--grad_share_block_num`
+  - 梯度参数的分块数目，在多CPU训练时共享该参数.
+  - 类型: int32 (默认: 64).
+
+## 测试
+
+* `--test_pass`
+  - 加载test_pass轮的模型用于测试.
+  - 类型: int32 (默认: -1).
+
+* `--test_period`
+   - 如果为0，每轮结束时对所有测试数据进行测试；如果不为0，每test_period个批次对所有测试数据进行测试.
+  - 类型: int32 (默认: 0).
+
+* `--test_wait`
+  - 指示当指定轮的测试模型不存在时，是否需要等待该轮模型参数。如果在训练期间同时发起另外一个进程进行测试，可以使用该参数.
+  - 类型: bool (默认: 0).
+
+* `--model_list`
+  - 测试时指定的存储模型列表的文件.
+  - 类型: string (默认: "", null).
+
+* `--predict_output_dir`
+  - 保存网络层输出结果的目录。该参数在网络配置的Outputs()中指定，默认为null，意思是不保存结果。在测试阶段，如果你想要保存某些层的特征图，请指定该目录。需要注意的是，网络层的输出是经过激活函数之后的值.
+  - 类型: string (默认: "", null).
+
+* `--average_test_period`
+  - 使用`average_test_period`个批次的参数平均值进行测试。该参数必须能被FLAGS_log_period整除，默认为0，意思是不使用平均参数执行测试.
+  - 类型: int32 (默认: 0).
+
+* `--distribute_test`
+  - 在分布式环境中测试，将多台机器的测试结果合并.
+  - 类型: bool (默认: 0).
+
+* `--predict_file`
+  - 保存预测结果的文件名。该参数默认为null，意思是不保存结果。目前该参数仅用于AucValidationLayer和PnpairValidationLayer层，每轮都会保存预测结果.
+  - 类型: string (默认: "", null).
+
+## GPU
+
+* `--gpu_id`
+  - 指示使用哪个GPU核.
+  - 类型: int32 (默认: 0).
+
+* `--allow_only_one_model_on_one_gpu`
+  - 如果为true，一个GPU设备上不允许配置多个模型.
+  - 类型: bool (默认: 1).
+
+* `--parallel_nn`
+  - 指示是否使用多线程来计算一个神经网络。如果为false，设置gpu_id指定使用哪个GPU核（训练配置中的设备属性将会无效）。如果为true，GPU核在训练配置中指定（gpu_id无效）.
+  - 类型: bool (默认: 0).
+
+* `--cudnn_dir`
+  - 选择路径来动态加载NVIDIA CuDNN库，例如，/usr/local/cuda/lib64. [默认]: LD_LIBRARY_PATH
+  - 类型: string (默认: "", null)
+
+* `--cuda_dir`
+  - 选择路径来动态加载NVIDIA CUDA库，例如，/usr/local/cuda/lib64. [默认]: LD_LIBRARY_PATH
+  - 类型: string (默认: "", null)
+
+* `--cudnn_conv_workspace_limit_in_mb`
+  - 指定cuDNN的最大工作空间容限，单位是MB，默认为4096MB=4GB. 
+  - 类型: int32 (默认: 4096MB=4GB)
+
+## 自然语言处理(NLP): RNN/LSTM/GRU
+* `--rnn_use_batch`
+  - 指示在简单的RecurrentLayer层的计算中是否使用批处理方法.
+  - 类型: bool (默认: 0).
+
+* `--prev_batch_state`
+  - 标识是否为连续的batch计算.
+  - 类型: bool (默认: 0).
+
+* `--beam_size`
+  - 集束搜索使用广度优先搜索的方式构建查找树。在树的每一层上，都会产生当前层状态的所有继承结果，按启发式损失的大小递增排序。然而，每层上只能保存固定数目个最好的状态，该数目是提前定义好的，称之为集束大小.
+  - 类型: int32 (默认: 1).
+
+* `--diy_beam_search_prob_so`
+  - 用户可以自定义beam search的方法，编译成动态库，供PaddlePaddle加载。 该参数用于指定动态库路径.
+  - 类型: string (默认: "", null).
+
+## 度量学习(Metric Learning)
+* `--external`
+   - 指示是否使用外部机器进行度量学习.
+   - 类型: bool (默认: 0).
+
+* `--data_server_port`
+  - 数据服务器(data server)的监听端口，主要用在度量学习中.
+  - 类型: int32 (默认: 21134).
+
+## 数据支持(DataProvider)
+
+* `--memory_threshold_on_load_data`
+  - 内存容限阈值，当超过该阈值时，停止加载数据.
+  - 类型: double (默认: 1.0).
+
+## 单元测试
+
+* `--checkgrad_eps`
+  - 使用checkgrad模式时的参数变化大小.
+  - 类型: double (默认: 1e-05).
+
+## 参数服务器和分布式通信
+
+* `--start_pserver`
+  - 指示是否开启参数服务器(parameter server).
+  - 类型: bool (默认: 0).
+
+* `--pservers`
+  - 参数服务器的IP地址，以逗号间隔.
+  - 类型: string (默认: "127.0.0.1").
+
+* `--port`
+  - 参数服务器的监听端口.
+  - 类型: int32 (默认: 20134).
+
+* `--ports_num`
+  - 发送参数的端口号，根据默认端口号递增.
+  - 类型: int32 (默认: 1).
+
+* `--trainer_id`
+  - 在分布式训练中，每个训练节点必须指定一个唯一的id号，从0到num_trainers-1。0号训练节点是主训练节点。使用者无需关心这个参数.
+  - 类型: int32 (默认: 0).
+
+* `--num_gradient_servers`
+  - 梯度服务器的数量，该参数在集群提交环境中自动设置.
+  - 类型: int32 (默认: 1).
+
+* `--small_messages`
+  - 如果消息数据太小，建议将该参数设为true，启动快速应答，无延迟.
+  - 类型: bool (默认: 0).
+
+* `--sock_send_buf_size`
+  - 限制套接字发送缓冲区的大小。如果仔细设置的话，可以有效减小网络的阻塞.
+  - 类型: int32 (默认: 1024 \* 1024 \* 40).
+
+* `--sock_recv_buf_size`
+  - 限制套接字接收缓冲区的大小.
+  - 类型: int32 (默认: 1024 \* 1024 \* 40).
+
+* `--parameter_block_size`
+  - 参数服务器的参数分块大小。如果未设置，将会自动计算出一个合适的值.
+  - 类型: int32 (默认: 0).
+
+* `--parameter_block_size_for_sparse`
+  - 参数服务器稀疏更新的参数分块大小。如果未设置，将会自动计算出一个合适的值.
+  - 类型: int32 (默认: 0).
+
+* `--log_period_server`
+  - 在参数服务器终端每log_period_server个批次打印日志进度.
+  - 类型: int32 (默认: 500).
+
+* `--loadsave_parameters_in_pserver`
+  - 在参数服务器上加载和保存参数，只有当设置了sparse_remote_update参数时才有效.
+  - 类型: bool (默认: 0).
+
+* `--pserver_num_threads`
+  - 同步执行操作的线程数.
+  - 类型: bool (默认: 1).
+
+* `--ports_num_for_sparse`
+  - 发送参数的端口号，根据默认值递增(port + ports_num)，用于稀疏训练中.
+  - 类型: int32 (默认: 0).
+
+* `--nics`
+  - 参数服务器的网络设备名称，已经在集群提交环境中完成设置.
+  - 类型: string (默认: "xgbe0,xgbe1").
+
+* `--rdma_tcp`
+  - 使用rdma还是tcp传输协议，该参数已经在集群提交环境中完成设置.
+  - 类型: string (默认: "tcp").
+
+## 异步随机梯度下降(Async SGD)
+* `--async_count`
+  - 定义异步训练的长度，如果为0，则使用同步训练.
+  - 类型: int32 (默认: 0).
+
+* `--async_lagged_ratio_min`
+  - 控制`config_.async_lagged_grad_discard_ratio()`的最小值.
+  - 类型: double (默认: 1.0).
+
+* `--async_lagged_ratio_default`
+  - 如果在网络配置中未设置async_lagged_grad_discard_ratio，则使用该参数作为默认值.
+  - 类型: double (默认: 1.5).
+
+## 性能调优(Performance Tuning)
+
+* `--log_barrier_abstract`
+  - 如果为true，则显示阻隔性能的摘要信息.
+  - 类型: bool (默认: 1).
+
+* `--log_barrier_show_log`
+  - 如果为true，则总会显示阻隔摘要信息，即使间隔很小.
+  - 类型: bool (默认: 0).
+
+* `--log_barrier_lowest_nodes`
+  - 最少显示多少个节点.
+  - 类型: int32 (默认: 5).
+
+* `--check_sparse_distribution_in_pserver`
+  - 指示是否检查所有参数服务器上的稀疏参数的分布是均匀的.
+  - 类型: bool (默认: 0).
+
+* `--show_check_sparse_distribution_log`
+  - 指示是否显示参数服务器上的稀疏参数分布的日志细节.
+  - 类型: bool (默认: 0).
+
+* `--allow_inefficient_sparse_update`
+  - 指示是否允许低效率的稀疏更新.
+  - 类型: bool (默认: 0).
+
+* `--check_sparse_distribution_batches`
+  - 每运行多少个批次执行一次稀疏参数分布的检查.
+  - 类型: int32 (默认: 100).
+
+* `--check_sparse_distribution_ratio`
+  - 如果检查到分配在不同参数服务器上的参数的分布不均匀次数大于check_sparse_distribution_ratio *  check_sparse_distribution_batches次，程序停止.
+  - 类型: double (默认: 0.6).
+
+* `--check_sparse_distribution_unbalance_degree`
+  - 不同参数服务器上数据大小的最大值与最小值的比率.
+  - 类型: double (默认: 2).
+
+## 矩阵/向量/随机数
+* `--enable_parallel_vector`
+  - 启动并行向量的阈值.
+  - 类型: int32 (默认: 0).
+
+* `--seed`
+  - 随机数的种子。srand(time)的为0.
+  - 类型: int32 (默认: 1)
+
+* `--thread_local_rand_use_global_seed`
+  - 是否将全局种子应用于本地线程的随机数.
+  - 类型: bool (默认: 0).
diff --git a/doc/howto/usage/cmd_parameter/detail_introduction_en.md b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
index 27b2faf1d8a9367ff9498a76d363791ab7fbe61c..aa69a3bd5423c4f3223242bdafda251271925f2d 100644
--- a/doc/howto/usage/cmd_parameter/detail_introduction_en.md
+++ b/doc/howto/usage/cmd_parameter/detail_introduction_en.md
@@ -73,7 +73,7 @@
   - type: bool (default: 0).
 
 * `--load_missing_parameter_strategy`
-  - Specify the loading operation when model file is missing. Now support fail/rand/zere three operations.
+  - Specify the loading operation when model file is missing. Now support fail/rand/zero three operations.
     - `fail`: program will exit.
     - `rand`: uniform or normal distribution according to **initial\_strategy** in network config. Uniform range is: **[mean - std, mean + std]**, where mean and std are configures in trainer config.
     - `zero`: all parameters are zero.
@@ -118,11 +118,11 @@
   - type: int32 (default: 0).
 
 * `--test_wait`
-  - Whether to wait for parameter per pass if not exist. If set test_data_path in submitting environment of cluster, it will launch one process to perfom testing, so we need to set test_wait=1. Note that in the cluster submitting environment, this argument has been set True by default.
+  - Whether to wait for parameter per pass if not exist. It can be used when user launch another process to perfom testing during the training process.
   - type: bool (default: 0).
 
 * `--model_list`
-  - File that saves the model list when testing. It was set automatically when using cluster submitting environment after setting model_path.
+  - File that saves the model list when testing. 
   - type: string (default: "", null).
 
 * `--predict_output_dir`
@@ -212,7 +212,7 @@
   - type: bool (default: 0).
 
 * `--pservers`
-  - Comma separated IP addresses of pservers. It is set automatically in cluster submitting environment.
+  - Comma separated IP addresses of pservers.
   - type: string (default: "127.0.0.1").
 
 * `--port`
diff --git a/doc/howto/usage/cmd_parameter/index_cn.rst b/doc/howto/usage/cmd_parameter/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..4c8729821110b9aec99351fc0a83a1ba75a8a2bb
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/index_cn.rst
@@ -0,0 +1,11 @@
+..  _cmd_line_index:
+
+设置命令行参数
+===============
+
+..  toctree::
+  :maxdepth: 1
+
+  use_case_cn.md
+  arguments_cn.md
+  detail_introduction_cn.md
diff --git a/doc/howto/usage/cmd_parameter/index_en.md b/doc/howto/usage/cmd_parameter/index_en.md
deleted file mode 100644
index 2a96e7e976c43fd69befccd78753cee431ef61bc..0000000000000000000000000000000000000000
--- a/doc/howto/usage/cmd_parameter/index_en.md
+++ /dev/null
@@ -1,8 +0,0 @@
-```eval_rst
-..  _cmd_line_index:
-```
-# Set Command-line Parameters
-
-* [Use Case](use_case_en.md)
-* [Arguments](arguments_en.md)
-* [Detailed Descriptions](detail_introduction_en.md)
diff --git a/doc/howto/usage/cmd_parameter/index_en.rst b/doc/howto/usage/cmd_parameter/index_en.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0e3c72d27aca063f1b6f1c23e55718dba373c40a
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/index_en.rst
@@ -0,0 +1,11 @@
+..  _cmd_line_index:
+
+Set Command-line Parameters
+===========================
+
+..  toctree::
+  :maxdepth: 1
+
+  use_case_en.md
+  arguments_en.md
+  detail_introduction_en.md
diff --git a/doc/howto/usage/cmd_parameter/use_case_cn.md b/doc/howto/usage/cmd_parameter/use_case_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..db8c39d950771726346ff9c9481990abc13036cf
--- /dev/null
+++ b/doc/howto/usage/cmd_parameter/use_case_cn.md
@@ -0,0 +1,182 @@
+# 使用案例
+
+## 本地训练
+
+本地训练的实验，诸如图像分类，自然语言处理等，通常都会使用下面这些命令行参数。
+
+```
+paddle train \
+  --use_gpu=1/0 \                        #1:GPU,0:CPU(默认为1)
+  --config=network_config \
+  --save_dir=output \
+  --trainer_count=COUNT \                #(默认为1)
+  --test_period=M \                      #(默认为0) 
+  --num_passes=N \                       #(默认为100)
+  --log_period=K \                       #(默认为100)
+  --dot_period=1000 \                    #(默认为1)
+  #[--show_parameter_stats_period=100] \ #(默认为0)
+  #[--saving_period_by_batches=200] \    #(默认为0)
+```
+根据你的任务，可以选择是否使用参数`show_parameter_stats_period`和`saving_period_by_batches`。
+
+### 1) 将命令参数传给网络配置
+
+`config_args`是一个很有用的参数，用于将参数传递给网络配置。
+
+```
+--config_args=generating=1,beam_size=5,layer_num=10 \
+```
+`get_config_arg`可用于在网络配置中解析这些参数，如下所示：
+
+```
+generating = get_config_arg('generating', bool, False)
+beam_size = get_config_arg('beam_size', int, 3)
+layer_num = get_config_arg('layer_num', int, 8)
+```
+
+`get_config_arg`:
+
+```
+get_config_arg(name, type, default_value)
+```
+- name: `--config_args`中指定的名字
+- type: 值类型，包括bool, int, str, float等
+- default_value: 默认值
+
+### 2) 使用模型初始化网络
+
+增加如下参数：
+
+```
+--init_model_path=model_path
+--load_missing_parameter_strategy=rand
+```
+
+## 本地测试
+
+方法一：
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \ 
+             --config=network_config \
+             --trainer_count=COUNT \ 
+             --init_model_path=model_path \
+```
+- 使用init\_model\_path指定测试的模型
+- 只能测试单个模型
+
+方法二：
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \ 
+             --config=network_config \
+             --trainer_count=COUNT \ 
+             --model_list=model.list \
+```
+- 使用model_list指定测试的模型列表
+- 可以测试多个模型，文件model.list如下所示：
+
+```
+./alexnet_pass1
+./alexnet_pass2
+```
+
+方法三：
+
+```
+paddle train --job=test \
+             --use_gpu=1/0 \
+             --config=network_config \
+             --trainer_count=COUNT \
+             --save_dir=model \
+             --test_pass=M \
+             --num_passes=N \
+```
+这种方式必须使用Paddle存储的模型路径格式，如：`model/pass-%5d`。测试的模型包括从第M轮到第N-1轮存储的所有模型。例如，M=12，N=14这种写法将会测试模型`model/pass-00012`和`model/pass-00013`。
+
+## 稀疏训练
+
+当输入是维度很高的稀疏数据时，通常使用稀疏训练来加速计算过程。例如，输入数据的字典维数是1百万，但是每个样本仅包含几个词。在Paddle中，稀疏矩阵的乘积应用于前向传播过程，而稀疏更新在反向传播之后的权重更新时进行。
+
+### 1) 本地训练
+
+用户需要在网络配置中指定**sparse\_update=True**。请参照网络配置的文档了解更详细的信息。
+
+### 2) 集群训练
+
+在集群上训练一个稀疏模型需要加上下面的参数。同时用户需要在网络配置中指定**sparse\_remote\_update=True**。请参照网络配置的文档了解更详细的信息。
+
+```
+--ports_num_for_sparse=1    #(默认为0)
+```
+
+## parallel_nn
+用户可以设置`parallel_nn`来混合使用GPU和CPU计算网络层的参数。也就是说，你可以将网络配置成某些层使用GPU计算，而其他层使用CPU计算。另一种方式是将网络层划分到不同的GPU上去计算，这样可以减小GPU内存，或者采用并行计算来加速某些层的更新。
+
+如果你想使用这些特性，你需要在网络配置中指定设备的ID号(表示为deviceId)，并且加上下面的命令行参数:
+
+```
+--parallel_nn=true
+```
+### 案例一：GPU和CPU混合使用
+请看下面的例子：
+
+```
+#command line:
+paddle train --use_gpu=true --parallel_nn=true trainer_count=COUNT
+
+default_device(0)
+
+fc1=fc_layer(...)
+fc2=fc_layer(...)
+fc3=fc_layer(...,layer_attr=ExtraAttr(device=-1))
+
+```
+- default_device(0): 设置默认设备号为0。这意味着除了指定device=-1的层之外，其他所有层都会使用GPU计算，每层使用的GPU号依赖于参数trainer\_count和gpu\_id(默认为0)。在此，fc1和fc2层在GPU上计算。
+
+- device=-1: fc3层使用CPU计算。
+
+- trainer_count:
+  - trainer_count=1: 如果未设置gpu\_id，那么fc1和fc2层将会使用第1个GPU来计算。否则使用gpu\_id指定的GPU。
+
+  - trainer_count>1: 在trainer\_count个GPU上使用数据并行来计算某一层。例如，trainer\_count=2意味着0号和1号GPU将会使用数据并行来计算fc1和fc2层。
+
+### 案例二：在不同设备上指定层
+
+```
+#command line:
+paddle train --use_gpu=true --parallel_nn=true --trainer_count=COUNT
+
+#network:
+fc2=fc_layer(input=l1, layer_attr=ExtraAttr(device=0), ...)
+fc3=fc_layer(input=l1, layer_attr=ExtraAttr(device=1), ...)
+fc4=fc_layer(input=fc2, layer_attr=ExtraAttr(device=-1), ...)
+```
+在本例中，我们假设一台机器上有4个GPU。
+
+- trainer_count=1:
+  - 使用0号GPU计算fc2层。
+  - 使用1号GPU计算fc3层。
+  - 使用CPU计算fc4层。
+
+- trainer_count=2:
+  - 使用0号和1号GPU计算fc2层。
+  - 使用2号和3号GPU计算fc3层。
+  - 使用CPU两线程计算fc4层。
+
+- trainer_count=4:
+  - 运行失败（注意到我们已经假设机器上有4个GPU），因为参数`allow_only_one_model_on_one_gpu`默认设置为真。
+
+**当`device!=-1`时设备ID号的分配：**
+
+```
+(deviceId + gpu_id + threadId * numLogicalDevices_) % numDevices_
+
+deviceId:             在层中指定
+gpu_id:               默认为0
+threadId:             线程ID号，范围: 0,1,..., trainer_count-1
+numDevices_:          机器的设备(GPU)数目
+numLogicalDevices_:   min(max(deviceId + 1), numDevices_)
+```
diff --git a/doc/howto/usage/cmd_parameter/use_case_en.md b/doc/howto/usage/cmd_parameter/use_case_en.md
index 4d7bb33f36fe258ee24796eedc9296065923e58f..e287f0c4b9617cbc6504596512bf408c56dc10f9 100644
--- a/doc/howto/usage/cmd_parameter/use_case_en.md
+++ b/doc/howto/usage/cmd_parameter/use_case_en.md
@@ -134,14 +134,14 @@ fc2=fc_layer(...)
 fc3=fc_layer(...,layer_attr=ExtraAttr(device=-1))
 
 ```
-- default_device(0): set default device ID to 0. This means that except the layers with device=-1, all layers will use a GPU, and the specific GPU used for each layer depends on trainer\_count and gpu\_id (0 by default). Here, layer l1 and l2 are computed on the GPU.
+- default_device(0): set default device ID to 0. This means that except the layers with device=-1, all layers will use a GPU, and the specific GPU used for each layer depends on trainer\_count and gpu\_id (0 by default). Here, layer fc1 and fc2 are computed on the GPU.
 
-- device=-1: use the CPU for layer l3.
+- device=-1: use the CPU for layer fc3.
 
 - trainer_count:
-  - trainer_count=1: if gpu\_id is not set, then use the first GPU to compute layers l1 and l2. Otherwise use the GPU with gpu\_id.
+  - trainer_count=1: if gpu\_id is not set, then use the first GPU to compute layers fc1 and fc2. Otherwise use the GPU with gpu\_id.
 
-  - trainer_count>1: use trainer\_count GPUs to compute one layer using data parallelism. For example, trainer\_count=2 means that GPUs 0 and 1 will use data parallelism to compute layer l1 and l2.
+  - trainer_count>1: use trainer\_count GPUs to compute one layer using data parallelism. For example, trainer\_count=2 means that GPUs 0 and 1 will use data parallelism to compute layer fc1 and fc2.
 
 ### Case 2: Specify Layers in Different Devices
 
@@ -157,14 +157,14 @@ fc4=fc_layer(input=fc2, layer_attr=ExtraAttr(device=-1), ...)
 In this case, we assume that there are 4 GPUs in one machine.
 
 - trainer_count=1:
-  - Use GPU 0 to compute layer l2.
-  - Use GPU 1 to compute layer l3.
-  - Use CPU to compute layer l4.
+  - Use GPU 0 to compute layer fc2.
+  - Use GPU 1 to compute layer fc3.
+  - Use CPU to compute layer fc4.
 
 - trainer_count=2:
-  - Use GPU 0 and 1 to compute layer l2.
-  - Use GPU 2 and 3 to compute layer l3.
-  - Use CPU to compute l4 in two threads.
+  - Use GPU 0 and 1 to compute layer fc2.
+  - Use GPU 2 and 3 to compute layer fc3.
+  - Use CPU to compute fc4 in two threads.
 
 - trainer_count=4:
   - It will fail (note, we have assumed that there are 4 GPUs in machine), because argument `allow_only_one_model_on_one_gpu` is true by default.
diff --git a/doc/howto/usage/k8s/k8s_aws_en.md b/doc/howto/usage/k8s/k8s_aws_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..422dc3bd811ae8b31dbdd6fa8637d6e44b29ac76
--- /dev/null
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
@@ -0,0 +1,666 @@
+# Kubernetes on AWS
+
+## Create AWS Account and IAM Account
+
+To use AWS, we need to sign up an AWS account on Amazon's Web site.
+An AWS account allows us to login to the AWS Console Web interface to
+create IAM users and user groups.  Usually, we create a user group with
+privileges required to run PaddlePaddle, and we create users for
+those who are going to run PaddlePaddle and add these users into the
+group.  IAM users can identify themselves using password and tokens,
+where passwords allows users to log in to the AWS Console, and tokens
+make it easy for users to submit and inspect jobs from the command
+line.
+
+To sign up an AWS account, please
+follow
+[this guide](http://docs.aws.amazon.com/lambda/latest/dg/setting-up.html).
+To create users and user groups under an AWS account, please
+follow
+[this guide](http://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html).
+
+Please be aware that this tutorial needs the following privileges in
+the user group:
+
+- AmazonEC2FullAccess
+- AmazonS3FullAccess
+- AmazonRoute53FullAccess
+- AmazonRoute53DomainsFullAccess
+- AmazonElasticFileSystemFullAccess
+- AmazonVPCFullAccess
+- IAMUserSSHKeys
+- IAMFullAccess
+- NetworkAdministrator
+
+
+By the time we write this tutorial, we noticed that Chinese AWS users
+might suffer from authentication problems when running this tutorial.
+Our solution is that we create a VM instance with the default Amazon
+AMI and in the same zone as our cluster runs, so we can SSH to this VM
+instance as a tunneling server and control our cluster and jobs from
+it.
+
+
+## PaddlePaddle on AWS
+
+Here we will show you step by step on how to run PaddlePaddle training on AWS cluster.
+
+
+###Download kube-aws and kubectl
+
+####kube-aws
+
+Import the CoreOS Application Signing Public Key:
+
+```
+gpg2 --keyserver pgp.mit.edu --recv-key FC8A365E
+```
+
+Validate the key fingerprint:
+
+```
+gpg2 --fingerprint FC8A365E
+```
+The correct key fingerprint is `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E`
+
+Go to the [releases](https://github.com/coreos/kube-aws/releases) and download the latest release tarball and detached signature (.sig) for your architecture.
+
+Validate the tarball's GPG signature:
+
+```
+PLATFORM=linux-amd64
+ # Or
+PLATFORM=darwin-amd64
+
+gpg2 --verify kube-aws-${PLATFORM}.tar.gz.sig kube-aws-${PLATFORM}.tar.gz
+```
+
+Extract the binary:
+
+```
+tar zxvf kube-aws-${PLATFORM}.tar.gz
+```
+
+Add kube-aws to your path:
+
+```
+mv ${PLATFORM}/kube-aws /usr/local/bin
+```
+
+
+####kubectl
+
+Go to the [releases](https://github.com/kubernetes/kubernetes/releases) and download the latest release tarball.
+
+Extract the tarball and then concate the kubernetes binaries directory into PATH:
+
+```
+export PATH=<path/to/kubernetes-directory>/platforms/linux/amd64:$PATH
+
+```
+
+User credentials and security tokens will be generated later in user directory, not in `~/.kube/config`, they will be necessary to use the CLI or the HTTP Basic Auth.
+
+
+###Configure AWS Credentials
+
+First check out [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) for installing the AWS command line interface, if you use ec2 instance with default amazon AMI, the cli tool has already been installed on your machine.
+
+
+And then configure your AWS account information:
+
+```
+aws configure
+
+```
+
+
+Fill in the required fields (You can get your AWS aceess key id and AWS secrete access key by following [this](http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html) instruction):
+
+
+```
+AWS Access Key ID: YOUR_ACCESS_KEY_ID
+AWS Secrete Access Key: YOUR_SECRETE_ACCESS_KEY
+Default region name: us-west-2
+Default output format: json
+
+```
+
+Test that your credentials work by describing any instances you may already have running on your account:
+
+```
+aws ec2 describe-instances
+```
+
+###Define Cluster Parameters
+
+####EC2 key pair
+
+The keypair that will authenticate SSH access to your EC2 instances. The public half of this key pair will be configured on each CoreOS node.
+
+After creating a key pair, you will use the name you gave the keys to configure the cluster. Key pairs are only available to EC2 instances in the same region. More info in the [EC2 Keypair docs](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html).
+
+####KMS key
+
+Amazon KMS keys are used to encrypt and decrypt cluster TLS assets. If you already have a KMS Key that you would like to use, you can skip creating a new key and provide the Arn string for your existing key.
+
+You can create a KMS key in the AWS console, or with the aws command line tool:
+
+```
+$ aws kms --region=us-west-2 create-key --description="kube-aws assets"
+{
+    "KeyMetadata": {
+        "CreationDate": 1458235139.724,
+        "KeyState": "Enabled",
+        "Arn": "arn:aws:kms:us-west-2:xxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx",
+        "AWSAccountId": "xxxxxxxxxxxxx",
+        "Enabled": true,
+        "KeyUsage": "ENCRYPT_DECRYPT",
+        "KeyId": "xxxxxxxxx",
+        "Description": "kube-aws assets"
+    }
+}
+```
+
+You will use the `KeyMetadata.Arn` string to identify your KMS key in the init step.
+
+And then you need to add several inline policies in your user permission.
+
+kms inline policy:
+
+```
+{
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Sid": "Stmt1482205552000",
+            "Effect": "Allow",
+            "Action": [
+                "kms:Decrypt",
+                "kms:Encrypt"
+            ],
+            "Resource": [
+                "arn:aws:kms:*:xxxxxxxxx:key/*"
+            ]
+        }
+    ]
+}
+```
+cloudformation inline policy:
+
+```
+"Version": "2012-10-17",
+    "Statement": [
+        {
+            "Sid": "Stmt1482205746000",
+            "Effect": "Allow",
+            "Action": [
+                "cloudformation:CreateStack",
+                "cloudformation:UpdateStack",
+                "cloudformation:DeleteStack",
+                "cloudformation:DescribeStacks",
+                "cloudformation:DescribeStackResource",
+                "cloudformation:GetTemplate"
+            ],
+            "Resource": [
+                "arn:aws:cloudformation:us-west-2:xxxxxxxxx:stack/YOUR_CLUSTER_NAME/*"
+            ]
+        }
+    ]
+}
+```
+
+
+####External DNS name
+
+When the cluster is created, the controller will expose the TLS-secured API on a public IP address. You will need to create an A record for the external DNS hostname you want to point to this IP address. You can find the API external IP address after the cluster is created by invoking kube-aws status.
+
+####S3 bucket
+
+You need to create an S3 bucket before startup the Kubernetes cluster.
+
+####Initialize an asset directory
+
+Create a directory on your local machine to hold the generated assets:
+
+```
+$ mkdir my-cluster
+$ cd my-cluster
+```
+
+Initialize the cluster CloudFormation stack with the KMS Arn, key pair name, and DNS name from the previous step:
+
+```
+$ kube-aws init \
+--cluster-name=my-cluster-name \
+--external-dns-name=my-cluster-endpoint \
+--region=us-west-1 \
+--availability-zone=us-west-1c \
+--key-name=key-pair-name \
+--kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
+```
+
+There will now be a cluster.yaml file in the asset directory. This is the main configuration file for your cluster.
+
+####Render contents of the asset directory
+
+In the simplest case, you can have kube-aws generate both your TLS identities and certificate authority for you.
+
+```
+$ kube-aws render credentials --generate-ca
+```
+
+The next command generates the default set of cluster assets in your asset directory.
+
+```
+sh $ kube-aws render stack
+```
+
+Here's what the directory structure looks like:
+
+```
+$ tree
+.
+├── cluster.yaml
+├── credentials
+│   ├── admin-key.pem
+│   ├── admin.pem
+│   ├── apiserver-key.pem
+│   ├── apiserver.pem
+│   ├── ca-key.pem
+│   ├── ca.pem
+│   ├── worker-key.pem
+│   └── worker.pem
+│   ├── etcd-key.pem
+│   └── etcd.pem
+│   ├── etcd-client-key.pem
+│   └── etcd-client.pem
+├── kubeconfig
+├── stack-template.json
+└── userdata
+    ├── cloud-config-controller
+    └── cloud-config-worker
+```
+
+These assets (templates and credentials) are used to create, update and interact with your Kubernetes cluster.
+
+
+###Kubernetes Cluster Start Up
+
+####Create the instances defined in the CloudFormation template
+
+Now for the exciting part, creating your cluster:
+
+```
+$ kube-aws up --s3-uri s3://<your-bucket-name>/<prefix>
+```
+
+####Configure DNS
+
+You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation, if necessary. This command can take a while. And then dig the load balancer hostname to get the ip address, use this ip to setup an A record for your external dns name.
+
+####Access the cluster
+
+Once the API server is running, you should see:
+
+```
+$ kubectl --kubeconfig=kubeconfig get nodes
+NAME                                       STATUS                     AGE
+ip-10-0-0-xxx.us-west-1.compute.internal   Ready                      5m
+ip-10-0-0-xxx.us-west-1.compute.internal   Ready                      5m
+ip-10-0-0-xx.us-west-1.compute.internal    Ready,SchedulingDisabled   5m
+```
+
+
+###Setup PaddlePaddle Environment on AWS
+
+Now, we've created a cluster with following network capability:
+
+1. All Kubernetes nodes can communicate with each other.
+
+1. All Docker containers on Kubernetes nodes can communicate with each other.
+
+1. All Kubernetes nodes can communicate with all Docker containers on Kubernetes nodes.
+
+1. All other traffic loads from outside of Kubernetes nodes cannot reach to the Docker containers on Kubernetes nodes except for creating the services for containers.
+
+
+For sharing the training data across all the Kubernetes nodes, we use EFS (Elastic File System) in AWS. Ceph might be a better solution, but it requires high version of Linux kernel that might not be stable enough at this moment. We haven't automated the EFS setup at this moment, so please do the following steps:
+
+
+1. Make sure you added AmazonElasticFileSystemFullAccess policy in your group.
+
+1. Create the Elastic File System in AWS console, and attach the new VPC with it.
+<center>![](src/create_efs.png)</center>
+
+
+1. Modify the Kubernetes security group under ec2/Security Groups, add additional inbound policy "All TCP TCP 0 - 65535 0.0.0.0/0" for Kubernetes default VPC security group. 
+<center>![](src/add_security_group.png)</center>
+
+
+1. Follow the EC2 mount instruction to mount the disk onto all the Kubernetes nodes, we recommend to mount EFS disk onto ~/efs.
+<center>![](src/efs_mount.png)</center>
+
+
+Before starting the training, you should place your user config and divided training data onto EFS. When the training start, each task will copy related files from EFS into container, and it will also write the training results back onto EFS, we will show you how to place the data later in this article.
+
+
+
+###Core Concept of PaddlePaddle Training on AWS
+
+Now we've already setup a 3 nodes distributed Kubernetes cluster, and on each node we've attached the EFS volume, in this training demo, we will create three Kubernetes pod and scheduling them on 3 node. Each pod contains a PaddlePaddle container. When container gets created, it will start pserver and trainer process, load the training data from EFS volume and start the distributed training task.
+
+####Use Kubernetes Job
+
+We use Kubernetes job to represent one time of distributed training. After the job get finished, Kubernetes will destroy job container and release all related resources.
+
+We can write a yaml file to describe the Kubernetes job. The file contains lots of configuration information, for example PaddlePaddle's node number, `paddle pserver` open port number, the network card info etc., these information are passed into container for processes to use as environment variables.
+
+In one time of distributed training, user will confirm the PaddlePaddle node number first. And then upload the pre-divided training data and configuration file onth EFS volume. And then create the Kubernetes job yaml file; submit to the Kubernetes cluster to start the training job.
+
+####Create PaddlePaddle Node
+
+After Kubernetes master gets the request, it will parse the yaml file and create several pods (defined by PaddlePaddle's node number)， Kubernetes will allocate these pods onto cluster's node. A pod represents a PaddlePaddle node, when pod is successfully allocated onto one physical/virtual machine, Kubernetes will startup the container in the pod, and this container will use the environment variables in yaml file and start up `paddle pserver` and `paddle trainer` processes.
+
+
+####Start up Training
+
+After container gets started, it starts up the distributed training by using scripts. We know `paddle train` process need to know other node's ip address and it's own trainer_id, since PaddlePaddle currently don't have the ability to do the service discovery, so in the start up script, each node will use job pod's name to query all to pod info from Kubernetes apiserver (apiserver's endpoint is an environment variable in container by default).
+
+With pod information, we can assign each pod a unique trainer_id. Here we sort all the pods by pod's ip, and assign the index to each PaddlePaddle node as it's trainer_id. The workflow of starting up the script is as follows:
+
+1. Query the api server to get pod information, and assign the trainer_id by sorting the ip.
+1. Copy the training data from EFS sharing volume into container.
+1. Parse the `paddle pserver` and 'paddle trainer' startup parameters from environment variables, and then start up the processes.
+1. PaddlePaddle will automatically write the result onto the PaddlePaddle node with trainer_id:0, we set the output path to be the EFS volume to save the result data.
+
+
+###Start PaddlePaddle Training Demo on AWS
+
+Now we'll start a PaddlePaddle training demo on AWS, steps are as follows:
+
+1. Build PaddlePaddle Docker image.
+1. Divide the training data file and upload it onto the EFS sharing volume.
+1. Create the training job yaml file, and start up the job.
+1. Check the result after training.
+
+####Build PaddlePaddle Docker Image
+
+PaddlePaddle docker image need to provide the runtime environment for `paddle pserver` and `paddle train`, so the container use this image should have two main function:
+
+1. Copy the training data into container.
+1. Generate the startup parameter for `paddle pserver` and `paddle train` process, and startup the training.
+
+
+Since official `paddledev/paddle:cpu-latest` have already included the PaddlePaddle binary, but lack of the above functionalities, so we will create the startup script based on this image, to achieve the work above. the detailed Dockerfile is as follows:
+
+```
+FROM paddledev/paddle:cpu-latest
+
+MAINTAINER zjsxzong89@gmail.com
+
+COPY start.sh /root/
+COPY start_paddle.py /root/
+CMD ["bash"," -c","/root/start.sh"]
+```
+
+At this point, we will copy our `start.sh` and `start_paddle.py` file into container, and then exec `start_paddle.py` script to start up the training, all the steps like assigning trainer_id, getting other nodes' ip are implemented in `start_paddle.py`.
+
+`start_paddle.py` will start parsing the parameters.
+
+```
+parser = argparse.ArgumentParser(prog="start_paddle.py",
+                                     description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+```
+
+And then using function `getPodList()` to query all the pod information from the job name through Kubernetes api server. When all the pods are in the running status, using `getIdMap(podlist)` to get the trainer_id.
+
+```
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(10)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+```
+
+In function `getIdMap(podlist)`, we use podlist to get the ip address for each pod and sort them, use the index as the trainer_id.
+
+```
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+```
+
+After getting `idMap`, we use function `startPaddle(idMap, train_args_dict)` to generate `paddle pserver` and `paddle train` start up parameters and then start up the processes.
+
+In function `startPaddle`, the most important work is to generate `paddle pserver` and `paddle train` start up parameters. For example, `paddle train` parameter parsing, we will get parameters like `PADDLE_NIC`, `PADDLE_PORT`, `PADDLE_PORTS_NUM`, and get the `trainer_id` from `idMap`.
+
+```
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+```
+
+Use `docker build` to build toe Docker Image:
+
+```
+docker build -t your_repo/paddle:mypaddle .
+```
+
+And then push the built image onto docker registry.
+
+```
+docker push  your_repo/paddle:mypaddle
+```
+
+####Upload Training Data File
+
+Here we will use PaddlePaddle's official recommendation demo as the content for this training, we put the training data file into a directory named by job name, which located in EFS sharing volume, the tree structure for the directory looks like:
+
+```
+efs
+└── paddle-cluster-job
+    ├── data
+    │   ├── 0
+    │   │
+    │   ├── 1
+    │   │
+    │   └── 2
+    ├── output
+    └── recommendation
+```
+
+The `paddle-cluster-job` directory is the job name for this training, this training includes 3 PaddlePaddle node, we store the pre-divided data under `paddle-cluster-job/data` directory, directory 0, 1, 2 each represent 3 nodes' trainer_id. the training data in in recommendation directory, the training results and logs will be in the output directory.
+
+
+####Create Kubernetes Job
+
+Kubernetes use yaml file to describe job details, and then use command line tool to create the job in Kubernetes cluster.
+
+In yaml file, we describe the Docker image we use for this training, the node number we need to startup, the volume mounting information and all the necessary parameters we need for `paddle pserver` and `paddle train` processes.
+
+The yaml file content is as follows:
+
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: jobpath
+        hostPath:
+          path: /home/admin/efs
+      containers:
+      - name: trainer
+        image: drinkcode/paddle:k8s-job
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: recommendation
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        volumeMounts:
+        - name: jobpath
+          mountPath: /home/jobpath
+        ports:
+        - name: jobport
+          hostPort: 30001
+          containerPort: 30001
+      restartPolicy: Never
+
+```
+
+In yaml file, the metadata's name is the job's name. `parallelism, completions` means this job will simultaneously start up 3 PaddlePaddle nodes, and this job will be finished when there are 3 finished pods. For the data store volume, we declare the path jobpath, it mount the /home/admin/efs on host machine into the container with path /home/jobpath. So in container, the /home/jobpath actually stores the data onto EFS sharing volume.
+
+`env` field represents container's environment variables, we pass the PaddlePaddle parameters into containers by using the `env` field.
+
+`JOB_PATH` represents the sharing volume path, `JOB_NAME` represents job name, `TRAIN_CONFIG_DIR` represents the training data file directory, we can these three parameters to get the file path for this training.
+
+`CONF_PADDLE_NIC` represents `paddle pserver` process's `--nics` parameters, the NIC name.
+
+`CONF_PADDLE_PORT` represents `paddle pserver` process's `--port` parameters, `CONF_PADDLE_PORTS_NUM` represents `--port_num` parameter.
+
+`CONF_PADDLE_PORTS_NUM_SPARSE` represents the sparse updated port number, `--ports_num_for_sparse` parameter.
+
+`CONF_PADDLE_GRADIENT_NUM` represents the training node number, `--num_gradient_servers` parameter.
+
+After we create the yaml file, we can use Kubernetes command line tool to create the job onto the cluster.
+
+```
+kubectl create -f job.yaml
+```
+
+After we execute the above command, Kubernetes will create 3 pods and then pull the PaddlePaddle image, then start up the containers for training.
+
+
+
+####Check Training Results
+
+During the training, we can see the logs and models on EFS sharing volume, the output directory contains the training results. (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node)
+
+```
+[root@paddle-kubernetes-node0 output]# tree -d
+.
+├── node_0
+│   ├── server.log
+│   └── train.log
+├── node_1
+│   ├── server.log
+│   └── train.log
+├── node_2
+......
+├── pass-00002
+│   ├── done
+│   ├── ___embedding_0__.w0
+│   ├── ___embedding_1__.w0
+......
+```
+
+We can always check the container training status through logs, for example:
+
+```
+[root@paddle-kubernetes-node0 node_0]# cat train.log
+I1116 09:10:17.123121    50 Util.cpp:155] commandline:
+ /usr/local/bin/../opt/paddle/bin/paddle_trainer
+    --nics=eth0 --port=7164
+    --ports_num=2 --comment=paddle_process_by_paddle
+    --pservers=192.168.129.66,192.168.223.143,192.168.129.71
+    --ports_num_for_sparse=2 --config=./trainer_config.py
+    --trainer_count=4 --num_passes=10 --use_gpu=0 
+    --log_period=50 --dot_period=10 --saving_period=1 
+    --local=0 --trainer_id=0
+    --save_dir=/home/jobpath/paddle-cluster-job/output
+I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
+I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
+[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
+[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__regression_cost_0__]
+I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
+I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.681543    50 GradientMachine.cpp:134] Initing parameters..
+I1116 09:10:18.012390    50 GradientMachine.cpp:141] Init parameters done.
+I1116 09:10:18.018641    50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164
+I1116 09:10:18.018950    50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165
+I1116 09:10:18.019069    50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164
+I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165
+I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
+I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
+```
+
+It'll take around 8 hours to finish this PaddlePaddle recommendation training demo on three 2 core 8 GB EC2 machine (m3.large).
+
+
+###Kubernetes Cluster Tear Down
+
+
+If you want to tear down the whole Kubernetes cluster, make sure to *delete* the EFS volume first (otherwise, you will get stucked on following steps), and then use the following command:
+
+```
+kube-aws destroy
+```
+It's an async call, it might take 5 min to tear down the whole cluster.
+
+If you created any Kubernetes Services of type LoadBalancer, you must delete these first, as the CloudFormation cannot be fully destroyed if any externally-managed resources still exist.
+
+
+
+## For Experts with Kubernetes and AWS
+
+Sometimes we might need to create or manage the cluster on AWS manually with limited privileges, so here we will explain more on what’s going on with the Kubernetes setup script.
+
+### Some Presumptions
+
+* Instances run on CoreOS, the official IAM.
+* Kubernetes node use instance storage, no EBS get mounted.  Etcd is running on additional node.
+* For networking, we use Flannel network at this moment, we will use Calico solution later on.
+* When you create a service with Type=LoadBalancer, Kubernetes will create and ELB, and create a security group for the ELB.
diff --git a/doc/howto/usage/cluster/k8s/k8s_cn.md b/doc/howto/usage/k8s/k8s_cn.md
similarity index 99%
rename from doc/howto/usage/cluster/k8s/k8s_cn.md
rename to doc/howto/usage/k8s/k8s_cn.md
index 2575701053ca12cc3af45682af6cd682a88bb987..ab07cb9cd5b135ddea82b3360720537f1dc5a801 100644
--- a/doc/howto/usage/cluster/k8s/k8s_cn.md
+++ b/doc/howto/usage/k8s/k8s_cn.md
@@ -1,4 +1,4 @@
-# Kubernetes 单机训练
+# Kubernetes单机训练
 
 在这篇文档里，我们介绍如何在 Kubernetes 集群上启动一个单机使用CPU的Paddle训练作业。在下一篇中，我们将介绍如何启动分布式训练作业。
 
diff --git a/doc/howto/usage/cluster/k8s/k8s_distributed_cn.md b/doc/howto/usage/k8s/k8s_distributed_cn.md
similarity index 99%
rename from doc/howto/usage/cluster/k8s/k8s_distributed_cn.md
rename to doc/howto/usage/k8s/k8s_distributed_cn.md
index 53d0b4676c6a3a2dc8c58e231756638cc0b67765..b63b8437a0114a0165971933912da83c2dd770a6 100644
--- a/doc/howto/usage/cluster/k8s/k8s_distributed_cn.md
+++ b/doc/howto/usage/k8s/k8s_distributed_cn.md
@@ -1,4 +1,4 @@
-# Kubernetes 分布式训练
+# Kubernetes分布式训练
 
 前一篇文章介绍了如何在Kubernetes集群上启动一个单机PaddlePaddle训练作业 (Job)。在这篇文章里，我们介绍如何在Kubernetes集群上进行分布式PaddlePaddle训练作业。关于PaddlePaddle的分布式训练，文章 [Cluster Training](https://github.com/baidu/Paddle/blob/develop/doc/cluster/opensource/cluster_train.md)介绍了一种通过SSH远程分发任务，进行分布式训练的方法，与此不同的是，本文将介绍在Kubernetes容器管理平台上快速构建PaddlePaddle容器集群，进行分布式训练的方案。
 
@@ -22,7 +22,7 @@
 
 首先，我们需要拥有一个Kubernetes集群，在这个集群中所有node与pod都可以互相通信。关于Kubernetes集群搭建，可以参考[官方文档](http://kubernetes.io/docs/getting-started-guides/kubeadm/)，在以后的文章中我们也会介绍AWS上搭建的方案。本文假设大家能找到几台物理机，并且可以按照官方文档在上面部署Kubernetes。在本文的环境中，Kubernetes集群中所有node都挂载了一个[MFS](http://moosefs.org/)（Moose filesystem，一种分布式文件系统）共享目录，我们通过这个目录来存放训练文件与最终输出的模型。关于MFS的安装部署，可以参考[MooseFS documentation](https://moosefs.com/documentation.html)。在训练之前，用户将配置与训练数据切分好放在MFS目录中，训练时，程序从此目录拷贝文件到容器内进行训练，将结果保存到此目录里。整体的结构图如下：
 
-![paddle on kubernetes结构图](k8s-paddle-arch.png)
+![paddle on kubernetes结构图](src/k8s-paddle-arch.png)
 
 上图描述了一个3节点的分布式训练场景，Kubernetes集群的每个node上都挂载了一个MFS目录，这个目录可以通过volume的形式挂载到容器中。Kubernetes为这次训练创建了3个pod并且调度到了3个node上运行，每个pod包含一个PaddlePaddle容器。在容器创建后，会启动pserver与trainer进程，读取volume中的数据进行这次分布式训练。
 
diff --git a/doc/howto/usage/k8s/k8s_en.md b/doc/howto/usage/k8s/k8s_en.md
new file mode 100644
index 0000000000000000000000000000000000000000..0c3ab05b708e7a924577c26496b8c55126e76c62
--- /dev/null
+++ b/doc/howto/usage/k8s/k8s_en.md
@@ -0,0 +1,201 @@
+# Paddle On Kubernetes
+
+>In this article, we will introduce how to run Paddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run Paddle training job on distributed cluster.
+
+## Build Docker Image
+
+In distributed Kubernetes cluster, we will use Ceph or other shared storage system for storing training related data so that all processes in Paddle training can retrieve data from Ceph. In this example, we will only demo training job on single machine. In order to simplify the requirement of the environment, we will directly put training data into Paddle's Docker Image, so we need to create a Paddle Docker image that already includes the training data.
+
+Paddle's [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) introduces how to download and train data by using script from Paddle's source code.
+And `paddledev/paddle:cpu-demo-latest` image has the Paddle source code and demo. (Caution: Default Paddle image `paddledev/paddle:cpu-latest` doesn't include the source code, Paddle's different versions of image can be referred here: [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html)), so we run this container and download the training data, and then commit the whole container to be a new Docker image.
+  
+### Run Docker Container
+
+```
+$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+```
+
+### Download Training Data
+
+Getting into `/root/paddle/demo/quick_start/data` Directory，using `get_data.sh` to download training data.
+Then getting into `/root/paddle/demo/quick_start` Directory, using `preprocess.sh` to pre-process training data.
+
+```
+$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh
+
+Downloading Amazon Electronics reviews data...
+--2016-10-31 01:33:43--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
+Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
+HTTP request sent, awaiting response... 200 OK
+Length: 495854086 (473M) [application/x-gzip]
+Saving to: 'reviews_Electronics_5.json.gz'
+
+ 10% [=======>                                         ] 874,279     64.7KB/s  eta 2h 13m
+
+```
+
+### Modify Startup Script
+
+After downloading the data，modify `/root/paddle/demo/quick_start/train.sh` file contents are as follows (one more cd cmd):
+```
+set -e
+cd /root/paddle/demo/quick_start
+cfg=trainer_config.lr.py
+#cfg=trainer_config.emb.py
+#cfg=trainer_config.cnn.py
+#cfg=trainer_config.lstm.py
+#cfg=trainer_config.bidi-lstm.py
+#cfg=trainer_config.db-lstm.py
+paddle train \
+  --config=$cfg \
+  --save_dir=./output \
+  --trainer_count=4 \
+  --log_period=20 \
+  --num_passes=15 \
+  --use_gpu=false \
+  --show_parameter_stats_period=100 \
+  --test_all_data_in_one_period=1 \
+  2>&1 | tee 'train.log'
+```
+
+### Commit Docker Image
+
+```
+$ docker commit quick_start_data mypaddle/paddle:quickstart
+```
+
+## Use Kubernetes For Training
+
+>We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.
+
+### Create Yaml Files
+
+The output result in container will be demolished when job finished (container stopped running), so we need to mount the volume out to the local disk when creating the container to store the training result. Using our previously created image, we can create a [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job), the yaml contents are as follows:
+
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: quickstart
+spec:
+  parallelism: 1
+  completions: 1
+  template:
+    metadata:
+      name: quickstart
+    spec:
+      volumes:
+      - name: output
+        hostPath: 
+          path: /home/work/paddle_output     
+      containers:
+      - name: pi
+        image: mypaddle/paddle:quickstart
+        command: ["bin/bash",  "-c", "/root/paddle/demo/quick_start/train.sh"]
+        volumeMounts:
+        - name: output
+          mountPath: /root/paddle/demo/quick_start/output
+      restartPolicy: Never
+```
+
+### Start Paddle Job
+
+Using the above yaml file to start the Kubernetes job.
+
+```
+$ kubectl  create -f paddle.yaml
+```
+
+Get the detailed status of the job:
+
+```
+$ kubectl  get job
+NAME         DESIRED   SUCCESSFUL   AGE
+quickstart   1         0            58s
+
+$ kubectl  describe job quickstart
+Name:		quickstart
+Namespace:	default
+Image(s):	registry.baidu.com/public/paddle:cpu-demo-latest
+Selector:	controller-uid=f120da72-9f18-11e6-b363-448a5b355b84
+Parallelism:	1
+Completions:	1
+Start Time:	Mon, 31 Oct 2016 11:20:16 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Pods Statuses:	0 Running / 1 Succeeded / 0 Failed
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+Events:
+  FirstSeen	LastSeen	Count	From			SubobjectPath	Type		Reason			Message
+  ---------	--------	-----	----			-------------	--------	------			-------
+  1m		1m		1	{job-controller }			Normal		SuccessfulCreate	Created pod: quickstart-fa0wx
+```
+
+### Get Training Result
+
+We can use kubectl command to take a look at the status of related pod.
+
+```
+$ kubectl  describe pod quickstart-fa0wx
+Name:		quickstart-fa0wx
+Namespace:	default
+Node:		paddle-demo-let02/10.206.202.44
+Start Time:	Mon, 31 Oct 2016 11:20:17 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Status:		Succeeded
+IP:		10.0.0.9
+Controllers:	Job/quickstart
+Containers:
+  quickstart:
+    Container ID:	docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465
+    Image:		registry.baidu.com/public/paddle:cpu-demo-latest
+    Image ID:		docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750
+    Port:
+    Command:
+      bin/bash
+      -c
+      /root/paddle/demo/quick_start/train.sh
+    QoS Tier:
+      cpu:		BestEffort
+      memory:		BestEffort
+    State:		Terminated
+      Reason:		Completed
+      Exit Code:	0
+      Started:		Mon, 31 Oct 2016 11:20:20 +0800
+      Finished:		Mon, 31 Oct 2016 11:21:46 +0800
+    Ready:		False
+    Restart Count:	0
+    Environment Variables:
+Conditions:
+  Type		Status
+  Ready 	False
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+```
+
+We can also ssh to Kubernetes node to take a look at the training result.
+
+```
+[root@paddle-demo-let02 paddle_output]# ll
+total 60
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014
+```
diff --git a/doc/howto/usage/cluster/k8s/Dockerfile b/doc/howto/usage/k8s/src/Dockerfile
similarity index 100%
rename from doc/howto/usage/cluster/k8s/Dockerfile
rename to doc/howto/usage/k8s/src/Dockerfile
diff --git a/doc/howto/usage/k8s/src/add_security_group.png b/doc/howto/usage/k8s/src/add_security_group.png
new file mode 100644
index 0000000000000000000000000000000000000000..50eed4c6573a18d6ae0f9df9bd6a3cae05493e3c
Binary files /dev/null and b/doc/howto/usage/k8s/src/add_security_group.png differ
diff --git a/doc/howto/usage/k8s/src/create_efs.png b/doc/howto/usage/k8s/src/create_efs.png
new file mode 100644
index 0000000000000000000000000000000000000000..f4d448d1518e11a11d535efb9c3a78b56cc13149
Binary files /dev/null and b/doc/howto/usage/k8s/src/create_efs.png differ
diff --git a/doc/howto/usage/k8s/src/efs_mount.png b/doc/howto/usage/k8s/src/efs_mount.png
new file mode 100644
index 0000000000000000000000000000000000000000..0f9e3cab98445707e5e9baa18ddabe15cdf04576
Binary files /dev/null and b/doc/howto/usage/k8s/src/efs_mount.png differ
diff --git a/doc/howto/usage/cluster/k8s/job.yaml b/doc/howto/usage/k8s/src/job.yaml
similarity index 100%
rename from doc/howto/usage/cluster/k8s/job.yaml
rename to doc/howto/usage/k8s/src/job.yaml
diff --git a/doc/howto/usage/cluster/k8s/k8s-paddle-arch.png b/doc/howto/usage/k8s/src/k8s-paddle-arch.png
similarity index 100%
rename from doc/howto/usage/cluster/k8s/k8s-paddle-arch.png
rename to doc/howto/usage/k8s/src/k8s-paddle-arch.png
diff --git a/doc/howto/usage/k8s/src/managed_policy.png b/doc/howto/usage/k8s/src/managed_policy.png
new file mode 100644
index 0000000000000000000000000000000000000000..c7ecda555b81d7750e9292a9ab72d2f517f76a2a
Binary files /dev/null and b/doc/howto/usage/k8s/src/managed_policy.png differ
diff --git a/doc/howto/usage/cluster/k8s/start.sh b/doc/howto/usage/k8s/src/start.sh
similarity index 100%
rename from doc/howto/usage/cluster/k8s/start.sh
rename to doc/howto/usage/k8s/src/start.sh
diff --git a/doc/howto/usage/cluster/k8s/start_paddle.py b/doc/howto/usage/k8s/src/start_paddle.py
similarity index 100%
rename from doc/howto/usage/cluster/k8s/start_paddle.py
rename to doc/howto/usage/k8s/src/start_paddle.py
diff --git a/doc/tutorials/embedding_model/index_cn.md b/doc/tutorials/embedding_model/index_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..fe800308d8d7a03619ec8e13fd8dc4aa7a8ed8be
--- /dev/null
+++ b/doc/tutorials/embedding_model/index_cn.md
@@ -0,0 +1,138 @@
+# 中文词向量模型的使用 #
+----------
+本文档介绍如何在PaddlePaddle平台上,使用预训练的标准格式词向量模型。
+
+在此感谢 @lipeng 提出的代码需求，并给出的相关模型格式的定义。
+
+## 介绍 ###
+### 中文字典 ###
+我们的字典使用内部的分词工具对百度知道和百度百科的语料进行分词后产生。分词风格如下： "《红楼梦》"将被分为 "《"，"红楼梦"，"》"，和 "《红楼梦》"。字典采用UTF8编码，输出有2列：词本身和词频。字典共包含 3206325个词和3个特殊标记：
+  - `<s>`: 分词序列的开始
+  - `<e>`: 分词序列的结束
+  - `<unk>`: 未知词
+
+### 中文词向量的预训练模型 ###
+遵循文章 [A Neural Probabilistic Language Model](http://www.jmlr.org/papers/volume3/bengio03a/bengio03a.pdf)中介绍的方法，模型采用 n-gram 语言模型，结构如下图：6元上下文作为输入层->全连接层->softmax层 。对应于字典，我们预训练得到4种不同维度的词向量，分别为：32维、64维、128维和256维。
+<center>![](./neural-n-gram-model.png)</center>
+<center>Figure 1. neural-n-gram-model</center>
+
+### 下载和数据抽取 ###
+运行以下的命令下载和获取我们的字典和预训练模型：
+
+    cd $PADDLE_ROOT/demo/model_zoo/embedding
+    ./pre_DictAndModel.sh
+
+## 中文短语改写的例子 ##
+以下示范如何使用预训练的中文字典和词向量进行短语改写。
+
+### 数据的准备和预处理 ###
+首先，运行以下的命令下载数据集。该数据集（utf8编码）包含20个训练样例，5个测试样例和2个生成式样例。
+
+    cd $PADDLE_ROOT/demo/seqToseq/data
+    ./paraphrase_data.sh
+
+第二步，将数据处理成规范格式，在训练数集上训练生成词向量字典（数据将保存在 `$PADDLE_SOURCE_ROOT/demo/seqToseq/data/pre-paraphrase`）:
+
+    cd $PADDLE_ROOT/demo/seqToseq/
+    python preprocess.py -i data/paraphrase [--mergeDict]
+
+- 其中，如果使用`--mergeDict`选项，源语言短语和目标语言短语的字典将被合并（源语言和目标语言共享相同的编码字典）。本实例中，源语言和目标语言都是相同的语言，因此可以使用该选项。
+
+
+### 使用用户指定的词向量字典 ###
+使用如下命令，从预训练模型中，根据用户指定的字典，抽取对应的词向量构成新的词表:
+    cd $PADDLE_ROOT/demo/model_zoo/embedding
+    python extract_para.py --preModel PREMODEL --preDict PREDICT --usrModel USRMODEL--usrDict USRDICT -d DIM
+
+- `--preModel PREMODEL`: 预训练词向量字典模型的路径
+- `--preDict PREDICT`:  预训练模型使用的字典的路径
+- `--usrModel USRMODEL`: 抽取出的新词表的保存路径
+- `--usrDict USRDICT`: 用户指定新的字典的路径，用于构成新的词表
+- `-d DIM`: 参数（词向量）的维度
+
+此处，你也可以简单的运行以下的命令：
+
+    cd $PADDLE_ROOT/demo/seqToseq/data/
+    ./paraphrase_model.sh
+
+运行成功以后，你将会看到以下的模型结构：
+
+    paraphrase_model
+    |--- _source_language_embedding
+    |--- _target_language_embedding
+
+### 在PaddlePaddle平台训练模型 ###
+首先，配置模型文件，配置如下（可以参考保存在 `demo/seqToseq/paraphrase/train.conf`的配置）:
+
+    from seqToseq_net import *
+    is_generating = False
+
+    ################## Data Definition #####################
+    train_conf = seq_to_seq_data(data_dir = "./data/pre-paraphrase",
+                                 job_mode = job_mode)
+
+    ############## Algorithm Configuration ##################
+    settings(
+          learning_method = AdamOptimizer(),
+          batch_size = 50,
+          learning_rate = 5e-4)
+
+    ################# Network configure #####################
+    gru_encoder_decoder(train_conf, is_generating, word_vector_dim = 32)
+
+这个配置与`demo/seqToseq/translation/train.conf` 基本相同
+
+然后，使用以下命令进行模型训练:
+
+    cd $PADDLE_SOURCE_ROOT/demo/seqToseq/paraphrase
+    ./train.sh
+
+其中，`train.sh` 与`demo/seqToseq/translation/train.sh` 基本相同，只有2个配置不一样:
+
+- `--init_model_path`: 初始化模型的路径配置为`data/paraphrase_modeldata/paraphrase_model`
+- `--load_missing_parameter_strategy`：如果参数模型文件缺失，除词向量模型外的参数将使用正态分布随机初始化
+
+如果用户想要了解详细的数据集的格式、模型的结构和训练过程，请查看 [Text generation Tutorial](../text_generation/index_cn.md).
+
+## 可选功能 ##
+###  观测词向量
+PaddlePaddle 平台为想观测词向量的用户提供了将二进制词向量模型转换为文本模型的功能:
+
+    cd $PADDLE_ROOT/demo/model_zoo/embedding
+    python paraconvert.py --b2t -i INPUT -o OUTPUT -d DIM
+
+- `-i INPUT`: 输入的（二进制）词向量模型名称
+- `-o OUTPUT`: 输出的文本模型名称
+- `-d DIM`: （词向量）参数维度
+
+运行完以上命令，用户可以在输出的文本模型中看到:
+
+    0,4,32156096
+    -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ......
+    0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ......
+    ......
+
+- 其中，第一行是`PaddlePaddle` 输出文件的格式说明，包含3个属性：:
+  - `PaddlePaddle`的版本号，本例中为0
+  - 浮点数占用的字节数，本例中为4
+  - 总计的参数个数，本例中为32,156,096
+- 其余行是（词向量）参数行（假设词向量维度为32）
+  - 每行打印32个参数以','分隔
+  - 共有32,156,096/32 = 1,004,877行，也就是说，模型共包含1,004,877个被向量化的词
+
+### 词向量模型的修正
+`PaddlePaddle` 为想修正词向量模型的用户提供了将文本词向量模型转换为二进制模型的命令:
+
+    cd $PADDLE_ROOT/demo/model_zoo/embedding
+    python paraconvert.py --t2b -i INPUT -o OUTPUT
+
+- `-i INPUT`: 输入的文本词向量模型名称
+- `-o OUTPUT`: 输出的二进制词向量模型名称
+
+请注意，输入的文本格式如下:
+
+    -0.7845433,1.1937413,-0.1704215,0.4154715,0.9566584,-0.5558153,-0.2503305, ......
+    0.0000909,0.0009465,-0.0008813,-0.0008428,0.0007879,0.0000183,0.0001984, ......
+    ......
+- 输入文本中没有头部（格式说明）行
+- （输入文本）每行存储一个词，以逗号','分隔
diff --git a/doc/tutorials/gan/gan.png b/doc/tutorials/gan/gan.png
index 001ed6cc19e8911f9b10f63211c9658160b3a06e..0eafd7cb49b545f412f8e775804bcd0b22c42454 100644
Binary files a/doc/tutorials/gan/gan.png and b/doc/tutorials/gan/gan.png differ
diff --git a/doc/tutorials/gan/index_en.md b/doc/tutorials/gan/index_en.md
index 99c8d730117a469c89abb218eeacf66103c0cbed..ac9ed37b2264778869f92c0910b1cb946fb4427f 100644
--- a/doc/tutorials/gan/index_en.md
+++ b/doc/tutorials/gan/index_en.md
@@ -4,9 +4,7 @@ This demo implements GAN training described in the original [GAN paper](https://
 
 The high-level structure of GAN is shown in Figure. 1 below. It is composed of two major parts: a generator and a discriminator, both of which are based on neural networks. The generator takes in some kind of noise with a known distribution and transforms it into an image. The discriminator takes in an image and determines whether it is artificially generated by the generator or a real image. So the generator and the discriminator are in a competitive game in which generator is trying to generate image to look as real as possible to fool the discriminator, while the discriminator is trying to distinguish between real and fake images. 
 
-<p align="center">
-    <img src="./gan.png" width="500" height="300"> 
-</p>
+<center>![](./gan.png)</center>
 <p align="center">
     Figure 1. GAN-Model-Structure
     <a href="https://ishmaelbelghazi.github.io/ALI/">figure credit</a>
@@ -111,9 +109,7 @@ $python gan_trainer.py -d uniform --useGpu 1
 ```
 The generated samples can be found in ./uniform_samples/ and one example is shown below as Figure 2. One can see that it roughly recovers the 2D uniform distribution. 
 
-<p align="center">
-    <img src="./uniform_sample.png" width="300" height="300"> 
-</p>
+<center>![](./uniform_sample.png)</center>
 <p align="center">
     Figure 2. Uniform Sample
 </p>
@@ -135,9 +131,7 @@ To train the GAN model on mnist data, one can use the following command:
 $python gan_trainer.py -d mnist --useGpu 1
 ```
 The generated sample images can be found at ./mnist_samples/ and one example is shown below as Figure 3. 
-<p align="center">
-    <img src="./mnist_sample.png" width="300" height="300"> 
-</p>
+<center>![](./mnist_sample.png)</center>
 <p align="center">
     Figure 3. MNIST Sample
 </p>
diff --git a/doc/tutorials/gan/uniform_sample.png b/doc/tutorials/gan/uniform_sample.png
index 4a96c45cae82673f5a1df986f2643a8026da7937..e716c48e782019a757bed0cb443f2ed97386cbe2 100644
Binary files a/doc/tutorials/gan/uniform_sample.png and b/doc/tutorials/gan/uniform_sample.png differ
diff --git a/doc/tutorials/index_cn.md b/doc/tutorials/index_cn.md
index 97014d537655d21871295699381c5dd2106d0b56..6a27004d58d24cc466d930322be8cdbb2f434c74 100644
--- a/doc/tutorials/index_cn.md
+++ b/doc/tutorials/index_cn.md
@@ -2,6 +2,7 @@
 
 * [快速入门](quick_start/index_cn.rst)
 * [个性化推荐](rec/ml_regression_cn.rst)
+* [图像分类](image_classification/index_cn.md)
 * [情感分析](sentiment_analysis/index_cn.md)
 * [语义角色标注](semantic_role_labeling/index_cn.md)
 * [机器翻译](text_generation/index_cn.md)
@@ -9,3 +10,4 @@
 ## 常用模型
 
 * [ResNet模型](imagenet_model/resnet_model_cn.md)
+* [词向量模型](embedding_model/index_cn.md)
diff --git a/doc/tutorials/index_en.md b/doc/tutorials/index_en.md
index cce9d3a176a5e5c87e97c16362ec8a202e8eb80a..77331a703b6f0fdf92921ebcc476325b7327e976 100644
--- a/doc/tutorials/index_en.md
+++ b/doc/tutorials/index_en.md
@@ -7,6 +7,7 @@ There are several examples and demos here.
 * [Sentiment Analysis](sentiment_analysis/index_en.md)
 * [Semantic Role Labeling](semantic_role_labeling/index_en.md)
 * [Text Generation](text_generation/index_en.md)
+* [Image Auto-Generation](gan/index_en.md)
 
 ## Model Zoo
 * [ImageNet: ResNet](imagenet_model/resnet_model_en.md)
diff --git a/paddle/CMakeLists.txt b/paddle/CMakeLists.txt
index 2daea052b01adc87f42e15cdcfec92301b7edae9..503024cff338dac42a6a8a32463472dc6b6451d9 100644
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
@@ -1,6 +1,7 @@
 add_subdirectory(cuda)
 add_subdirectory(function)
 add_subdirectory(utils)
+add_subdirectory(testing)
 add_subdirectory(math)
 add_subdirectory(parameter)
 add_subdirectory(gserver)
diff --git a/paddle/api/Paddle.swig b/paddle/api/Paddle.swig
index 3365927f9b59936244230bed439808fa7ead2c61..068ba286c07d8854a1a7c7042224a679b50b4957 100644
--- a/paddle/api/Paddle.swig
+++ b/paddle/api/Paddle.swig
@@ -178,6 +178,7 @@ namespace std {
 %newobject ParameterOptimizer::create;
 %newobject ParameterOptimizer::needSpecialTraversal;
 %newobject ParameterUpdater::createLocalUpdater;
+%newobject ParameterUpdater::createRemoteUpdater;
 
 %feature("director") UpdateCallback;
 %feature("autodoc", 1); // To generate method stub, for code hint in ide
diff --git a/paddle/api/PaddleAPI.h b/paddle/api/PaddleAPI.h
index 09c891871a5ca8571216d211203fe8643fc3a63f..81c9eed0bccd5ad63f524cdb011fc73cd568f465 100644
--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -803,6 +803,8 @@ private:
 
 public:
   static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
+  static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
+                                               int passCount);
   ~ParameterUpdater();
 
   /**
diff --git a/paddle/api/ParameterUpdater.cpp b/paddle/api/ParameterUpdater.cpp
index 7cd8ed7e3907489a60f37090df6f51492def2612..75b0ae7cb6cc8c9ad0f8fe69963b7439a44bf55e 100644
--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -15,15 +15,25 @@ limitations under the License. */
 #include "PaddleAPI.h"
 
 #include "PaddleAPIPrivate.h"
+#include "paddle/trainer/RemoteParameterUpdater.h"
 #include "paddle/trainer/ThreadParameterUpdater.h"
 
 ParameterUpdater::ParameterUpdater() : m(new ParameterUpdaterPrivate()) {}
 
 ParameterUpdater *ParameterUpdater::createLocalUpdater(
     OptimizationConfig *config) {
-  auto param = new ParameterUpdater();
-  param->m->updater.reset(new paddle::SgdThreadUpdater(config->m->getConfig()));
-  return param;
+  auto updater = new ParameterUpdater();
+  updater->m->updater.reset(
+      new paddle::SgdThreadUpdater(config->m->getConfig()));
+  return updater;
+}
+
+ParameterUpdater *ParameterUpdater::createRemoteUpdater(
+    OptimizationConfig *config, int passCount) {
+  auto updater = new ParameterUpdater();
+  updater->m->updater.reset(new paddle::RemoteParameterUpdater(
+      config->m->getConfig(), passCount, nullptr));
+  return updater;
 }
 
 ParameterUpdater::~ParameterUpdater() { delete m; }
diff --git a/paddle/cuda/include/hl_sequence.h b/paddle/cuda/include/hl_sequence.h
index 9bcd25b0623e569052e08c0befc8e09f937fa4bd..9f9d8f972e3a4c62e5caedcf85054be5681b96c1 100644
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -48,78 +48,6 @@ extern void hl_max_sequence_forward(real* input,
 extern void hl_max_sequence_backward(
     real* outputGrad, int* index, real* inputGrad, int numSequences, int dim);
 
-/**
- * @brief   Context projection forward.
- *
- * @param[in]   input           input sequence.
- * @param[in]   sequence        sequence index.
- * @param[in]   weightData      padding data.
- * @param[out]  output          output sequence.
- * @param[in]   numSequences    number of sequences.
- * @param[in]   inputDim        input sequence dimension.
- * @param[in]   contextLength   context length.
- * @param[in]   contextStart    context start.
- * @param[in]   beginPad        number of extra timesteps added at the
- * beginning.
- * @param[in]   isPadding       trainable padding.
- *
- */
-extern void hl_context_projection_forward(real* input,
-                                          const int* sequence,
-                                          real* weightData,
-                                          real* output,
-                                          int numSequences,
-                                          int inputDim,
-                                          int contextLength,
-                                          int contextStart,
-                                          int beginPad,
-                                          bool isPadding);
-
-/**
- * @brief   Context projection backward data.
- *
- * @param[in]   outputGrad      output gradient.
- * @param[in]   sequence        sequence index.
- * @param[out]  inputGrad       input gradient.
- * @param[in]   numSequences    number of sequences.
- * @param[in]   inputDim        input sequence dimension.
- * @param[in]   contextLength   context length.
- * @param[in]   contextStart    context start.
- *
- */
-extern void hl_context_projection_backward_data(real* outputGrad,
-                                                const int* sequence,
-                                                real* inputGrad,
-                                                int numSequences,
-                                                int inputDim,
-                                                int contextLength,
-                                                int contextStart);
-
-/**
- * @brief   Context projection backward weight.
- *
- * @param[in]   outputGrad      output gradient.
- * @param[in]   sequence        sequence index.
- * @param[out]  weightGrad      weight gradient.
- * @param[in]   numSequences    number of sequences.
- * @param[in]   weightDim       input sequence dimension.
- * @param[in]   totalPad        number of extra timesteps.
- * @param[in]   contextLength   context length.
- * @param[in]   contextStart    context start.
- * @param[in]   beginPad        number of extra timesteps added at the
- * beginning.
- *
- */
-extern void hl_context_projection_backward_weight(real* outputGrad,
-                                                  const int* sequence,
-                                                  real* weightGrad,
-                                                  int numSequences,
-                                                  int weightDim,
-                                                  int totalPad,
-                                                  int contextLength,
-                                                  int contextStart,
-                                                  int beginPad);
-
 /**
  * @brief   Memory copy from sequence to batch.
  *
diff --git a/paddle/cuda/include/stub/hl_sequence_stub.h b/paddle/cuda/include/stub/hl_sequence_stub.h
index d6b07556f8958a62bd47f0b47b75bbebafeb58d3..05e51bce9e1df6fc6ef1cad891b44a9172da185d 100644
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -27,35 +27,6 @@ inline void hl_max_sequence_forward(real* input,
 inline void hl_max_sequence_backward(
     real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {}
 
-inline void hl_context_projection_forward(real* input,
-                                          const int* sequence,
-                                          real* weightData,
-                                          real* output,
-                                          int numSequences,
-                                          int inputDim,
-                                          int contextLength,
-                                          int contextStart,
-                                          int beginPad,
-                                          bool isPadding) {}
-
-inline void hl_context_projection_backward_data(real* outputGrad,
-                                                const int* sequence,
-                                                real* inputGrad,
-                                                int numSequences,
-                                                int inputDim,
-                                                int contextLength,
-                                                int contextStart) {}
-
-inline void hl_context_projection_backward_weight(real* outputGrad,
-                                                  const int* sequence,
-                                                  real* weightGrad,
-                                                  int numSequences,
-                                                  int weightDim,
-                                                  int totalPad,
-                                                  int contextLength,
-                                                  int contextStart,
-                                                  int beginPad) {}
-
 inline void hl_sequence2batch_copy(real* batch,
                                    real* sequence,
                                    const int* batchIndex,
diff --git a/paddle/cuda/src/hl_cuda_sequence.cu b/paddle/cuda/src/hl_cuda_sequence.cu
index 4e33ac443c1f78b7fa50a15784875cbadfcf7497..ba823de2720336851bf9c49d8162360af93e8601 100644
--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -90,258 +90,6 @@ void hl_max_sequence_backward(real* outputGrad,
   CHECK_SYNC("hl_max_sequence_backward failed");
 }
 
-template <bool padding>
-__global__ void KeContextProjectionForward(real* input,
-                                           const int* sequence,
-                                           real* weightData,
-                                           real* output,
-                                           int inputDim,
-                                           int contextLength,
-                                           int contextStart,
-                                           int beginPad) {
-  int idx = threadIdx.x;
-  int blockSize = blockDim.x;
-  int sequenceId = blockIdx.x;
-  int seqStart = sequence[sequenceId];
-  int seqEnd = sequence[sequenceId+1];
-  real value = 0;
-
-  int instances = seqEnd - seqStart + contextLength - 1;
-  output += seqStart * inputDim * contextLength;
-  input += seqStart * inputDim;
-  for (int k = 0; k <= inputDim / blockSize; k++) {
-    if (idx < inputDim) {
-      for (int i = 0; i < instances; i++) {
-        // i + contextStart;
-        if ((i + contextStart) < 0) {
-          if (padding) {
-            value = weightData[i * inputDim + idx];
-          } else {
-            continue;
-          }
-        } else if ((i + contextStart) >= (seqEnd - seqStart)) {
-          if (padding) {
-            value =
-              weightData[(beginPad + i + contextStart - (seqEnd - seqStart)) *
-                         inputDim + idx];
-          } else {
-            continue;
-          }
-        } else {
-          value = input[(i + contextStart) * inputDim + idx];
-        }
-
-        int outx = (i - contextLength) < 0 ? i : (contextLength - 1);
-        int outy = (i - contextLength) < 0 ? 0 : (i - (contextLength - 1));
-        real* output_r =
-          output + outy * inputDim * contextLength + outx * inputDim;
-        for (int j = outy; j < seqEnd - seqStart; j++) {
-          output_r[idx] += value;
-          if (j - outy == outx) break;
-          output_r += (contextLength - 1) * inputDim;
-        }
-      }
-    }
-    idx += blockSize;
-  }
-}
-
-void hl_context_projection_forward(real* input,
-                                   const int* sequence,
-                                   real* weightData,
-                                   real* output,
-                                   int numSequences,
-                                   int inputDim,
-                                   int contextLength,
-                                   int contextStart,
-                                   int beginPad,
-                                   bool isPadding) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(output);
-  CHECK(!isPadding || weightData);
-
-  int blockSize = 128;
-  int blocksX = numSequences;
-  int blocksY = 1;
-  dim3 threads(blockSize, 1);
-  dim3 grid(blocksX, blocksY);
-
-  if (isPadding) {
-    KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (input, sequence, weightData, output, inputDim,
-       contextLength, contextStart, beginPad);
-  } else  {
-    KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (input, sequence, weightData, output, inputDim,
-       contextLength, contextStart, beginPad);
-  }
-  CHECK_SYNC("hl_context_projection_forward failed");
-}
-
-__global__ void KeContextProjectionBackwardData(real* outputGrad,
-                                                const int* sequence,
-                                                real* inputGrad,
-                                                int inputDim,
-                                                int contextLength,
-                                                int contextStart) {
-  int idx = threadIdx.x;
-  int blockSize = blockDim.x;
-  int sequenceId = blockIdx.x;
-  int seqStart = sequence[sequenceId];
-  int seqEnd = sequence[sequenceId+1];
-  real value = 0;
-
-  int instances = seqEnd - seqStart + contextLength - 1;
-  outputGrad += seqStart * inputDim * contextLength;
-  inputGrad += seqStart * inputDim;
-  for (int k = 0; k <= inputDim / blockSize; k++) {
-    if (idx < inputDim) {
-      for (int i = 0; i < instances; i++) {
-        if ((i + contextStart) < 0) {
-          continue;
-        } else if ((i + contextStart) >= (seqEnd - seqStart)) {
-          continue;
-        } else {
-          // value = 0;
-          value = inputGrad[(i + contextStart) * inputDim + idx];
-        }
-
-        int outx = (i - contextLength) < 0 ? i : (contextLength - 1);
-        int outy = (i - contextLength) < 0 ? 0 : (i - (contextLength - 1));
-        real* output_r =
-          outputGrad + outy * inputDim * contextLength + outx * inputDim;
-        for (int j = outy; j < seqEnd - seqStart; j++) {
-          value += output_r[idx];
-          if (j - outy == outx) break;
-          output_r += (contextLength - 1) * inputDim;
-        }
-        inputGrad[(i + contextStart) * inputDim + idx] = value;
-      }
-    }
-    idx += blockSize;
-  }
-}
-
-void hl_context_projection_backward_data(real* outputGrad,
-                                         const int* sequence,
-                                         real* inputGrad,
-                                         int numSequences,
-                                         int inputDim,
-                                         int contextLength,
-                                         int contextStart) {
-  CHECK_NOTNULL(outputGrad);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(inputGrad);
-
-  int blockSize = 128;
-  int blocksX = numSequences;
-  int blocksY = 1;
-  dim3 threads(blockSize, 1);
-  dim3 grid(blocksX, blocksY);
-  KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
-    (outputGrad, sequence, inputGrad, inputDim, contextLength, contextStart);
-  CHECK_SYNC("hl_context_projection_backward_data failed");
-}
-
-template<int THREADS_X, int THREADS_Y>
-__global__ void KeContextProjectionBackwardWeight(real* outputGrad,
-                                                  const int* sequence,
-                                                  real* weightGrad,
-                                                  int numSequences,
-                                                  int weightDim,
-                                                  int contextLength,
-                                                  int contextStart,
-                                                  int beginPad) {
-  __shared__ real sum_s[THREADS_Y][THREADS_X];
-  int padOfBlock = (weightDim + THREADS_X - 1) / THREADS_X;
-  const int idx = threadIdx.x;
-  const int idy = threadIdx.y;
-  int padId = blockIdx.x / padOfBlock;
-  int weightIdx = idx + THREADS_X * (blockIdx.x % padOfBlock);
-  int instanceId;
-  real value = 0;
-  real* output_r;
-
-  sum_s[idy][idx] = 0.0f;
-  if (weightIdx < weightDim) {
-    for (int seqId = idy; seqId < numSequences; seqId += THREADS_Y) {
-      int seqStart = sequence[seqId];
-      int seqEnd = sequence[seqId+1];
-      output_r = outputGrad + seqStart * weightDim * contextLength;
-
-      if (contextStart < 0) {
-        if (padId + contextStart < 0) {
-          instanceId = padId;
-        } else {
-          // beginPad > 0;
-          instanceId = (padId - beginPad) + (seqEnd - seqStart) - contextStart;
-        }
-      } else {
-        if (padId + (seqEnd - seqStart) < contextStart) {
-          continue;
-        } else {
-          // beginPad == 0;
-          instanceId = padId + (seqEnd - seqStart) - contextStart;
-        }
-      }
-
-      int outx = (instanceId - contextLength) < 0 ?
-                 instanceId : (contextLength - 1);
-      int outy = (instanceId - contextLength) < 0 ?
-                 0 : (instanceId - (contextLength - 1));
-      output_r += outy * weightDim * contextLength + outx * weightDim;
-      for (int j = outy; j < seqEnd - seqStart; j++) {
-        value += output_r[weightIdx];
-        if (j - outy == outx) break;
-        output_r += (contextLength - 1) * weightDim;
-      }
-    }
-    sum_s[idy][idx] = value;
-  }
-  __syncthreads();
-
-  for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
-    if (idy < stride) {
-      sum_s[idy][idx] += sum_s[idy + stride][idx];
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-
-  if (weightIdx < weightDim) {
-    if (idy == 0) {
-      weightGrad[padId * weightDim + weightIdx] += sum_s[0][idx];
-    }
-  }
-}
-
-void hl_context_projection_backward_weight(real* outputGrad,
-                                           const int* sequence,
-                                           real* weightGrad,
-                                           int numSequences,
-                                           int weightDim,
-                                           int totalPad,
-                                           int contextLength,
-                                           int contextStart,
-                                           int beginPad) {
-  CHECK_NOTNULL(outputGrad);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(weightGrad);
-
-  int threadsX = 32;
-  int threadsY = 32;
-  int blocksX = totalPad * ((weightDim + threadsX - 1) / threadsX);
-  dim3 threads(threadsX, threadsY);
-  dim3 grid(blocksX, 1);
-
-  KeContextProjectionBackwardWeight<32, 32>
-    <<< grid, threads, 0, STREAM_DEFAULT >>>
-    (outputGrad, sequence, weightGrad, numSequences, weightDim,
-     contextLength, contextStart, beginPad);
-  CHECK_SYNC("hl_context_projection_backward_weight failed");
-}
-
 template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
 __global__ void KeMatrixAddRows(real* output,
                                 real* table,
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 1de887b7dda611c400d86778a66e311d7934655d..cfa45e117c1d45e36ed0f2b4c11b4806b3730127 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -1,11 +1,11 @@
-file(GLOB h_files . *_op.h)
-file(GLOB cpp_files . *_op.cpp)
+file(GLOB h_files . *Op.h)
+file(GLOB cpp_files . *Op.cpp)
 
 list(APPEND h_files Function.h)
 list(APPEND cpp_files Function.cpp)
 
 if(WITH_GPU)
-    file(GLOB cu_files . *_op_gpu.cu)
+    file(GLOB cu_files . *OpGpu.cu)
     cuda_compile(cu_objs ${cu_files})
 endif()
 
@@ -16,10 +16,15 @@ add_library(paddle_test_main STATIC TestMain.cpp)
 add_dependencies(paddle_test_main ${external_project_dependencies})
 
 if(WITH_GPU)
+if(WITH_TESTING)
     # TODO:
-    # file(GLOB test_files . *_op_test.cpp)
+    # file(GLOB test_files . *OpTest.cpp)
     # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
-    add_simple_unittest(cross_map_normal_op_test)
+    add_simple_unittest(CrossMapNormalOpTest)
+    add_unittest(ContextProjectionOpTest
+        ContextProjectionOpTest.cpp
+        ../gserver/tests/TestUtil.cpp)
+endif()
 endif()
 
 add_style_check_target(paddle_function ${h_files})
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..bd367a859e10c0522206cd0215970922905905ed
--- /dev/null
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -0,0 +1,373 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "ContextProjectionOp.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+
+namespace paddle {
+
+template <>
+void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
+                                               const CpuMatrix* input_mat,
+                                               const CpuMatrix* weight_mat,
+                                               const CpuIVector& seq_vec,
+                                               size_t context_length,
+                                               int context_start,
+                                               size_t begin_pad) {
+  const int* starts = seq_vec.getData();
+  const size_t num_sequences = seq_vec.getSize() - 1;
+  auto w_mat = const_cast<CpuMatrix*>(weight_mat);
+  auto in_mat = const_cast<CpuMatrix*>(input_mat);
+  for (size_t i = 0; i < num_sequences; ++i) {
+    for (size_t j = 0; j < context_length; ++j) {
+      int begin = starts[i] + context_start + j;
+      int end = starts[i + 1] + context_start + j;
+      int dst_begin = starts[i];
+      int dst_end = starts[i + 1];
+      if (begin < starts[i]) {
+        int64_t pad_size =
+            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
+        MatrixPtr mat = out_mat->subMatrix(starts[i], pad_size);
+        if (w_mat) {
+          MatrixPtr sub = w_mat->subMatrix(j, pad_size);
+          mat->addAtOffset(*sub, j * in_mat->getWidth());
+        }
+        dst_begin = starts[i] + pad_size;
+        begin = starts[i];
+      }
+      if (end > starts[i + 1]) {
+        int64_t pad_size =
+            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
+        MatrixPtr mat = out_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
+        if (w_mat) {
+          MatrixPtr sub = w_mat->subMatrix(
+              begin_pad + context_start + j - pad_size, pad_size);
+          mat->addAtOffset(*sub, j * in_mat->getWidth());
+        }
+        dst_end = starts[i + 1] - pad_size;
+        end = starts[i + 1];
+      }
+      if (end <= begin) continue;
+      MatrixPtr src = in_mat->subMatrix(begin, end - begin);
+      MatrixPtr dst = out_mat->subMatrix(dst_begin, dst_end - dst_begin);
+      dst->addAtOffset(*src, j * in_mat->getWidth());
+    }
+  }
+}
+
+/**
+ * \param inputs[0] input value.
+ * \param inputs[1] input weight.
+ * \param inputs[2] input sequence.
+ * \param outputs[0] output value.
+ */
+template <DeviceType Device>
+class ContextProjectionForwardFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+  }
+
+  void calc(const Arguments& inputs,
+            const Arguments& outputs,
+            const Arguments& inouts) override {
+    CHECK_EQ(3, inputs.size());
+    CHECK_EQ(1, outputs.size());
+    CHECK_EQ(0, inouts.size());
+
+    CHECK(outputs[0].getData() && inputs[0].getData() && inputs[2].getData());
+    CHECK_EQ(outputs[0].dims_.size(), 2);
+    CHECK_EQ(inputs[0].dims_.size(), 2);
+    CHECK_EQ(inputs[1].dims_.size(), 2);
+    CHECK_EQ(inputs[2].dims_.size(), 1);
+    /// dim of output = dim of input * context_length
+    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+    /// dim of input == dim of weight
+    CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
+    /// input and output has the same batch_size
+    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+
+    auto out_mat = std::make_shared<typename MatrixT<Device>::type>(
+        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
+    const auto in_mat = std::make_shared<typename MatrixT<Device>::type>(
+        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
+    const auto w_mat =
+        !inputs[1].getData()
+            ? nullptr
+            : std::make_shared<typename MatrixT<Device>::type>(
+                  inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
+    typename SequenceT<Device>::type seq_vec(
+        inputs[2].dims_[0], reinterpret_cast<int*>(inputs[2].getData()));
+
+    ContextProjectionForward<Device>(out_mat.get(),
+                                     in_mat.get(),
+                                     w_mat.get(),
+                                     seq_vec,
+                                     context_length_,
+                                     context_start_,
+                                     begin_pad_);
+  }
+
+private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+};
+
+template <>
+void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
+                                                CpuMatrix* in_grad_mat,
+                                                CpuMatrix* w_grad_mat,
+                                                const CpuIVector& seq_vec,
+                                                size_t context_length,
+                                                int context_start,
+                                                size_t begin_pad,
+                                                bool is_padding,
+                                                size_t total_pad) {
+  CHECK(out_grad_mat);
+  size_t input_dim = in_grad_mat ? in_grad_mat->getWidth()
+                                 : w_grad_mat ? w_grad_mat->getWidth() : 0;
+  const int* starts = seq_vec.getData();
+  size_t num_sequences = seq_vec.getSize() - 1;
+  for (size_t i = 0; i < num_sequences; ++i) {
+    for (size_t j = 0; j < context_length; ++j) {
+      int begin = starts[i] + context_start + j;
+      int end = starts[i + 1] + context_start + j;
+      int dst_begin = starts[i];
+      int dst_end = starts[i + 1];
+      if (begin < starts[i]) {
+        int64_t pad_size =
+            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
+        if (is_padding && w_grad_mat) {
+          MatrixPtr mat = out_grad_mat->subMatrix(starts[i], pad_size);
+          MatrixPtr sub = w_grad_mat->subMatrix(j, pad_size);
+          sub->addAtOffset(*mat, j * input_dim);
+        }
+        dst_begin = starts[i] + pad_size;
+        begin = starts[i];
+      }
+      if (end > starts[i + 1]) {
+        int64_t pad_size =
+            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
+        if (is_padding && w_grad_mat) {
+          MatrixPtr mat =
+              out_grad_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
+          MatrixPtr sub = w_grad_mat->subMatrix(
+              begin_pad + context_start + j - pad_size, pad_size);
+          sub->addAtOffset(*mat, j * input_dim);
+        }
+        dst_end = starts[i + 1] - pad_size;
+        end = starts[i + 1];
+      }
+      if (end <= begin) continue;
+      if (!in_grad_mat) continue;
+      MatrixPtr src = in_grad_mat->subMatrix(begin, end - begin);
+      MatrixPtr dst = out_grad_mat->subMatrix(dst_begin, dst_end - dst_begin);
+      src->addAtOffset(*dst, j * input_dim);
+    }
+  }
+}
+
+/**
+ * \param inputs[0] input grad.
+ * \param inputs[1] weight grad.
+ * \param inputs[2] input sequence.
+ * \param outputs[0] output value.
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+    is_padding_ = config.get<bool>("is_padding");
+    total_pad_ = config.get<size_t>("total_pad");
+  }
+
+  void calc(const Arguments& inputs,
+            const Arguments& outputs,
+            const Arguments& inouts) override {
+    CHECK_EQ(3, inputs.size());
+    CHECK_EQ(1, outputs.size());
+    CHECK_EQ(0, inouts.size());
+
+    CHECK(outputs[0].getData() && inputs[2].getData());
+    CHECK_EQ(outputs[0].dims_.size(), 2);
+    CHECK_EQ(inputs[0].dims_.size(), 2);
+    CHECK_EQ(inputs[1].dims_.size(), 2);
+    CHECK_EQ(inputs[2].dims_.size(), 1);
+
+    /// dim of input == dim of weight
+    CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
+    /// input and output has the same batch_size
+    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+    /// dim of output = dim of input * context_length
+    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+
+    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
+    auto in_grad_mat =
+        !inputs[0].getData()
+            ? nullptr
+            : std::make_shared<typename MatrixT<Device>::type>(
+                  inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
+    auto w_grad_mat =
+        !inputs[1].getData()
+            ? nullptr
+            : std::make_shared<typename MatrixT<Device>::type>(
+                  inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
+    typename SequenceT<Device>::type seq_vec(
+        inputs[2].dims_[0], reinterpret_cast<int*>(inputs[2].getData()));
+
+    ContextProjectionBackward<Device>(out_grad_mat.get(),
+                                      in_grad_mat ? in_grad_mat.get() : nullptr,
+                                      w_grad_mat ? w_grad_mat.get() : nullptr,
+                                      seq_vec,
+                                      context_length_,
+                                      context_start_,
+                                      begin_pad_,
+                                      is_padding_,
+                                      total_pad_);
+  }
+
+private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+  bool is_padding_;
+  size_t total_pad_;
+};
+
+/**
+ * \param inputs[0] input grad.
+ * \param inputs[1] input sequence.
+ * \param outputs[0] output grad.
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardDataFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+  }
+
+  void calc(const Arguments& inputs,
+            const Arguments& outputs,
+            const Arguments& inouts) override {
+    CHECK_EQ(2, inputs.size());
+    CHECK_EQ(1, outputs.size());
+    CHECK_EQ(0, inouts.size());
+    CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
+    CHECK_EQ(outputs[0].dims_.size(), 2);
+    CHECK_EQ(inputs[0].dims_.size(), 2);
+    CHECK_EQ(inputs[1].dims_.size(), 1);
+    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+    /// input and output has the same batch_size
+    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+
+    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
+    const auto in_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
+    typename SequenceT<Device>::type seq_vec(
+        inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
+
+    ContextProjectionBackwardData<Device>(out_grad_mat.get(),
+                                          in_grad_mat.get(),
+                                          seq_vec,
+                                          context_length_,
+                                          context_start_);
+  }
+
+private:
+  size_t context_length_;
+  int context_start_;
+};
+
+/**
+ * \param inputs[0] weight grad.
+ * \param inputs[1] input sequence.
+ * \param outputs[0] output grad.
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardWeightFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+    total_pad_ = config.get<size_t>("total_pad");
+  }
+
+  void calc(const Arguments& inputs,
+            const Arguments& outputs,
+            const Arguments& inouts) override {
+    CHECK_EQ(2, inputs.size());
+    CHECK_EQ(1, outputs.size());
+    CHECK_EQ(0, inouts.size());
+
+    CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
+    CHECK_EQ(outputs[0].dims_.size(), 2);
+    CHECK_EQ(inputs[0].dims_.size(), 2);
+    CHECK_EQ(inputs[1].dims_.size(), 1);
+    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+
+    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
+    auto w_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
+    typename SequenceT<Device>::type seq_vec(
+        inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
+
+    ContextProjectionBackwardWeight<Device>(out_grad_mat.get(),
+                                            w_grad_mat.get(),
+                                            seq_vec,
+                                            context_length_,
+                                            context_start_,
+                                            total_pad_,
+                                            begin_pad_);
+  }
+
+private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+  size_t total_pad_;
+};
+
+REGISTER_TYPED_FUNC(ContextProjectionForward,
+                    CPU,
+                    ContextProjectionForwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackward,
+                    CPU,
+                    ContextProjectionBackwardFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(ContextProjectionForward,
+                    GPU,
+                    ContextProjectionForwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackward,
+                    GPU,
+                    ContextProjectionBackwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
+                    GPU,
+                    ContextProjectionBackwardDataFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
+                    GPU,
+                    ContextProjectionBackwardWeightFunc);
+#endif
+}  // namespace paddle
diff --git a/paddle/function/ContextProjectionOp.h b/paddle/function/ContextProjectionOp.h
new file mode 100644
index 0000000000000000000000000000000000000000..93eb050fde35f474750f3c2efa72b7471f654b75
--- /dev/null
+++ b/paddle/function/ContextProjectionOp.h
@@ -0,0 +1,85 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "Function.h"
+
+namespace paddle {
+
+/**
+ * \brief   Context Projection Forward.
+ *
+ * \param[out]  outputs           output data.
+ * \param[in]   input             input data.
+ * \param[in]   weight            input weight.
+ * \param[in]   sequence          input data.
+ * \param[in]   context_length    consecutive rows for concatenation.
+ * \param[in]   context_start     context start position.
+ * \param[in]   begin_pad         begining pad position.
+ * \param[in]   is_padding        whether padding 0 or not.
+ *
+ */
+template <DeviceType Device>
+void ContextProjectionForward(typename MatrixT<Device>::type* output,
+                              const typename MatrixT<Device>::type* input,
+                              const typename MatrixT<Device>::type* weight,
+                              const typename SequenceT<Device>::type& sequence,
+                              size_t context_length,
+                              int context_start,
+                              size_t begin_pad);
+
+/**
+ * \brief   Context Projection Backward.
+ *
+ * \param[out]  outputs           output gradient.
+ * \param[in]   input             input gradient.
+ * \param[in]   weight            input weight gradient.
+ * \param[in]   sequence          input data.
+ * \param[in]   context_length    consecutive rows for concatenation.
+ * \param[in]   context_start     context start position.
+ * \param[in]   begin_pad         begining pad position.
+ * \param[in]   is_padding        whether padding 0 or not.
+ *
+ */
+template <DeviceType Device>
+void ContextProjectionBackward(typename MatrixT<Device>::type* out_grad,
+                               typename MatrixT<Device>::type* in_grad,
+                               typename MatrixT<Device>::type* w_grad,
+                               const typename SequenceT<Device>::type& seq_vec,
+                               size_t context_length,
+                               int context_start,
+                               size_t begin_pad,
+                               bool is_padding,
+                               size_t total_pad);
+
+template <DeviceType Device>
+void ContextProjectionBackwardData(
+    typename MatrixT<Device>::type* out_grad,
+    typename MatrixT<Device>::type* in_grad,
+    const typename SequenceT<Device>::type& sequence,
+    size_t context_length,
+    int context_start);
+
+template <DeviceType Device>
+void ContextProjectionBackwardWeight(
+    typename MatrixT<Device>::type* out_grad,
+    typename MatrixT<Device>::type* w_grad,
+    const typename SequenceT<Device>::type& seq_vec,
+    size_t context_length,
+    int context_start,
+    size_t total_pad,
+    size_t begin_pad);
+
+}  // namespace paddle
diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/function/ContextProjectionOpGpu.cu
new file mode 100644
index 0000000000000000000000000000000000000000..1ec7058f96c8200728e5add051d5fa6a77a97e36
--- /dev/null
+++ b/paddle/function/ContextProjectionOpGpu.cu
@@ -0,0 +1,401 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "hl_base.h"
+#include "ContextProjectionOp.h"
+
+namespace paddle {
+
+template <bool padding>
+__global__ void KeContextProjectionForward(const real* input,
+                                           const int* sequence,
+                                           const real* weight,
+                                           real* output,
+                                           int input_dim,
+                                           int context_length,
+                                           int context_start,
+                                           int begin_pad) {
+  int idx = threadIdx.x;
+  int block_size = blockDim.x;
+  int sequenceId = blockIdx.x;
+  int seq_start = sequence[sequenceId];
+  int seq_end = sequence[sequenceId+1];
+  real value = 0;
+
+  int instances = seq_end - seq_start + context_length - 1;
+  output += seq_start * input_dim * context_length;
+  input += seq_start * input_dim;
+  for (int k = 0; k <= input_dim / block_size; k++) {
+    if (idx < input_dim) {
+      for (int i = 0; i < instances; i++) {
+        // i + context_start;
+        if ((i + context_start) < 0) {
+          if (padding) {
+            value = weight[i * input_dim + idx];
+          } else {
+            continue;
+          }
+        } else if ((i + context_start) >= (seq_end - seq_start)) {
+          if (padding) {
+            value =
+              weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
+                         input_dim + idx];
+          } else {
+            continue;
+          }
+        } else {
+          value = input[(i + context_start) * input_dim + idx];
+        }
+
+        int outx = (i - context_length) < 0 ? i : (context_length - 1);
+        int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
+        real* output_r =
+          output + outy * input_dim * context_length + outx * input_dim;
+        for (int j = outy; j < seq_end - seq_start; j++) {
+          output_r[idx] += value;
+          if (j - outy == outx) break;
+          output_r += (context_length - 1) * input_dim;
+        }
+      }
+    }
+    idx += block_size;
+  }
+}
+
+/**
+ * @brief   Context projection forward.
+ *
+ * @param[in]   input           input sequence.
+ * @param[in]   sequence        sequence index.
+ * @param[in]   weight          padding data.
+ * @param[out]  output          output sequence.
+ * @param[in]   num_sequences    number of sequences.
+ * @param[in]   input_dim        input sequence dimension.
+ * @param[in]   context_length   context length.
+ * @param[in]   context_start    context start.
+ * @param[in]   begin_pad        number of extra timesteps added at the
+ * beginning.
+ *
+ */
+void hl_context_projection_forward(const real* input,
+                                   const int* sequence,
+                                   const real* weight,
+                                   real* output,
+                                   size_t num_sequences,
+                                   size_t input_dim,
+                                   size_t context_length,
+                                   int context_start,
+                                   size_t begin_pad) {
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(output);
+
+  int block_size = 128;
+  int blocks_x = num_sequences;
+  int blocks_y = 1;
+  dim3 threads(block_size, 1);
+  dim3 grid(blocks_x, blocks_y);
+
+  if (weight) {
+    KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
+      (input, sequence, weight, output, input_dim,
+       context_length, context_start, begin_pad);
+  } else  {
+    KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
+      (input, sequence, weight, output, input_dim,
+       context_length, context_start, begin_pad);
+  }
+  CHECK_SYNC("hl_context_projection_forward failed");
+}
+
+template <>
+void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix* output,
+                                               const GpuMatrix* input,
+                                               const GpuMatrix* weight,
+                                               const GpuIVector& sequence,
+                                               size_t context_length,
+                                               int context_start,
+                                               size_t begin_pad) {
+  CHECK(input && output);
+  hl_context_projection_forward(input->getData(),
+                                sequence.getData(),
+                                weight ? weight->getData() : nullptr,
+                                output->getData(),
+                                sequence.getSize() - 1,
+                                input->getWidth(),
+                                context_length,
+                                context_start,
+                                begin_pad);
+}
+
+__global__ void KeContextProjectionBackwardData(real* out_grad,
+                                                const int* sequence,
+                                                real* in_grad,
+                                                int input_dim,
+                                                int context_length,
+                                                int context_start) {
+  int idx = threadIdx.x;
+  int block_size = blockDim.x;
+  int sequenceId = blockIdx.x;
+  int seq_start = sequence[sequenceId];
+  int seq_end = sequence[sequenceId+1];
+  real value = 0;
+
+  int instances = seq_end - seq_start + context_length - 1;
+  out_grad += seq_start * input_dim * context_length;
+  in_grad += seq_start * input_dim;
+  for (int k = 0; k <= input_dim / block_size; k++) {
+    if (idx < input_dim) {
+      for (int i = 0; i < instances; i++) {
+        if ((i + context_start) < 0) {
+          continue;
+        } else if ((i + context_start) >= (seq_end - seq_start)) {
+          continue;
+        } else {
+          // value = 0;
+          value = in_grad[(i + context_start) * input_dim + idx];
+        }
+
+        int outx = (i - context_length) < 0 ? i : (context_length - 1);
+        int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
+        real* output_r =
+          out_grad + outy * input_dim * context_length + outx * input_dim;
+        for (int j = outy; j < seq_end - seq_start; j++) {
+          value += output_r[idx];
+          if (j - outy == outx) break;
+          output_r += (context_length - 1) * input_dim;
+        }
+        in_grad[(i + context_start) * input_dim + idx] = value;
+      }
+    }
+    idx += block_size;
+  }
+}
+
+/**
+ * @brief   Context projection backward data.
+ *
+ * @param[in]   out_grad         output gradient.
+ * @param[in]   sequence         sequence index.
+ * @param[out]  input_grad       input gradient.
+ * @param[in]   num_sequences    number of sequences.
+ * @param[in]   input_dim        input sequence dimension.
+ * @param[in]   context_length   context length.
+ * @param[in]   context_start    context start.
+ *
+ */
+void hl_context_projection_backward_data(real* out_grad,
+                                         const int* sequence,
+                                         real* input_grad,
+                                         size_t num_sequences,
+                                         size_t input_dim,
+                                         size_t context_length,
+                                         int context_start) {
+  CHECK_NOTNULL(out_grad);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(input_grad);
+
+  int block_size = 128;
+  int blocks_x = num_sequences;
+  int blocks_y = 1;
+  dim3 threads(block_size, 1);
+  dim3 grid(blocks_x, blocks_y);
+  KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
+    (out_grad, sequence, input_grad, input_dim, context_length, context_start);
+  CHECK_SYNC("hl_context_projection_backward_data failed");
+}
+
+template <>
+void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix* out_grad,
+                                                    GpuMatrix* in_grad,
+                                                    const GpuIVector& sequence,
+                                                    size_t context_length,
+                                                    int context_start) {
+  CHECK(in_grad && out_grad);
+  hl_context_projection_backward_data(out_grad->getData(),
+                                      sequence.getData(),
+                                      in_grad->getData(),
+                                      sequence.getSize() - 1,
+                                      in_grad->getWidth(),
+                                      context_length,
+                                      context_start);
+}
+
+template<int THREADS_X, int THREADS_Y>
+__global__ void KeContextProjectionBackwardWeight(real* out_grad,
+                                                  const int* sequence,
+                                                  real* w_grad,
+                                                  int num_sequences,
+                                                  int w_dim,
+                                                  int context_length,
+                                                  int context_start,
+                                                  int begin_pad) {
+  __shared__ real sum_s[THREADS_Y][THREADS_X];
+  int pad_of_block = (w_dim + THREADS_X - 1) / THREADS_X;
+  const int idx = threadIdx.x;
+  const int idy = threadIdx.y;
+  int padId = blockIdx.x / pad_of_block;
+  int weight_idx = idx + THREADS_X * (blockIdx.x % pad_of_block);
+  int instanceId;
+  real value = 0;
+  real* output_r;
+
+  sum_s[idy][idx] = 0.0f;
+  if (weight_idx < w_dim) {
+    for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
+      int seq_start = sequence[seqId];
+      int seq_end = sequence[seqId+1];
+      output_r = out_grad + seq_start * w_dim * context_length;
+
+      if (context_start < 0) {
+        if (padId + context_start < 0) {
+          instanceId = padId;
+        } else {
+          // begin_pad > 0;
+          instanceId = (padId - begin_pad) +
+            (seq_end - seq_start) - context_start;
+        }
+      } else {
+        if (padId + (seq_end - seq_start) < context_start) {
+          continue;
+        } else {
+          // begin_pad == 0;
+          instanceId = padId + (seq_end - seq_start) - context_start;
+        }
+      }
+
+      int outx = (instanceId - context_length) < 0 ?
+                 instanceId : (context_length - 1);
+      int outy = (instanceId - context_length) < 0 ?
+                 0 : (instanceId - (context_length - 1));
+      output_r += outy * w_dim * context_length + outx * w_dim;
+      for (int j = outy; j < seq_end - seq_start; j++) {
+        value += output_r[weight_idx];
+        if (j - outy == outx) break;
+        output_r += (context_length - 1) * w_dim;
+      }
+    }
+    sum_s[idy][idx] = value;
+  }
+  __syncthreads();
+
+  for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
+    if (idy < stride) {
+      sum_s[idy][idx] += sum_s[idy + stride][idx];
+    }
+    __syncthreads();
+  }
+  __syncthreads();
+
+  if (weight_idx < w_dim) {
+    if (idy == 0) {
+      w_grad[padId * w_dim + weight_idx] += sum_s[0][idx];
+    }
+  }
+}
+
+/**
+ * @brief   Context projection backward weight.
+ *
+ * @param[in]   out_grad         output gradient.
+ * @param[in]   sequence         sequence index.
+ * @param[out]  w_grad           weight gradient.
+ * @param[in]   num_sequences    number of sequences.
+ * @param[in]   w_dim            input sequence dimension.
+ * @param[in]   total_pad        number of extra timesteps.
+ * @param[in]   context_length   context length.
+ * @param[in]   context_start    context start.
+ * @param[in]   begin_pad        number of extra timesteps added at the
+ * beginning.
+ *
+ */
+void hl_context_projection_backward_weight(real* out_grad,
+                                           const int* sequence,
+                                           real* w_grad,
+                                           size_t num_sequences,
+                                           size_t w_dim,
+                                           size_t total_pad,
+                                           size_t context_length,
+                                           int context_start,
+                                           size_t begin_pad) {
+  CHECK_NOTNULL(out_grad);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(w_grad);
+
+  int threads_x = 32;
+  int threads_y = 32;
+  int blocks_x = total_pad * ((w_dim + threads_x - 1) / threads_x);
+  dim3 threads(threads_x, threads_y);
+  dim3 grid(blocks_x, 1);
+
+  KeContextProjectionBackwardWeight<32, 32>
+    <<< grid, threads, 0, STREAM_DEFAULT >>>
+    (out_grad, sequence, w_grad, num_sequences, w_dim,
+     context_length, context_start, begin_pad);
+  CHECK_SYNC("hl_context_projection_backward_weight failed");
+}
+
+template <>
+void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
+        GpuMatrix* out_grad,
+        GpuMatrix* w_grad,
+        const GpuIVector& seq_vec,
+        size_t context_length,
+        int context_start,
+        size_t total_pad,
+        size_t begin_pad) {
+  CHECK(out_grad && w_grad);
+  hl_context_projection_backward_weight(out_grad->getData(),
+                                        seq_vec.getData(),
+                                        w_grad->getData(),
+                                        seq_vec.getSize() - 1,
+                                        w_grad->getWidth(),
+                                        total_pad,
+                                        context_length,
+                                        context_start,
+                                        begin_pad);
+}
+
+template <>
+void ContextProjectionBackward<DEVICE_TYPE_GPU>(GpuMatrix* out_grad,
+                                                GpuMatrix* in_grad,
+                                                GpuMatrix* w_grad,
+                                                const GpuIVector& sequence,
+                                                size_t context_length,
+                                                int context_start,
+                                                size_t begin_pad,
+                                                bool is_padding,
+                                                size_t total_pad) {
+    CHECK(out_grad);
+    if (in_grad) {
+        ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
+                out_grad,
+                in_grad,
+                sequence,
+                context_length,
+                context_start);
+    }
+    if (is_padding && w_grad) {
+        ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
+                out_grad,
+                w_grad,
+                sequence,
+                context_length,
+                context_start,
+                total_pad,
+                begin_pad);
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..359428fc03d698145cb880bd735c908838f96f56
--- /dev/null
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -0,0 +1,172 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/gserver/tests/TestUtil.h"
+#include "paddle/math/Matrix.h"
+
+using namespace paddle;  // NOLINT
+
+void testMatrixProjectionForward(int context_start,
+                                 size_t context_length,
+                                 bool is_padding,
+                                 size_t batch_size,
+                                 size_t input_dim) {
+  size_t pad = std::max(0, -context_start) +
+               std::max(0, (int)(context_start + context_length - 1));
+  if (pad == 0) is_padding = false;
+
+  FunctionCompare compare("ContextProjectionForward",
+                          FuncConfig()
+                              .set("context_length", context_length)
+                              .set("context_start", context_start)
+                              .set("begin_pad", std::max(0, -context_start)));
+
+  CpuMatrix cpu_in(batch_size, input_dim);
+  cpu_in.randomizeUniform();
+  GpuMatrix gpu_in(batch_size, input_dim);
+  gpu_in.copyFrom(cpu_in);
+  auto cpu_weight =
+      is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
+  auto gpu_weight =
+      is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
+  if (is_padding) {
+    cpu_weight->randomizeUniform();
+    gpu_weight->copyFrom(*cpu_weight);
+  }
+  IVectorPtr cpu_seq;
+  generateSequenceStartPositions(batch_size, cpu_seq);
+  IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
+  gpu_seq->copyFrom(*cpu_seq);
+
+  CpuMatrix cpu_out(batch_size, input_dim * context_length);
+  GpuMatrix gpu_out(batch_size, input_dim * context_length);
+  cpu_out.randomizeUniform();
+  gpu_out.copyFrom(cpu_out);
+
+  compare.getCpuFunction()->calc(
+      {Tensor(cpu_in.getData(), Dims{batch_size, input_dim}),
+       Tensor(cpu_weight ? cpu_weight->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
+              Dims{cpu_seq->getSize()})},
+      {Tensor(cpu_out.getData(), Dims{batch_size, input_dim * context_length})},
+      {});
+  compare.getGpuFunction()->calc(
+      {Tensor(gpu_in.getData(), Dims{batch_size, input_dim}),
+       Tensor(gpu_weight ? gpu_weight->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
+              Dims{gpu_seq->getSize()})},
+      {Tensor(gpu_out.getData(), Dims{batch_size, input_dim * context_length})},
+      {});
+
+  autotest::TensorCheckEqual(cpu_out, gpu_out);
+}
+
+void testMatrixProjectionBackward(int context_start,
+                                  int context_length,
+                                  bool is_padding,
+                                  size_t batch_size,
+                                  size_t input_dim) {
+  size_t pad = std::max(0, -context_start) +
+               std::max(0, (int)(context_start + context_length - 1));
+  if (pad == 0) is_padding = false;
+
+  FunctionCompare compare("ContextProjectionBackward",
+                          FuncConfig()
+                              .set("context_length", context_length)
+                              .set("context_start", context_start)
+                              .set("begin_pad", std::max(0, -context_start))
+                              .set("is_padding", is_padding)
+                              .set("total_pad", pad));
+
+  CpuMatrix cpu_in_grad(batch_size, input_dim);
+  cpu_in_grad.randomizeUniform();
+  GpuMatrix gpu_in_grad(batch_size, input_dim);
+  gpu_in_grad.copyFrom(cpu_in_grad);
+
+  CpuMatrix cpu_out_grad(batch_size, input_dim * context_length);
+  cpu_out_grad.randomizeUniform();
+  GpuMatrix gpu_out_grad(batch_size, input_dim * context_length);
+  gpu_out_grad.copyFrom(cpu_out_grad);
+
+  IVectorPtr cpu_seq;
+  generateSequenceStartPositions(batch_size, cpu_seq);
+  IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
+  gpu_seq->copyFrom(*cpu_seq);
+
+  auto cpu_w_grad =
+      is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
+  auto gpu_w_grad =
+      is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
+  if (is_padding) {
+    cpu_w_grad->randomizeUniform();
+    gpu_w_grad->copyFrom(*cpu_w_grad);
+  }
+
+  compare.getCpuFunction()->calc(
+      {Tensor(cpu_in_grad.getData(), Dims{batch_size, input_dim}),
+       Tensor(cpu_w_grad ? cpu_w_grad->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
+              Dims{cpu_seq->getSize()})},
+      {Tensor(cpu_out_grad.getData(),
+              Dims{batch_size, input_dim * context_length})},
+      {});
+
+  compare.getGpuFunction()->calc(
+      {Tensor(gpu_in_grad.getData(), Dims{batch_size, input_dim}),
+       Tensor(gpu_w_grad ? gpu_w_grad->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
+              Dims{gpu_seq->getSize()})},
+      {Tensor(gpu_out_grad.getData(),
+              Dims{batch_size, input_dim * context_length})},
+      {});
+
+  autotest::TensorCheckErr(cpu_in_grad, gpu_in_grad);
+  if (is_padding) {
+    autotest::TensorCheckErr(*cpu_w_grad, *gpu_w_grad);
+  }
+}
+
+TEST(ContextProjection, projection) {
+  for (auto context_start : {-5, -3, -1, 0, 3}) {
+    for (auto context_length : {1, 2, 5, 7}) {
+      for (auto trainable_padding : {false, true}) {
+        for (auto batch_size : {1, 2, 5, 20, 100}) {
+          for (auto input_dim : {15, 32, 63, 128, 200}) {
+            VLOG(3) << " context_start=" << context_start
+                    << " context_length=" << context_length
+                    << " trainable_padding=" << trainable_padding
+                    << " batch_size=" << batch_size
+                    << " input_dim=" << input_dim;
+            testMatrixProjectionForward(context_start,
+                                        context_length,
+                                        trainable_padding,
+                                        batch_size,
+                                        input_dim);
+            testMatrixProjectionBackward(context_start,
+                                         context_length,
+                                         trainable_padding,
+                                         batch_size,
+                                         input_dim);
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/paddle/function/cross_map_normal_op.cpp b/paddle/function/CrossMapNormalOp.cpp
similarity index 99%
rename from paddle/function/cross_map_normal_op.cpp
rename to paddle/function/CrossMapNormalOp.cpp
index 74094bc4fc8052aba0ae955217311e28eda7c2a7..96a7a30eebbf0f01fa89ea91110ddb826fd2f64b 100644
--- a/paddle/function/cross_map_normal_op.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "cross_map_normal_op.h"
+#include "CrossMapNormalOp.h"
 #include "paddle/math/Vector.h"
 
 namespace paddle {
diff --git a/paddle/function/cross_map_normal_op.h b/paddle/function/CrossMapNormalOp.h
similarity index 100%
rename from paddle/function/cross_map_normal_op.h
rename to paddle/function/CrossMapNormalOp.h
diff --git a/paddle/function/cross_map_normal_op_gpu.cu b/paddle/function/CrossMapNormalOpGpu.cu
similarity index 99%
rename from paddle/function/cross_map_normal_op_gpu.cu
rename to paddle/function/CrossMapNormalOpGpu.cu
index aae4f461b6f57de6cadfe7c3a6d684c613cc037f..b33dd108348b7789c6e73bfe3b1ffbc448163ef7 100644
--- a/paddle/function/cross_map_normal_op_gpu.cu
+++ b/paddle/function/CrossMapNormalOpGpu.cu
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "hl_base.h"
-#include "cross_map_normal_op.h"
+#include "CrossMapNormalOp.h"
 
 namespace paddle {
 
diff --git a/paddle/function/cross_map_normal_op_test.cpp b/paddle/function/CrossMapNormalOpTest.cpp
similarity index 98%
rename from paddle/function/cross_map_normal_op_test.cpp
rename to paddle/function/CrossMapNormalOpTest.cpp
index 22692691bdb64c23cbd2a479b2afb919672554f7..d65d9310affd7c9b7fee3118c79449870849c243 100644
--- a/paddle/function/cross_map_normal_op_test.cpp
+++ b/paddle/function/CrossMapNormalOpTest.cpp
@@ -15,6 +15,8 @@ limitations under the License. */
 #include <gtest/gtest.h>
 #include "FunctionTest.h"
 
+namespace paddle {
+
 TEST(CrossMapNormal, real) {
   for (size_t numSamples : {5, 32}) {
     for (size_t channels : {1, 5, 32}) {
@@ -69,3 +71,5 @@ TEST(CrossMapNormalGrad, real) {
     }
   }
 }
+
+}  // namespace paddle
diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp
index 02880e5ea1acb85d8685f865a5745f7090db03d2..6f82a8d053bc203eed44bd0d8d4c47d23a15268d 100644
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -30,20 +30,48 @@ real FuncConfig::get<real>(const std::string& key) const {
   return it->second.r;
 }
 
+template <>
+int FuncConfig::get<int>(const std::string& key) const {
+  auto it = valueMap_.find(key);
+  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
+  return it->second.i;
+}
+
+template <>
+bool FuncConfig::get<bool>(const std::string& key) const {
+  auto it = valueMap_.find(key);
+  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
+  return it->second.b;
+}
+
 template <>
 FuncConfig& FuncConfig::set<size_t>(const std::string& key, size_t v) {
-  CHECK(valueMap_.count(key) == 0) << "Duplicated value: " << key;
+  CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
   valueMap_[key].s = v;
   return *this;
 }
 
 template <>
 FuncConfig& FuncConfig::set<real>(const std::string& key, real v) {
-  CHECK(valueMap_.count(key) == 0) << "Duplicated value: " << key;
+  CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
   valueMap_[key].r = v;
   return *this;
 }
 
+template <>
+FuncConfig& FuncConfig::set<int>(const std::string& key, int v) {
+  CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
+  valueMap_[key].i = v;
+  return *this;
+}
+
+template <>
+FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
+  CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
+  valueMap_[key].b = v;
+  return *this;
+}
+
 ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
 
 }  // namespace paddle
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 095584c0b19f7a0b7d8787a0bc6bbdd78d785eed..9e8cbb8e48c30e80c5057fc53c050b67d3957188 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -40,6 +40,19 @@ struct MatrixT<DEVICE_TYPE_GPU> {
   using type = GpuMatrix;
 };
 
+template <DeviceType Device>
+struct SequenceT;
+
+template <>
+struct SequenceT<DEVICE_TYPE_CPU> {
+  using type = CpuIVector;
+};
+
+template <>
+struct SequenceT<DEVICE_TYPE_GPU> {
+  using type = GpuIVector;
+};
+
 typedef std::vector<size_t> Dims;
 
 class Tensor {
@@ -59,6 +72,8 @@ public:
   union value {
     size_t s;
     real r;
+    int i;
+    bool b;
   };
 
   template <typename T>
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index a8c5e412bd12df2ea0b4d6bd67072fb7d08591fe..32131037f6de4a9f7a3ebf8f5773eccd65dc2cdb 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -33,25 +33,33 @@ public:
     // init cpu and gpu arguments
     auto initArgs = [=](
         Arguments& cpuArgs, Arguments& gpuArgs, const Arguments& inArgs) {
-      for (auto arg : inArgs) {
+      for (const auto arg : inArgs) {
         size_t size = sizeof(real);
-        for (auto dim : arg.dims_) {
+        for (const auto dim : arg.dims_) {
           size *= dim;
         }
-        cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-        gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
-        cpuArgs.emplace_back(
-            Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
-        gpuArgs.emplace_back(
-            Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
-
-        // will use an api to refactor this code.
-        CpuVector cpuVector(size / sizeof(real),
-                            (real*)cpuArgs.back().getData());
-        GpuVector gpuVector(size / sizeof(real),
-                            (real*)gpuArgs.back().getData());
-        cpuVector.uniform(0.001, 1);
-        gpuVector.copyFrom(cpuVector);
+        if (arg.getData()) {
+          // todo(tianbing), waste unnecessary mem here
+          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+          cpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
+          gpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
+          // already init outside
+        } else {
+          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+          cpuArgs.emplace_back(
+              Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
+          gpuArgs.emplace_back(
+              Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
+          // will use an api to refactor this code.
+          CpuVector cpuVector(size / sizeof(real),
+                              (real*)cpuArgs.back().getData());
+          GpuVector gpuVector(size / sizeof(real),
+                              (real*)gpuArgs.back().getData());
+          cpuVector.uniform(0.001, 1);
+          gpuVector.copyFrom(cpuVector);
+        }
       }
     };
     initArgs(cpuInputs, gpuInputs, inputs);
@@ -81,6 +89,10 @@ public:
     checkArgs(cpuInouts, gpuInouts);
   }
 
+  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpu; }
+
+  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpu; }
+
 protected:
   std::shared_ptr<FunctionBase> cpu;
   std::shared_ptr<FunctionBase> gpu;
@@ -95,8 +107,3 @@ protected:
 };
 
 }  // namespace paddle
-
-using paddle::FunctionCompare;
-using paddle::FuncConfig;
-using paddle::Dims;
-using paddle::Tensor;
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index 51c0ae5cc9523debffa4bdfe44fe0df0c56839c2..e947b2b9ecbebda11db5c049e1606a2d5926c28c 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -38,6 +38,32 @@ ContextProjection::ContextProjection(const ProjectionConfig& config,
     CHECK_EQ(inputDim * totalPad, parameter->getSize());
     weight_.reset(new Weight(totalPad, inputDim, parameter));
   }
+  // init forward_ and backward_ functions
+  init();
+}
+
+bool ContextProjection::init() {
+  size_t context_length = config_.context_length();
+  int context_start = config_.context_start();
+  bool is_padding = config_.trainable_padding();
+  size_t total_pad = is_padding ? beginPad_ + endPad_ : 0;
+
+  createFunction(forward_,
+                 "ContextProjectionForward",
+                 FuncConfig()
+                     .set("context_length", context_length)
+                     .set("context_start", context_start)
+                     .set("begin_pad", beginPad_));
+  createFunction(backward_,
+                 "ContextProjectionBackward",
+                 FuncConfig()
+                     .set("context_length", context_length)
+                     .set("context_start", context_start)
+                     .set("begin_pad", beginPad_)
+                     .set("is_padding", is_padding)
+                     .set("total_pad", total_pad));
+
+  return true;
 }
 
 void ContextProjection::resetState() {
@@ -78,25 +104,29 @@ LayerStatePtr ContextProjection::getState() {
 }
 
 void ContextProjection::forward() {
-  CHECK(in_->value);
+  CHECK(in_->value && out_->value);
   CHECK(in_->sequenceStartPositions);
 
-  auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
-
-  int64_t inputDim = in_->value->getWidth();
-  int64_t dim = out_->value->getWidth();
-  CHECK_EQ(dim, inputDim * config_.context_length());
+  size_t input_dim = in_->value->getWidth();
+  size_t dim = out_->value->getWidth();
+  CHECK_EQ(dim, input_dim * config_.context_length());
+  size_t batch_size = in_->value->getHeight();
+  CHECK_EQ(forward_.size(), 1) << "Only one forward function here";
 
   REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
-  bool isPadding = config_.trainable_padding();
-  out_->value->contextProjectionForward(
-      *(in_->value),
-      state_ ? state_.get() : isPadding ? weight_->getW().get() : nullptr,
-      *startPositions,
-      config_.context_length(),
-      config_.context_start(),
-      beginPad_,
-      state_ ? true : isPadding);
+  bool is_padding = config_.trainable_padding();
+  /// first use state_, otherwise use weight_(padding false === w nullptr)
+  auto w_ptr =
+      state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
+  auto start_pos = in_->sequenceStartPositions;
+  forward_[0]->calc({Tensor(in_->value->getData(), Dims{batch_size, input_dim}),
+                     Tensor(w_ptr ? w_ptr->getData() : nullptr,
+                            Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
+                     Tensor(reinterpret_cast<real*>(
+                                const_cast<int*>(start_pos->getData(useGpu_))),
+                            Dims{start_pos->getSize()})},
+                    {Tensor(out_->value->getData(), Dims{batch_size, dim})},
+                    {});
 
   if (state_ && config_.context_start() < 0) {
     CHECK_EQ(1, in_->getNumSequences());
@@ -118,41 +148,27 @@ void ContextProjection::forward() {
 }
 
 void ContextProjection::backward(const UpdateCallback& callback) {
-  CHECK(in_->value);
-  int64_t inputDim = in_->value->getWidth();
-  int64_t dim = out_->value->getWidth();
-  CHECK_EQ(dim, inputDim * config_.context_length());
-  auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
+  CHECK(in_->value && out_->value && out_->grad);
+  size_t input_dim = in_->value->getWidth();
+  size_t dim = out_->value->getWidth();
+  CHECK_EQ(dim, input_dim * config_.context_length());
+  size_t batch_size = in_->value->getHeight();
+  CHECK_EQ(batch_size, out_->value->getHeight());
+  CHECK_EQ(backward_.size(), 1) << "Only one backward function here";
 
   REGISTER_TIMER_INFO("ContextProjectionBackward", getName().c_str());
-  bool isPadding = config_.trainable_padding();
-  if (!out_->grad->useGpu()) {
-    out_->grad->contextProjectionBackward(
-        in_->grad.get(),
-        isPadding ? weight_->getWGrad().get() : nullptr,
-        *startPositions,
-        config_.context_length(),
-        config_.context_start(),
-        beginPad_,
-        isPadding);
-  } else {
-    if (in_->grad) {
-      out_->grad->contextProjectionBackwardData(*(in_->grad),
-                                                *startPositions,
-                                                config_.context_length(),
-                                                config_.context_start());
-    }
-
-    if (isPadding && weight_->getWGrad()) {
-      out_->grad->contextProjectionBackwardWeight(
-          *(weight_->getWGrad()),
-          *startPositions,
-          config_.context_length(),
-          config_.context_start(),
-          weight_->getWGrad()->getHeight(),
-          beginPad_);
-    }
-  }
+  bool is_padding = config_.trainable_padding();
+  auto start_pos = in_->sequenceStartPositions;
+  auto w_ptr = is_padding ? weight_->getWGrad() : nullptr;
+  backward_[0]->calc({Tensor(in_->grad ? in_->grad->getData() : nullptr,
+                             Dims{batch_size, input_dim}),
+                      Tensor(w_ptr ? w_ptr->getData() : nullptr,
+                             Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
+                      Tensor(reinterpret_cast<real*>(
+                                 const_cast<int*>(start_pos->getData(useGpu_))),
+                             Dims{start_pos->getSize()})},
+                     {Tensor(out_->grad->getData(), Dims{batch_size, dim})},
+                     {});
 
   if (config_.trainable_padding()) {
     weight_->getParameterPtr()->incUpdate(callback);
diff --git a/paddle/gserver/layers/ContextProjection.h b/paddle/gserver/layers/ContextProjection.h
index 2df43bd04fec868924b5d45f9def231a48ee7f04..c87d6ed1d6d46b391ccf8722f6d110614be1fe78 100644
--- a/paddle/gserver/layers/ContextProjection.h
+++ b/paddle/gserver/layers/ContextProjection.h
@@ -61,6 +61,8 @@ public:
 
   virtual LayerStatePtr getState();
 
+  virtual bool init();
+
 protected:
   std::unique_ptr<Weight> weight_;
   /// number of extra timesteps added at the beginning
diff --git a/paddle/gserver/layers/Projection.h b/paddle/gserver/layers/Projection.h
index 8cd8042479eafdbd6b8dac03b63b344fcf9526b1..778a7fe13d8a2b669831396e69546446b4745e61 100644
--- a/paddle/gserver/layers/Projection.h
+++ b/paddle/gserver/layers/Projection.h
@@ -88,11 +88,37 @@ public:
    */
   virtual LayerStatePtr getState() { return nullptr; }
 
+  /**
+   * init forward_ and backward_ functions
+   */
+  virtual bool init() { return true; }
+
   /**
    * Get output size of projection.
    */
   size_t getOutputSize() const { return config_.output_size(); }
 
+protected:
+  /**
+   * Create layer function. Function is called in forward or backward.
+   * \param function, Layer::forward_ or Layer::backward_
+   * \param name, function name
+   * \param config, initialization configuration for the function
+   */
+  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
+                      const std::string& name,
+                      const FuncConfig& config) {
+    if (useGpu_) {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
+    } else {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
+    }
+    auto& func = function.back();
+    func->init(config);
+  }
+
 protected:
   /// Config of projection
   ProjectionConfig config_;
@@ -106,5 +132,9 @@ protected:
   const Argument* out_;
   /// Store `passType` passed to forward()
   PassType passType_;
+  /// Layer forward function
+  std::vector<std::shared_ptr<FunctionBase>> forward_;
+  /// Layer backward function
+  std::vector<std::shared_ptr<FunctionBase>> backward_;
 };
 }  // namespace paddle
diff --git a/paddle/gserver/tests/test_LinearChainCRF.cpp b/paddle/gserver/tests/test_LinearChainCRF.cpp
index 330adee8f77f495dab6a13190aaca6a3a5f86b2c..f046cb0b289c9ce22b98f3200bf0a3f7d48d77f5 100644
--- a/paddle/gserver/tests/test_LinearChainCRF.cpp
+++ b/paddle/gserver/tests/test_LinearChainCRF.cpp
@@ -65,9 +65,3 @@ TEST(LinearChainCRF, decoding) {
     }
   }
 }
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_ProtoDataProvider.cpp b/paddle/gserver/tests/test_ProtoDataProvider.cpp
index d421b6e2f2536e266883508ff29cbec731c9d7e3..8fc0aaab69548ae60100696db04d5611570df110 100644
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
@@ -730,9 +730,3 @@ TEST(ProtoSequenceDataProvider, test) {
     }        // end for (int numIdSlots : numSlotsArray)
   }          // end for (int numSparseNonValueVecSlots : numSlotsArray)
 }
-
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/gserver/tests/test_WarpCTCLayer.cpp b/paddle/gserver/tests/test_WarpCTCLayer.cpp
index 0a4a814d5247410248f7418e1ef2c79a2da42507..dab6366588b7894a6700c00a5331d436ca2a410c 100644
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -242,9 +242,3 @@ TEST(Layer, WarpCTCLayer) {
     }
   }
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index b281d5eb02f6d5ee46b3f4155b98c738f05d6640..90813a89969c2525f7029f1c2609bed116c910c4 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1304,68 +1304,6 @@ void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
   hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim);
 }
 
-void GpuMatrix::contextProjectionForward(Matrix& input,
-                                         Matrix* weight,
-                                         const IVector& sequence,
-                                         int contextLength,
-                                         int contextStart,
-                                         size_t beginPad,
-                                         bool isPadding) {
-  CHECK(dynamic_cast<GpuMatrix*>(&input));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  if (weight) CHECK(dynamic_cast<GpuMatrix*>(weight));
-  CHECK_EQ(getWidth(), input.getWidth() * contextLength);
-
-  hl_context_projection_forward(input.getData(),
-                                sequence.getData(),
-                                isPadding ? weight->getData() : NULL,
-                                getData(),
-                                sequence.getSize() - 1,
-                                input.getWidth(),
-                                contextLength,
-                                contextStart,
-                                beginPad,
-                                isPadding);
-}
-
-void GpuMatrix::contextProjectionBackwardData(Matrix& inputGrad,
-                                              const IVector& sequence,
-                                              int contextLength,
-                                              int contextStart) {
-  CHECK(dynamic_cast<GpuMatrix*>(&inputGrad));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  CHECK_EQ(getWidth(), inputGrad.getWidth() * contextLength);
-
-  hl_context_projection_backward_data(getData(),
-                                      sequence.getData(),
-                                      inputGrad.getData(),
-                                      sequence.getSize() - 1,
-                                      inputGrad.getWidth(),
-                                      contextLength,
-                                      contextStart);
-}
-
-void GpuMatrix::contextProjectionBackwardWeight(Matrix& weightGrad,
-                                                const IVector& sequence,
-                                                int contextLength,
-                                                int contextStart,
-                                                int totalPad,
-                                                size_t beginPad) {
-  CHECK(dynamic_cast<GpuMatrix*>(&weightGrad));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  CHECK_EQ(getWidth(), weightGrad.getWidth() * contextLength);
-
-  hl_context_projection_backward_weight(getData(),
-                                        sequence.getData(),
-                                        weightGrad.getData(),
-                                        sequence.getSize() - 1,
-                                        weightGrad.getWidth(),
-                                        totalPad,
-                                        contextLength,
-                                        contextStart,
-                                        beginPad);
-}
-
 void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
   CHECK(data.useGpu_ == true && W.useGpu_ == true)
       << "Matrix type are not equal";
@@ -2203,113 +2141,6 @@ void CpuMatrix::maxSequenceBackward(Matrix& outputGrad,
   }
 }
 
-void CpuMatrix::contextProjectionForward(Matrix& input,
-                                         Matrix* weight,
-                                         const IVector& sequence,
-                                         int contextLength,
-                                         int contextStart,
-                                         size_t beginPad,
-                                         bool isPadding) {
-  auto input_ptr = dynamic_cast<CpuMatrix*>(&input);
-  auto seq_ptr = dynamic_cast<const CpuIVector*>(&sequence);
-  CHECK(input_ptr && seq_ptr);
-  if (weight) CHECK(dynamic_cast<CpuMatrix*>(weight));
-  CHECK_EQ(getWidth(), input_ptr->getWidth() * contextLength);
-
-  const int* starts = seq_ptr->getData();
-  size_t numSequences = seq_ptr->getSize() - 1;
-  for (size_t i = 0; i < numSequences; ++i) {
-    for (int j = 0; j < contextLength; ++j) {
-      int begin = starts[i] + contextStart + j;
-      int end = starts[i + 1] + contextStart + j;
-      int dstBegin = starts[i];
-      int dstEnd = starts[i + 1];
-      if (begin < starts[i]) {
-        int64_t padSize =
-            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        MatrixPtr mat = this->subMatrix(starts[i], padSize);
-        if (isPadding) {
-          MatrixPtr sub = weight->subMatrix(j, padSize);
-          mat->addAtOffset(*sub, j * input_ptr->getWidth());
-        }
-        dstBegin = starts[i] + padSize;
-        begin = starts[i];
-      }
-      if (end > starts[i + 1]) {
-        int64_t padSize =
-            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        MatrixPtr mat = this->subMatrix(starts[i + 1] - padSize, padSize);
-        if (isPadding) {
-          MatrixPtr sub =
-              weight->subMatrix(beginPad + contextStart + j - padSize, padSize);
-          mat->addAtOffset(*sub, j * input_ptr->getWidth());
-        }
-        dstEnd = starts[i + 1] - padSize;
-        end = starts[i + 1];
-      }
-      if (end <= begin) continue;
-      MatrixPtr src = input_ptr->subMatrix(begin, end - begin);
-      MatrixPtr dst = this->subMatrix(dstBegin, dstEnd - dstBegin);
-      dst->addAtOffset(*src, j * input_ptr->getWidth());
-    }
-  }
-}
-
-void CpuMatrix::contextProjectionBackward(Matrix* inputGrad,
-                                          Matrix* weightGrad,
-                                          const IVector& sequence,
-                                          int contextLength,
-                                          int contextStart,
-                                          size_t beginPad,
-                                          bool isPadding) {
-  if (inputGrad) CHECK(dynamic_cast<CpuMatrix*>(inputGrad));
-  if (weightGrad) CHECK(dynamic_cast<CpuMatrix*>(weightGrad));
-  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
-
-  int64_t inputDim = inputGrad ? inputGrad->getWidth()
-                               : weightGrad ? weightGrad->getWidth() : 0;
-  CHECK_EQ(getWidth(), static_cast<size_t>(inputDim * contextLength));
-
-  const int* starts = sequence.getData();
-  size_t numSequences = sequence.getSize() - 1;
-  for (size_t i = 0; i < numSequences; ++i) {
-    for (int j = 0; j < contextLength; ++j) {
-      int begin = starts[i] + contextStart + j;
-      int end = starts[i + 1] + contextStart + j;
-      int dstBegin = starts[i];
-      int dstEnd = starts[i + 1];
-      if (begin < starts[i]) {
-        int64_t padSize =
-            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        if (isPadding && weightGrad) {
-          MatrixPtr mat = this->subMatrix(starts[i], padSize);
-          MatrixPtr sub = weightGrad->subMatrix(j, padSize);
-          sub->addAtOffset(*mat, j * inputDim);
-        }
-        dstBegin = starts[i] + padSize;
-        begin = starts[i];
-      }
-      if (end > starts[i + 1]) {
-        int64_t padSize =
-            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        if (isPadding && weightGrad) {
-          MatrixPtr mat = this->subMatrix(starts[i + 1] - padSize, padSize);
-          MatrixPtr sub = weightGrad->subMatrix(
-              beginPad + contextStart + j - padSize, padSize);
-          sub->addAtOffset(*mat, j * inputDim);
-        }
-        dstEnd = starts[i + 1] - padSize;
-        end = starts[i + 1];
-      }
-      if (end <= begin) continue;
-      if (!inputGrad) continue;
-      MatrixPtr src = inputGrad->subMatrix(begin, end - begin);
-      MatrixPtr dst = this->subMatrix(dstBegin, dstEnd - dstBegin);
-      src->addAtOffset(*dst, j * inputDim);
-    }
-  }
-}
-
 inline void vecAddTo(real* a, const real* b, size_t len) {
   for (unsigned int i = 0; i < len; ++i) {
     a[i] += b[i];
diff --git a/paddle/math/Matrix.h b/paddle/math/Matrix.h
index bda863de38675fe481544a7e82b69f445df361bd..4865a081a5aaa010d5b3ce0127ffc6f8330d4a68 100644
--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -972,42 +972,6 @@ public:
     LOG(FATAL) << "Not implemeted";
   }
 
-  virtual void contextProjectionForward(Matrix& input,
-                                        Matrix* weight,
-                                        const IVector& sequence,
-                                        int contextLength,
-                                        int contextStart,
-                                        size_t beginPad,
-                                        bool isPadding) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void contextProjectionBackward(Matrix* inputGrad,
-                                         Matrix* weightGrad,
-                                         const IVector& sequence,
-                                         int contextLength,
-                                         int contextStart,
-                                         size_t beginPad,
-                                         bool isPadding) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void contextProjectionBackwardData(Matrix& inputGrad,
-                                             const IVector& sequence,
-                                             int contextLength,
-                                             int contextStart) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
-  virtual void contextProjectionBackwardWeight(Matrix& weightGrad,
-                                               const IVector& sequence,
-                                               int contextLength,
-                                               int contextStart,
-                                               int totalPad,
-                                               size_t beginPad) {
-    LOG(FATAL) << "Not implemeted";
-  }
-
   /**
    * @code
    * this.row[i] += table.row[ids[i]]
@@ -1442,26 +1406,6 @@ public:
                            const IVector& sequence,
                            IVector& index);
 
-  void contextProjectionForward(Matrix& input,
-                                Matrix* weight,
-                                const IVector& sequence,
-                                int contextLength,
-                                int contextStart,
-                                size_t beginPad,
-                                bool isPadding);
-
-  void contextProjectionBackwardData(Matrix& inputGrad,
-                                     const IVector& sequence,
-                                     int contextLength,
-                                     int contextStart);
-
-  void contextProjectionBackwardWeight(Matrix& weightGrad,
-                                       const IVector& sequence,
-                                       int contextLength,
-                                       int contextStart,
-                                       int totalPad,
-                                       size_t beginPad);
-
   void bilinearForward(const Matrix& in,
                        const size_t inImgH,
                        const size_t inImgW,
@@ -1648,22 +1592,6 @@ public:
                            const IVector& sequence,
                            IVector& index);
 
-  void contextProjectionForward(Matrix& input,
-                                Matrix* weight,
-                                const IVector& sequence,
-                                int contextLength,
-                                int contextStart,
-                                size_t beginPad,
-                                bool isPadding);
-
-  void contextProjectionBackward(Matrix* inputGrad,
-                                 Matrix* weightGrad,
-                                 const IVector& sequence,
-                                 int contextLength,
-                                 int contextStart,
-                                 size_t beginPad,
-                                 bool isPadding);
-
   real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
   virtual real* getRowBuf(size_t row) { return getRow(row); }
 
diff --git a/paddle/math/tests/test_Allocator.cpp b/paddle/math/tests/test_Allocator.cpp
index 33e0952efedddec16acf6153209e14f18fd48134..1ca70ea84c867b83013625eaee141f5b75fad4ae 100644
--- a/paddle/math/tests/test_Allocator.cpp
+++ b/paddle/math/tests/test_Allocator.cpp
@@ -120,9 +120,3 @@ TEST(MemoryHandle, Gpu) {
   }
 }
 #endif
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_BaseMatrix.cpp b/paddle/math/tests/test_BaseMatrix.cpp
index cc7c1e7eb2734605cb278a4b97cab22bdba1594e..21918b86e1ad98766ceaf09dea3020d6e8592191 100644
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -242,10 +242,4 @@ TEST(BaseMatrix, Other) {
   }
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif
diff --git a/paddle/math/tests/test_CpuGpuVector.cpp b/paddle/math/tests/test_CpuGpuVector.cpp
index 624fa20ca58bca3f16fa567487bbaa5d9656e1b1..58bc43a38ba9465a832fcd0652e6309c403577e3 100644
--- a/paddle/math/tests/test_CpuGpuVector.cpp
+++ b/paddle/math/tests/test_CpuGpuVector.cpp
@@ -77,11 +77,4 @@ TEST(CpuGpuVector, subCreate) {
   checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
-
 #endif
diff --git a/paddle/math/tests/test_ExecViaCpu.cpp b/paddle/math/tests/test_ExecViaCpu.cpp
index 27216ddb58eccd7fd52e121e795baf463ea69f51..04c856453d2ec4ad764e37ae430e3e30ac0dea0b 100644
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
@@ -114,9 +114,3 @@ TEST(ExecViaCpu, test1) {
   testWrapper(functor);
 }
 #endif
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_Matrix.cpp b/paddle/math/tests/test_Matrix.cpp
index adb5fbd9fa30d810a25a2eb11f6d57474c1304c7..6899769144dd89156b2ffdb644c47ef0025d624b 100644
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
@@ -291,10 +291,4 @@ TEST(Matrix, multiBinaryCrossEntropy) {
   }
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif
diff --git a/paddle/math/tests/test_SIMDFunctions.cpp b/paddle/math/tests/test_SIMDFunctions.cpp
index f62843310d886ba7d449e793066b19a7cc7bd5a9..e8f9b26ff240f9c339404a919c14eb3e3704c1de 100644
--- a/paddle/math/tests/test_SIMDFunctions.cpp
+++ b/paddle/math/tests/test_SIMDFunctions.cpp
@@ -169,9 +169,3 @@ TEST(SIMDFunction, decayL1_WithoutLR) {
     ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
   }
 }
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_SparseMatrix.cpp b/paddle/math/tests/test_SparseMatrix.cpp
index 0949ab7ffba423daedd47876bc055a21c5c3f016..9d3fbaef43d719d07577631d5df3ac4656610cc6 100644
--- a/paddle/math/tests/test_SparseMatrix.cpp
+++ b/paddle/math/tests/test_SparseMatrix.cpp
@@ -561,9 +561,3 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
   checkSMatrixEqual2(matA, matD);
 #endif
 }
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_Tensor.cu b/paddle/math/tests/test_Tensor.cu
index 1859b9fc13576b6f1d0bc13b43f7e7a2ef6030c9..40e38434fa328bba8be6e1b8e509023d615899c1 100644
--- a/paddle/math/tests/test_Tensor.cu
+++ b/paddle/math/tests/test_Tensor.cu
@@ -1163,11 +1163,3 @@ TEST(Quaternary, CompareOp) {
   TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
 #endif
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  hl_start();
-  hl_init(0);
-  return RUN_ALL_TESTS();
-}
-
diff --git a/paddle/math/tests/test_TrainingAlgorithm.cpp b/paddle/math/tests/test_TrainingAlgorithm.cpp
index 2c458cba9ca11e9af8a98b88a6392978c2a9be77..4a88844b43ef40af988d2b391d2bef4568dea9b7 100644
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
@@ -459,11 +459,3 @@ void testSparseMomentum(size_t size, bool useGpu) {
 }
 
 TEST(Training, SparseMomentum) { testCase(testSparseMomentum); }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  hl_start();
-  hl_init(FLAGS_gpu_id);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_batchTranspose.cpp b/paddle/math/tests/test_batchTranspose.cpp
index 9925e24dc14294ec70806ffd9cc496ea01beaa43..4eb9837909ffaaf0f483ab65ece7a0b29fd49319 100644
--- a/paddle/math/tests/test_batchTranspose.cpp
+++ b/paddle/math/tests/test_batchTranspose.cpp
@@ -53,9 +53,3 @@ TEST(MatrixBatchTransTest, test_batch_matrix_transpose) {
   checkMatrixEqual(cBatchTransMat, cMat_d2h);
 }
 #endif
-
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/math/tests/test_lazyAssign.cu b/paddle/math/tests/test_lazyAssign.cu
index 16541edb54b807d4e1690d4ae63fd44459e2d726..786d863a533b58ea9856300aaa0cd8f5a10a4dd9 100644
--- a/paddle/math/tests/test_lazyAssign.cu
+++ b/paddle/math/tests/test_lazyAssign.cu
@@ -139,11 +139,3 @@ TEST(sgdUpdate, GPU) {
   testMatrixCase(testSgdUpdate<GpuMatrix>);
 }
 #endif
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  hl_start();
-  hl_init(0);
-  return RUN_ALL_TESTS();
-}
-
diff --git a/paddle/math/tests/test_matrixCompare.cpp b/paddle/math/tests/test_matrixCompare.cpp
index c6fc849ba0328dae62c9da0bd721d86fd8b6881e..98d63438a57b48340bc3b05ac7ac3d6c5cd90fb0 100644
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -29,148 +29,6 @@ using namespace std;     // NOLINT
 using autotest::TensorCheckEqual;
 using autotest::TensorCheckErr;
 
-void testMatrixProjectionForward(int contextStart,
-                                 int contextLength,
-                                 bool padding,
-                                 int batchSize,
-                                 int inputDim) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-
-  int pad = std::max(0, -contextStart) +
-            std::max(0, contextStart + contextLength - 1);
-  if (pad == 0) padding = false;
-  MatrixPtr cpuWeight = nullptr;
-  MatrixPtr gpuWeight = nullptr;
-  if (padding) {
-    cpuWeight = std::make_shared<CpuMatrix>(pad, inputDim);
-    gpuWeight = std::make_shared<GpuMatrix>(pad, inputDim);
-    cpuWeight->randomizeUniform();
-    gpuWeight->copyFrom(*cpuWeight);
-  }
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  MatrixPtr cpuOutput =
-      std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
-  MatrixPtr gpuOutput =
-      std::make_shared<GpuMatrix>(batchSize, inputDim * contextLength);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-
-  // calculate
-  int beginPad = std::max(0, -contextStart);
-  cpuOutput->contextProjectionForward(*cpuInput,
-                                      cpuWeight.get(),
-                                      *cpuSequence,
-                                      contextLength,
-                                      contextStart,
-                                      beginPad,
-                                      padding);
-
-  gpuOutput->contextProjectionForward(*gpuInput,
-                                      gpuWeight.get(),
-                                      *gpuSequence,
-                                      contextLength,
-                                      contextStart,
-                                      beginPad,
-                                      padding);
-
-  TensorCheckEqual(*cpuOutput, *gpuOutput);
-}
-
-void testMatrixProjectionBackward(int contextStart,
-                                  int contextLength,
-                                  bool padding,
-                                  int batchSize,
-                                  int inputDim) {
-  MatrixPtr cpuOutputGrad =
-      std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
-  MatrixPtr gpuOutputGrad =
-      std::make_shared<GpuMatrix>(batchSize, inputDim * contextLength);
-  cpuOutputGrad->randomizeUniform();
-  gpuOutputGrad->copyFrom(*cpuOutputGrad);
-
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-
-  MatrixPtr cpuInputGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInputGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInputGrad->randomizeUniform();
-  gpuInputGrad->copyFrom(*cpuInputGrad);
-
-  int pad = std::max(0, -contextStart) +
-            std::max(0, contextStart + contextLength - 1);
-  if (pad == 0) padding = false;
-  MatrixPtr cpuWeightGrad = nullptr;
-  MatrixPtr gpuWeightGrad = nullptr;
-  if (padding) {
-    cpuWeightGrad = std::make_shared<CpuMatrix>(pad, inputDim);
-    gpuWeightGrad = std::make_shared<GpuMatrix>(pad, inputDim);
-    cpuWeightGrad->randomizeUniform();
-    gpuWeightGrad->copyFrom(*cpuWeightGrad);
-  }
-
-  // calculate
-  int beginPad = std::max(0, -contextStart);
-  cpuOutputGrad->contextProjectionBackward(cpuInputGrad.get(),
-                                           cpuWeightGrad.get(),
-                                           *cpuSequence,
-                                           contextLength,
-                                           contextStart,
-                                           beginPad,
-                                           padding);
-  gpuOutputGrad->contextProjectionBackwardData(
-      *gpuInputGrad, *gpuSequence, contextLength, contextStart);
-  if (padding) {
-    gpuOutputGrad->contextProjectionBackwardWeight(*gpuWeightGrad,
-                                                   *gpuSequence,
-                                                   contextLength,
-                                                   contextStart,
-                                                   pad,
-                                                   beginPad);
-  }
-
-  TensorCheckErr(*cpuInputGrad, *gpuInputGrad);
-  if (padding) {
-    TensorCheckErr(*cpuWeightGrad, *gpuWeightGrad);
-  }
-}
-
-TEST(Matrix, projection) {
-  for (auto contextStart : {-5, -3, -1, 0, 3}) {
-    for (auto contextLength : {1, 2, 5, 7}) {
-      for (auto trainablePadding : {false, true}) {
-        for (auto batchSize : {1, 2, 5, 20, 100}) {
-          for (auto inputDim : {15, 32, 63, 128, 200}) {
-            VLOG(3) << " contextStart=" << contextStart
-                    << " contextLength=" << contextLength
-                    << " trainablePadding=" << trainablePadding
-                    << " batchSize=" << batchSize << " inputDim=" << inputDim;
-            testMatrixProjectionForward(contextStart,
-                                        contextLength,
-                                        trainablePadding,
-                                        batchSize,
-                                        inputDim);
-            testMatrixProjectionBackward(contextStart,
-                                         contextLength,
-                                         trainablePadding,
-                                         batchSize,
-                                         inputDim);
-          }
-        }
-      }
-    }
-  }
-}
-
 void testMatrixMaxSequence(int batchSize, int inputDim) {
   // forward
   MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
@@ -1262,10 +1120,4 @@ TEST(Matrix, MaxOutFwdBwd) {
   }
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
-
 #endif
diff --git a/paddle/math/tests/test_sparseMatrixCompare.cpp b/paddle/math/tests/test_sparseMatrixCompare.cpp
index dcdbccffc3a19faa177c9867fe7ab142612f5209..a9185a4b24b13ca0287b0f67375c4599e8b9ac78 100644
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
@@ -171,11 +171,4 @@ TEST(SMatrix, sMatrixCollectBias) {
   }
 }
 
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
-
 #endif
diff --git a/paddle/parameter/tests/test_common.cpp b/paddle/parameter/tests/test_common.cpp
index aa57a6346917b259dbb89f6ad2340fb8db28f3e3..8bab5a6289e2bb9f634e8cce4557de55f7704447 100644
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
@@ -23,15 +23,6 @@ limitations under the License. */
 
 using namespace paddle;  // NOLINT
 
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-
-  int ret = RUN_ALL_TESTS();
-
-  return ret;
-}
-
 class CommonTest : public ::testing::Test {
 protected:
   CommonTest() : testStat_("test") {}
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..584498c8602ee5faad3e21a8588af7bb802d7377
--- /dev/null
+++ b/paddle/testing/CMakeLists.txt
@@ -0,0 +1,6 @@
+# for paddle test case
+
+if(WITH_TESTING)
+  add_library(paddle_test_main STATIC TestMain.cpp)
+  add_dependencies(paddle_test_main gen_proto_cpp)
+endif()
diff --git a/paddle/function/TestMain.cpp b/paddle/testing/TestMain.cpp
similarity index 100%
rename from paddle/function/TestMain.cpp
rename to paddle/testing/TestMain.cpp
diff --git a/paddle/trainer/RemoteParameterUpdater.h b/paddle/trainer/RemoteParameterUpdater.h
index 7794b209009a3429e810074b61e1d5bffa8b3a4e..5e82c944751629632ea8d16992bd8f4178a2fbd5 100644
--- a/paddle/trainer/RemoteParameterUpdater.h
+++ b/paddle/trainer/RemoteParameterUpdater.h
@@ -56,7 +56,7 @@ class RemoteParameterUpdater : public ParameterUpdater {
 public:
   RemoteParameterUpdater(
       const OptimizationConfig& config,
-      int expectedPpassCount,
+      int expectedPassCount,
       std::unique_ptr<ParameterUpdater>&& localUpdater = nullptr);
   ~RemoteParameterUpdater() {
     if (controllerThread_) {
@@ -146,7 +146,7 @@ protected:
   BatchStatus batchStatus_;
   /// controller thread for sync-sgd
   std::unique_ptr<std::thread> controllerThread_;
-  /// passed alread finished
+  /// passed already finished
   int64_t passCount_;
   /// expected passes to finished
   int64_t expectedPassCount_;
diff --git a/paddle/utils/ThreadLocal.cpp b/paddle/utils/ThreadLocal.cpp
index d27dae33fd039bbefdbc65908e5ce7dc58eceab7..58fe51bd40c36088fdc6ee51e22d120b63486bf4 100644
--- a/paddle/utils/ThreadLocal.cpp
+++ b/paddle/utils/ThreadLocal.cpp
@@ -37,7 +37,7 @@ unsigned int* ThreadLocalRand::getSeed() {
       p = new unsigned int(defaultSeed_ - 1);
     } else {
       p = new unsigned int(defaultSeed_ + getTID());
-      LOG(INFO) << "thread use undeterministic rand seed:" << *p;
+      VLOG(3) << "thread use undeterministic rand seed:" << *p;
     }
     seed_.set(p);
   }
diff --git a/paddle/utils/Util.cpp b/paddle/utils/Util.cpp
index 0f778dbebf4e124c7a240d738b8f73cef03fc477..411a64aa8d0737a8d57e62fbd0788ffaacfbc9f7 100644
--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -125,7 +125,7 @@ void registerInitFunction(std::function<void()> func, int priority) {
 
 void runInitFunctions() {
   std::call_once(g_onceFlag, []() {
-    LOG(INFO) << "Calling runInitFunctions";
+    VLOG(3) << "Calling runInitFunctions";
     if (g_initFuncs) {
       std::sort(g_initFuncs->begin(),
                 g_initFuncs->end(),
@@ -139,7 +139,7 @@ void runInitFunctions() {
       g_initFuncs = nullptr;
     }
     g_initialized = true;
-    LOG(INFO) << "Call runInitFunctions done.";
+    VLOG(3) << "Call runInitFunctions done.";
   });
 }
 
@@ -231,7 +231,7 @@ std::string join(const std::string& part1, const std::string& part2) {
 }  // namespace path
 
 void copyFileToPath(const std::string& file, const std::string& dir) {
-  LOG(INFO) << "copy " << file << " to " << dir;
+  VLOG(3) << "copy " << file << " to " << dir;
   std::string fileName = path::basename(file);
   std::string dst = path::join(dir, fileName);
   std::ifstream source(file, std::ios_base::binary);
diff --git a/paddle/utils/tests/test_CustomStackTrace.cpp b/paddle/utils/tests/test_CustomStackTrace.cpp
index 18dd0aac4305006745dcd8e0a0717fb0fb939778..378788bcecd579fff1c762702a8c27f54cee94bf 100644
--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -96,9 +96,3 @@ TEST(CustomStackTrace, normalTest) {
     }
   });
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/utils/tests/test_SIMDFlags.cpp b/paddle/utils/tests/test_SIMDFlags.cpp
index 42edede209ad957c13c1cec8e6bb20bd0fe9d28b..8200a24ce7b7df75b48a89fbb7af15f304c5957f 100644
--- a/paddle/utils/tests/test_SIMDFlags.cpp
+++ b/paddle/utils/tests/test_SIMDFlags.cpp
@@ -44,8 +44,3 @@ TEST(SIMDFlags, normalPrint) {
   LOG(INFO) << "Has AVX2:    " << std::boolalpha << HAS_AVX2;
   LOG(INFO) << "Has AVX512:  " << std::boolalpha << HAS_AVX512;
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/utils/tests/test_SpinLock.cpp b/paddle/utils/tests/test_SpinLock.cpp
index 605bedb6c912b0436f40e3eff93d5cf95d8dc489..cc34eb1f868003d3db9221578c0c20c44be285eb 100644
--- a/paddle/utils/tests/test_SpinLock.cpp
+++ b/paddle/utils/tests/test_SpinLock.cpp
@@ -53,9 +53,3 @@ TEST(ThreadSpinLock, normalTest) {
         });
   }
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/utils/tests/test_Thread.cpp b/paddle/utils/tests/test_Thread.cpp
index 2f5c5bbce07f39b799b928fd231bb4db1d2b3e05..6e2580c4913f0adc7ba1e63c9cebce308775aac6 100644
--- a/paddle/utils/tests/test_Thread.cpp
+++ b/paddle/utils/tests/test_Thread.cpp
@@ -79,8 +79,3 @@ TEST(AsyncThreadPool, addBatchJobWithResults) {
     ASSERT_EQ(res[i], i);
   }
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/utils/tests/test_ThreadBarrier.cpp b/paddle/utils/tests/test_ThreadBarrier.cpp
index 1237f1b731b2fb733d6823619df2c574476b89de..554b1c1d4adce7a0196b304281dcf878a0b6426e 100644
--- a/paddle/utils/tests/test_ThreadBarrier.cpp
+++ b/paddle/utils/tests/test_ThreadBarrier.cpp
@@ -64,9 +64,3 @@ TEST(ThreadBarrier, normalTest) {
                    });
   }
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}