Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into improve_pruning

1a82e7da · zlx · 885275ee · 17fe8322 · 1a82e7da · 1a82e7da
48 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -126,6 +126,7 @@ endif(WITH_GPU)
 add_subdirectory(proto)
 add_subdirectory(paddle)
+add_subdirectory(go/master/c)
 add_subdirectory(python)
 add_subdirectory(go/pserver/cclient)

--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -22,6 +22,7 @@ To compile the source code, your computer must be equipped with the following de
 - **CMake**: CMake >= 3.0 (at least CMake 3.4 on Mac OS X)
 - **BLAS**: MKL, OpenBlas or ATLAS
 - **Python**: only support Python 2.7
+- **Go**
 **Note:** For CUDA 7.0 and CUDA 7.5, GCC 5.0 and up are not supported!
 For CUDA 8.0, GCC versions later than 5.3 are not supported!
@@ -107,6 +108,18 @@ As a simple example, consider the following:
    sudo apt-get install -y python python-pip python-numpy libpython-dev bison
    sudo pip install 'protobuf==3.1.0.post1'
+    # Install Go
+    # You can follow https://golang.org/doc/install for a detailed explanation.
+    wget -O go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz && \
+    tar -C $HOME -xzf go.tgz && \
+    mkdir $HOME/gopath && \
+    rm go.tgz
+    # Setup environment variables
+    export GOROOT=$HOME/go
+    export GOPATH=$HOME/gopath
+    export PATH=$PATH:$GOROOT/bin
    # install cmake 3.4
    curl -sSL https://cmake.org/files/v3.4/cmake-3.4.1.tar.gz | tar -xz && \
        cd cmake-3.4.1 && ./bootstrap && make -j4 && sudo make install && \

--- a/doc/howto/deep_model/rnn/index_cn.rst
+++ b/doc/howto/deep_model/rnn/index_cn.rst
@@ -4,6 +4,7 @@ RNN相关模型
 ..  toctree::
  :maxdepth: 1
+  rnn_config_cn.rst
  recurrent_group_cn.md
  hierarchical_layer_cn.rst
  hrnn_rnn_api_compare_cn.rst
--- a/doc/howto/deep_model/rnn/index_en.rst
+++ b/doc/howto/deep_model/rnn/index_en.rst
 RNN Models
 ==========
+..  toctree::
+  :maxdepth: 1
+  rnn_config_en.rst
--- a/doc/howto/deep_model/rnn/rnn_config_cn.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_cn.rst
@@ -5,36 +5,13 @@ RNN配置
 中配置循环神经网络（RNN）。PaddlePaddle
 高度支持灵活和高效的循环神经网络配置。 在本教程中，您将了解如何：
-  准备用来学习循环神经网络的序列数据。
 -  配置循环神经网络架构。
 -  使用学习完成的循环神经网络模型生成序列。
 我们将使用 vanilla 循环神经网络和 sequence to sequence
 模型来指导你完成这些步骤。sequence to sequence
-模型的代码可以在\ ``demo / seqToseq``\ 找到。
+模型的代码可以在 `book/08.machine_translation <https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation>`_ 找到。
+wmt14数据的提供文件在 `python/paddle/v2/dataset/wmt14.py <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/wmt14.py>`_ 。
-准备序列数据
------------
-PaddlePaddle
-不需要对序列数据进行任何预处理，例如填充。唯一需要做的是将相应类型设置为输入。例如，以下代码段定义了三个输入。
-它们都是序列，它们的大小是\ ``src_dict``\ ，\ ``trg_dict``\ 和\ ``trg_dict``\ ：
-.. code:: python
-    settings.input_types = [
-      integer_value_sequence(len(settings.src_dict)),
-      integer_value_sequence(len(settings.trg_dict)),
-      integer_value_sequence(len(settings.trg_dict))]
-在\ ``process``\ 函数中，每个\ ``yield``\ 函数将返回三个整数列表。每个整数列表被视为一个整数序列：
-.. code:: python
-    yield src_ids, trg_ids, trg_ids_next
-有关如何编写数据提供程序的更多细节描述，请参考 :ref:`api_pydataprovider2` 。完整的数据提供文件在
-``demo/seqToseq/dataprovider.py``\ 。
 配置循环神经网络架构
 --------------------
@@ -85,19 +62,19 @@ vanilla
                   act=None,
                   rnn_layer_attr=None):
        def __rnn_step__(ipt):
-           out_mem = memory(name=name, size=size)
+           out_mem = paddle.layer.memory(name=name, size=size)
-           rnn_out = mixed_layer(input = [full_matrix_projection(ipt),
+           rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt),
-                                          full_matrix_projection(out_mem)],
+                                                 paddle.layer.full_matrix_projection(input=out_mem)],
-                                 name = name,
+                                        name = name,
-                                 bias_attr = rnn_bias_attr,
+                                        bias_attr = rnn_bias_attr,
-                                 act = act,
+                                        act = act,
-                                 layer_attr = rnn_layer_attr,
+                                        layer_attr = rnn_layer_attr,
-                                 size = size)
+                                        size = size)
           return rnn_out
-        return recurrent_group(name='%s_recurrent_group' % name,
+        return paddle.layer.recurrent_group(name='%s_recurrent_group' % name,
-                               step=__rnn_step__,
+                                            step=__rnn_step__,
-                               reverse=reverse,
+                                            reverse=reverse,
-                               input=input)
+                                            input=input)
 PaddlePaddle
 使用“Memory”（记忆模块）实现单步函数。\ **Memory**\ 是在PaddlePaddle中构造循环神经网络时最重要的概念。
@@ -140,43 +117,52 @@ Sequence to Sequence Model with Attention
 .. code:: python
    # 定义源语句的数据层
-    src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
    # 计算每个词的词向量
-    src_embedding = embedding_layer(
+    src_embedding = paddle.layer.embedding(
        input=src_word_id,
        size=word_vector_dim,
-        param_attr=ParamAttr(name='_source_language_embedding'))
+        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
    # 应用前向循环神经网络
-    src_forward = grumemory(input=src_embedding, size=encoder_size)
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
    # 应用反向递归神经网络（reverse=True表示反向循环神经网络）
-    src_backward = grumemory(input=src_embedding,
+    src_backward = paddle.networks.simple_gru(
-                              size=encoder_size,
+        input=src_embedding, size=encoder_size, reverse=True)
-                              reverse=True)
    # 将循环神经网络的前向和反向部分混合在一起
-    encoded_vector = concat_layer(input=[src_forward, src_backward])
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
    # 投射编码向量到 decoder_size
-    encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)],
+    encoded_proj = paddle.layer.mixed(
-                               size = decoder_size)
+        size=decoder_size,
+        input=paddle.layer.full_matrix_projection(encoded_vector))
    # 计算反向RNN的第一个实例
-    backward_first = first_seq(input=src_backward)
+    backward_first = paddle.layer.first_seq(input=src_backward)
    # 投射反向RNN的第一个实例到 decoder size
-    decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation())
+    decoder_boot = paddle.layer.mixed(
+       size=decoder_size,
+       act=paddle.activation.Tanh(),
+       input=paddle.layer.full_matrix_projection(backward_first))
 解码器使用 ``recurrent_group`` 来定义循环神经网络。单步函数和输出函数在
 ``gru_decoder_with_attention`` 中定义：
 .. code:: python
-    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
-                  StaticInput(input=encoded_proj,is_seq=True)]
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
-    trg_embedding = embedding_layer(
+    group_inputs = [group_input1, group_input2]
-        input=data_layer(name='target_language_word',
+    trg_embedding = paddle.layer.embedding(
-                         size=target_dict_dim),
+            input=paddle.layer.data(
-        size=word_vector_dim,
+                name='target_language_word',
-        param_attr=ParamAttr(name='_target_language_embedding'))
+                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
    group_inputs.append(trg_embedding)
    # 对于配备有注意力机制的解码器，在训练中，
@@ -185,9 +171,10 @@ Sequence to Sequence Model with Attention
    # StaticInput 意味着不同时间步的输入都是相同的值，
    # 否则它以一个序列输入，不同时间步的输入是不同的。
    # 所有输入序列应该有相同的长度。
-    decoder = recurrent_group(name=decoder_group_name,
+    decoder = paddle.layer.recurrent_group(
-                              step=gru_decoder_with_attention,
+            name=decoder_group_name,
-                              input=group_inputs)
+            step=gru_decoder_with_attention,
+            input=group_inputs)
 单步函数的实现如下所示。首先，它定义解码网络的\ **Memory**\ 。然后定义
 attention，门控循环单元单步函数和输出函数：
@@ -198,27 +185,32 @@ attention，门控循环单元单步函数和输出函数：
        # 定义解码器的Memory
        # Memory的输出定义在 gru_step 内
        # 注意 gru_step 应该与它的Memory名字相同
-        decoder_mem = memory(name='gru_decoder',
+        decoder_mem = paddle.layer.memory(
-                             size=decoder_size,
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
-                             boot_layer=decoder_boot)
        # 计算 attention 加权编码向量
-        context = simple_attention(encoded_sequence=enc_vec,
+        context = paddle.networks.simple_attention(
-                                   encoded_proj=enc_proj,
+            encoded_sequence=enc_vec,
-                                   decoder_state=decoder_mem)
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
        # 混合当前词向量和attention加权编码向量
-        decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context),
+         decoder_inputs = paddle.layer.mixed(
-                                               full_matrix_projection(current_word)],
+            size=decoder_size * 3,
-                                     size = decoder_size * 3)
+            input=[
+                paddle.layer.full_matrix_projection(input=context),
+                paddle.layer.full_matrix_projection(input=current_word)
+            ])
        # 定义门控循环单元循环神经网络单步函数
-        gru_step = gru_step_layer(name='gru_decoder',
+         gru_step = paddle.layer.gru_step(
-                                  input=decoder_inputs,
+            name='gru_decoder',
-                                  output_mem=decoder_mem,
+            input=decoder_inputs,
-                                  size=decoder_size)
+            output_mem=decoder_mem,
+            size=decoder_size)
        # 定义输出函数
-        out = mixed_layer(input=[full_matrix_projection(input=gru_step)],
+         out = paddle.layer.mixed(
-                          size=target_dict_dim,
+            size=target_dict_dim,
-                          bias_attr=True,
+            bias_attr=True,
-                          act=SoftmaxActivation())
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
        return out
 生成序列
@@ -238,41 +230,32 @@ attention，门控循环单元单步函数和输出函数：
   -  ``beam_size``: beam search 算法中的beam大小。
   -  ``max_length``: 生成序列的最大长度。
-  使用 ``seqtext_printer_evaluator``
-   根据索引矩阵和字典打印文本。这个函数需要设置：
-   -  ``id_input``: 数据的整数ID，用于标识生成的文件中的相应输出。
-   -  ``dict_file``: 用于将词ID转换为词的字典文件。
-   -  ``result_file``: 生成结果文件的路径。
 代码如下：
 .. code:: python
-    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
-                  StaticInput(input=encoded_proj,is_seq=True)]
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
    # 在生成时，解码器基于编码源序列和最后生成的目标词预测下一目标词。
    # 编码源序列（编码器输出）必须由只读Memory的 StaticInput 指定。
    # 这里， GeneratedInputs 自动获取上一个生成的词，并在最开始初始化为起始词，如 <s>。
-    trg_embedding = GeneratedInput(
+    trg_embedding = paddle.layer.GeneratedInput(
-        size=target_dict_dim,
+            size=target_dict_dim,
-        embedding_name='_target_language_embedding',
+            embedding_name='_target_language_embedding',
-        embedding_size=word_vector_dim)
+            embedding_size=word_vector_dim)
    group_inputs.append(trg_embedding)
-    beam_gen = beam_search(name=decoder_group_name,
+    beam_gen = paddle.layer.beam_search(
-                           step=gru_decoder_with_attention,
+            name=decoder_group_name,
-                           input=group_inputs,
+            step=gru_decoder_with_attention,
-                           bos_id=0, # Beginnning token.
+            input=group_inputs,
-                           eos_id=1, # End of sentence token.
+            bos_id=0, # Beginnning token.
-                           beam_size=beam_size,
+            eos_id=1, # End of sentence token.
-                           max_length=max_length)
+            beam_size=beam_size,
+            max_length=max_length)
-    seqtext_printer_evaluator(input=beam_gen,
-                              id_input=data_layer(name="sent_id", size=1),
+    return beam_gen
-                              dict_file=trg_dict_path,
-                              result_file=gen_trans_file)
+注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅 `book/06.understand_sentiment <https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment>`_ 了解更多详细信息。
-    outputs(beam_gen)
+完整的配置文件在 `book/08.machine_translation/train.py <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py>`_ 。
-注意，这种生成技术只用于类似解码器的生成过程。如果你正在处理序列标记任务，请参阅 :ref:`semantic_role_labeling` 了解更多详细信息。
-完整的配置文件在\ ``demo/seqToseq/seqToseq_net.py``\ 。
--- a/doc/howto/deep_model/rnn/rnn_config_en.rst
+++ b/doc/howto/deep_model/rnn/rnn_config_en.rst
@@ -3,34 +3,11 @@ RNN Configuration
 This tutorial will guide you how to configure recurrent neural network in PaddlePaddle. PaddlePaddle supports highly flexible and efficient recurrent neural network configuration. In this tutorial, you will learn how to:
- prepare sequence data for learning recurrent neural networks.
 - configure recurrent neural network architecture.
 - generate sequence with learned recurrent neural network models.
-We will use vanilla recurrent neural network, and sequence to sequence model to guide you through these steps. The code of sequence to sequence model can be found at :code:`demo/seqToseq`.
+We will use vanilla recurrent neural network, and sequence to sequence model to guide you through these steps. The code of sequence to sequence model can be found at `book/08.machine_translation <https://github.com/PaddlePaddle/book/tree/develop/08.machine_translation>`_ .
+And the data preparation of this model can be found at `python/paddle/v2/dataset/wmt14.py <https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/dataset/wmt14.py>`_ 
-=====================
-Prepare Sequence Data
-=====================
-PaddlePaddle does not need any preprocessing to sequence data, such as padding. The only thing that needs to be done is to set the type of the corresponding type to input. For example, the following code snippets defines three input. All of them are sequences, and the size of them are :code:`src_dict`, :code:`trg_dict`, and :code:`trg_dict`:
-.. code-block:: python
-    settings.input_types = [
-      integer_value_sequence(len(settings.src_dict)),
-      integer_value_sequence(len(settings.trg_dict)),
-      integer_value_sequence(len(settings.trg_dict))]
-Then at the :code:`process` function, each :code:`yield` function will return three integer lists. Each integer list is treated as a sequence of integers:
-.. code-block:: python
-    yield src_ids, trg_ids, trg_ids_next
-For more details description of how to write a data provider, please refer to :ref:`api_pydataprovider2` . The full data provider file is located at :code:`demo/seqToseq/dataprovider.py`.
 ===============================================
 Configure Recurrent Neural Network Architecture
@@ -75,19 +52,19 @@ Its **output function** simply takes :math:`x_t` as the output.
                   act=None,
                   rnn_layer_attr=None):
        def __rnn_step__(ipt):
-           out_mem = memory(name=name, size=size)
+           out_mem = paddle.layer.memory(name=name, size=size)
-           rnn_out = mixed_layer(input = [full_matrix_projection(ipt),
+           rnn_out = paddle.layer.mixed(input = [paddle.layer.full_matrix_projection(input=ipt),
-                                          full_matrix_projection(out_mem)],
+                                                 paddle.layer.full_matrix_projection(input=out_mem)],
-                                 name = name,
+                                        name = name,
-                                 bias_attr = rnn_bias_attr,
+                                        bias_attr = rnn_bias_attr,
-                                 act = act,
+                                        act = act,
-                                 layer_attr = rnn_layer_attr,
+                                        layer_attr = rnn_layer_attr,
-                                 size = size)
+                                        size = size)
           return rnn_out
-        return recurrent_group(name='%s_recurrent_group' % name,
+        return paddle.layer.recurrent_group(name='%s_recurrent_group' % name,
-                               step=__rnn_step__,
+                                            step=__rnn_step__,
-                               reverse=reverse,
+                                            reverse=reverse,
-                               input=input)
+                                            input=input)
 PaddlePaddle uses memory to construct step function. **Memory** is the most important concept when constructing recurrent neural networks in PaddlePaddle. A memory is a state that is used recurrently in step functions, such as :math:`x_{t+1} = f_x(x_t)`. One memory contains an **output** and a **input**. The output of memory at the current time step is utilized as the input of the memory at the next time step. A memory can also has a **boot layer**, whose output is utilized as the initial value of the memory. In our case, the output of the gated recurrent unit is employed as the output memory. Notice that the name of the layer :code:`rnn_out` is the same as the name of :code:`out_mem`. This means the output of the layer :code:`rnn_out` (:math:`x_{t+1}`) is utilized as the **output** of :code:`out_mem` memory.
@@ -113,43 +90,52 @@ We also project the encoder vector to :code:`decoder_size` dimensional space, ge
 .. code-block:: python
    # Define the data layer of the source sentence.
-    src_word_id = data_layer(name='source_language_word', size=source_dict_dim)
+    src_word_id = paddle.layer.data(
+        name='source_language_word',
+        type=paddle.data_type.integer_value_sequence(source_dict_dim))
    # Calculate the word embedding of each word.
-    src_embedding = embedding_layer(
+    src_embedding = paddle.layer.embedding(
        input=src_word_id,
        size=word_vector_dim,
-        param_attr=ParamAttr(name='_source_language_embedding'))
+        param_attr=paddle.attr.ParamAttr(name='_source_language_embedding'))
    # Apply forward recurrent neural network.
-    src_forward = grumemory(input=src_embedding, size=encoder_size)
+    src_forward = paddle.networks.simple_gru(
+        input=src_embedding, size=encoder_size)
    # Apply backward recurrent neural network. reverse=True means backward recurrent neural network.
-    src_backward = grumemory(input=src_embedding,
+    src_backward = paddle.networks.simple_gru(
-                              size=encoder_size,
+        input=src_embedding, size=encoder_size, reverse=True)
-                              reverse=True)
    # Mix the forward and backward parts of the recurrent neural network together.
-    encoded_vector = concat_layer(input=[src_forward, src_backward])
+    encoded_vector = paddle.layer.concat(input=[src_forward, src_backward])
    # Project encoding vector to decoder_size.
-    encoder_proj = mixed_layer(input = [full_matrix_projection(encoded_vector)],
+    encoded_proj = paddle.layer.mixed(
-                               size = decoder_size)
+        size=decoder_size,
+        input=paddle.layer.full_matrix_projection(encoded_vector))
    # Compute the first instance of the backward RNN.
-    backward_first = first_seq(input=src_backward)
+    backward_first = paddle.layer.first_seq(input=src_backward)
    # Project the first instance of backward RNN to decoder size.
-    decoder_boot = mixed_layer(input=[full_matrix_projection(backward_first)], size=decoder_size, act=TanhActivation())
+    decoder_boot = paddle.layer.mixed(
+       size=decoder_size,
+       act=paddle.activation.Tanh(),
+       input=paddle.layer.full_matrix_projection(backward_first))
 The decoder uses :code:`recurrent_group` to define the recurrent neural network. The step and output functions are defined in :code:`gru_decoder_with_attention`:
 .. code-block:: python
-    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
-                  StaticInput(input=encoded_proj,is_seq=True)]
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
-    trg_embedding = embedding_layer(
+    group_inputs = [group_input1, group_input2]
-        input=data_layer(name='target_language_word',
+    trg_embedding = paddle.layer.embedding(
-                         size=target_dict_dim),
+            input=paddle.layer.data(
-        size=word_vector_dim,
+                name='target_language_word',
-        param_attr=ParamAttr(name='_target_language_embedding'))
+                type=paddle.data_type.integer_value_sequence(target_dict_dim)),
+            size=word_vector_dim,
+            param_attr=paddle.attr.ParamAttr(name='_target_language_embedding'))
+        group_inputs.append(trg_embedding)
    group_inputs.append(trg_embedding)
    # For decoder equipped with attention mechanism, in training,
@@ -158,9 +144,10 @@ The decoder uses :code:`recurrent_group` to define the recurrent neural network.
    # StaticInput means the same value is utilized at different time steps.
    # Otherwise, it is a sequence input. Inputs at different time steps are different.
    # All sequence inputs should have the same length.
-    decoder = recurrent_group(name=decoder_group_name,
+    decoder = paddle.layer.recurrent_group(
-                              step=gru_decoder_with_attention,
+            name=decoder_group_name,
-                              input=group_inputs)
+            step=gru_decoder_with_attention,
+            input=group_inputs)
 The implementation of the step function is listed as below. First, it defines the **memory** of the decoder network. Then it defines attention, gated recurrent unit step function, and the output function:
@@ -171,27 +158,32 @@ The implementation of the step function is listed as below. First, it defines th
        # Defines the memory of the decoder.
        # The output of this memory is defined in gru_step.
        # Notice that the name of gru_step should be the same as the name of this memory.
-        decoder_mem = memory(name='gru_decoder',
+        decoder_mem = paddle.layer.memory(
-                             size=decoder_size,
+            name='gru_decoder', size=decoder_size, boot_layer=decoder_boot)
-                             boot_layer=decoder_boot)
        # Compute attention weighted encoder vector.
-        context = simple_attention(encoded_sequence=enc_vec,
+        context = paddle.networks.simple_attention(
-                                   encoded_proj=enc_proj,
+            encoded_sequence=enc_vec,
-                                   decoder_state=decoder_mem)
+            encoded_proj=enc_proj,
+            decoder_state=decoder_mem)
        # Mix the current word embedding and the attention weighted encoder vector.
-        decoder_inputs = mixed_layer(inputs = [full_matrix_projection(context),
+        decoder_inputs = paddle.layer.mixed(
-                                               full_matrix_projection(current_word)],
+            size=decoder_size * 3,
-                                     size = decoder_size * 3)
+            input=[
+                paddle.layer.full_matrix_projection(input=context),
+                paddle.layer.full_matrix_projection(input=current_word)
+            ])
        # Define Gated recurrent unit recurrent neural network step function.
-        gru_step = gru_step_layer(name='gru_decoder',
+        gru_step = paddle.layer.gru_step(
-                                  input=decoder_inputs,
+            name='gru_decoder',
-                                  output_mem=decoder_mem,
+            input=decoder_inputs,
-                                  size=decoder_size)
+            output_mem=decoder_mem,
+            size=decoder_size)
        # Defines the output function.
-        out = mixed_layer(input=[full_matrix_projection(input=gru_step)],
+        out = paddle.layer.mixed(
-                          size=target_dict_dim,
+            size=target_dict_dim,
-                          bias_attr=True,
+            bias_attr=True,
-                          act=SoftmaxActivation())
+            act=paddle.activation.Softmax(),
+            input=paddle.layer.full_matrix_projection(input=gru_step))
        return out
@@ -207,45 +199,37 @@ After training the model, we can use it to generate sequences. A common practice
  - :code:`eos_id`: the end token. Every sentence ends with the end token.
  - :code:`beam_size`: the beam size used in beam search.
  - :code:`max_length`: the maximum length of the generated sentences.
-* use :code:`seqtext_printer_evaluator` to print text according to index matrix and dictionary. This function needs to set:
-  - :code:`id_input`: the integer ID of the data, used to identify the corresponding output in the generated files.
-  - :code:`dict_file`: the dictionary file for converting word id to word.
-  - :code:`result_file`: the path of the generation result file.
 The code is listed below:
 .. code-block:: python
-    group_inputs=[StaticInput(input=encoded_vector,is_seq=True),
+    group_input1 = paddle.layer.StaticInput(input=encoded_vector, is_seq=True)
-                  StaticInput(input=encoded_proj,is_seq=True)]
+    group_input2 = paddle.layer.StaticInput(input=encoded_proj, is_seq=True)
+    group_inputs = [group_input1, group_input2]
    # In generation, decoder predicts a next target word based on
    # the encoded source sequence and the last generated target word.
    # The encoded source sequence (encoder's output) must be specified by
    # StaticInput which is a read-only memory.
    # Here, GeneratedInputs automatically fetchs the last generated word,
    # which is initialized by a start mark, such as <s>.
-    trg_embedding = GeneratedInput(
+    trg_embedding = paddle.layer.GeneratedInput(
-        size=target_dict_dim,
+            size=target_dict_dim,
-        embedding_name='_target_language_embedding',
+            embedding_name='_target_language_embedding',
-        embedding_size=word_vector_dim)
+            embedding_size=word_vector_dim)
    group_inputs.append(trg_embedding)
-    beam_gen = beam_search(name=decoder_group_name,
+    beam_gen = paddle.layer.beam_search(
-                           step=gru_decoder_with_attention,
+            name=decoder_group_name,
-                           input=group_inputs,
+            step=gru_decoder_with_attention,
-                           bos_id=0, # Beginnning token.
+            input=group_inputs,
-                           eos_id=1, # End of sentence token.
+            bos_id=0, # Beginnning token.
-                           beam_size=beam_size,
+            eos_id=1, # End of sentence token.
-                           max_length=max_length)
+            beam_size=beam_size,
+            max_length=max_length)
-    seqtext_printer_evaluator(input=beam_gen,
+    return beam_gen
-                              id_input=data_layer(name="sent_id", size=1),
-                              dict_file=trg_dict_path,
-                              result_file=gen_trans_file)
-    outputs(beam_gen)
-Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to :ref:`semantic_role_labeling` for more details.
+Notice that this generation technique is only useful for decoder like generation process. If you are working on sequence tagging tasks, please refer to `book/06.understand_sentiment <https://github.com/PaddlePaddle/book/tree/develop/06.understand_sentiment>`_ for more details.
-The full configuration file is located at :code:`demo/seqToseq/seqToseq_net.py`.
+The full configuration file is located at `book/08.machine_translation/train.py <https://github.com/PaddlePaddle/book/blob/develop/08.machine_translation/train.py>`_ .
--- a/go/cmake/golang.cmake
+++ b/go/cmake/golang.cmake
@@ -26,27 +26,23 @@ function(GO_LIBRARY NAME BUILD_TYPE)
  # automatically get all dependencies specified in the source code
  # for given target.
-  add_custom_target(goGet env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get -d ${rel}/...)
+  add_custom_target(${NAME}_goGet env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} get -d ${rel}/...)
  # make a symlink that references Paddle inside $GOPATH, so go get
  # will use the local changes in Paddle rather than checkout Paddle
  # in github.
-  add_custom_target(copyPaddle
+  add_custom_target(${NAME}_copyPaddle
    COMMAND rm -rf ${PADDLE_IN_GOPATH}/Paddle
    COMMAND ln -sf ${PADDLE_DIR} ${PADDLE_IN_GOPATH}/Paddle)
-  add_dependencies(goGet copyPaddle)
+  add_dependencies(${NAME}_goGet ${NAME}_copyPaddle)
  add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
    COMMAND env GOPATH=${GOPATH} ${CMAKE_Go_COMPILER} build ${BUILD_MODE}
-     -gcflags=-shared -asmflags=-shared -installsuffix=_shared -a
     -o "${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME}"
    ${CMAKE_GO_FLAGS} ${GO_SOURCE}
    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  add_custom_target(${NAME} ALL DEPENDS ${OUTPUT_DIR}/.timestamp ${ARGN})
-  add_dependencies(${NAME} goGet)
+  add_dependencies(${NAME} ${NAME}_goGet)
-  if(NOT BUILD_TYPE STREQUAL "STATIC")
-    install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${LIB_NAME} DESTINATION bin)
-  endif()
 endfunction(GO_LIBRARY)
--- a/go/connection/conn.go
+++ b/go/connection/conn.go
@@ -2,9 +2,10 @@ package connection
 import (
 	"errors"
-	"log"
 	"net/rpc"
 	"sync"
+	log "github.com/sirupsen/logrus"
 )
 // TODO(helin): add TCP re-connect logic
@@ -65,7 +66,7 @@ func (c *Conn) Connect(addr string) error {
 	} else {
 		err := client.Close()
 		if err != nil {
-			log.Println(err)
+			log.Errorln(err)
 		}
 		return errors.New("client already set from a concurrent goroutine")

--- a/go/master/c/CMakeLists.txt
+++ b/go/master/c/CMakeLists.txt
+cmake_minimum_required(VERSION 3.0)
+get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
+get_filename_component(PARENT_DIR ${PARENT_DIR} DIRECTORY)
+set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${PARENT_DIR}/cmake")
+project(cxx_go C Go)
+include(golang)
+include(flags)
+set(MASTER_LIB_NAME "paddle_master")
+go_library(${MASTER_LIB_NAME} SHARED)
+if(PROJ_ROOT)
+  add_custom_command(OUTPUT ${PROJ_ROOT}/python/paddle/v2/master/lib${MASTER_LIB_NAME}.so
+    COMMAND rm ${CMAKE_CURRENT_BINARY_DIR}/lib${MASTER_LIB_NAME}.h
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/lib${MASTER_LIB_NAME}.so ${PROJ_ROOT}/python/paddle/v2/master/
+    DEPENDS ${MASTER_LIB_NAME})
+  add_custom_target(paddle_master_shared ALL DEPENDS ${PROJ_ROOT}/python/paddle/v2/master/lib${MASTER_LIB_NAME}.so)
+endif(PROJ_ROOT)
--- a/go/master/c/client.go
+++ b/go/master/c/client.go
+package main
+/*
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#define PADDLE_MASTER_OK    0
+#define PADDLE_MASTER_ERROR -1
+typedef int paddle_master_client;
+*/
+import "C"
+import (
+	"sync"
+	"unsafe"
+	"github.com/PaddlePaddle/Paddle/go/master"
+	log "github.com/sirupsen/logrus"
+)
+var nullPtr = unsafe.Pointer(uintptr(0))
+var mu sync.Mutex
+var handleMap = make(map[C.paddle_master_client]*master.Client)
+var curHandle C.paddle_master_client
+func add(c *master.Client) C.paddle_master_client {
+	mu.Lock()
+	defer mu.Unlock()
+	client := curHandle
+	curHandle++
+	handleMap[client] = c
+	return client
+}
+func get(client C.paddle_master_client) *master.Client {
+	mu.Lock()
+	defer mu.Unlock()
+	return handleMap[client]
+}
+func remove(client C.paddle_master_client) *master.Client {
+	mu.Lock()
+	defer mu.Unlock()
+	h := handleMap[client]
+	delete(handleMap, client)
+	return h
+}
+type addresser string
+func (a addresser) Address() string {
+	return string(a)
+}
+//export paddle_new_master_client
+func paddle_new_master_client(addr *C.char, bufSize int) C.paddle_master_client {
+	a := C.GoString(addr)
+	c := master.NewClient(addresser(a), bufSize)
+	return add(c)
+}
+//export paddle_release_master_client
+func paddle_release_master_client(client C.paddle_master_client) {
+	remove(client)
+}
+//export paddle_set_dataset
+func paddle_set_dataset(client C.paddle_master_client, path **C.char, size C.int) C.int {
+	c := get(client)
+	var paths []string
+	for i := 0; i < int(size); i++ {
+		ptr := (**C.char)(unsafe.Pointer(uintptr(unsafe.Pointer(path)) + uintptr(i)*unsafe.Sizeof(*path)))
+		str := C.GoString(*ptr)
+		paths = append(paths, str)
+	}
+	err := c.SetDataset(paths)
+	if err != nil {
+		log.Errorln(err)
+		return C.PADDLE_MASTER_ERROR
+	}
+	return C.PADDLE_MASTER_OK
+}
+//export paddle_next_record
+func paddle_next_record(client C.paddle_master_client, record **C.uchar) C.int {
+	c := get(client)
+	r := c.NextRecord()
+	if len(r) == 0 {
+		*record = (*C.uchar)(nullPtr)
+		return 0
+	}
+	size := C.size_t(len(r))
+	*record = (*C.uchar)(C.malloc(size))
+	C.memcpy(unsafe.Pointer(*record), unsafe.Pointer(&r[0]), size)
+	return C.int(size)
+}
+//export mem_free
+func mem_free(p unsafe.Pointer) {
+	// "free" may be a better name for this function, but doing so
+	// will cause calling any function of this library from Python
+	// ctypes hanging.
+	C.free(p)
+}
+func main() {}
--- a/go/master/client.go
+++ b/go/master/client.go
 package master
 import (
-	"log"
+	"os"
 	"time"
 	"github.com/PaddlePaddle/Paddle/go/connection"
+	"github.com/PaddlePaddle/recordio"
+	log "github.com/sirupsen/logrus"
 )
 // Addresser provide the address of the master server.
@@ -15,16 +17,61 @@ type Addresser interface {
 // Client is the client of the master server.
 type Client struct {
 	conn *connection.Conn
+	ch   chan []byte
 }
 // NewClient creates a new Client.
-func NewClient(addr Addresser) *Client {
+//
+// bufSize is the record buffer size. NextRecord will read from this
+// buffer.
+func NewClient(addr Addresser, bufSize int) *Client {
 	c := &Client{}
 	c.conn = connection.New()
+	c.ch = make(chan []byte, bufSize)
 	go c.monitorMaster(addr)
+	go c.getRecords()
 	return c
 }
+func (c *Client) getRecords() {
+	for {
+		t, err := c.getTask()
+		if err != nil {
+			// TODO(helin): wait before move on with next
+			// getTask call.
+			log.Errorln(err)
+			continue
+		}
+		for _, chunk := range t.Chunks {
+			f, err := os.Open(chunk.Path)
+			if err != nil {
+				log.Errorln(err)
+				continue
+			}
+			s := recordio.NewRangeScanner(f, &chunk.Index, -1, -1)
+			for s.Scan() {
+				c.ch <- s.Record()
+			}
+			if s.Err() != nil {
+				log.Errorln(err, chunk.Path)
+			}
+			err = f.Close()
+			if err != nil {
+				log.Errorln(err)
+			}
+		}
+		// We treat a task as finished whenever the last data
+		// instance of the task is read. This is not exactly
+		// correct, but a reasonable approximation.
+		c.taskFinished(t.ID)
+	}
+}
 func (c *Client) monitorMaster(addr Addresser) {
 	lastMaster := ""
 	monitor := func() {
@@ -35,12 +82,12 @@ func (c *Client) monitorMaster(addr Addresser) {
 			if curMaster == "" {
 				err := c.conn.Close()
 				if err != nil {
-					log.Println(err)
+					log.Errorln(err)
 				}
 			} else {
 				err := c.conn.Connect(curMaster)
 				if err != nil {
-					log.Println(err)
+					log.Errorln(err)
 					// connect to addr failed, set
 					// to last known addr in order
@@ -69,14 +116,22 @@ func (c *Client) SetDataset(globPaths []string) error {
 	return c.conn.Call("Service.SetDataset", globPaths, nil)
 }
-// GetTask gets a new task from the master server.
+// getTask gets a new task from the master server.
-func (c *Client) GetTask() (Task, error) {
+func (c *Client) getTask() (Task, error) {
 	var t Task
 	err := c.conn.Call("Service.GetTask", 0, &t)
 	return t, err
 }
 // TaskFinished tells the master server a task is finished.
-func (c *Client) TaskFinished(taskID int) error {
+func (c *Client) taskFinished(taskID int) error {
 	return c.conn.Call("Service.TaskFinished", taskID, nil)
 }
+// NextRecord returns next record in the dataset.
+//
+// NextRecord will block until the next record is available. It is
+// thread-safe.
+func (c *Client) NextRecord() []byte {
+	return <-c.ch
+}
--- a/go/master/client_internal_test.go
+++ b/go/master/client_internal_test.go
+package master
+import (
+	"fmt"
+	"net"
+	"net/http"
+	"net/rpc"
+	"os"
+	"strconv"
+	"strings"
+	"testing"
+	"time"
+	log "github.com/sirupsen/logrus"
+	"github.com/PaddlePaddle/Paddle/go/connection"
+	"github.com/PaddlePaddle/recordio"
+)
+const (
+	totalTask    = 20
+	chunkPerTask = 10
+)
+func init() {
+	log.SetLevel(log.ErrorLevel)
+}
+type TestAddresser string
+func (a TestAddresser) Address() string {
+	return string(a)
+}
+func TestGetFinishTask(t *testing.T) {
+	const path = "/tmp/master_client_test_0"
+	l, err := net.Listen("tcp", ":0")
+	if err != nil {
+		panic(err)
+	}
+	ss := strings.Split(l.Addr().String(), ":")
+	p, err := strconv.Atoi(ss[len(ss)-1])
+	if err != nil {
+		panic(err)
+	}
+	go func(l net.Listener) {
+		s := NewService(chunkPerTask, time.Second, 1)
+		server := rpc.NewServer()
+		err := server.Register(s)
+		if err != nil {
+			panic(err)
+		}
+		mux := http.NewServeMux()
+		mux.Handle(rpc.DefaultRPCPath, server)
+		err = http.Serve(l, mux)
+		if err != nil {
+			panic(err)
+		}
+	}(l)
+	f, err := os.Create(path)
+	if err != nil {
+		panic(err)
+	}
+	for i := 0; i < totalTask*chunkPerTask; i++ {
+		w := recordio.NewWriter(f, -1, -1)
+		w.Write(nil)
+		// call Close to force RecordIO writing a chunk.
+		w.Close()
+	}
+	f.Close()
+	// Manually intialize client to avoid calling c.getRecords()
+	c := &Client{}
+	c.conn = connection.New()
+	go c.monitorMaster(TestAddresser(fmt.Sprintf(":%d", p)))
+	c.SetDataset([]string{path})
+	checkOnePass := func(i int) {
+		var tasks []Task
+		for idx := 0; idx < totalTask; idx++ {
+			task, err := c.getTask()
+			if err != nil {
+				t.Fatalf("Error: %v, pass: %d\n", err, i)
+			}
+			tasks = append(tasks, task)
+		}
+		_, err = c.getTask()
+		if err == nil {
+			t.Fatalf("Should get error, pass: %d\n", i)
+		}
+		err = c.taskFinished(tasks[0].ID)
+		if err != nil {
+			t.Fatalf("Error: %v, pass: %d\n", err, i)
+		}
+		tasks = tasks[1:]
+		task, err := c.getTask()
+		if err != nil {
+			t.Fatal(err)
+		}
+		tasks = append(tasks, task)
+		for _, task := range tasks {
+			err = c.taskFinished(task.ID)
+			if err != nil {
+				t.Fatalf("Error: %v, pass: %d\n", err, i)
+			}
+		}
+	}
+	for i := 0; i < 10; i++ {
+		checkOnePass(i)
+	}
+}
--- a/go/master/client_test.go
+++ b/go/master/client_test.go
@@ -11,21 +11,15 @@ import (
 	"testing"
 	"time"
-	log "github.com/sirupsen/logrus"
 	"github.com/PaddlePaddle/Paddle/go/master"
 	"github.com/PaddlePaddle/recordio"
 )
-const (
+func TestNextRecord(t *testing.T) {
-	totalTask    = 20
+	const (
-	chunkPerTask = 10
+		path  = "/tmp/master_client_TestFull"
-)
+		total = 50
+	)
-var port int
-func init() {
-	log.SetLevel(log.ErrorLevel)
 	l, err := net.Listen("tcp", ":0")
 	if err != nil {
@@ -37,10 +31,9 @@ func init() {
 	if err != nil {
 		panic(err)
 	}
-	port = p
 	go func(l net.Listener) {
-		s := master.NewService(chunkPerTask, time.Second, 1)
+		s := master.NewService(10, time.Second, 1)
 		server := rpc.NewServer()
 		err := server.Register(s)
 		if err != nil {
@@ -54,67 +47,33 @@ func init() {
 			panic(err)
 		}
 	}(l)
-}
-type addresser string
+	f, err := os.Create(path)
-func (a addresser) Address() string {
-	return string(a)
-}
-func TestClientFull(t *testing.T) {
-	const p = "/tmp/master_client_test_0"
-	f, err := os.Create(p)
 	if err != nil {
 		panic(err)
 	}
-	for i := 0; i < totalTask*chunkPerTask; i++ {
+	w := recordio.NewWriter(f, -1, -1)
-		w := recordio.NewWriter(f, -1, -1)
+	for i := 0; i < total; i++ {
-		w.Write(nil)
+		w.Write([]byte{byte(i)})
-		// call Close to force RecordIO writing a chunk.
-		w.Close()
 	}
+	w.Close()
 	f.Close()
-	c := master.NewClient(addresser(fmt.Sprintf(":%d", port)))
+	c := master.NewClient(master.TestAddresser(fmt.Sprintf(":%d", p)), 10)
-	c.SetDataset([]string{p})
+	c.SetDataset([]string{path})
-	checkOnePass := func(i int) {
+	for pass := 0; pass < 50; pass++ {
-		var tasks []master.Task
+		received := make(map[byte]bool)
-		for i := 0; i < totalTask; i++ {
+		for i := 0; i < total; i++ {
-			task, err := c.GetTask()
+			r := c.NextRecord()
-			if err != nil {
+			if len(r) != 1 {
-				t.Fatal(i, err)
+				t.Fatal("Length should be 1.", r)
 			}
-			tasks = append(tasks, task)
+			if received[r[0]] {
-		}
+				t.Fatal("Received duplicate.", received, r)
-		_, err = c.GetTask()
-		if err == nil {
-			t.Fatal(i, "should get error.")
-		}
-		err = c.TaskFinished(tasks[0].ID)
-		if err != nil {
-			t.Fatal(err)
-		}
-		tasks = tasks[1:]
-		task, err := c.GetTask()
-		if err != nil {
-			t.Fatal(err)
-		}
-		tasks = append(tasks, task)
-		for _, task := range tasks {
-			err = c.TaskFinished(task.ID)
-			if err != nil {
-				t.Fatal(i, err)
 			}
+			received[r[0]] = true
 		}
 	}
-	for i := 0; i < 10; i++ {
-		checkOnePass(i)
-	}
 }
--- a/go/master/service.go
+++ b/go/master/service.go
@@ -207,16 +207,26 @@ func (s *Service) checkTimeoutFunc(taskID int, epoch int) func() {
 		t.NumTimeout++
 		if t.NumTimeout > s.timeoutMax {
-			log.Warningf("Task %v failed %d times, discard.\n", t.Task, t.NumTimeout)
+			log.Warningf("Task %v timed out %d times, discard.\n", t.Task, t.NumTimeout)
 			s.taskQueues.Failed = append(s.taskQueues.Failed, t.Task)
 			return
 		}
-		log.Warningf("Task %v failed %d times, retry.\n", t.Task, t.NumTimeout)
+		log.Warningf("Task %v timed out %d times, retry.\n", t.Task, t.NumTimeout)
 		s.taskQueues.Todo = append(s.taskQueues.Todo, t)
 	}
 }
+// must be called with lock held.
+func (s *Service) logFields() log.Fields {
+	return log.Fields{
+		"todoLen":    len(s.taskQueues.Todo),
+		"pendingLen": len(s.taskQueues.Pending),
+		"doneLen":    len(s.taskQueues.Done),
+		"failedLen":  len(s.taskQueues.Failed),
+	}
+}
 // GetTask gets a new task from the service.
 func (s *Service) GetTask(dummy int, task *Task) error {
 	select {
@@ -230,7 +240,7 @@ func (s *Service) GetTask(dummy int, task *Task) error {
 		if len(s.taskQueues.Done) == 0 {
 			if len(s.taskQueues.Pending) == 0 {
 				err := errors.New("all task failed")
-				log.Warningln(err)
+				log.WithFields(s.logFields()).Warningln("All tasks failed.")
 				return err
 			}
@@ -243,12 +253,12 @@ func (s *Service) GetTask(dummy int, task *Task) error {
 			// in package. So we need to figure out a way
 			// for client to check this error correctly.
 			err := errors.New("no more available task")
-			log.Warningln(err)
+			log.WithFields(s.logFields()).Warningln("No more available task.")
 			return err
 		}
 		s.taskQueues.Todo = s.taskQueues.Done
 		s.taskQueues.Done = nil
-		log.Infoln("No more todo task, but trainer is requesting task to do. Move all done task to todo.")
+		log.WithFields(s.logFields()).Infoln("No more todo task, but trainer is requesting task to do. Move all done task to todo.")
 	}
 	t := s.taskQueues.Todo[0]
@@ -261,7 +271,7 @@ func (s *Service) GetTask(dummy int, task *Task) error {
 	}
 	*task = t.Task
-	log.Infof("Task #%d dispatched\n", task.ID)
+	log.WithFields(s.logFields()).Infof("Task #%d dispatched.", task.ID)
 	time.AfterFunc(s.timeoutDur, s.checkTimeoutFunc(t.Task.ID, t.Epoch))
 	return nil
@@ -276,12 +286,10 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 	s.mu.Lock()
 	defer s.mu.Unlock()
-	log.Infof("Task %d finished\n", taskID)
 	t, ok := s.taskQueues.Pending[taskID]
 	if !ok {
 		err := errors.New("pending task not found")
-		log.Warningln(err)
+		log.WithFields(s.logFields()).Warningln("Pending task #%d not found.", taskID)
 		return err
 	}
@@ -290,8 +298,10 @@ func (s *Service) TaskFinished(taskID int, dummy *int) error {
 	s.taskQueues.Done = append(s.taskQueues.Done, t)
 	delete(s.taskQueues.Pending, taskID)
+	log.WithFields(s.logFields()).Infof("Task #%d finished.", taskID)
 	if len(s.taskQueues.Pending) == 0 && len(s.taskQueues.Todo) == 0 {
-		log.Infoln("No more todo and pending task, start a new pass.")
+		log.WithFields(s.logFields()).Infoln("No more todo and pending task, start a new pass.")
 		s.taskQueues.Todo = append(s.taskQueues.Todo, s.taskQueues.Done...)
 		s.taskQueues.Done = nil
 	}

--- a/go/pserver/cclient/cclient.go
+++ b/go/pserver/cclient/cclient.go
 package main
 /*
-#include <stdlib.h>
 #include <string.h>
 typedef enum {
  PADDLE_ELEMENT_TYPE_INT32   = 0,
@@ -26,12 +25,12 @@ typedef int paddle_pserver_client;
 import "C"
 import (
-	"log"
 	"strings"
 	"sync"
 	"unsafe"
 	"github.com/PaddlePaddle/Paddle/go/pserver"
+	log "github.com/sirupsen/logrus"
 )
 var nullPtr = unsafe.Pointer(uintptr(0))
@@ -134,10 +133,10 @@ func paddle_init_param(client C.paddle_pserver_client, param C.paddle_parameter,
 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Printf("parameter %s already initialized, treat paddle_init_param as sucessful.\n", name)
+			log.Warningf("parameter %s already initialized, treat paddle_init_param as sucessful.\n", name)
 			return C.PSERVER_OK
 		}
-		log.Println(err)
+		log.Errorln(err)
 		return C.PSERVER_ERROR
 	}
@@ -150,11 +149,11 @@ func paddle_finish_init_params(client C.paddle_pserver_client) C.int {
 	err := c.FinishInitParams()
 	if err != nil {
 		if err.Error() == pserver.AlreadyInitialized {
-			log.Println("parameters already initialized, treat paddle_finish_init_params as sucessful.")
+			log.Warningln("parameters already initialized, treat paddle_finish_init_params as sucessful.")
 			return C.PSERVER_OK
 		}
-		log.Println(err)
+		log.Errorln(err)
 		return C.PSERVER_ERROR
 	}
@@ -175,7 +174,7 @@ func paddle_send_grads(client C.paddle_pserver_client, grads **C.paddle_gradient
 	c := get(client)
 	err := c.SendGrads(gs)
 	if err != nil {
-		log.Println(err)
+		log.Errorln(err)
 		return C.PSERVER_ERROR
 	}
@@ -192,7 +191,7 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 	c := get(client)
 	ps, err := c.GetParams(ns)
 	if err != nil {
-		log.Println(err)
+		log.Errorln(err)
 		return C.PSERVER_ERROR
 	}
@@ -201,7 +200,7 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 		for i, p := range ps {
 			pn[i] = p.Name
 		}
-		log.Printf("pserver returned wrong number of parameters. Requested: %s, returned: %s.\n", strings.Join(pn, ", "), strings.Join(ns, ", "))
+		log.Errorf("pserver returned wrong number of parameters. Requested: %s, returned: %s.\n", strings.Join(pn, ", "), strings.Join(ns, ", "))
 		return C.PSERVER_ERROR
 	}
@@ -211,7 +210,7 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 			for i, p := range ps {
 				pn[i] = p.Name
 			}
-			log.Printf("pserver returned wrong parameters, or not in requested order. Requested: %s, returned: %s.\n", strings.Join(pn, ", "), strings.Join(ns, ", "))
+			log.Errorf("pserver returned wrong parameters, or not in requested order. Requested: %s, returned: %s.\n", strings.Join(pn, ", "), strings.Join(ns, ", "))
 			return C.PSERVER_ERROR
 		}
 	}
@@ -221,14 +220,14 @@ func paddle_get_params(client C.paddle_pserver_client, dst **C.paddle_parameter,
 		param := *(**C.paddle_parameter)(unsafe.Pointer((uintptr(unsafe.Pointer(dst)) + uintptr(i)*unsafe.Sizeof(*dst))))
 		if unsafe.Pointer(param) == nullPtr {
-			log.Println("must pre-allocate parameter.")
+			log.Errorln("must pre-allocate parameter.")
 			return C.PSERVER_ERROR
-		} else {
+		}
-			if unsafe.Pointer(param.content) != nullPtr {
-				if int(param.content_len) != len(p.Content) {
+		if unsafe.Pointer(param.content) != nullPtr {
-					log.Printf("the pre-allocated content len does not match parameter content len. Pre-allocated len: %d, returned len: %d", param.content_len, len(p.Content))
+			if int(param.content_len) != len(p.Content) {
-					return C.PSERVER_ERROR
+				log.Errorf("the pre-allocated content len does not match parameter content len. Pre-allocated len: %d, returned len: %d", param.content_len, len(p.Content))
-				}
+				return C.PSERVER_ERROR
 			}
 		}
@@ -246,7 +245,7 @@ func paddle_save_model(client C.paddle_pserver_client, path *C.char) C.int {
 	c := get(client)
 	err := c.Save(p)
 	if err != nil {
-		log.Println(err)
+		log.Errorln(err)
 		return C.PSERVER_ERROR
 	}

--- a/go/pserver/cclient/test/main.c
+++ b/go/pserver/cclient/test/main.c
 #include <stdio.h>
+#include <stdlib.h>
 #include "libpaddle_pserver_cclient.h"

--- a/go/pserver/client.go
+++ b/go/pserver/client.go
@@ -2,11 +2,11 @@ package pserver
 import (
 	"hash/fnv"
-	"log"
 	"sort"
 	"time"
 	"github.com/PaddlePaddle/Paddle/go/connection"
+	log "github.com/sirupsen/logrus"
 )
 // TODO(helin): add RPC call retry logic
@@ -64,7 +64,7 @@ func (c *Client) monitorPservers(l Lister, pserverNum int) {
 			if curServers[i].Addr == "" {
 				err := c.pservers[i].Close()
 				if err != nil {
-					log.Println(err)
+					log.Errorln(err)
 				}
 				continue
@@ -72,7 +72,7 @@ func (c *Client) monitorPservers(l Lister, pserverNum int) {
 			err := c.pservers[i].Connect(curServers[i].Addr)
 			if err != nil {
-				log.Println(err)
+				log.Errorln(err)
 				// connect to addr failed, set
 				// to last known addr in order

--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -14,8 +14,8 @@ add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
 add_dependencies(paddle_function ${external_project_dependencies})
 add_dependencies(paddle_function gen_proto_cpp)
-if(WITH_GPU)
 if(WITH_TESTING)
+if(WITH_GPU)
    # TODO:
    # file(GLOB test_files . *OpTest.cpp)
    # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
@@ -30,6 +30,8 @@ if(WITH_TESTING)
    add_simple_unittest(CosSimOpTest)
    add_simple_unittest(RowConvOpTest)
 endif()
+add_simple_unittest(ConvOpTest)
 endif()
 add_style_check_target(paddle_function ${h_files})

--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -28,7 +28,7 @@ void testMatrixProjectionForward(int context_start,
               std::max(0, (int)(context_start + context_length - 1));
  if (pad == 0) is_padding = false;
-  FunctionCompare test(
+  CpuGpuFuncCompare test(
      "ContextProjectionForward",
      FuncConfig()
          .set("context_length", context_length)
@@ -60,7 +60,7 @@ void testMatrixProjectionBackward(int context_start,
               std::max(0, (int)(context_start + context_length - 1));
  if (pad == 0) is_padding = false;
-  FunctionCompare test(
+  CpuGpuFuncCompare test(
      "ContextProjectionBackward",
      FuncConfig()
          .set("context_length", context_length)

--- a/paddle/function/ConvOp.h
+++ b/paddle/function/ConvOp.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "Function.h"
+namespace paddle {
+/*
+ * \brief Based on the ConvFunctionBase class, the forward calculation,
+ *        backward input calculation and backward filter calculation
+ *        of convolution operations can be implemented.
+ *
+ * Arguments of forward and backward calculation:
+ *   1. Forward calculation of convolution.
+ *      inputs = {INPUT, FILTER}, outputs = {OUTPUT}
+ *      The first and second input arguments are input image and filter data.
+ *      The output argument is output image.
+ *
+ *   2. Backward input calculation of convolution.
+ *      inputs = {OUTPUT_GRAD, FILTER}, outputs = {INPUT_GRAD}
+ *      The first and second input arguments are output grad image
+ *      and filter data.
+ *      The output argument is input grad image.
+ *
+ *   3. Backward filter calculation of convolution.
+ *      inputs = {OUTPUT_GRAD, INPUT}, outputs = {FILTER_GRAD}
+ *      The first and second input arguments are output grad image
+ *      and input image.
+ *      The output argument is filter grad.
+ *
+ * Arguments format of input, filter and output:
+ *   1. Input image, output image, input image gradient, output image gradient
+ *      are all NCHW format. Where N is batch size, C is the number of channels,
+ *      H and W is the height and width of image or image gradient.
+ *
+ *   2. The format of the filter data is MCHW, where M is the number of output
+ *      image channels, C is the number of input image channels,
+ *      H and W is height and width of filter.
+ *
+ *      If `groups` is greater than 1, the filter's data format should be GMCHW,
+ *      where G is the `groups`, and G * M is the number of output image
+ *      channels, G * C is the number of input image channels,
+ *      H and W is height and width of filter.
+ */
+class ConvFunctionBase : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    // function arguments
+    strides_ = config.get<std::vector<size_t>>("strides");
+    paddings_ = config.get<std::vector<size_t>>("paddings");
+    groups_ = config.get<size_t>("groups");
+    // number of inputs and outputs
+    numInputs_ = 2;
+    numOutputs_ = 1;
+  }
+  virtual void calc(const BufferArgs& inputs, const BufferArgs& outputs) {}
+  // input can be INPUT and INPUT_GRAD
+  // filter can be FILTER and FILTER_GRAD
+  // output can be OUTPUT and OUTPUT_GRAD
+  void check(const TensorShape& input,
+             const TensorShape& filter,
+             const TensorShape& output) {
+    // inputs and outputs arguments should be 4-dimensional.
+    CHECK_EQ(input.ndims(), (size_t)4);
+    CHECK_EQ(output.ndims(), (size_t)4);
+    // The batchSize of the input needs to be equal to
+    // the batchSize of the output.
+    CHECK_EQ(input[0], output[0]);
+    if (filter.ndims() == (size_t)4) {
+      // If the filter's dimension is 4, groups convolution is not supported.
+      CHECK_EQ(groups_, (size_t)1);
+      // The input and output channel dimensions are the second and first
+      // dimensions of the filter shape.
+      CHECK_EQ(input[1], filter[1]);
+      CHECK_EQ(output[1], filter[0]);
+    } else {
+      // filter argument should be 5-dimensional.
+      CHECK_EQ(filter.ndims(), (size_t)5);
+      // The first dimension of the filter is the size of the group
+      CHECK_EQ(filter[0], groups_);
+      // The input and output channel dimensions are the third and second
+      // dimensions of the filter shape.
+      CHECK_EQ(input[1], filter[2] * groups_);
+      CHECK_EQ(output[1], filter[1] * groups_);
+    }
+  }
+protected:
+  size_t getFilterHeight(const TensorShape& filter) const {
+    return filter[filter.ndims() - 2];
+  }
+  size_t getFilterWidth(const TensorShape& filter) const {
+    return filter[filter.ndims() - 1];
+  }
+  std::vector<size_t> strides_;
+  std::vector<size_t> paddings_;
+  /// Group size, refer to grouped convolution in
+  /// Alex Krizhevsky's paper: when group=2, the first half of the
+  /// filters are only connected to the first half of the input channels,
+  /// and the second half only connected to the second half.
+  size_t groups_;
+  inline int strideH() const { return strides_[0]; }
+  inline int strideW() const { return strides_[1]; }
+  inline int paddingH() const { return paddings_[0]; }
+  inline int paddingW() const { return paddings_[1]; }
+  // A temporary memory in convolution calculation.
+  MemoryHandlePtr memory_;
+  template <DeviceType Device>
+  void resizeBuffer(size_t newSize) {
+    if (!memory_ || newSize * sizeof(real) > memory_->getAllocSize()) {
+      if (Device == DEVICE_TYPE_CPU) {
+        memory_ = std::make_shared<CpuMemoryHandle>(newSize * sizeof(real));
+      } else {
+        memory_ = std::make_shared<GpuMemoryHandle>(newSize * sizeof(real));
+      }
+    }
+  }
+};
+}  // namespace paddle
--- a/paddle/function/ConvOpTest.cpp
+++ b/paddle/function/ConvOpTest.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include <memory>
+#include "Function.h"
+#include "FunctionTest.h"
+namespace paddle {
+enum TestType {
+  kForwardTest = 0,
+  kBackwardInputTest = 1,
+  kBackwardFilterTest = 2,
+};
+template <DeviceType DType1, DeviceType DType2>
+class ConvolutionTest {
+public:
+  ConvolutionTest(const std::string& conv1,
+                  const std::string& conv2,
+                  TestType type,
+                  std::string algo = "auto") {
+    for (size_t batchSize : {1, 32}) {
+      for (size_t inputSize : {7, 14, 54}) {
+        for (size_t filterSize : {1, 3, 5}) {
+          for (size_t inputChannels : {3, 64}) {
+            for (size_t outputChannels : {3, 64, 128}) {
+              if (inputChannels < outputChannels) break;
+              for (size_t stride : {1, 2}) {
+                for (size_t padding : {0, 1}) {
+                  if (padding >= filterSize) break;
+                  size_t outputSize =
+                      (inputSize - filterSize + 2 * padding + stride) / stride;
+                  VLOG(3) << " batchSize=" << batchSize
+                          << " inputChannels=" << inputChannels
+                          << " inputHeight=" << inputSize
+                          << " inputWidth=" << inputSize
+                          << " outputChannels=" << outputChannels
+                          << " filterHeight=" << filterSize
+                          << " filterWidth=" << filterSize
+                          << " outputHeight=" << outputSize
+                          << " outputWidth=" << outputSize
+                          << " stride=" << stride << " padding=" << padding;
+                  std::vector<size_t> paddings = {padding, padding};
+                  std::vector<size_t> strides = {stride, stride};
+                  Compare2Function<DType1, DType2> test(
+                      conv1,
+                      conv2,
+                      FuncConfig()
+                          .set("paddings", paddings)
+                          .set("strides", strides)
+                          .set("groups", (size_t)1)
+                          .set("algo", algo));
+                  TensorShape input{
+                      batchSize, inputChannels, inputSize, inputSize};
+                  TensorShape filter{
+                      outputChannels, inputChannels, filterSize, filterSize};
+                  TensorShape output{
+                      batchSize, outputChannels, outputSize, outputSize};
+                  if (type == kForwardTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.run();
+                  } else if (type == kBackwardInputTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
+                    test.run();
+                  } else if (type == kBackwardFilterTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.run();
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+// Mainly used to test cases where the height and width (input, filter)
+// are not equal.
+template <DeviceType DType1, DeviceType DType2>
+class ConvolutionTest2 {
+public:
+  ConvolutionTest2(const std::string& conv1,
+                   const std::string& conv2,
+                   TestType type,
+                   std::string algo = "auto") {
+    for (size_t batchSize : {16}) {
+      for (size_t inputHeight : {7, 31}) {
+        for (size_t inputWidth : {10, 54}) {
+          for (size_t filterHeight : {1, 5}) {
+            for (size_t filterWidth : {3, 7}) {
+              for (size_t inputChannels : {7}) {
+                for (size_t outputChannels : {32}) {
+                  size_t stride = 1;
+                  size_t padding = 0;
+                  size_t outputHeight =
+                      (inputHeight - filterHeight + 2 * padding + stride) /
+                      stride;
+                  size_t outputWidth =
+                      (inputWidth - filterWidth + 2 * padding + stride) /
+                      stride;
+                  VLOG(3) << " batchSize=" << batchSize
+                          << " inputChannels=" << inputChannels
+                          << " inputHeight=" << inputHeight
+                          << " inputWidth=" << inputWidth
+                          << " outputChannels=" << outputChannels
+                          << " filterHeight=" << filterHeight
+                          << " filterWidth=" << filterWidth
+                          << " outputHeight=" << outputHeight
+                          << " outputWidth=" << outputWidth
+                          << " stride=" << stride << " padding=" << padding;
+                  std::vector<size_t> paddings = {padding, padding};
+                  std::vector<size_t> strides = {stride, stride};
+                  Compare2Function<DType1, DType2> test(
+                      conv1,
+                      conv2,
+                      FuncConfig()
+                          .set("paddings", paddings)
+                          .set("strides", strides)
+                          .set("groups", (size_t)1)
+                          .set("algo", algo));
+                  TensorShape input{
+                      batchSize, inputChannels, inputHeight, inputWidth};
+                  TensorShape filter{
+                      outputChannels, inputChannels, filterHeight, filterWidth};
+                  TensorShape output{
+                      batchSize, outputChannels, outputHeight, outputWidth};
+                  if (type == kForwardTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.run();
+                  } else if (type == kBackwardInputTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
+                    test.run();
+                  } else if (type == kBackwardFilterTest) {
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+                    test.run();
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+TEST(Forward, GEMM) {
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
+      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test2(
+      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest);
+}
+#ifndef PADDLE_ONLY_CPU
+TEST(Forward, GEMM2) {
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "GemmConv-CPU", "GemmConv-GPU", kForwardTest);
+}
+TEST(BackwardInput, GEMM) {
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", kBackwardInputTest);
+}
+TEST(BackwardFilter, GEMM) {
+  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
+      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
+  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
+      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", kBackwardFilterTest);
+}
+#endif
+}  // namespace paddle
--- a/paddle/function/CosSimOpTest.cpp
+++ b/paddle/function/CosSimOpTest.cpp
@@ -22,7 +22,7 @@ void testCosSimForward(size_t height_x,
                       size_t height_y,
                       size_t width,
                       real scale) {
-  FunctionCompare test("CosSimForward", FuncConfig().set("scale", scale));
+  CpuGpuFuncCompare test("CosSimForward", FuncConfig().set("scale", scale));
  // prepare input arguments
  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, width}));
  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_y, width}));
@@ -36,7 +36,7 @@ void testCosSimBackward(size_t height_x,
                        size_t height_y,
                        size_t width,
                        real scale) {
-  FunctionCompare test("CosSimBackward", FuncConfig().set("scale", scale));
+  CpuGpuFuncCompare test("CosSimBackward", FuncConfig().set("scale", scale));
  // prepare input arguments
  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));
  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{height_x, 1}));

--- a/paddle/function/CrossMapNormalOpTest.cpp
+++ b/paddle/function/CrossMapNormalOpTest.cpp
@@ -28,11 +28,11 @@ TEST(CrossMapNormal, real) {
                    << " size=" << size;
            // init Test object
-            FunctionCompare test("CrossMapNormal",
+            CpuGpuFuncCompare test("CrossMapNormal",
-                                 FuncConfig()
+                                   FuncConfig()
-                                     .set("size", size)
+                                       .set("size", size)
-                                     .set("scale", (real)1.5)
+                                       .set("scale", (real)1.5)
-                                     .set("pow", (real)0.5));
+                                       .set("pow", (real)0.5));
            // prepare input arguments
            TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
@@ -57,11 +57,11 @@ TEST(CrossMapNormalGrad, real) {
                    << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                    << " size=" << size;
-            FunctionCompare test("CrossMapNormalGrad",
+            CpuGpuFuncCompare test("CrossMapNormalGrad",
-                                 FuncConfig()
+                                   FuncConfig()
-                                     .set("size", size)
+                                       .set("size", size)
-                                     .set("scale", (real)1.5)
+                                       .set("scale", (real)1.5)
-                                     .set("pow", (real)0.5));
+                                       .set("pow", (real)0.5));
            TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));

--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -22,14 +22,62 @@ namespace paddle {
 typedef std::shared_ptr<BufferArg> BufferArgPtr;
+namespace test {
+template <DeviceType DType>
+struct Allocator;
+template <>
+struct Allocator<DEVICE_TYPE_CPU> {
+  using type = CpuMemoryHandle;
+};
+template <>
+struct Allocator<DEVICE_TYPE_GPU> {
+  using type = GpuMemoryHandle;
+};
+// Copy argument1 to argument2
+template <DeviceType DType1, DeviceType DType2>
+class CopyArgument {
+public:
+  void operator()(const BufferArg& arg1, BufferArg& arg2) {
+    CHECK_EQ(arg1.valueType(), arg2.valueType());
+    CHECK_LE(arg1.shape().getElements(), arg2.shape().getElements());
+    if (arg1.valueType() == VALUE_TYPE_INT32) {
+      IVectorPtr vector1 =
+          IVector::create((int*)arg1.data(),
+                          arg1.shape().getElements(),
+                          DType1 == DEVICE_TYPE_CPU ? false : true);
+      IVectorPtr vector2 =
+          IVector::create((int*)arg2.data(),
+                          arg2.shape().getElements(),
+                          DType2 == DEVICE_TYPE_CPU ? false : true);
+      vector2->copyFrom(*vector1);
+    } else {
+      VectorPtr vector1 =
+          Vector::create((real*)arg1.data(),
+                         arg1.shape().getElements(),
+                         DType1 == DEVICE_TYPE_CPU ? false : true);
+      VectorPtr vector2 =
+          Vector::create((real*)arg2.data(),
+                         arg2.shape().getElements(),
+                         DType2 == DEVICE_TYPE_CPU ? false : true);
+      vector2->copyFrom(*vector1);
+    }
+  }
+};
+}  // namespace test
 /**
- * \brief A class for comparing CPU and GPU implementations of Function.
+ * \brief A class for comparing two Functions of different implementations.
- *
+ *        For example, can be used to compare the CPU and GPU implementation
+ *        of the function is consistent.
 *
 * Use case:
 *  // Initializes a test object, the corresponding cpu and gpu Function
 *  // are constructed according to FunctionName and FuncConfig.
- *  FunctionCompare test(FunctionName, FuncConfig);
+ *  CpuGpuFuncCompare test(FunctionName, FuncConfig);
 *  // Prepare inputs and outputs arguments.
 *  // Here the input and output can not contain real data,
 *  // only contains the argument type and shape.
@@ -45,28 +93,38 @@ typedef std::shared_ptr<BufferArg> BufferArgPtr;
 *  // Compares CPU and GPU calculation results for consistency.
 *  test.run();
 */
-class FunctionCompare {
+template <DeviceType DType1, DeviceType DType2>
+class Compare2Function {
 public:
-  FunctionCompare(const std::string& name, const FuncConfig& config)
+  typedef typename test::Allocator<DType1>::type Allocator1;
-      : cpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-CPU")),
+  typedef typename test::Allocator<DType2>::type Allocator2;
-        gpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) {
+  typedef typename Tensor<real, DType1>::Vector Vector1;
-    cpuFunc_->init(config);
+  typedef typename Tensor<real, DType2>::Vector Vector2;
-    gpuFunc_->init(config);
+  typedef typename Tensor<real, DType1>::SparseMatrix SparseMatrix1;
+  typedef typename Tensor<real, DType2>::SparseMatrix SparseMatrix2;
+  Compare2Function(const std::string& name1,
+                   const std::string& name2,
+                   const FuncConfig& config)
+      : function1_(FunctionBase::funcRegistrar_.createByType(name1)),
+        function2_(FunctionBase::funcRegistrar_.createByType(name2)) {
+    function1_->init(config);
+    function2_->init(config);
  }
-  ~FunctionCompare() {}
+  ~Compare2Function() {}
  // input need only contains shape, do not contains data.
  void addInputs(const BufferArg& input) {
    size_t size =
        input.shape().getElements() * sizeOfValuType(input.valueType());
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-    cpuInputs_.emplace_back(std::make_shared<BufferArg>(
+    func1Inputs_.emplace_back(std::make_shared<BufferArg>(
-        cpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+        func1Memory_.back()->getBuf(), input.valueType(), input.shape()));
-    gpuInputs_.emplace_back(std::make_shared<BufferArg>(
+    func2Inputs_.emplace_back(std::make_shared<BufferArg>(
-        gpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+        func2Memory_.back()->getBuf(), input.valueType(), input.shape()));
  }
  // assume one copy of sequence is shared by different SequenceArgs
@@ -75,62 +133,57 @@ public:
    size_t batchSize = input.shape()[0];
    size_t numSeqs = batchSize / 10 + 1;
    size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32);
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(sizeId));
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(sizeId));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(sizeId));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(sizeId));
-    cpuSeq_ = std::make_shared<SequenceIdArg>(cpuMemory_.back()->getBuf(),
+    seq1_ = std::make_shared<SequenceIdArg>(func1Memory_.back()->getBuf(),
-                                              TensorShape{numSeqs + 1});
+                                            TensorShape{numSeqs + 1});
-    gpuSeq_ = std::make_shared<SequenceIdArg>(gpuMemory_.back()->getBuf(),
+    seq2_ = std::make_shared<SequenceIdArg>(func2Memory_.back()->getBuf(),
-                                              TensorShape{numSeqs + 1});
+                                            TensorShape{numSeqs + 1});
    /// init sequence Id
-    initArg(*cpuSeq_, batchSize);
+    initArg(*seq1_, batchSize);
-    // todo(tianbing), delete it
+    copyArg_(*seq1_, *seq2_);
-    CHECK_EQ(cpuSeq_->shape().getElements(), cpuSeq_->numSeqs() + 1);
-    CpuIVector cpuSeq(cpuSeq_->shape().getElements(), (int*)cpuSeq_->data());
-    GpuIVector gpuSeq(gpuSeq_->shape().getElements(), (int*)gpuSeq_->data());
-    gpuSeq.copyFrom(cpuSeq);
  }
  void addInputs(const SequenceArg& input) {
    CHECK_EQ(input.shape().ndims(), 2UL);
    size_t batchSize = input.shape()[0];
-    if (!cpuSeq_ || !gpuSeq_) {  // sequence not exist
+    if (!seq1_ || !seq2_) {  // sequence not exist
      addSequence(SequenceIdArg(TensorShape{batchSize}));
    }
    size_t size =
        input.shape().getElements() * sizeOfValuType(input.valueType());
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
    /// SequenceArg
-    cpuInputs_.emplace_back(
+    func1Inputs_.emplace_back(
-        std::make_shared<SequenceArg>(cpuMemory_.back()->getBuf(),
+        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
                                      input.valueType(),
                                      input.shape(),
-                                      *cpuSeq_));
+                                      *seq1_));
-    gpuInputs_.emplace_back(
+    func2Inputs_.emplace_back(
-        std::make_shared<SequenceArg>(gpuMemory_.back()->getBuf(),
+        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
                                      input.valueType(),
                                      input.shape(),
-                                      *gpuSeq_));
+                                      *seq2_));
  }
  // output need only contains shape, do not contains data.
  void addOutputs(const BufferArg& output, ArgType argType = ASSIGN_TO) {
    size_t size =
        output.shape().getElements() * sizeOfValuType(output.valueType());
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
-    cpuOutputs_.emplace_back(
+    func1Outputs_.emplace_back(
-        std::make_shared<BufferArg>(cpuMemory_.back()->getBuf(),
+        std::make_shared<BufferArg>(func1Memory_.back()->getBuf(),
                                    output.valueType(),
                                    output.shape(),
                                    argType));
-    gpuOutputs_.emplace_back(
+    func2Outputs_.emplace_back(
-        std::make_shared<BufferArg>(gpuMemory_.back()->getBuf(),
+        std::make_shared<BufferArg>(func2Memory_.back()->getBuf(),
                                    output.valueType(),
                                    output.shape(),
                                    argType));
@@ -138,14 +191,14 @@ public:
  /// add and init output sparse matrix
  void addOutputs(const SparseMatrixArg& output, ArgType argType = ASSIGN_TO) {
-    cpuSparse_ = std::make_shared<CpuSparseMatrix>(
+    sparse1_ = std::make_shared<SparseMatrix1>(
        output.shape()[0],
        output.shape()[1],
        output.nnz(),
        static_cast<SparseValueType>(output.dataType()),
        static_cast<SparseFormat>(output.dataFormat()));
-    gpuSparse_ = std::make_shared<GpuSparseMatrix>(
+    sparse2_ = std::make_shared<SparseMatrix2>(
        output.shape()[0],
        output.shape()[1],
        output.nnz(),
@@ -154,52 +207,52 @@ public:
    /// init sparse matrix
    hl_stream_t stream(HPPL_STREAM_1);
-    cpuSparse_->randomizeUniform();
+    sparse1_->randomizeUniform();
-    gpuSparse_->copyFrom(*cpuSparse_, stream);
+    sparse2_->copyFrom(*sparse1_, stream);
    hl_stream_synchronize(stream);
-    cpuOutputs_.emplace_back(
+    func1Outputs_.emplace_back(
-        std::make_shared<SparseMatrixArg>(*cpuSparse_, argType));
+        std::make_shared<SparseMatrixArg>(*sparse1_, argType));
-    gpuOutputs_.emplace_back(
+    func2Outputs_.emplace_back(
-        std::make_shared<SparseMatrixArg>(*gpuSparse_, argType));
+        std::make_shared<SparseMatrixArg>(*sparse2_, argType));
  }
  void addOutputs(const SequenceArg& output, ArgType argType = ASSIGN_TO) {
    CHECK_EQ(output.shape().ndims(), 2UL);
    size_t batchSize = output.shape()[0];
-    if (!cpuSeq_ || !gpuSeq_) {  // sequence not exist
+    if (!seq1_ || !seq2_) {  // sequence not exist
      addSequence(SequenceIdArg(TensorShape{batchSize}));
    }
    size_t size =
        output.shape().getElements() * sizeOfValuType(output.valueType());
-    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    func1Memory_.emplace_back(std::make_shared<Allocator1>(size));
-    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+    func2Memory_.emplace_back(std::make_shared<Allocator2>(size));
    /// SequenceArg
-    cpuOutputs_.emplace_back(
+    func1Outputs_.emplace_back(
-        std::make_shared<SequenceArg>(cpuMemory_.back()->getBuf(),
+        std::make_shared<SequenceArg>(func1Memory_.back()->getBuf(),
                                      output.valueType(),
                                      output.shape(),
-                                      *cpuSeq_,
+                                      *seq1_,
                                      argType));
-    gpuOutputs_.emplace_back(
+    func2Outputs_.emplace_back(
-        std::make_shared<SequenceArg>(gpuMemory_.back()->getBuf(),
+        std::make_shared<SequenceArg>(func2Memory_.back()->getBuf(),
                                      output.valueType(),
                                      output.shape(),
-                                      *gpuSeq_,
+                                      *seq2_,
                                      argType));
  }
  void addInputs(const SparseMatrixArg& input) {
-    cpuSparse_ = std::make_shared<CpuSparseMatrix>(
+    sparse1_ = std::make_shared<SparseMatrix1>(
        input.shape()[0],
        input.shape()[1],
        input.nnz(),
        static_cast<SparseValueType>(input.dataType()),
        static_cast<SparseFormat>(input.dataFormat()));
-    gpuSparse_ = std::make_shared<GpuSparseMatrix>(
+    sparse2_ = std::make_shared<SparseMatrix2>(
        input.shape()[0],
        input.shape()[1],
        input.nnz(),
@@ -208,12 +261,12 @@ public:
    /// init sparse matrix
    hl_stream_t stream(HPPL_STREAM_1);
-    cpuSparse_->randomizeUniform();
+    sparse1_->randomizeUniform();
-    gpuSparse_->copyFrom(*cpuSparse_, stream);
+    sparse2_->copyFrom(*sparse1_, stream);
    hl_stream_synchronize(stream);
-    cpuInputs_.emplace_back(std::make_shared<SparseMatrixArg>(*cpuSparse_));
+    func1Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse1_));
-    gpuInputs_.emplace_back(std::make_shared<SparseMatrixArg>(*gpuSparse_));
+    func2Inputs_.emplace_back(std::make_shared<SparseMatrixArg>(*sparse2_));
  }
  void run() {
@@ -236,27 +289,27 @@ public:
      function->calc(inArgs, outArgs);
    };
-    callFunction(cpuFunc_.get(), cpuInputs_, cpuOutputs_);
+    callFunction(function1_.get(), func1Inputs_, func1Outputs_);
-    callFunction(gpuFunc_.get(), gpuInputs_, gpuOutputs_);
+    callFunction(function2_.get(), func2Inputs_, func2Outputs_);
    // check outputs
    compareOutputs();
  }
-  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpuFunc_; }
+  std::shared_ptr<FunctionBase> getFunction1() const { return function1_; }
-  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpuFunc_; }
+  std::shared_ptr<FunctionBase> getFunction2() const { return function2_; }
 protected:
  // only init cpu argument, gpu argument copy from cpu argument.
  void initArg(BufferArg& arg) {
-    CpuVector vector(arg.shape().getElements(), (real*)arg.data());
+    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
    vector.uniform(0.001, 1);
  }
  void initArg(SequenceArg& arg) {
    /// init only matrix
-    CpuVector vector(arg.shape().getElements(), (real*)arg.data());
+    Vector1 vector(arg.shape().getElements(), (real*)arg.data());
    vector.uniform(0.001, 1);
  }
@@ -276,73 +329,72 @@ protected:
  }
  void initInputs() {
-    for (size_t i = 0; i < cpuInputs_.size(); i++) {
+    for (size_t i = 0; i < func1Inputs_.size(); i++) {
-      if (cpuInputs_[i]->isSparseArg()) {
+      if (func1Inputs_[i]->isSparseArg()) {
        continue;  /// sparse matrix already init
      }
-      if (cpuInputs_[i]->isSequenceArg()) {
+      if (func1Inputs_[i]->isSequenceArg()) {
-        initArg(dynamic_cast<SequenceArg&>(*cpuInputs_[i]));
+        initArg(dynamic_cast<SequenceArg&>(*func1Inputs_[i]));
      } else {
-        initArg(*cpuInputs_[i]);
+        initArg(*func1Inputs_[i]);
      }
-      // TODO: Need a BufferCopy used to copy from one BufferArg to another.
-      CpuVector cpuVector(cpuInputs_[i]->shape().getElements(),
-                          (real*)cpuInputs_[i]->data());
-      GpuVector gpuVector(gpuInputs_[i]->shape().getElements(),
-                          (real*)gpuInputs_[i]->data());
-      gpuVector.copyFrom(cpuVector);
+      copyArg_(*func1Inputs_[i], *func2Inputs_[i]);
    }
  }
  void initOutputs() {
-    for (size_t i = 0; i < cpuOutputs_.size(); i++) {
+    for (size_t i = 0; i < func1Outputs_.size(); i++) {
-      if (cpuOutputs_[i]->isSparseArg()) {
+      if (func1Outputs_[i]->isSparseArg()) {
        continue;  /// sparse matrix already init
      }
-      if (cpuOutputs_[i]->isSequenceArg()) {
+      if (func1Outputs_[i]->isSequenceArg()) {
-        initArg(dynamic_cast<SequenceArg&>(*cpuOutputs_[i]));
+        initArg(dynamic_cast<SequenceArg&>(*func1Outputs_[i]));
      } else {
-        initArg(*cpuOutputs_[i]);
+        initArg(*func1Outputs_[i]);
      }
-      // TODO: Need a BufferCopy used to copy from one BufferArg to another.
+      copyArg_(*func1Outputs_[i], *func2Outputs_[i]);
-      CpuVector cpuVector(cpuOutputs_[i]->shape().getElements(),
-                          (real*)cpuOutputs_[i]->data());
-      GpuVector gpuVector(gpuOutputs_[i]->shape().getElements(),
-                          (real*)gpuOutputs_[i]->data());
-      gpuVector.copyFrom(cpuVector);
    }
  }
  void compareOutputs() {
-    for (size_t i = 0; i < cpuOutputs_.size(); i++) {
+    for (size_t i = 0; i < func1Outputs_.size(); i++) {
      // TODO, Need a BufferCheck used to compare the two buffers.
-      const auto cpu = cpuOutputs_[i];
+      const auto cpu = func1Outputs_[i];
-      const auto gpu = gpuOutputs_[i];
+      const auto gpu = func2Outputs_[i];
      CHECK_EQ(cpu->numElements(), gpu->numElements());
-      CpuVector cpuVector(cpu->numElements(), (real*)cpu->data());
+      Vector1 cpuVector(cpu->numElements(), (real*)cpu->data());
-      GpuVector gpuVector(gpu->numElements(), (real*)gpu->data());
+      Vector2 gpuVector(gpu->numElements(), (real*)gpu->data());
      autotest::TensorCheckErr(cpuVector, gpuVector);
    }
  }
 protected:
-  std::shared_ptr<FunctionBase> cpuFunc_;
+  std::shared_ptr<FunctionBase> function1_;
-  std::shared_ptr<FunctionBase> gpuFunc_;
+  std::shared_ptr<FunctionBase> function2_;
-  std::vector<CpuMemHandlePtr> cpuMemory_;
+  std::vector<std::shared_ptr<Allocator1>> func1Memory_;
-  std::vector<GpuMemHandlePtr> gpuMemory_;
+  std::vector<std::shared_ptr<Allocator2>> func2Memory_;
-  std::vector<BufferArgPtr> cpuInputs_;
+  std::vector<BufferArgPtr> func1Inputs_;
-  std::vector<BufferArgPtr> cpuOutputs_;
+  std::vector<BufferArgPtr> func1Outputs_;
-  std::vector<BufferArgPtr> gpuInputs_;
+  std::vector<BufferArgPtr> func2Inputs_;
-  std::vector<BufferArgPtr> gpuOutputs_;
+  std::vector<BufferArgPtr> func2Outputs_;
-  std::shared_ptr<CpuSparseMatrix> cpuSparse_;
+  std::shared_ptr<SparseMatrix1> sparse1_;
-  std::shared_ptr<GpuSparseMatrix> gpuSparse_;
+  std::shared_ptr<SparseMatrix2> sparse2_;
-  std::shared_ptr<SequenceIdArg> cpuSeq_;
+  std::shared_ptr<SequenceIdArg> seq1_;
-  std::shared_ptr<SequenceIdArg> gpuSeq_;
+  std::shared_ptr<SequenceIdArg> seq2_;
+  test::CopyArgument<DType1, DType2> copyArg_;
+};
+class CpuGpuFuncCompare
+    : public Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> {
+public:
+  CpuGpuFuncCompare(const std::string& name, const FuncConfig& config)
+      : Compare2Function(name + "-CPU", name + "-GPU", config) {}
+  ~CpuGpuFuncCompare() {}
 };
 }  // namespace paddle
--- a/paddle/function/GemmConvOp.cpp
+++ b/paddle/function/GemmConvOp.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "GemmConvOp.h"
+#include "GemmFunctor.h"
+#include "paddle/math/MemoryHandle.h"
+namespace paddle {
+/*
+ * imData = [input_channels, input_height, input_width]
+ * colData = [input_channels, filter_height, filter_width,
+ *            output_height, output_width]
+ */
+template <class T>
+class Im2ColFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* imData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* colData) {
+    int channelsCol = inputChannels * filterHeight * filterWidth;
+    for (int c = 0; c < channelsCol; ++c) {
+      int wOffset = c % filterWidth;
+      int hOffset = (c / filterWidth) % filterHeight;
+      int c_im = c / filterWidth / filterHeight;
+      for (int h = 0; h < outputHeight; ++h) {
+        for (int w = 0; w < outputWidth; ++w) {
+          int imRowIdx = h * strideHeight + hOffset;
+          int imColIdx = w * strideWidth + wOffset;
+          if ((imRowIdx - paddingHeight) < 0 ||
+              (imRowIdx - paddingHeight) >= inputHeight ||
+              (imColIdx - paddingWidth) < 0 ||
+              (imColIdx - paddingWidth) >= inputWidth) {
+            colData[(c * outputHeight + h) * outputWidth + w] = T(0);
+          } else {
+            imRowIdx += c_im * inputHeight - paddingHeight;
+            imColIdx -= paddingWidth;
+            colData[(c * outputHeight + h) * outputWidth + w] =
+                imData[imRowIdx * inputWidth + imColIdx];
+          }
+        }
+      }
+    }
+  }
+};
+template <class T>
+class Col2ImFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const T* colData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* imData) {
+    int channelsCol = inputChannels * filterHeight * filterWidth;
+    for (int c = 0; c < channelsCol; ++c) {
+      int wOffset = c % filterWidth;
+      int hOffset = (c / filterWidth) % filterHeight;
+      int c_im = c / filterWidth / filterHeight;
+      for (int h = 0; h < outputHeight; ++h) {
+        for (int w = 0; w < outputWidth; ++w) {
+          int imRowIdx = h * strideHeight + hOffset;
+          int imColIdx = w * strideWidth + wOffset;
+          if ((imRowIdx - paddingHeight) >= 0 &&
+              (imRowIdx - paddingHeight) < inputHeight &&
+              (imColIdx - paddingWidth) >= 0 &&
+              (imColIdx - paddingWidth) < inputWidth) {
+            imRowIdx += c_im * inputHeight - paddingHeight;
+            imColIdx -= paddingWidth;
+            imData[imRowIdx * inputWidth + imColIdx] +=
+                colData[(c * outputHeight + h) * outputWidth + w];
+          }
+        }
+      }
+    }
+  }
+};
+/*
+ * \brief Forward calculation of convolution.
+ */
+template <DeviceType Device>
+class GemmConvFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    // TODO(hedaoyuan): Need to define some index macros,
+    // to avoid useing 0 and 1.
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    check(input, filter, output);
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
+    }
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+    size_t size = inputChannels / groups_ * filterHeight * filterWidth *
+                  outputHeight * outputWidth;
+    resizeBuffer<Device>(size);
+    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+    Im2ColFunctor<Device, real> im2col;
+    GemmFunctor<Device, real> gemm;
+    size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth;
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        im2col(inputData + g * inputOffset,
+               inputChannels / groups_,
+               inputHeight,
+               inputWidth,
+               filterHeight,
+               filterWidth,
+               strideH(),
+               strideW(),
+               paddingH(),
+               paddingW(),
+               outputHeight,
+               outputWidth,
+               colData);
+        int M = outputChannels / groups_;
+        int N = outputHeight * outputWidth;
+        int K = inputChannels / groups_ * filterHeight * filterWidth;
+        gemm(CblasNoTrans,
+             CblasNoTrans,
+             M,
+             N,
+             K,
+             1.0f,
+             filterData + g * filterOffset,
+             K,
+             colData,
+             N,
+             beta,
+             outputData + g * outputOffset,
+             N);
+      }
+      inputData += inputChannels * inputHeight * inputWidth;
+      outputData += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+/*
+ * \brief Backward input calculation of convolution.
+ */
+template <DeviceType Device>
+class GemmConvGradInputFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    // Since the implementation of Col2ImFunctor is ADD_TO,
+    // this function only supports ADD_TO mode.
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& input = outputs[0].shape();
+    check(input, filter, output);
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    real* outputGrad = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* inputGrad = outputs[0].data<real>();
+    size_t size = inputChannels / groups_ * filterHeight * filterWidth *
+                  outputHeight * outputWidth;
+    resizeBuffer<Device>(size);
+    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+    Col2ImFunctor<Device, real> col2im;
+    GemmFunctor<Device, real> gemm;
+    size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth;
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        int K = outputChannels / groups_;
+        int N = outputHeight * outputWidth;
+        int M = inputChannels / groups_ * filterHeight * filterWidth;
+        gemm(CblasTrans,
+             CblasNoTrans,
+             M,
+             N,
+             K,
+             1.0f,
+             filterData + g * filterOffset,
+             M,
+             outputGrad + g * outputOffset,
+             N,
+             0.0f,
+             colData,
+             N);
+        col2im(colData,
+               inputChannels / groups_,
+               inputHeight,
+               inputWidth,
+               filterHeight,
+               filterWidth,
+               strideH(),
+               strideW(),
+               paddingH(),
+               paddingW(),
+               outputHeight,
+               outputWidth,
+               inputGrad + g * inputOffset);
+      }
+      inputGrad += inputChannels * inputHeight * inputWidth;
+      outputGrad += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+/*
+ * \brief Backward filter calculation of convolution.
+ */
+template <DeviceType Device>
+class GemmConvGradFilterFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    const TensorShape& output = inputs[0].shape();
+    const TensorShape& input = inputs[1].shape();
+    const TensorShape& filter = outputs[0].shape();
+    check(input, filter, output);
+    real beta;
+    if (outputs[0].getArgType() == ADD_TO) {
+      beta = 1.0;
+    } else {
+      beta = 0.0;
+    }
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    size_t outputHeight = output[2];
+    size_t outputWidth = output[3];
+    real* outputGrad = inputs[0].data<real>();
+    real* inputData = inputs[1].data<real>();
+    real* filterGrad = outputs[0].data<real>();
+    size_t size = inputChannels / groups_ * filterHeight * filterWidth *
+                  outputHeight * outputWidth;
+    resizeBuffer<Device>(size);
+    real* colData = reinterpret_cast<real*>(memory_->getBuf());
+    Im2ColFunctor<Device, real> im2col;
+    GemmFunctor<Device, real> gemm;
+    size_t inputOffset = (inputChannels / groups_) * inputHeight * inputWidth;
+    size_t outputOffset =
+        (outputChannels / groups_) * outputHeight * outputWidth;
+    size_t filterOffset = filter.getElements() / groups_;
+    for (size_t i = 0; i < batchSize; i++) {
+      for (size_t g = 0; g < groups_; g++) {
+        im2col(inputData + g * inputOffset,
+               inputChannels / groups_,
+               inputHeight,
+               inputWidth,
+               filterHeight,
+               filterWidth,
+               strideH(),
+               strideW(),
+               paddingH(),
+               paddingW(),
+               outputHeight,
+               outputWidth,
+               colData);
+        int M = outputChannels / groups_;
+        int K = outputHeight * outputWidth;
+        int N = inputChannels / groups_ * filterHeight * filterWidth;
+        gemm(CblasNoTrans,
+             CblasTrans,
+             M,
+             N,
+             K,
+             1.0f,
+             outputGrad + g * outputOffset,
+             K,
+             colData,
+             K,
+             i == 0 ? beta : 1.0f,
+             filterGrad + g * filterOffset,
+             N);
+      }
+      inputData += inputChannels * inputHeight * inputWidth;
+      outputGrad += outputChannels * outputHeight * outputWidth;
+    }
+  }
+};
+REGISTER_TYPED_FUNC(GemmConv, CPU, GemmConvFunction);
+REGISTER_TYPED_FUNC(GemmConvGradInput, CPU, GemmConvGradInputFunction);
+REGISTER_TYPED_FUNC(GemmConvGradFilter, CPU, GemmConvGradFilterFunction);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(GemmConv, GPU, GemmConvFunction);
+REGISTER_TYPED_FUNC(GemmConvGradInput, GPU, GemmConvGradInputFunction);
+REGISTER_TYPED_FUNC(GemmConvGradFilter, GPU, GemmConvGradFilterFunction);
+#endif
+}  // namespace paddle
--- a/paddle/gserver/layers/ExpandConvTransLayer.h
+++ b/paddle/gserver/layers/ExpandConvTransLayer.h
@@ -14,31 +14,49 @@ limitations under the License. */
 #pragma once
-#include <vector>
+#include "ConvOp.h"
-#include "ExpandConvBaseLayer.h"
-#include "paddle/math/Matrix.h"
 namespace paddle {
-/**
+/*
- * @brief A subclass of convolution layer.
+ * imData = [input_channels, input_height, input_width]
- * This layer expands input and use matrix multiplication to
+ * colData = [input_channels, filter_height, filter_width,
- * calculate convolution transpose (deconv) operation.
+ *            output_height, output_width]
- *
- * The config file api is img_conv_layer with flag trans=True.
 */
-class ExpandConvTransLayer : public ExpandConvBaseLayer {
+template <DeviceType Device, class T>
+class Im2ColFunctor {
 public:
-  explicit ExpandConvTransLayer(const LayerConfig& config)
+  void operator()(const T* imData,
-      : ExpandConvBaseLayer(config) {}
+                  int inputChannels,
+                  int inputHeight,
-  ~ExpandConvTransLayer() {}
+                  int inputWidth,
+                  int filterHeight,
-  bool init(const LayerMap& layerMap,
+                  int filterWidth,
-            const ParameterMap& parameterMap) override;
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* colData);
+};
-  void forward(PassType passType) override;
+template <DeviceType Device, class T>
-  void backward(const UpdateCallback& callback) override;
+class Col2ImFunctor {
+public:
+  void operator()(const T* colData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* imData);
 };
 }  // namespace paddle
--- a/paddle/function/GemmConvOpGpu.cu
+++ b/paddle/function/GemmConvOpGpu.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "ConvOp.h"
+#include "GemmConvOp.h"
+namespace paddle {
+template<class T>
+__global__
+void im2col(const T* data_im, int numOuts, int height, int width,
+            int blockH, int blockW,
+            int strideH, int strideW,
+            int paddingH, int paddingW,
+            int height_col, int width_col,
+            T* data_col) {
+  int index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < numOuts) {
+    int w_out = index % width_col;
+    index /= width_col;
+    int h_out = index % height_col;
+    int channel_in = index / height_col;
+    int channel_out = channel_in * blockH * blockW;
+    int h_in = h_out * strideH;
+    int w_in = w_out * strideW;
+    data_col += (channel_out * height_col + h_out) * width_col + w_out;
+    for (int i = 0; i < blockH; ++i) {
+      for (int j = 0; j < blockW; ++j) {
+        int rIdx = int(h_in+i);
+        int cIdx = int(w_in+j);
+        if ((rIdx-(int)paddingH) >= (int)height ||
+            (rIdx-(int)paddingH) < 0 ||
+            (cIdx-(int)paddingW) >= (int)width ||
+            (cIdx-(int)paddingW) < 0) {
+          *data_col = 0;
+        } else {
+          rIdx = rIdx + channel_in*height - paddingH;
+          cIdx = cIdx - paddingW;
+          *data_col = data_im[rIdx* width + cIdx];
+        }
+        data_col += height_col * width_col;
+      }
+    }
+  }
+}
+template <class T>
+class Im2ColFunctor<DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* imData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* colData) {
+    int numKernels = inputChannels * outputHeight * outputWidth;
+    int blocks = (numKernels + 1024 -1) / 1024;
+    int blockX = 512;
+    int blockY = (blocks + 512 - 1) / 512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+    im2col<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+        (imData, numKernels, inputHeight, inputWidth, filterHeight, filterWidth,
+         strideHeight, strideWidth, paddingHeight, paddingWidth,
+         outputHeight, outputWidth, colData);
+    CHECK_SYNC("Im2ColFunctor GPU failed");
+  }
+};
+template<class T>
+__global__
+void col2im(size_t n, const T* data_col, size_t height,
+            size_t width, size_t channels,
+            size_t blockH, size_t blockW,
+            size_t strideH, size_t strideW,
+            size_t paddingH, size_t paddingW,
+            size_t height_col, size_t width_col,
+            T* data_im) {
+  size_t index =
+    (blockIdx.x * gridDim.y + blockIdx.y) * blockDim.x + threadIdx.x;
+  if (index < n) {
+    T val = 0;
+    int w = int(index % width);
+    int h = int((index / width) % height);
+    int c = int(index / (width * height));
+    if ((w - (int)paddingW) >= 0 &&
+        (w - (int)paddingW) < (width-2 * paddingW) &&
+        (h - (int)paddingH) >= 0 &&
+        (h - paddingH) < (height - 2 * paddingH)) {
+      // compute the start and end of the output
+      int w_col_start =
+        (w < (int)blockW) ? 0 : (w - int(blockW)) / (int)strideW + 1;
+      int w_col_end =
+        min((int)(w / (int)strideW + 1), (int)(width_col));
+      int h_col_start =
+        (h < (int)blockH) ? 0 : (h - (int)blockH) / (int)strideH + 1;
+      int h_col_end = min(int(h / strideH + 1), int(height_col));
+      for (int h_col = h_col_start; h_col < h_col_end; ++h_col) {
+        for (int w_col = w_col_start; w_col < w_col_end; ++w_col) {
+          // the col location: [c * width * height + h_out, w_out]
+          int c_col = int(c * blockH* blockW) + \
+            (h - h_col * (int)strideH) * (int)blockW +
+            (w - w_col * (int)strideW);
+          val += data_col[(c_col * height_col + h_col) * width_col + w_col];
+        }
+      }
+      h -= paddingH;
+      w -= paddingW;
+      data_im[c*((width-2*paddingW) * (height-2*paddingH)) +
+              h*(width-2*paddingW) + w] += val;
+    }
+  }
+}
+template <class T>
+class Col2ImFunctor<DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const T* colData,
+                  int inputChannels,
+                  int inputHeight,
+                  int inputWidth,
+                  int filterHeight,
+                  int filterWidth,
+                  int strideHeight,
+                  int strideWidth,
+                  int paddingHeight,
+                  int paddingWidth,
+                  int outputHeight,
+                  int outputWidth,
+                  T* imData) {
+    size_t numKernels = inputChannels * (inputHeight + 2*paddingHeight)
+        * (inputWidth + 2*paddingWidth);
+    size_t blocks = (numKernels + 1024 -1) / 1024;
+    size_t blockX = 512;
+    size_t blockY = (blocks+512-1)/512;
+    dim3 threads(1024, 1);
+    dim3 grid(blockX, blockY);
+    // To avoid involving atomic operations, we will launch one kernel per
+    // bottom dimension, and then in the kernel add up the top dimensions.
+    col2im<T><<< grid, threads, 0, STREAM_DEFAULT >>>
+             (numKernels,
+              colData,
+              inputHeight + 2*paddingHeight,
+              inputWidth + 2*paddingWidth,
+              inputChannels,
+              filterHeight,
+              filterWidth,
+              strideHeight,
+              strideWidth,
+              paddingHeight,
+              paddingWidth,
+              outputHeight,
+              outputWidth,
+              imData);
+    CHECK_SYNC("Col2ImFunctor GPU failed");
+  }
+};
+template class Im2ColFunctor<DEVICE_TYPE_GPU, float>;
+template class Im2ColFunctor<DEVICE_TYPE_GPU, double>;
+template class Col2ImFunctor<DEVICE_TYPE_GPU, float>;
+template class Col2ImFunctor<DEVICE_TYPE_GPU, double>;
+}  // namespace paddle
--- a/paddle/function/GemmFunctor.h
+++ b/paddle/function/GemmFunctor.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/math/MathFunctions.h"
+namespace paddle {
+// TODO(hedaoyuan): Since the hl_matrix_mul interface does not conform to the
+// cblas_dgemm interface's parameter format, it is necessary to introduce
+// GemmFunctor as a new interface. Later, when considering the implementation
+// of MatMulFunction, we need to consider the reconstruction of hl_matrix_mul
+// interface.
+template <DeviceType Device, class T>
+class GemmFunctor {
+public:
+  void operator()(const CBLAS_TRANSPOSE transA,
+                  const CBLAS_TRANSPOSE TransB,
+                  const int M,
+                  const int N,
+                  const int K,
+                  const T alpha,
+                  const T* A,
+                  const int lda,
+                  const T* B,
+                  const int ldb,
+                  const T beta,
+                  T* C,
+                  const int ldc);
+};
+template <class T>
+class GemmFunctor<DEVICE_TYPE_CPU, T> {
+public:
+  void operator()(const CBLAS_TRANSPOSE transA,
+                  const CBLAS_TRANSPOSE TransB,
+                  const int M,
+                  const int N,
+                  const int K,
+                  const T alpha,
+                  const T* A,
+                  const int lda,
+                  const T* B,
+                  const int ldb,
+                  const T beta,
+                  T* C,
+                  const int ldc) {
+    gemm<T>(transA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, ldc);
+  }
+};
+template <class T>
+class GemmFunctor<DEVICE_TYPE_GPU, T> {
+public:
+  void operator()(const CBLAS_TRANSPOSE transA,
+                  const CBLAS_TRANSPOSE TransB,
+                  const int M,
+                  const int N,
+                  const int K,
+                  const T alpha,
+                  const T* A,
+                  const int lda,
+                  const T* B,
+                  const int ldb,
+                  const T beta,
+                  T* C,
+                  const int ldc) {
+    hl_matrix_mul((T*)A,
+                  transA == CblasNoTrans ? HPPL_OP_N : HPPL_OP_T,
+                  (T*)B,
+                  TransB == CblasNoTrans ? HPPL_OP_N : HPPL_OP_T,
+                  C,
+                  M,
+                  N,
+                  K,
+                  alpha,
+                  beta,
+                  lda,
+                  ldb,
+                  ldc);
+  }
+};
+}  // namespace paddle
--- a/paddle/function/MulOpTest.cpp
+++ b/paddle/function/MulOpTest.cpp
@@ -35,7 +35,7 @@ void testFuncDDDMatrix(
  size_t heightC = dimM;
  size_t widthC = dimN;
  // init Test object
-  FunctionCompare test(
+  CpuGpuFuncCompare test(
      "MulOp", FuncConfig().set("aTrans", transa).set("bTrans", transb));
  // prepare input arguments
  /// matrix A : HA * WA
@@ -81,8 +81,8 @@ void testFuncDSparseDMatrix(
    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
  real scaleT = 1.0;
  // init Test object
-  FunctionCompare test("MulOp",
+  CpuGpuFuncCompare test(
-                       FuncConfig().set("aTrans", false).set("bTrans", false));
+      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
  // prepare input arguments
  /// sparse matrix A : M * K
  test.addInputs(SparseMatrixArg(
@@ -126,8 +126,8 @@ void testFuncDDSparseMatrix(
    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
  real scaleT = 1.0;
  // init Test object
-  FunctionCompare test("MulOp",
+  CpuGpuFuncCompare test(
-                       FuncConfig().set("aTrans", false).set("bTrans", false));
+      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
  // prepare input arguments
  /// matrix A : M * K
  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));
@@ -172,8 +172,8 @@ void testFuncSparseDDMatrix(
    size_t dimM, size_t dimN, size_t dimK, size_t nnz, SparseFormat FORMAT) {
  real scaleT = 1.0;
  // init Test object
-  FunctionCompare test("MulOp",
+  CpuGpuFuncCompare test(
-                       FuncConfig().set("aTrans", false).set("bTrans", false));
+      "MulOp", FuncConfig().set("aTrans", false).set("bTrans", false));
  // prepare input arguments
  /// matrix A : M * K
  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, TensorShape{dimM, dimK}));

--- a/paddle/function/NaiveConvOp.cpp
+++ b/paddle/function/NaiveConvOp.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "ConvOp.h"
+namespace paddle {
+/*
+ * The three arguments are stored in memory in row major order.
+ * inputData  = [batchSize, inputChannels, inputHeight, inputWidth]
+ * filterData = [outputChannels, inputChannels, filterHeight, filterWidth]
+ * outputData = [batchSize, outputChannels, outputHeight, outputWidth]
+ */
+template <class T>
+class NaiveConvFunctor {
+public:
+  void operator()(const T* inputData,
+                  size_t batchSize,
+                  size_t inputChannels,
+                  size_t inputHeight,
+                  size_t inputWidth,
+                  const T* filterData,
+                  size_t filterHeight,
+                  size_t filterWidth,
+                  T* outputData,
+                  size_t outputChannels,
+                  size_t outputHeight,
+                  size_t outputWidth,
+                  size_t paddingH,
+                  size_t paddingW,
+                  size_t strideH,
+                  size_t strideW) {
+    for (size_t batch = 0; batch < batchSize; batch++) {
+      for (size_t outC = 0; outC < outputChannels; outC++) {
+        for (size_t outH = 0; outH < outputHeight; outH++) {
+          for (size_t outW = 0; outW < outputWidth; outW++) {
+            const int inStartH = (outH * strideH) - paddingH;
+            const int inStartW = (outW * strideW) - paddingW;
+            T outValue = (T)0;
+            for (size_t inC = 0; inC < inputChannels; inC++) {
+              for (size_t fH = 0; fH < filterHeight; fH++) {
+                for (size_t fW = 0; fW < filterWidth; fW++) {
+                  T inValue;
+                  const int inH = inStartH + fH;
+                  const int inW = inStartW + fW;
+                  if ((inH >= 0 && inH < inputHeight) &&
+                      (inW >= 0 && inW < inputWidth)) {
+                    size_t offsetInput =
+                        batch * inputChannels * inputHeight * inputWidth +
+                        inC * inputHeight * inputWidth + inH * inputWidth + inW;
+                    inValue = inputData[offsetInput];
+                  } else {
+                    inValue = (T)0;
+                  }
+                  size_t offsetFilter =
+                      outC * inputChannels * filterHeight * filterWidth +
+                      inC * filterHeight * filterWidth + fH * filterWidth + fW;
+                  T filterValue = filterData[offsetFilter];
+                  outValue += (inValue * filterValue);
+                }
+              }
+            }
+            size_t offset =
+                batch * outputChannels * outputHeight * outputWidth +
+                outC * outputHeight * outputWidth + outH * outputWidth + outW;
+            outputData[offset] = outValue;
+          }
+        }
+      }
+    }
+  }
+};
+template <DeviceType Device>
+class NaiveConvFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+  }
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    check(input, filter, output);
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    size_t batchSize = inputs[0].shape()[0];
+    size_t inputChannels = inputs[0].shape()[1];
+    size_t inputHeight = inputs[0].shape()[2];
+    size_t inputWidth = inputs[0].shape()[3];
+    size_t filterHeight = inputs[1].shape()[2];
+    size_t filterWidth = inputs[1].shape()[3];
+    size_t outputChannels = outputs[0].shape()[1];
+    size_t outputHeight = outputs[0].shape()[2];
+    size_t outputWidth = outputs[0].shape()[3];
+    real* inputData = inputs[0].data<real>();
+    real* filterData = inputs[1].data<real>();
+    real* outputData = outputs[0].data<real>();
+    NaiveConvFunctor<real> conv;
+    conv(inputData,
+         batchSize,
+         inputChannels,
+         inputHeight,
+         inputWidth,
+         filterData,
+         filterHeight,
+         filterWidth,
+         outputData,
+         outputChannels,
+         outputHeight,
+         outputWidth,
+         paddingH(),
+         paddingW(),
+         strideH(),
+         strideW());
+  }
+};
+REGISTER_TYPED_FUNC(NaiveConv, CPU, NaiveConvFunction);
+}  // namespace paddle
--- a/paddle/function/PadOpTest.cpp
+++ b/paddle/function/PadOpTest.cpp
@@ -25,7 +25,7 @@ TEST(Pad, real) {
          VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
                  << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
          for (bool test_grad : {false, true}) {
-            FunctionCompare compare(
+            CpuGpuFuncCompare compare(
                test_grad ? "PadGrad" : "Pad",
                FuncConfig()
                    .set<std::vector<uint32_t>>("channel", {2, 3})

--- a/paddle/function/RowConvOpTest.cpp
+++ b/paddle/function/RowConvOpTest.cpp
@@ -18,7 +18,7 @@ limitations under the License. */
 namespace paddle {
 void testRowConvFw(size_t batchSize, size_t dim, size_t contextLength) {
-  FunctionCompare test("RowConv", FuncConfig());
+  CpuGpuFuncCompare test("RowConv", FuncConfig());
  test.addSequence(SequenceIdArg(TensorShape{batchSize}));
  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));
@@ -31,7 +31,7 @@ void testRowConvFw(size_t batchSize, size_t dim, size_t contextLength) {
 }
 void testRowConvBw(size_t batchSize, size_t dim, size_t contextLength) {
-  FunctionCompare test("RowConvGrad", FuncConfig());
+  CpuGpuFuncCompare test("RowConvGrad", FuncConfig());
  test.addSequence(SequenceIdArg(TensorShape{batchSize}));
  test.addInputs(SequenceArg(VALUE_TYPE_FLOAT, TensorShape{batchSize, dim}));

--- a/paddle/gserver/layers/ConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ConvBaseLayer.cpp
@@ -118,11 +118,7 @@ size_t ConvBaseLayer::calOutputSize() {
    layerSize = outH[0] * outW[0] * size_t(numFilters_);
  };
-  if (isDeconv_) {
+  setLayerSize(imgSizeH_, imgSizeW_, outputH_, outputW_);
-    setLayerSize(outputH_, outputW_, imgSizeH_, imgSizeW_);
-  } else {
-    setLayerSize(imgSizeH_, imgSizeW_, outputH_, outputW_);
-  }
  return layerSize;
 }

--- a/paddle/gserver/layers/CudnnConvBaseLayer.cpp
+++ b/paddle/gserver/layers/CudnnConvBaseLayer.cpp
@@ -70,14 +70,8 @@ void CudnnConvBaseLayer::forward(PassType passType) {
  if (biases_) {
    REGISTER_TIMER_INFO("CudnnConvBiasTimer", getName().c_str());
    int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-    int outH, outW;
+    int outH = outputH_[0];
-    if (isDeconv_) {
+    int outW = outputW_[0];
-      outH = imgSizeH_[0];
-      outW = imgSizeW_[0];
-    } else {
-      outH = outputH_[0];
-      outW = outputW_[0];
-    }
    hl_tensor_reshape(outputDesc_,
                      batchSize,

--- a/paddle/gserver/layers/ExpandConvBaseLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.cpp
@@ -22,26 +22,8 @@ bool ExpandConvBaseLayer::init(const LayerMap &layerMap,
  /* Initialize the basic convolutional parent class */
  ConvBaseLayer::init(layerMap, parameterMap);
-  /* The class fields channels_ and numFilters_ are the same as in the config
-   * i.e., channels_ is the for the input and numFilters_ is for the output
-   *
-   * But in order for the variables in convTrans having the same semantic
-   * meaning as in conv, we need to swap channels_ and numFilters here for
-   * convTrans, and in other functions too.
-   * */
-  /* Initialize the projection */
  for (auto &inputConfig : config_.inputs()) {
    const ConvConfig &conf = inputConfig.conv_conf();
-    int numFilters = isDeconv_ ? conf.channels() : numFilters_;
-    subM_.push_back(numFilters / conf.groups());
-    subN_.push_back(conf.output_x() *
-                    (conf.has_output_y() ? conf.output_y() : conf.output_x()));
-    int channel = isDeconv_ ? numFilters_ : conf.channels();
-    subK_.push_back(
-        channel * conf.filter_size() *
-        (conf.has_filter_size_y() ? conf.filter_size_y() : conf.filter_size()) /
-        conf.groups());
    /* Consistent caffe mode for multiple input */
    caffeMode_ = conf.caffe_mode();
  }
@@ -54,17 +36,9 @@ bool ExpandConvBaseLayer::init(const LayerMap &layerMap,
 size_t ExpandConvBaseLayer::getOutputSize() {
  CHECK_NE(inputLayers_.size(), 0UL);
  size_t layerSize = ConvBaseLayer::calOutputSize();
-  subN_.clear();
-  for (size_t i = 0; i < inputLayers_.size(); i++) {
-    subN_.push_back(outputH_[i] * outputW_[i]);
-  }
  return layerSize;
 }
-void ExpandConvBaseLayer::resetExpandInput(size_t height, size_t width) {
-  Matrix::resizeOrCreate(expandInput_, height, width, false, useGpu_);
-}
 void ExpandConvBaseLayer::addSharedBias() {
  size_t mapW = getOutputSize() / numFilters_;
  size_t mapH = getOutputValue()->getElementCnt() / mapW;
@@ -101,173 +75,6 @@ void ExpandConvBaseLayer::addUnsharedBias() {
  outValue->addBias(*bias, 1.0f);
 }
-void ExpandConvBaseLayer::expandOneFrame(MatrixPtr image,
-                                         size_t startIdx,
-                                         int inIdx) {
-  int channel = isDeconv_ ? numFilters_ : channels_[inIdx];
-  resetExpandInput(subK_[inIdx] * groups_[inIdx], subN_[inIdx]);
-  CHECK_EQ(image->getWidth(),
-           static_cast<size_t>(imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel));
-  real *imgData = image->getData() + startIdx * image->getWidth();
-  MatrixPtr imageTmp =
-      Matrix::create(imgData,
-                     1,
-                     imgSizeH_[inIdx] * imgSizeW_[inIdx] * channel,
-                     false,
-                     useGpu_);
-  expandInput_->convExpand(*imageTmp,
-                           imgSizeH_[inIdx],
-                           imgSizeW_[inIdx],
-                           channel,
-                           filterSizeY_[inIdx],
-                           filterSize_[inIdx],
-                           strideY_[inIdx],
-                           stride_[inIdx],
-                           paddingY_[inIdx],
-                           padding_[inIdx],
-                           outputH_[inIdx],
-                           outputW_[inIdx]);
-  imageTmp->clear();
-}
-void ExpandConvBaseLayer::expandFwdOnce(MatrixPtr image,
-                                        MatrixPtr out,
-                                        int inIdx,
-                                        int startIdx) {
-  int subM = subM_[inIdx];
-  int subN = subN_[inIdx];
-  int subK = subK_[inIdx];
-  expandOneFrame(image, startIdx, inIdx);
-  int numFilters = isDeconv_ ? channels_[inIdx] : numFilters_;
-  real *outData = out->getData() + startIdx * subN * numFilters;
-  real *wgtData = weights_[inIdx]->getW()->getData();
-  real *expInData = expandInput_->getData();
-  for (int g = 0; g < groups_[inIdx]; ++g) {
-    MatrixPtr A =
-        Matrix::create(wgtData, subM, subK, false, useGpu_);  // mark transpose
-    MatrixPtr B = Matrix::create(expInData, subK, subN, false, useGpu_);
-    MatrixPtr C = Matrix::create(outData, subM, subN, false, useGpu_);
-    C->mul(*A, *B, 1, 1);
-    A->clear();
-    B->clear();
-    C->clear();
-    wgtData += subK * subM;
-    expInData += subK * subN;
-    outData += subM * subN;
-  }
-}
-void ExpandConvBaseLayer::bpropActs(MatrixPtr out,
-                                    MatrixPtr image,
-                                    int inpIdx) {
-  int channel = isDeconv_ ? numFilters_ : channels_[inpIdx];
-  int subM = subM_[inpIdx];
-  int subN = subN_[inpIdx];
-  int subK = subK_[inpIdx];
-  size_t batchSize = image->getHeight();
-  /* reset the expand-grad memory */
-  resetExpandInput(subK * groups_[inpIdx], subN);
-  real *localGradData = out->getData();
-  real *tgtGradData = image->getData();
-  for (size_t n = 0; n < batchSize; n++) {
-    real *wgtData = weights_[inpIdx]->getW()->getData();
-    real *expandInData = expandInput_->getData();
-    for (int g = 0; g < groups_[inpIdx]; g++) {
-      // create temporary matrix
-      MatrixPtr C = Matrix::create(expandInData, subK, subN, false, useGpu_);
-      MatrixPtr B = Matrix::create(localGradData, subM, subN, false, useGpu_);
-      MatrixPtr A = Matrix::create(wgtData, subM, subK, true, useGpu_);
-      C->mul(*A, *B);  // mul
-      // clear the temporary matrix
-      A->clear();
-      B->clear();
-      C->clear();
-      expandInData += subK * subN;
-      localGradData += subM * subN;
-      wgtData += subK * subM;
-    }
-    // shrink one frame outGrad
-    MatrixPtr oneGradTmp = Matrix::create(
-        expandInput_->getData(), subK * groups_[inpIdx], subN, false, useGpu_);
-    MatrixPtr vTmp =
-        Matrix::create(tgtGradData,
-                       1,
-                       imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel,
-                       false,
-                       useGpu_);
-    vTmp->convShrink(*oneGradTmp,
-                     imgSizeH_[inpIdx],
-                     imgSizeW_[inpIdx],
-                     channel,
-                     filterSizeY_[inpIdx],
-                     filterSize_[inpIdx],
-                     strideY_[inpIdx],
-                     stride_[inpIdx],
-                     paddingY_[inpIdx],
-                     padding_[inpIdx],
-                     outputH_[inpIdx],
-                     outputW_[inpIdx],
-                     1.0f,
-                     1.0f);
-    vTmp->clear();
-    oneGradTmp->clear();
-    // move the data-pointer
-    tgtGradData += imgSizeH_[inpIdx] * imgSizeW_[inpIdx] * channel;
-  }
-}
-void ExpandConvBaseLayer::bpropWeights(MatrixPtr image,
-                                       MatrixPtr out,
-                                       int inpIdx) {
-  MatrixPtr weightGrad = weights_[inpIdx]->getWGrad();
-  int subM = subM_[inpIdx];
-  int subN = subN_[inpIdx];
-  int subK = subK_[inpIdx];
-  size_t batchSize = image->getHeight();
-  resetExpandInput(subK * groups_[inpIdx], subN);
-  real *gradData = out->getData();
-  for (size_t n = 0; n < batchSize; n++) {  // frame by frame
-    // expand
-    expandOneFrame(image, n, inpIdx);
-    real *wGradData = weightGrad->getData();
-    real *expandInData = expandInput_->getData();
-    // expand-mul one-group by one
-    for (int g = 0; g < groups_[inpIdx]; g++) {
-      MatrixPtr A = Matrix::create(expandInData, subK, subN, true, useGpu_);
-      MatrixPtr B = Matrix::create(gradData, subM, subN, false, useGpu_);
-      MatrixPtr C = Matrix::create(wGradData, subM, subK, false, useGpu_);
-      C->mul(*B, *A, 1, 1);
-      A->clear();
-      B->clear();
-      C->clear();
-      gradData += subM * subN;
-      wGradData += subK * subM;
-      expandInData += subK * subN;
-    }
-  }
-}
 void ExpandConvBaseLayer::bpropSharedBias(MatrixPtr biases, MatrixPtr v) {
  size_t mapW = getOutputSize() / numFilters_;
  size_t mapH = v->getElementCnt() / mapW;

--- a/paddle/gserver/layers/ExpandConvBaseLayer.h
+++ b/paddle/gserver/layers/ExpandConvBaseLayer.h
@@ -26,19 +26,6 @@ namespace paddle {
 */
 class ExpandConvBaseLayer : public ConvBaseLayer {
 protected:
-  /// For expand convolution.
-  /// subM_ = numFilters_ / groups_.
-  IntV subM_;
-  /// subN_ = outputH_ * outputW_.
-  IntV subN_;
-  /// subK_ = channels_ * filterPixels_ * groups_.
-  IntV subK_;
-  /*The expandInput_ and transOutValue_ are used for CPU expand conv calc
-   * Expand one sample at a time. shape:
-   * (numChannels * filterPixels_, outputSizeH * outputSizeW)
-   * */
-  MatrixPtr expandInput_;
  /// The transpose of output, which is an auxiliary matrix.
  MatrixPtr transOutValue_;
@@ -52,10 +39,6 @@ public:
            const ParameterMap& parameterMap) override;
  size_t getOutputSize();
-  /**
-   * Create or resize expandInput_.
-   */
-  void resetExpandInput(size_t height, size_t width);
  /**
   * Add shared bias.
@@ -66,20 +49,9 @@ public:
   * Add unshared bias.
   */
  void addUnsharedBias();
-  /**
-   * Expand one input sample.
-   */
-  void expandOneFrame(MatrixPtr image, size_t startIdx, int inIdx);
-  /**
-   * Expand one input sample and perform matrix multiplication.
-   */
-  void expandFwdOnce(MatrixPtr image, MatrixPtr out, int inIdx, int startIdx);
  void bpropSharedBias(MatrixPtr biases, MatrixPtr v);
  void bpropBiases(MatrixPtr v);
-  void bpropWeights(MatrixPtr image, MatrixPtr out, int inpIdx);
-  void bpropActs(MatrixPtr image, MatrixPtr out, int inpIdx);
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -18,32 +18,94 @@ limitations under the License. */
 namespace paddle {
+/*
+ * The calculation of the exconvt(convolution transpose (deconv) operation)
+ * is a swap of forward and backward of the calculation of exconv.
+ * */
 REGISTER_LAYER(exconv, ExpandConvLayer);
+REGISTER_LAYER(exconvt, ExpandConvLayer);
 bool ExpandConvLayer::init(const LayerMap &layerMap,
                           const ParameterMap &parameterMap) {
  /* Initialize the basic convolutional parent class */
  ExpandConvBaseLayer::init(layerMap, parameterMap);
+  size_t numInputs = config_.inputs_size();
+  inputShape_.resize(numInputs);
+  filterShape_.resize(numInputs);
+  outputShape_.resize(numInputs);
+  for (int i = 0; i < config_.inputs_size(); i++) {
+    std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
+    std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
+    createFunction(forward_,
+                   !isDeconv_ ? "GemmConv" : "GemmConvGradInput",
+                   FuncConfig()
+                       .set("paddings", paddings)
+                       .set("strides", strides)
+                       .set("groups", (size_t)groups_[i]));
+    createFunction(backward_,
+                   !isDeconv_ ? "GemmConvGradInput" : "GemmConv",
+                   FuncConfig()
+                       .set("paddings", paddings)
+                       .set("strides", strides)
+                       .set("groups", (size_t)groups_[i]));
+    createFunction(backward_,
+                   "GemmConvGradFilter",
+                   FuncConfig()
+                       .set("paddings", paddings)
+                       .set("strides", strides)
+                       .set("groups", (size_t)groups_[i]));
+  }
  return true;
 }
+// i is the index of input layers
+#define BACKWARD_INPUT(i, inputs, outputs) \
+  backward_[2 * i]->calc(inputs, outputs)
+#define BACKWARD_FILTER(i, inputs, outputs) \
+  backward_[2 * i + 1]->calc(inputs, outputs)
 void ExpandConvLayer::forward(PassType passType) {
  Layer::forward(passType);
-  /* malloc memory for the output_ if necessary */
+  size_t batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
  resetOutput(batchSize, getOutputSize());
-  MatrixPtr image = nullptr;
+  // Calculate the shape of the input, output, and filter.
-  MatrixPtr outV = getOutputValue();
  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    LayerPtr prevLayer = getPrev(i);
+    inputShape_[i] = TensorShape({(size_t)batchSize,
-    image = prevLayer->getOutputValue();
+                                  (size_t)channels_[i],
-    for (size_t off = 0; off < image->getHeight(); off++) {
+                                  (size_t)imgSizeH_[i],
-      REGISTER_TIMER_INFO("expandFwdOnce", getName().c_str());
+                                  (size_t)imgSizeW_[i]});
-      expandFwdOnce(image, outV, i, off);
+    filterShape_[i] =
-    }
+        TensorShape({(size_t)groups_[i],
+                     !isDeconv_ ? (size_t)numFilters_ / groups_[i]
+                                : (size_t)channels_[i] / groups_[i],
+                     !isDeconv_ ? (size_t)channels_[i] / groups_[i]
+                                : (size_t)numFilters_ / groups_[i],
+                     (size_t)filterSizeY_[i],
+                     (size_t)filterSize_[i]});
+    outputShape_[i] = TensorShape({(size_t)batchSize,
+                                   (size_t)numFilters_,
+                                   (size_t)outputH_[i],
+                                   (size_t)outputW_[i]});
  }
+  // Calculate the output value.
+  for (size_t i = 0; i < inputLayers_.size(); ++i) {
+    BufferArgs inputs;
+    BufferArgs outputs;
+    inputs.addArg(*getInputValue(i), inputShape_[i]);
+    inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
+    outputs.addArg(*getOutputValue(),
+                   outputShape_[i],
+                   !isDeconv_ && i == 0 ? ASSIGN_TO : ADD_TO);
+    forward_[i]->calc(inputs, outputs);
+  }
  /* add the bias-vector */
  if (biases_.get()) {
    if (sharedBiases_) {
@@ -67,14 +129,30 @@ void ExpandConvLayer::backward(const UpdateCallback &callback) {
    biases_->getParameterPtr()->incUpdate(callback);
  }
+  // Calculate the input grad and filter grad.
  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    /* First, calculate the input layers error */
+    if (getInputGrad(i)) {
-    if (getPrev(i)->getOutputGrad()) {
+      BufferArgs inputs;
-      bpropActs(outGrad, getPrev(i)->getOutputGrad(), i);
+      BufferArgs outputs;
+      inputs.addArg(*getOutputGrad(), outputShape_[i]);
+      inputs.addArg(*weights_[i]->getW(), filterShape_[i]);
+      outputs.addArg(*getInputGrad(i), inputShape_[i], ADD_TO);
+      BACKWARD_INPUT(i, inputs, outputs);
    }
    if (weights_[i]->getWGrad()) {
-      /* Then, calculate the W-gradient for the current layer */
+      BufferArgs inputs;
-      bpropWeights(getPrev(i)->getOutputValue(), outGrad, i);
+      BufferArgs outputs;
+      if (!isDeconv_) {
+        inputs.addArg(*getOutputGrad(), outputShape_[i]);
+        inputs.addArg(*getInputValue(i), inputShape_[i]);
+      } else {
+        inputs.addArg(*getInputValue(i), inputShape_[i]);
+        inputs.addArg(*getOutputGrad(), outputShape_[i]);
+      }
+      outputs.addArg(*weights_[i]->getWGrad(), filterShape_[i], ADD_TO);
+      BACKWARD_FILTER(i, inputs, outputs);
      /* Increasing the number of gradient */
      weights_[i]->getParameterPtr()->incUpdate(callback);
    }

--- a/paddle/gserver/layers/ExpandConvLayer.h
+++ b/paddle/gserver/layers/ExpandConvLayer.h
@@ -40,6 +40,11 @@ public:
  void forward(PassType passType) override;
  void backward(const UpdateCallback& callback) override;
+protected:
+  std::vector<TensorShape> inputShape_;
+  std::vector<TensorShape> filterShape_;
+  std::vector<TensorShape> outputShape_;
 };
 }  // namespace paddle
--- a/paddle/gserver/layers/ExpandConvTransLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvTransLayer.cpp
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include "ExpandConvTransLayer.h"
-#include "paddle/utils/Logging.h"
-#include "paddle/utils/Stat.h"
-/* The implementation of the convTransLayer is basically a swap of forward and
- * backward of the original convLayer.
- * The variable naming follows the convention of the convLayer.
- * */
-namespace paddle {
-REGISTER_LAYER(exconvt, ExpandConvTransLayer);
-bool ExpandConvTransLayer::init(const LayerMap &layerMap,
-                                const ParameterMap &parameterMap) {
-  /* Initialize the basic convolutional parent class */
-  ExpandConvBaseLayer::init(layerMap, parameterMap);
-  return true;
-}
-void ExpandConvTransLayer::forward(PassType passType) {
-  Layer::forward(passType);
-  /* malloc memory for the output_ if necessary */
-  int batchSize = inputLayers_[0]->getOutputValue()->getHeight();
-  resetOutput(batchSize, getOutputSize());
-  MatrixPtr output = nullptr;
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    LayerPtr prevLayer = getPrev(i);
-    output = prevLayer->getOutputValue();
-    REGISTER_TIMER_INFO("shrinkFwd", getName().c_str());
-    bpropActs(output, getOutputValue(), i);
-  }
-  /* add the bias-vector */
-  if (biases_.get()) {
-    if (sharedBiases_) {
-      addSharedBias();
-    } else {
-      addUnsharedBias();
-    }
-  }
-  /* activation */
-  forwardActivation();
-}
-void ExpandConvTransLayer::backward(const UpdateCallback &callback) {
-  backwardActivation();
-  MatrixPtr imageGrad = getOutputGrad();
-  if (biases_ && biases_->getWGrad()) {
-    bpropBiases(imageGrad);
-    /* Increasing the number of gradient */
-    biases_->getParameterPtr()->incUpdate(callback);
-  }
-  for (size_t i = 0; i < inputLayers_.size(); ++i) {
-    /* First, calculate the input layers error */
-    for (size_t off = 0; off < imageGrad->getHeight(); off++) {
-      if (getPrev(i)->getOutputGrad()) {
-        expandFwdOnce(imageGrad, getPrev(i)->getOutputGrad(), i, off);
-      }
-    }
-    if (weights_[i]->getWGrad()) {
-      /* Then, calculate the W-gradient for the current layer */
-      bpropWeights(imageGrad, getPrev(i)->getOutputValue(), i);
-      /* Increasing the number of gradient */
-      weights_[i]->getParameterPtr()->incUpdate(callback);
-    }
-  }
-}
-}  // namespace paddle
--- a/paddle/gserver/tests/test_BatchNorm.cpp
+++ b/paddle/gserver/tests/test_BatchNorm.cpp
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/ExpandConvTransLayer.h"
 #include "paddle/trainer/Trainer.h"
 #include "paddle/utils/GlobalConstants.h"

--- a/paddle/gserver/tests/test_ConvTrans.cpp
+++ b/paddle/gserver/tests/test_ConvTrans.cpp
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/ExpandConvTransLayer.h"
 #include "paddle/math/MathUtils.h"
 #include "paddle/trainer/Trainer.h"
 #include "paddle/utils/GlobalConstants.h"

--- a/paddle/gserver/tests/test_ConvUnify.cpp
+++ b/paddle/gserver/tests/test_ConvUnify.cpp
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <vector>
 #include "ModelConfig.pb.h"
 #include "paddle/gserver/layers/DataLayer.h"
-#include "paddle/gserver/layers/ExpandConvTransLayer.h"
 #include "paddle/math/MathUtils.h"
 #include "paddle/trainer/Trainer.h"
 #include "paddle/utils/GlobalConstants.h"

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -18,7 +18,7 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
 add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
    COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
-    DEPENDS gen_proto_py ${PY_FILES} ${external_project_dependencies})
+    DEPENDS gen_proto_py ${PY_FILES} ${external_project_dependencies} paddle_master_shared)
 add_custom_target(paddle_python ALL DEPENDS
    ${OUTPUT_DIR}/.timestamp)

--- a/python/paddle/v2/__init__.py
+++ b/python/paddle/v2/__init__.py
@@ -26,6 +26,7 @@ import evaluator
 from . import dataset
 from . import reader
 from . import plot
+from . import master
 import attr
 import op
 import pooling
@@ -37,9 +38,26 @@ import plot
 import image
 __all__ = [
-    'optimizer', 'layer', 'activation', 'parameters', 'init', 'trainer',
+    'optimizer',
-    'event', 'data_type', 'attr', 'pooling', 'data_feeder', 'dataset', 'reader',
+    'layer',
-    'topology', 'networks', 'infer', 'plot', 'evaluator', 'image'
+    'activation',
+    'parameters',
+    'init',
+    'trainer',
+    'event',
+    'data_type',
+    'attr',
+    'pooling',
+    'data_feeder',
+    'dataset',
+    'reader',
+    'topology',
+    'networks',
+    'infer',
+    'plot',
+    'evaluator',
+    'image',
+    'master',
 ]

--- a/python/paddle/v2/master/.gitignore
+++ b/python/paddle/v2/master/.gitignore
+*.whl
+*.so
+*.pyc
--- a/python/paddle/v2/master/__init__.py
+++ b/python/paddle/v2/master/__init__.py
+from client import *
+__all__ = ['client']
--- a/python/paddle/v2/master/client.py
+++ b/python/paddle/v2/master/client.py
+import ctypes
+import os
+path = os.path.join(os.path.dirname(__file__), "libpaddle_master.so")
+lib = ctypes.cdll.LoadLibrary(path)
+class client(object):
+    """
+    client is a client to the master server.
+    """
+    def __init__(self, addr, buf_size):
+        self.c = lib.paddle_new_master_client(addr, buf_size)
+    def close(self):
+        lib.paddle_release_master_client(self.c)
+        self.c = None
+    def set_dataset(self, paths):
+        holder_type = ctypes.c_char_p * len(paths)
+        holder = holder_type()
+        print paths
+        for idx, path in enumerate(paths):
+            c_ptr = ctypes.c_char_p(path)
+            holder[idx] = c_ptr
+        lib.paddle_set_dataset(self.c, holder, len(paths))
+    def next_record(self):
+        p = ctypes.c_char_p()
+        ret = ctypes.pointer(p)
+        size = lib.paddle_next_record(self.c, ret)
+        if size == 0:
+            # Empty record
+            return ""
+        record = ret.contents.value[:size]
+        # Memory created from C should be freed.
+        lib.mem_free(ret.contents)
+        return record
--- a/python/setup.py.in
+++ b/python/setup.py.in
 from setuptools import setup
 packages=['paddle',
          'paddle.proto',
          'paddle.trainer',
@@ -9,7 +8,8 @@ packages=['paddle',
          'paddle.v2',
          'paddle.v2.dataset',
          'paddle.v2.reader',
-          'paddle.v2.plot']
+          'paddle.v2.plot',
+          'paddle.v2.master']
 setup_requires=["requests",
                "numpy",
@@ -25,7 +25,8 @@ setup(name='paddle',
      description='Parallel Distributed Deep Learning',
      install_requires=setup_requires,
      packages=packages,
+      package_data={'paddle.v2.master': ['libpaddle_master.so'], },
      package_dir={
          '': '${CMAKE_CURRENT_SOURCE_DIR}'
-      }
+      },
 )