diff --git a/.travis.yml b/.travis.yml
index 0705baa1aca8b480b2a774076bd91fb9df401a53..162bebba091d84b295f929527de9804e65df5a65 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -25,9 +25,9 @@ addons:
     packages:
       - gcc-4.8
       - g++-4.8
+      - gfortran-4.8
       - git
       - build-essential
-      - libatlas-base-dev
       - python
       - python-pip
       - python2.7-dev
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 4e1ae7dc81231943c4bf3db4d4ac6f073f4fd1c4..26306f9849100d4463dde267acae5392cc81d7ac 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -16,7 +16,7 @@
 set(CBLAS_FOUND OFF)
 
 ## Find MKL First.
-set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")
+set(MKL_ROOT $ENV{MKLROOT} CACHE PATH "Folder contains MKL")
 
 find_path(MKL_INCLUDE_DIR mkl.h PATHS
   ${MKL_ROOT}/include)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 0e8c29c831c823f701d8eecd954d3b120085e495..29d17691db9f4575bae4372c61a0e1964e163fc9 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -15,7 +15,6 @@
 INCLUDE(cblas)
 
 IF(NOT ${CBLAS_FOUND})
-    MESSAGE(FATAL_ERROR "Please install OpenBlas, MKL or ATLAS.")
     INCLUDE(ExternalProject)
 
     SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
@@ -28,20 +27,40 @@ IF(NOT ${CBLAS_FOUND})
         SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/libopenblas.a" CACHE FILEPATH "openblas library" FORCE)
     ENDIF(WIN32)
 
+    IF(CMAKE_COMPILER_IS_GNUCC)
+        ENABLE_LANGUAGE(Fortran)
+        LIST(APPEND CBLAS_LIBRARIES gfortran pthread)
+    ENDIF(CMAKE_COMPILER_IS_GNUCC)
+
+    IF(NOT CMAKE_Fortran_COMPILER)
+        MESSAGE(FATAL_ERROR "To build lapack in libopenblas, "
+                "you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...")
+    ENDIF(NOT CMAKE_Fortran_COMPILER)
+
     ExternalProject_Add(
         openblas
         ${EXTERNAL_PROJECT_LOG_ARGS}
-        URL                 "https://github.com/xianyi/OpenBLAS/archive/v0.2.19.tar.gz"
+        GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
+        GIT_TAG             v0.2.19
         PREFIX              ${CBLAS_SOURCES_DIR}
         INSTALL_DIR         ${CBLAS_INSTALL_DIR}
         BUILD_IN_SOURCE     1
-        CONFIGURE_COMMAND   ""
-        BUILD_COMMAND       make CC=${CMAKE_C_COMPILER} FC=${CMAKE_Fortran_COMPILER}
-        INSTALL_COMMAND     make install PREFIX=<INSTALL_DIR>
+        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} NO_SHARED=1 libs netlib
+        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 PREFIX=<INSTALL_DIR>
         UPDATE_COMMAND      ""
+        CONFIGURE_COMMAND   ""
+    )
+
+    ExternalProject_Add_Step(
+        openblas lapacke_install
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h" "${CBLAS_INSTALL_DIR}/include/lapacke_mangling.h"
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke.h" "${CBLAS_INSTALL_DIR}/include/lapacke.h"
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_config.h" "${CBLAS_INSTALL_DIR}/include/lapacke_config.h"
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_utils.h" "${CBLAS_INSTALL_DIR}/include/lapacke_utils.h"
+        DEPENDEES install
     )
 
     LIST(APPEND external_project_dependencies openblas)
-ENDIF()
+ENDIF(NOT ${CBLAS_FOUND})
 
 INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index c0cf2719f9a7b3ae6be5cefffa3dbd2c3f712e82..613614c0e3d42fac4147f78edbc1bd6d62847419 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -54,6 +54,7 @@ ExternalProject_Add(
   CONFIGURE_COMMAND
     ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
     -Dprotobuf_BUILD_TESTS=OFF
+    -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
     -DCMAKE_BUILD_TYPE=Release
     -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index 29247d5c3d474acaa5c65e450780f00b8885ee78..209e679f2cb2178423bf20dec73a0bccef199fcb 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -31,6 +31,7 @@ IF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
         "please use pip to upgrade protobuf.")
     ENDIF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
 ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
+    MESSAGE(FATAL_ERROR "Please install python 2.7 before building PaddlePaddle.")
     ##################################### PYTHON ########################################
     SET(PYTHON_SOURCES_DIR ${THIRD_PARTY_PATH}/python)
     SET(PYTHON_INSTALL_DIR ${THIRD_PARTY_PATH}/install/python)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 0d1ef5cd8449bd31b4cfa4619f27bce7c1f55ebb..b76852fc6c50e80633c8294fb2724b83f15293a7 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -96,6 +96,7 @@ set(COMMON_FLAGS
     -Wno-unused-parameter
     -Wno-unused-function
     -Wno-error=literal-suffix
+    -Wno-error=sign-compare
     -Wno-error=unused-local-typedefs)
 
 set(GPU_COMMON_FLAGS
@@ -105,6 +106,7 @@ set(GPU_COMMON_FLAGS
     -Wdelete-non-virtual-dtor
     -Wno-unused-parameter
     -Wno-unused-function
+    -Wno-error=sign-compare
     -Wno-error=literal-suffix
     -Wno-error=unused-local-typedefs
     -Wno-error=unused-function  # Warnings in Numpy Header.
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 788db404ebfb6facbaedf2910186f3b1afe775c1..ab124a89dcdc1cc5b865f6a15c1693b2f72ea39a 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -21,6 +21,7 @@ ELSE(WIN32)
         SET(MACOS_VERSION ${VERSION})
         SET(HOST_SYSTEM "macosx")
     ELSE(APPLE)
+
         IF(EXISTS "/etc/issue")
             FILE(READ "/etc/issue" LINUX_ISSUE)
             IF(LINUX_ISSUE MATCHES "CentOS")
@@ -31,6 +32,14 @@ ELSE(WIN32)
                 SET(HOST_SYSTEM "ubuntu")
             ENDIF()
         ENDIF(EXISTS "/etc/issue")
+
+        IF(EXISTS "/etc/redhat-release")
+            FILE(READ "/etc/redhat-release" LINUX_ISSUE)
+            IF(LINUX_ISSUE MATCHES "CentOS")
+                SET(HOST_SYSTEM "centos")
+            ENDIF()
+        ENDIF(EXISTS "/etc/redhat-release")
+
     ENDIF(APPLE)
 ENDIF(WIN32)
 
@@ -47,7 +56,7 @@ SET(EXTERNAL_PROJECT_LOG_ARGS
     LOG_DOWNLOAD    0     # Wrap download in script to log output
     LOG_UPDATE      1     # Wrap update in script to log output
     LOG_CONFIGURE   1     # Wrap configure in script to log output
-    LOG_BUILD       1     # Wrap build in script to log output
+    LOG_BUILD       0     # Wrap build in script to log output
     LOG_TEST        1     # Wrap test in script to log output
-    LOG_INSTALL     1     # Wrap install in script to log output
+    LOG_INSTALL     0     # Wrap install in script to log output
 )
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index 6954be3b2bb956755c7820bf285addfd15226874..924ccf01169234bdb29e4e3ea76adbedd73bf00a 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -4,6 +4,8 @@ Installing from Sources
 * [1. Download and Setup](#download)
 * [2. Requirements](#requirements)
 * [3. Build on Ubuntu](#ubuntu)
+* [4. Build on Centos](#centos)
+
 
 ## <span id="download">Download and Setup</span> 
 You can download PaddlePaddle from the [github source](https://github.com/PaddlePaddle/Paddle).
@@ -64,7 +66,8 @@ As a simple example, consider the following:
 
 1. **BLAS Dependencies(optional)**
   
-    Paddle will find BLAS from system's default path. But you can specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
+    CMake will search BLAS libraries from system. If not found, OpenBLAS will be downloaded, built and installed automatically.
+    To utilize preinstalled BLAS， you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
 
     ```bash
     # specify MKL
@@ -99,7 +102,7 @@ As a simple example, consider the following:
     ```bash
     # necessary
     sudo apt-get update
-    sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev git
+    sudo apt-get install -y g++ make cmake build-essential python python-pip libpython-dev git
     sudo pip install wheel numpy
     sudo pip install 'protobuf>=3.0.0'
     ```
@@ -150,3 +153,64 @@ export PATH=<path to install>/bin:$PATH
 # install PaddlePaddle Python modules.
 sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
 ```
+## <span id="centos">Build on Centos 7</span>
+
+### Install Dependencies
+
+- **CPU Dependencies**
+
+    ```bash
+    # necessary
+    sudo yum update
+    sudo yum install -y epel-release
+    sudo yum install -y make cmake3 python-devel python-pip gcc-gfortran swig git
+    sudo pip install wheel numpy
+    sudo pip install 'protobuf>=3.0.0'
+    ```
+  
+- **GPU Dependencies (optional)**
+
+    To build GPU version, you will need the following installed:
+
+        1. a CUDA-capable GPU
+        2. A supported version of Linux with a gcc compiler and toolchain
+        3. NVIDIA CUDA Toolkit (available at http://developer.nvidia.com/cuda-downloads)
+        4. NVIDIA cuDNN Library (availabel at https://developer.nvidia.com/cudnn)
+
+    The CUDA development environment relies on tight integration with the host development environment,
+    including the host compiler and C runtime libraries, and is therefore only supported on
+    distribution versions that have been qualified for this CUDA Toolkit release.
+        
+    After downloading cuDNN library, issue the following commands:
+
+    ```bash
+    sudo tar -xzf cudnn-7.5-linux-x64-v5.1.tgz -C /usr/local
+    sudo chmod a+r /usr/local/cuda/include/cudnn.h /usr/local/cuda/lib64/libcudnn*
+    ```
+    Then you need to set LD\_LIBRARY\_PATH, PATH environment variables in ~/.bashrc.
+
+    ```bash
+    export LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+    export PATH=/usr/local/cuda/bin:$PATH
+    ```
+
+### Build and Install
+
+As usual, the best option is to create build folder under paddle project directory.
+
+```bash
+mkdir build && cd build
+``` 
+
+Finally, you can build and install PaddlePaddle:
+
+```bash
+# you can add build option here, such as:    
+cmake3 .. -DCMAKE_INSTALL_PREFIX=<path to install>
+# please use sudo make install, if you want to install PaddlePaddle into the system
+make -j `nproc` && make install
+# set PaddlePaddle installation path in ~/.bashrc
+export PATH=<path to install>/bin:$PATH
+# install PaddlePaddle Python modules.
+sudo pip install <path to install>/opt/paddle/share/wheels/*.whl
+```
diff --git a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
index 943b1d4bb84646d9f60de7790be166a83d10b1e0..4b328fc9d38bc5dfec35d5e0f0d46136aeeb41bc 100644
--- a/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
+++ b/doc/howto/deep_model/rnn/hierarchical_layer_cn.rst
@@ -32,7 +32,7 @@ pooling_layer 的使用示例如下，详细见 :ref:`api_trainer_config_helpers
         
 - `pooling_type` 目前支持两种，分别是：MaxPooling()和AvgPooling()。
 
-- `agg_level=AggregateLevel.TIMESTEP` 时（默认值）：
+- `agg_level=AggregateLevel.EACH_TIMESTEP` 时（默认值）：
 
   - 作用：双层序列经过运算变成一个0层序列，或单层序列经过运算变成一个0层序列
   - 输入：一个双层序列，或一个单层序列
@@ -54,7 +54,7 @@ last_seq 的使用示例如下（ :ref:`api_trainer_config_helpers_layers_first_
         last = last_seq(input=layer,
                         agg_level=AggregateLevel.EACH_SEQUENCE)
         
-- `agg_level=AggregateLevel.TIMESTEP` 时（默认值）：
+- `agg_level=AggregateLevel.EACH_TIMESTEP` 时（默认值）：
 
   - 作用：一个双层序列经过运算变成一个0层序列，或一个单层序列经过运算变成一个0层序列
   - 输入：一个双层序列或一个单层序列
diff --git a/doc/howto/usage/k8s/k8s_aws_en.md b/doc/howto/usage/k8s/k8s_aws_en.md
index b04bfba590de42956dfe99256cde325b24adbfab..a6422b9be00e210a6a305260585520acd72fb2f1 100644
--- a/doc/howto/usage/k8s/k8s_aws_en.md
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
@@ -2,25 +2,16 @@
 
 ## Create AWS Account and IAM Account
 
-To use AWS, we need to sign up an AWS account on Amazon's Web site.
-An AWS account allows us to login to the AWS Console Web interface to
-create IAM users and user groups. Usually, we create a user group with
-privileges required to run PaddlePaddle, and we create users for
-those who are going to run PaddlePaddle and add these users into the
-group. IAM users can identify themselves using password and tokens,
-where passwords allows users to log in to the AWS Console, and tokens
-make it easy for users to submit and inspect jobs from the command
-line.
+Under each AWS account, we can create multiple [IAM](http://docs.aws.amazon.com/IAM/latest/UserGuide/introduction.html) users. This allows us to grant some privileges to each IAM user and to create/operate AWS clusters as an IAM user.
 
 To sign up an AWS account, please
 follow
 [this guide](http://docs.aws.amazon.com/lambda/latest/dg/setting-up.html).
-To create users and user groups under an AWS account, please
+To create IAM users and user groups under an AWS account, please
 follow
 [this guide](http://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html).
 
-Please be aware that this tutorial needs the following privileges in
-the user group:
+Please be aware that this tutorial needs the following privileges for the user in IAM:
 
 - AmazonEC2FullAccess
 - AmazonS3FullAccess
@@ -31,14 +22,7 @@ the user group:
 - IAMUserSSHKeys
 - IAMFullAccess
 - NetworkAdministrator
-
-
-By the time we write this tutorial, we noticed that Chinese AWS users
-might suffer from authentication problems when running this tutorial.
-Our solution is that we create a VM instance with the default Amazon
-AMI and in the same zone as our cluster runs, so we can SSH to this VM
-instance as a tunneling server and control our cluster and jobs from
-it.
+- AWSKeyManagementServicePowerUser
 
 
 ## PaddlePaddle on AWS
@@ -46,9 +30,11 @@ it.
 Here we will show you step by step on how to run PaddlePaddle training on AWS cluster.
 
 
-###Download kube-aws and kubectl
+### Download kube-aws and kubectl
+
+#### kube-aws
 
-####kube-aws
+[kube-aws](https://github.com/coreos/kube-aws) is a CLI tool to automate cluster deployment to AWS.
 
 Import the CoreOS Application Signing Public Key:
 
@@ -63,7 +49,7 @@ gpg2 --fingerprint FC8A365E
 ```
 The correct key fingerprint is `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E`
 
-Go to the [releases](https://github.com/coreos/kube-aws/releases) and download the latest release tarball and detached signature (.sig) for your architecture.
+We can download `kube-aws` from its [release page](https://github.com/coreos/kube-aws/releases). In this tutorial, we use version 0.9.1
 
 Validate the tarball's GPG signature:
 
@@ -88,24 +74,30 @@ mv ${PLATFORM}/kube-aws /usr/local/bin
 ```
 
 
-####kubectl
+#### kubectl
 
-Go to the [releases](https://github.com/kubernetes/kubernetes/releases) and download the latest release tarball.
+[kubectl](https://kubernetes.io/docs/user-guide/kubectl-overview/) is a command line interface for running commands against Kubernetes clusters.
 
-Extract the tarball and then concate the kubernetes binaries directory into PATH:
+Download `kubectl` from the Kubernetes release artifact site with the `curl` tool.
 
 ```
-export PATH=<path/to/kubernetes-directory>/platforms/linux/amd64:$PATH
+# OS X
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/darwin/amd64/kubectl
 
+# Linux
+curl -O https://storage.googleapis.com/kubernetes-release/release/"$(curl -s https://storage.googleapis.com/kubernetes-release/release/stable.txt)"/bin/linux/amd64/kubectl
 ```
 
-User credentials and security tokens will be generated later in user directory, not in `~/.kube/config`, they will be necessary to use the CLI or the HTTP Basic Auth.
+Make the kubectl binary executable and move it to your PATH (e.g. `/usr/local/bin`):
 
+```
+chmod +x ./kubectl
+sudo mv ./kubectl /usr/local/bin/kubectl
+```
 
-###Configure AWS Credentials
-
-First check out [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) for installing the AWS command line interface, if you use ec2 instance with default amazon AMI, the cli tool has already been installed on your machine.
+### Configure AWS Credentials
 
+First check out [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) for installing the AWS command line interface.
 
 And then configure your AWS account information:
 
@@ -115,44 +107,49 @@ aws configure
 ```
 
 
-Fill in the required fields (You can get your AWS aceess key id and AWS secrete access key by following [this](http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html) instruction):
+Fill in the required fields:
 
 
 ```
 AWS Access Key ID: YOUR_ACCESS_KEY_ID
 AWS Secrete Access Key: YOUR_SECRETE_ACCESS_KEY
-Default region name: us-west-2
+Default region name: us-west-1
 Default output format: json
-
 ```
 
-Test that your credentials work by describing any instances you may already have running on your account:
+`YOUR_ACCESS_KEY_ID`, and `YOUR_SECRETE_ACCESS_KEY` is the IAM key and secret from [Create AWS Account and IAM Account](#create-aws-account-and-iam-account)
+
+Verify that your credentials work by describing any instances you may already have running on your account:
 
 ```
 aws ec2 describe-instances
 ```
 
-###Define Cluster Parameters
+### Define Cluster Parameters
 
-####EC2 key pair
+#### EC2 key pair
 
 The keypair that will authenticate SSH access to your EC2 instances. The public half of this key pair will be configured on each CoreOS node.
 
-After creating a key pair, you will use the name you gave the keys to configure the cluster. Key pairs are only available to EC2 instances in the same region. More info in the [EC2 Keypair docs](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html).
+Follow [EC2 Keypair docs](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html) to create a EC2 key pair
+
+After creating a key pair, you will use the key pair name to configure the cluster.
 
-####KMS key
+Key pairs are only available to EC2 instances in the same region. We are using us-west-1 in our tutorial, so make sure to creat key pairs in that region (N. California).
+
+#### KMS key
 
 Amazon KMS keys are used to encrypt and decrypt cluster TLS assets. If you already have a KMS Key that you would like to use, you can skip creating a new key and provide the Arn string for your existing key.
 
 You can create a KMS key in the AWS console, or with the aws command line tool:
 
 ```
-$ aws kms --region=us-west-2 create-key --description="kube-aws assets"
+aws kms --region=us-west-1 create-key --description="kube-aws assets"
 {
     "KeyMetadata": {
         "CreationDate": 1458235139.724,
         "KeyState": "Enabled",
-        "Arn": "arn:aws:kms:us-west-2:xxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx",
+        "Arn": "arn:aws:kms:us-west-1:aaaaaaaaaaaaa:key/xxxxxxxxxxxxxxxxxxx",
         "AWSAccountId": "xxxxxxxxxxxxx",
         "Enabled": true,
         "KeyUsage": "ENCRYPT_DECRYPT",
@@ -162,11 +159,13 @@ $ aws kms --region=us-west-2 create-key --description="kube-aws assets"
 }
 ```
 
-You will use the `KeyMetadata.Arn` string to identify your KMS key in the init step.
+We will need to use the value of `Arn` later.
 
 And then you need to add several inline policies in your user permission.
 
-kms inline policy:
+Go to IAM user page, click on `Add inline policy` button, and then select `Custom Policy`
+
+paste into following inline policies:
 
 ```
 {
@@ -180,18 +179,10 @@ kms inline policy:
                 "kms:Encrypt"
             ],
             "Resource": [
-                "arn:aws:kms:*:xxxxxxxxx:key/*"
+                "arn:aws:kms:*:AWS_ACCOUNT_ID:key/*"
             ]
-        }
-    ]
-}
-```
-cloudformation inline policy:
-
-```
-"Version": "2012-10-17",
-    "Statement": [
-        {
+        },
+		{
             "Sid": "Stmt1482205746000",
             "Effect": "Allow",
             "Action": [
@@ -200,26 +191,43 @@ cloudformation inline policy:
                 "cloudformation:DeleteStack",
                 "cloudformation:DescribeStacks",
                 "cloudformation:DescribeStackResource",
-                "cloudformation:GetTemplate"
+                "cloudformation:GetTemplate",
+                "cloudformation:DescribeStackEvents"
             ],
             "Resource": [
-                "arn:aws:cloudformation:us-west-2:xxxxxxxxx:stack/YOUR_CLUSTER_NAME/*"
+                "arn:aws:cloudformation:us-west-1:AWS_ACCOUNT_ID:stack/MY_CLUSTER_NAME/*"
             ]
         }
     ]
 }
 ```
 
+`AWS_ACCOUNT_ID`: You can get it from following command line:
+
+```
+aws sts get-caller-identity --output text --query Account
+```
 
-####External DNS name
+`MY_CLUSTER_NAME`: Pick a MY_CLUSTER_NAME that you like, you will use it later as well.
 
-When the cluster is created, the controller will expose the TLS-secured API on a public IP address. You will need to create an A record for the external DNS hostname you want to point to this IP address. You can find the API external IP address after the cluster is created by invoking kube-aws status.
+#### External DNS name
 
-####S3 bucket
+When the cluster is created, the controller will expose the TLS-secured API on a DNS name.
+
+The A record of that DNS name needs to be point to the cluster ip address.
+
+We will need to use DNS name later in tutorial. If you don't already own one, you can choose any DNS name (e.g., `paddle`) and modify `/etc/hosts` to associate cluster ip with that DNS name.
+
+#### S3 bucket
 
 You need to create an S3 bucket before startup the Kubernetes cluster.
 
-####Initialize an asset directory
+There are some bugs in aws cli in creating S3 bucket, so let's use the [Web console](https://console.aws.amazon.com/s3/home?region=us-west-1).
+
+Click on `Create Bucket`, fill in a unique BUCKET_NAME, and make sure region is us-west-1 (Northern California).
+
+
+#### Initialize an asset directory
 
 Create a directory on your local machine to hold the generated assets:
 
@@ -231,29 +239,44 @@ $ cd my-cluster
 Initialize the cluster CloudFormation stack with the KMS Arn, key pair name, and DNS name from the previous step:
 
 ```
-$ kube-aws init \
---cluster-name=my-cluster-name \
---external-dns-name=my-cluster-endpoint \
+kube-aws init \
+--cluster-name=MY_CLUSTER_NAME \
+--external-dns-name=MY_EXTERNAL_DNS_NAME \
 --region=us-west-1 \
---availability-zone=us-west-1c \
---key-name=key-pair-name \
---kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
+--availability-zone=us-west-1a \
+--key-name=KEY_PAIR_NAME \
+--kms-key-arn="arn:aws:kms:us-west-1:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
 ```
 
+`MY_CLUSTER_NAME`: the one you picked in [KMS key](#kms-key)
+
+`MY_EXTERNAL_DNS_NAME`: see [External DNS name](#external-dns-name)
+
+`KEY_PAIR_NAME`: see [EC2 key pair](#ec2-key-pair)
+
+`--kms-key-arn`: the "Arn" in [KMS key](#kms-key)
+
+Here `us-west-1a` is used for parameter `--availability-zone`, but supported availability zone varies among AWS accounts.
+
+Please check if `us-west-1a` is supported by `aws ec2 --region us-west-1 describe-availability-zones`, if not switch to other supported availability zone. (e.g., `us-west-1a`, or `us-west-1b`)
+
+Note: please don't use `us-west-1c`. Subnets can currently only be created in the following availability zones: us-west-1b, us-west-1a.
+
 There will now be a cluster.yaml file in the asset directory. This is the main configuration file for your cluster.
 
-####Render contents of the asset directory
+
+#### Render contents of the asset directory
 
 In the simplest case, you can have kube-aws generate both your TLS identities and certificate authority for you.
 
 ```
-$ kube-aws render credentials --generate-ca
+kube-aws render credentials --generate-ca
 ```
 
 The next command generates the default set of cluster assets in your asset directory.
 
 ```
-sh $ kube-aws render stack
+kube-aws render stack
 ```
 
 Here's what the directory structure looks like:
@@ -285,47 +308,62 @@ $ tree
 These assets (templates and credentials) are used to create, update and interact with your Kubernetes cluster.
 
 
-###Kubernetes Cluster Start Up
+### Kubernetes Cluster Start Up
 
-####Create the instances defined in the CloudFormation template
+#### Create the instances defined in the CloudFormation template
 
-Now for the exciting part, creating your cluster:
+Now let's create your cluster (choose any PREFIX for the command below):
 
 ```
-$ kube-aws up --s3-uri s3://<your-bucket-name>/<prefix>
+kube-aws up --s3-uri s3://BUCKET_NAME/PREFIX
 ```
 
-####Configure DNS
+`BUCKET_NAME`: the bucket name that you used in [S3 bucket](#s3-bucket)
 
-You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation, if necessary. This command can take a while. And then dig the load balancer hostname to get the ip address, use this ip to setup an A record for your external dns name.
 
-####Access the cluster
+#### Configure DNS
 
-Once the API server is running, you should see:
+You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation.
 
 ```
-$ kubectl --kubeconfig=kubeconfig get nodes
-NAME                                       STATUS                     AGE
-ip-10-0-0-xxx.us-west-1.compute.internal   Ready                      5m
-ip-10-0-0-xxx.us-west-1.compute.internal   Ready                      5m
-ip-10-0-0-xx.us-west-1.compute.internal    Ready,SchedulingDisabled   5m
+$ kube-aws status
+Cluster Name:		paddle-cluster
+Controller DNS Name:	paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-1.elb.amazonaws.com
 ```
 
+Use command `dig` to check the load balancer hostname to get the ip address.
 
-###Setup PaddlePaddle Environment on AWS
+```
+$ dig paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-1.elb.amazonaws.com
 
-Now, we've created a cluster with following network capability:
+;; QUESTION SECTION:
+;paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-1.elb.amazonaws.com. IN A
 
-1. All Kubernetes nodes can communicate with each other.
+;; ANSWER SECTION:
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-1.elb.amazonaws.com. 59 IN A 54.241.164.52
+paddle-cl-ElbAPISe-EEOI3EZPR86C-531251350.us-west-1.elb.amazonaws.com. 59 IN A 54.67.102.112
+```
 
-1. All Docker containers on Kubernetes nodes can communicate with each other.
+In the above output, both ip `54.241.164.52`, `54.67.102.112` will work.
 
-1. All Kubernetes nodes can communicate with all Docker containers on Kubernetes nodes.
+If you own a DNS name, set the A record to any of the above ip. Otherwise you can edit `/etc/hosts` to associate ip with the DNS name.
 
-1. All other traffic loads from outside of Kubernetes nodes cannot reach to the Docker containers on Kubernetes nodes except for creating the services for containers.
+#### Access the cluster
 
+Once the API server is running, you should see:
 
-For sharing the training data across all the Kubernetes nodes, we use EFS (Elastic File System) in AWS. Ceph might be a better solution, but it requires high version of Linux kernel that might not be stable enough at this moment. We haven't automated the EFS setup at this moment, so please do the following steps:
+```
+$ kubectl --kubeconfig=kubeconfig get nodes
+NAME                                       STATUS                     AGE
+ip-10-0-0-xxx.us-west-1.compute.internal   Ready                      5m
+ip-10-0-0-xxx.us-west-1.compute.internal   Ready                      5m
+ip-10-0-0-xx.us-west-1.compute.internal    Ready,SchedulingDisabled   5m
+```
+
+
+### Setup Elastic File System for Cluster
+
+Training data is usually served on a distributed filesystem, we use Elastic File System (EFS) on AWS. Ceph might be a better solution, but it requires high version of Linux kernel that might not be stable enough at this moment. We haven't automated the EFS setup at this moment, so please do the following steps:
 
 
 1. Make sure you added AmazonElasticFileSystemFullAccess policy in your group.
@@ -342,57 +380,71 @@ For sharing the training data across all the Kubernetes nodes, we use EFS (Elast
 <center>![](src/efs_mount.png)</center>
 
 
-Before starting the training, you should place your user config and divided training data onto EFS. When the training start, each task will copy related files from EFS into container, and it will also write the training results back onto EFS, we will show you how to place the data later in this article.
+We will place user config and divided training data onto EFS. Training task will cache related files by copying them from EFS into container. It will also write the training results back onto EFS. We will show you how to place the data later in this article.
+
+
+
+### Core Concepts of PaddlePaddle Training on AWS
+
+Now we've already setup a 3 nodes distributed Kubernetes cluster, and on each node we've attached the EFS volume. In this training demo, we will create three Kubernetes pods and schedule them on three nodes. Each pod contains a PaddlePaddle container. When container gets created, it will start parameter server (pserver) and trainer process, load the training data from EFS volume and start the distributed training task.
 
+#### Distributed Training Job
 
+A distributed training job is represented by a [kubernetes job](https://kubernetes.io/docs/user-guide/jobs/#what-is-a-job).
 
-###Core Concept of PaddlePaddle Training on AWS
+Each Kuberentes job is described by a job config file, which specifies the information like the number of pods in the job and environment variables.
 
-Now we've already setup a 3 nodes distributed Kubernetes cluster, and on each node we've attached the EFS volume, in this training demo, we will create three Kubernetes pod and scheduling them on 3 node. Each pod contains a PaddlePaddle container. When container gets created, it will start pserver and trainer process, load the training data from EFS volume and start the distributed training task.
+In a distributed training job, we would:
 
-####Use Kubernetes Job
+1. upload the partitioned training data and configuration file onto EFS volume, and
+1. create and submit the Kubernetes job config to the Kubernetes cluster to start the training job.
 
-We use Kubernetes job to represent one time of distributed training. After the job get finished, Kubernetes will destroy job container and release all related resources.
+#### Parameter Servers and Trainers
 
-We can write a yaml file to describe the Kubernetes job. The file contains lots of configuration information, for example PaddlePaddle's node number, `paddle pserver` open port number, the network card info etc., these information are passed into container for processes to use as environment variables.
+There are two roles in a PaddlePaddle cluster: `parameter server` and `trainer`. Each parameter server process maintains a shard of the global model. Each trainer has its local copy of the model, and uses its local data to update the model. During the training process, trainers send model updates to parameter servers, parameter servers are responsible for aggregating these updates, so that trainers can synchronize their local copy with the global model.
 
-In one time of distributed training, user will confirm the PaddlePaddle node number first. And then upload the pre-divided training data and configuration file onth EFS volume. And then create the Kubernetes job yaml file; submit to the Kubernetes cluster to start the training job.
+<center>![Model is partitioned into two shards. Managed by two parameter servers respectively.](src/pserver_and_trainer.png)</center>
 
-####Create PaddlePaddle Node
+In order to communicate with pserver, trainer needs to know the ip address of each pserver. In kubernetes it's better to use a service discovery mechanism (e.g., DNS hostname) rather than static ip address, since any pserver's pod may be killed and a new pod could be schduled onto another node of different ip address. We will improve paddlepaddle's service discovery ability. For now we will use static ip.
 
-After Kubernetes master gets the request, it will parse the yaml file and create several pods (defined by PaddlePaddle's node number), Kubernetes will allocate these pods onto cluster's node. A pod represents a PaddlePaddle node, when pod is successfully allocated onto one physical/virtual machine, Kubernetes will startup the container in the pod, and this container will use the environment variables in yaml file and start up `paddle pserver` and `paddle trainer` processes.
+Parameter server and trainer are packaged into a same docker image. They will run once pod is scheduled by kubernetes job.
 
+#### Trainer ID
 
-####Start up Training
+Each trainer process requires a trainer ID, a zero-based index value, passed in as a command-line parameter. The trainer process thus reads the data partition indexed by this ID.
 
-After container gets started, it starts up the distributed training by using scripts. We know `paddle train` process need to know other node's ip address and it's own trainer_id, since PaddlePaddle currently don't have the ability to do the service discovery, so in the start up script, each node will use job pod's name to query all to pod info from Kubernetes apiserver (apiserver's endpoint is an environment variable in container by default).
+#### Training
 
-With pod information, we can assign each pod a unique trainer_id. Here we sort all the pods by pod's ip, and assign the index to each PaddlePaddle node as it's trainer_id. The workflow of starting up the script is as follows:
+The entry-point of a container is a Python script. As it runs in a pod, it can see some environment variables pre-defined by Kubernetes. This includes one that gives the job's identity, which can be used in a remote call to the Kubernetes apiserver that lists all pods in the job.
 
-1. Query the api server to get pod information, and assign the trainer_id by sorting the ip.
+We rank each pod by sorting them by their ips. The rank of each pod could be the "pod ID". Because we run one trainer and one parameter server in each pod, we can use this "pod ID" as the trainer ID. A detailed workflow of the entry-point script is as follows:
+
+1. Query the api server to get pod information, and assign the `trainer_id` by sorting the ip.
 1. Copy the training data from EFS sharing volume into container.
-1. Parse the `paddle pserver` and 'paddle trainer' startup parameters from environment variables, and then start up the processes.
-1. PaddlePaddle will automatically write the result onto the PaddlePaddle node with trainer_id:0, we set the output path to be the EFS volume to save the result data.
+1. Parse the `paddle pserver` and `paddle trainer` startup parameters from environment variables, and then start up the processes.
+1. Trainer with `train_id` 0 will automatically write results onto EFS volume.
 
 
-###Start PaddlePaddle Training Demo on AWS
+### Start PaddlePaddle Training Demo on AWS
 
 Now we'll start a PaddlePaddle training demo on AWS, steps are as follows:
 
 1. Build PaddlePaddle Docker image.
 1. Divide the training data file and upload it onto the EFS sharing volume.
-1. Create the training job yaml file, and start up the job.
+1. Create the training job config file, and start up the job.
 1. Check the result after training.
 
-####Build PaddlePaddle Docker Image
+#### Build PaddlePaddle Docker Image
 
-PaddlePaddle docker image need to provide the runtime environment for `paddle pserver` and `paddle train`, so the container use this image should have two main function:
+PaddlePaddle docker image need to provide the runtime environment for `pserver` and `trainer`, so the container use this image should have two main function:
 
 1. Copy the training data into container.
-1. Generate the startup parameter for `paddle pserver` and `paddle train` process, and startup the training.
+1. Generate the startup parameter for `pserver` and `trainer` process, and startup the training.
+
 
+We need to create a new image since official `paddledev/paddle:cpu-latest` only have PaddlePaddle binary, but lack of the above functionalities.
 
-Since official `paddledev/paddle:cpu-latest` have already included the PaddlePaddle binary, but lack of the above functionalities, so we will create the startup script based on this image, to achieve the work above. the detailed Dockerfile is as follows:
+Dockerfile for creating the new image is as follows:
 
 ```
 FROM paddledev/paddle:cpu-latest
@@ -481,7 +533,7 @@ And then push the built image onto docker registry.
 docker push  your_repo/paddle:mypaddle
 ```
 
-####Upload Training Data File
+#### Upload Training Data File
 
 Here we will use PaddlePaddle's official recommendation demo as the content for this training, we put the training data file into a directory named by job name, which located in EFS sharing volume, the tree structure for the directory looks like:
 
@@ -498,10 +550,10 @@ efs
     └── recommendation
 ```
 
-The `paddle-cluster-job` directory is the job name for this training, this training includes 3 PaddlePaddle node, we store the pre-divided data under `paddle-cluster-job/data` directory, directory 0, 1, 2 each represent 3 nodes' trainer_id. the training data in in recommendation directory, the training results and logs will be in the output directory.
+The `paddle-cluster-job` directory is the job name for this training, this training includes 3 PaddlePaddle node, we store the partitioned data under `paddle-cluster-job/data` directory, directory 0, 1, 2 each represent 3 nodes' trainer_id. the training data in in recommendation directory, the training results and logs will be in the output directory.
 
 
-####Create Kubernetes Job
+#### Create Kubernetes Job
 
 Kubernetes use yaml file to describe job details, and then use command line tool to create the job in Kubernetes cluster.
 
@@ -583,7 +635,7 @@ After we execute the above command, Kubernetes will create 3 pods and then pull
 
 
 
-####Check Training Results
+#### Check Training Results
 
 During the training, we can see the logs and models on EFS sharing volume, the output directory contains the training results. (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node)
 
@@ -640,7 +692,7 @@ I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7
 It'll take around 8 hours to finish this PaddlePaddle recommendation training demo on three 2 core 8 GB EC2 machine (m3.large).
 
 
-###Kubernetes Cluster Tear Down
+### Kubernetes Cluster Tear Down
 
 
 If you want to tear down the whole Kubernetes cluster, make sure to *delete* the EFS volume first (otherwise, you will get stucked on following steps), and then use the following command:
@@ -651,16 +703,3 @@ kube-aws destroy
 It's an async call, it might take 5 min to tear down the whole cluster.
 
 If you created any Kubernetes Services of type LoadBalancer, you must delete these first, as the CloudFormation cannot be fully destroyed if any externally-managed resources still exist.
-
-
-
-## For Experts with Kubernetes and AWS
-
-Sometimes we might need to create or manage the cluster on AWS manually with limited privileges, so here we will explain more on what’s going on with the Kubernetes setup script.
-
-### Some Presumptions
-
-* Instances run on CoreOS, the official IAM.
-* Kubernetes node use instance storage, no EBS get mounted. Etcd is running on additional node.
-* For networking, we use Flannel network at this moment, we will use Calico solution later on.
-* When you create a service with Type=LoadBalancer, Kubernetes will create and ELB, and create a security group for the ELB.
diff --git a/doc/howto/usage/k8s/src/pserver_and_trainer.png b/doc/howto/usage/k8s/src/pserver_and_trainer.png
new file mode 100644
index 0000000000000000000000000000000000000000..f41fe48920590333ad332bb51eb18e03dc251541
Binary files /dev/null and b/doc/howto/usage/k8s/src/pserver_and_trainer.png differ
diff --git a/paddle/function/BufferArg.cpp b/paddle/function/BufferArg.cpp
index fde48a73b61c31d06225cc1763efbc6971c86f57..5d595deb12c6c8ea419dd1f31b3c131a2f6a587a 100644
--- a/paddle/function/BufferArg.cpp
+++ b/paddle/function/BufferArg.cpp
@@ -20,23 +20,27 @@ limitations under the License. */
 namespace paddle {
 
 const SequenceArg& BufferArg::sequence() const {
-  // CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
+  CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
   return dynamic_cast<const SequenceArg&>(*this);
 }
 
 const SparseMatrixArg& BufferArg::sparse() const {
-  // CHECK_EQ(bufferType_, TENSOR_SPARSE);
+  CHECK_EQ(bufferType_, TENSOR_SPARSE);
   return dynamic_cast<const SparseMatrixArg&>(*this);
 }
 
 SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
     : BufferArg(sparse, argType),
       row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {
+  bufferType_ = TENSOR_SPARSE;
+}
 
 SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
     : BufferArg(sparse, argType),
       row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {
+  bufferType_ = TENSOR_SPARSE;
+}
 
 }  // namespace paddle
diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 12352ba29e33920ba65bd66088b6f7cc53517b52..84209265ce7634121e3e4dde609cd787093c45ec 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -23,10 +23,11 @@ limitations under the License. */
 namespace paddle {
 
 enum BufferType {
-  TENSOR_NORMAL = 0,
-  TENSOR_SEQUENCE_ID = 1,
-  TENSOR_SEQUENCE_DATA = 2,
-  TENSOR_SPARSE = 3
+  TENSOR_UNKNOWN = 0,
+  TENSOR_NORMAL = 1,
+  TENSOR_SEQUENCE_ID = 2,
+  TENSOR_SEQUENCE_DATA = 3,
+  TENSOR_SPARSE = 4
 };
 
 enum SparseDataType {
@@ -39,7 +40,6 @@ enum SparseDataFormat { SPARSE_CSR_FORMAT = 0, SPARSE_CSC_FORMAT = 1 };
 class BufferArg;
 class SequenceArg;
 class SparseMatrixArg;
-typedef std::shared_ptr<BufferArg> BufferArgPtr;
 
 /**
  * \brief BufferArg used as the argument type of Function.
@@ -50,6 +50,11 @@ typedef std::shared_ptr<BufferArg> BufferArgPtr;
  * 3. SequenceArg for a Buffer of sequence data.
  * 4. SparseMatrixArg for a Buffer of sparse matrix.
  *
+ * Buffer shape
+ * For most buffers, the first dimension `shape()[0]` represents
+ * the size of the mini-batch.
+ *
+ * Buffer argType
  * There is an ArgType property for the BufferArg used as Function Output.
  * Whether the result of the Function calculation is assigned to the
  * output Buffer or added to the output Buffer is determined by the
@@ -71,6 +76,14 @@ public:
   ArgType getArgType() const { return argType_; }
 
 public:
+  BufferArg(ValueType valueType,
+            const TensorShape& shape,
+            ArgType argType = UNSPECIFIED)
+      : buf_(nullptr),
+        valueType_(valueType),
+        shape_(shape),
+        argType_(argType) {}
+
   BufferArg(void* buf,
             ValueType valueType,
             const TensorShape& shape,
@@ -86,6 +99,7 @@ public:
         valueType_(DataType<real>::value),
         shape_(2),
         argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
     shape_.setDim(0, matrix.getHeight());
     shape_.setDim(1, matrix.getWidth());
   }
@@ -98,6 +112,7 @@ public:
         valueType_(DataType<real>::value),
         shape_(shape),
         argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
     CHECK_EQ(matrix.getElementCnt(), shape.getElements());
   }
 
@@ -107,6 +122,7 @@ public:
         valueType_(DataType<real>::value),
         shape_(1),
         argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
     shape_.setDim(0, vector.getSize());
   }
 
@@ -116,6 +132,7 @@ public:
         valueType_(VALUE_TYPE_INT32),
         shape_(1),
         argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
     shape_.setDim(0, vector.getSize());
   }
 
@@ -150,6 +167,8 @@ public:
   ValueType valueType() const { return valueType_; }
   BufferType bufferType() const { return bufferType_; }
   const TensorShape& shape() const { return shape_; }
+  bool isSparse() const { return (TENSOR_SPARSE == bufferType_); }
+  bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
 
   const SequenceArg& sequence() const;
   const SparseMatrixArg& sparse() const;
@@ -158,8 +177,8 @@ protected:
   void* buf_;
   ValueType valueType_;
   TensorShape shape_;
-  BufferType bufferType_;
-  ArgType argType_ = UNSPECIFIED;
+  BufferType bufferType_{TENSOR_UNKNOWN};
+  ArgType argType_{UNSPECIFIED};
   // leading dimensions. The size is dims_.size()
   // Dims lds_;
 };
@@ -170,15 +189,24 @@ protected:
 // if a < b then value_.buf_[a] < value_.buf_[b]
 class SequenceIdArg : public BufferArg {
 public:
+  SequenceIdArg(const TensorShape& shape, ArgType argType = UNSPECIFIED)
+      : BufferArg(VALUE_TYPE_INT32, shape, argType) {
+    CHECK_EQ(shape_.ndims(), (size_t)1);
+    CHECK_GT(shape_[0], 1);
+    numSeqs_ = shape_[0] - 1;
+  }
+
   SequenceIdArg(void* buf,
                 const TensorShape& shape,
                 ArgType argType = UNSPECIFIED)
       : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
     CHECK_EQ(shape_.ndims(), (size_t)1);
     numSeqs_ = shape_[0] - 1;
   }
 
   SequenceIdArg(const IVector& vector) : BufferArg(vector) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
     numSeqs_ = shape_[0] - 1;
   }
 
@@ -190,26 +218,41 @@ private:
   size_t numSeqs_;
 };
 
-// sequence data
+// sequences data
+// For mini-batch calculate,
+// one batch can contain more than one sequence of data.
+// SequenceArg can be used to represent sequences that contain multiple
+// unequal lengths.
 class SequenceArg : public BufferArg {
 public:
+  SequenceArg(ValueType valueType,
+              const TensorShape& shape,
+              ArgType argType = UNSPECIFIED)
+      : BufferArg(valueType, shape, argType), startPositions_(TensorShape()) {}
+
   SequenceArg(void* buf,
               ValueType valueType,
               const TensorShape& shape,
               const SequenceIdArg& startPositions,
               ArgType argType = UNSPECIFIED)
       : BufferArg(buf, valueType, shape, argType),
-        startPositions_(startPositions) {}
+        startPositions_(startPositions) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
 
   SequenceArg(const Matrix& matrix,
               const IVector& vector,
               ArgType argType = UNSPECIFIED)
-      : BufferArg(matrix, argType), startPositions_(vector) {}
+      : BufferArg(matrix, argType), startPositions_(vector) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
 
   ~SequenceArg() {}
 
   void* getIdBuf() const { return startPositions_.data(); }
   size_t numSeqs() const { return startPositions_.numSeqs(); }
+  SequenceIdArg& getSequenceId() { return startPositions_; }
+  const SequenceIdArg& getSequenceId() const { return startPositions_; }
 
 private:
   SequenceIdArg startPositions_;
@@ -235,6 +278,7 @@ public:
         nnz_(nnz),
         format_(format),
         type_(type) {
+    bufferType_ = TENSOR_SPARSE;
     CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
     CHECK_EQ(shape_.ndims(), (size_t)2);
     CHECK_EQ(row_.shape().ndims(), (size_t)1);
diff --git a/paddle/function/BufferArgTest.cpp b/paddle/function/BufferArgTest.cpp
index b345597435c9911ce95b596f5f7f2add47f4cd03..1744f377808f137dcda4a28acce336dc22be3d01 100644
--- a/paddle/function/BufferArgTest.cpp
+++ b/paddle/function/BufferArgTest.cpp
@@ -14,9 +14,7 @@ limitations under the License. */
 
 #include "BufferArg.h"
 #include <gtest/gtest.h>
-#include "Function.h"
 #include "paddle/math/MemoryHandle.h"
-#include "paddle/math/SparseMatrix.h"
 
 namespace paddle {
 
@@ -37,55 +35,4 @@ TEST(BufferTest, SequenceIdArg) {
   EXPECT_EQ(buffer.numSeqs(), 9);
 }
 
-TEST(BufferTest, asArgument) {
-  MatrixPtr matrix = Matrix::create(100, 200);
-  VectorPtr vector = Vector::create(100, false);
-  CpuSparseMatrix sparse(200, 300, 50);
-
-  // prepare arguments
-  BufferArgs argments;
-  argments.addArg(*matrix);
-  argments.addArg(*vector);
-  argments.addArg(sparse);
-
-  // function
-  auto function = [=](const BufferArgs& inputs) {
-    EXPECT_EQ(inputs.size(), 3);
-
-    // check inputs[0]
-    EXPECT_EQ(inputs[0].shape().ndims(), 2);
-    EXPECT_EQ(inputs[0].shape()[0], 100);
-    EXPECT_EQ(inputs[0].shape()[1], 200);
-    EXPECT_EQ(inputs[0].data(), matrix->getData());
-
-    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getHeight(),
-              matrix->getHeight());
-    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getWidth(),
-              matrix->getWidth());
-    EXPECT_EQ(inputs[0].matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
-
-    // check inputs[1]
-    EXPECT_EQ(inputs[1].shape().ndims(), 1);
-    EXPECT_EQ(inputs[1].shape()[0], 100);
-    EXPECT_EQ(inputs[1].data(), vector->getData());
-    CpuVector inVector = inputs[1].vector<real, DEVICE_TYPE_CPU>();
-    EXPECT_EQ(inVector.getSize(), vector->getSize());
-    EXPECT_EQ(inVector.getData(), vector->getData());
-
-    // check inputs[2]
-    EXPECT_EQ(inputs[2].shape().ndims(), 2);
-    EXPECT_EQ(inputs[2].shape()[0], 200);
-    EXPECT_EQ(inputs[2].shape()[1], 300);
-    EXPECT_EQ(inputs[2].data(), sparse.getData());
-    // CHECK_EQ(inputs[2].sparse().nnz(), 50);
-    // CHECK_EQ(inputs[2].sparse().dataFormat(), SPARSE_CSR_FORMAT);
-    // CHECK_EQ(inputs[2].sparse().dataType(), SPARSE_FLOAT_VALUE);
-    EXPECT_EQ(inputs[2].sparse().getRowBuf(), sparse.getRows());
-    EXPECT_EQ(inputs[2].sparse().getColBuf(), sparse.getCols());
-  };
-
-  // call function
-  function(argments);
-}
-
 }  // namespace paddle
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 24e7074075f50e13c274b8bdfc0275b00e4abe50..6d20868072c3acaab2c5f9381bad5ea99d841d26 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -19,13 +19,13 @@ if(WITH_TESTING)
     # TODO:
     # file(GLOB test_files . *OpTest.cpp)
     # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
-    # add_simple_unittest(CrossMapNormalOpTest)
+    add_simple_unittest(CrossMapNormalOpTest)
     add_simple_unittest(TensorShapeTest)
     add_simple_unittest(TensorTypeTest)
     add_simple_unittest(BufferArgTest)
     add_simple_unittest(FunctionTest)
+    add_simple_unittest(ContextProjectionOpTest)
     add_simple_unittest(PadOpTest)
-    # add_simple_unittest(ContextProjectionOpTest)
 endif()
 endif()
 
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index cb448562ebb37022f727ee65024f06f69d63e9cb..6cd4e4abee8fccf3a4745b0bfc6701df4ddfa5c0 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -17,7 +17,10 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 
 namespace paddle {
-
+/**
+ * Context Projection Forward with CPU Matrix Device.
+ *
+ */
 template <>
 void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
                                                const CpuMatrix& input_mat,
@@ -70,10 +73,30 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
 }
 
 /**
- * \param inputs[0] input value.
- * \param inputs[1] input weight.
- * \param inputs[2] input sequence.
- * \param outputs[0] output value.
+ * Paddle Function for Context Projection Forward.
+ * Calculate the output layer value sequence after context projection.
+ *
+ * What is Context Projection for a sequence?
+ * For example, assumed input (x) has 4 words and the dimension of each word
+ * representation is 2. If we use zero to pad instead of learned weight to pad,
+ * and the context_lenth is 3, the output (y) is:
+ *
+ * @code
+ *  x = [a1, a2;
+ *       b1, b2;
+ *       c1, c2;
+ *       d1, d2]
+ *  y = [0,  0,  a1, a2, b1, b2;
+ *       a1, a2, b1, b2, c1, c2;
+ *       b1, b2, c1, c2, d1, d2;
+ *       c1, c2, d1, d2, 0,  0]
+ * @endcode
+ *
+ * \param outputs[0].matrix   output layer value, n * (d * l)
+ * \param outputs[0].vector   start position sequence, n * 1
+ * \param inputs[0].matrix    input layer value, n * d
+ * \param inputs[0].vector    start position sequence, n * 1
+ * \param inputs[1].matrix    input layer weight, pad * d
  */
 template <DeviceType Device>
 class ContextProjectionForwardFunc : public FunctionBase {
@@ -85,28 +108,37 @@ public:
   }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ((size_t)3, inputs.size());
+    CHECK(1 == inputs.size() || 2 == inputs.size());
     CHECK_EQ((size_t)1, outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto val_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
 
-    CHECK(outputs[0].data() && inputs[0].data() && inputs[2].data());
-    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[2].shape().ndims(), (size_t)1);
+    CHECK(out_seq.data() && val_seqs.data() && val_seqs.getSequenceId().data());
+    CHECK_EQ(out_seq.shape().ndims(), (size_t)2);
+    CHECK_EQ(val_seqs.shape().ndims(), (size_t)2);
+    CHECK_EQ(val_seqs.getSequenceId().shape().ndims(), (size_t)1);
+    if (2 == inputs.size()) {
+      CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
+    }
     /// dim of output = dim of input * context_length
-    CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
-    /// dim of input == dim of weight
-    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
+    CHECK_EQ(out_seq.shape()[1], val_seqs.shape()[1] * context_length_);
     /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
+    CHECK_EQ(val_seqs.shape()[0], out_seq.shape()[0]);
+    /// dim of input == dim of weight
+    if (2 == inputs.size()) {
+      CHECK_EQ(val_seqs.shape()[1], inputs[1].shape()[1]);
+    }
 
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    auto out_mat = outputs[0].matrix<Device>();
-    auto in_mat = inputs[0].matrix<Device>();
-    auto w_mat = !inputs[1].data()
-                     ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-                     : inputs[1].matrix<Device>();
-    auto seq_vec = inputs[2].vector<int, Device>();
+    CHECK_EQ(out_seq.getArgType(), ADD_TO);
+    auto out_mat = out_seq.matrix<Device>();
+    const auto in_mat = val_seqs.matrix<Device>();
+    const auto w_mat =
+        (2 == inputs.size())
+            ? inputs[1].matrix<Device>()
+            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    const auto seq_vec = val_seqs.getSequenceId().vector<int, Device>();
     ContextProjectionForward<Device>(out_mat,
                                      in_mat,
                                      w_mat,
@@ -122,8 +154,12 @@ private:
   size_t begin_pad_;
 };
 
+/**
+ * Context Projection Backward with CPU Matrix Device.
+ *
+ */
 template <>
-void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
+void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
                                                 CpuMatrix& in_grad_mat,
                                                 CpuMatrix& w_grad_mat,
                                                 const CpuIVector& seq_vec,
@@ -146,7 +182,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
         int64_t pad_size =
             std::min(starts[i] - begin, starts[i + 1] - starts[i]);
         if (is_padding && w_grad_mat) {
-          MatrixPtr mat = out_grad_mat.subMatrix(starts[i], pad_size);
+          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
+                              .subMatrix(starts[i], pad_size);
           MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size);
           sub->addAtOffset(*mat, j * input_dim);
         }
@@ -157,8 +194,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
         int64_t pad_size =
             std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
         if (is_padding && w_grad_mat) {
-          MatrixPtr mat =
-              out_grad_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
+          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
+                              .subMatrix(starts[i + 1] - pad_size, pad_size);
           MatrixPtr sub = w_grad_mat.subMatrix(
               begin_pad + context_start + j - pad_size, pad_size);
           sub->addAtOffset(*mat, j * input_dim);
@@ -169,17 +206,22 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
       if (end <= begin) continue;
       if (!in_grad_mat) continue;
       MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin);
-      MatrixPtr dst = out_grad_mat.subMatrix(dst_begin, dst_end - dst_begin);
+      MatrixPtr dst = const_cast<CpuMatrix&>(out_grad_mat)
+                          .subMatrix(dst_begin, dst_end - dst_begin);
       src->addAtOffset(*dst, j * input_dim);
     }
   }
 }
 
 /**
- * \param inputs[0] input grad.
- * \param inputs[1] weight grad.
- * \param inputs[2] input sequence.
- * \param outputs[0] output value.
+ * Context Projection Backward Function.
+ * Update the weight gradient and input layer gradient with backprop
+ *
+ * \param inputs[0].matrix          output layer grad, n * (d * l)
+ * \param inputs[0].vector          start position sequence, n * 1
+ * \param outputs[0].matrix         input layer grad, n * d
+ * \param outputs[0].vector         start position sequence, n * 1
+ * \param outputs[1]                weight grad, pad * d
  */
 template <DeviceType Device>
 class ContextProjectionBackwardFunc : public FunctionBase {
@@ -193,32 +235,36 @@ public:
   }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ((size_t)3, inputs.size());
-    CHECK_EQ((size_t)1, outputs.size());
+    CHECK_EQ((size_t)1, inputs.size());
+    CHECK_EQ((size_t)2, outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+    CHECK(in_seq.data() && in_seq.getSequenceId().data());
+    CHECK_EQ(in_seq.shape().ndims(), (size_t)2);
+    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), (size_t)1);
+    CHECK_EQ(out_seq.shape().ndims(), (size_t)2);
+    CHECK_EQ(out_seq.getSequenceId().shape().ndims(), (size_t)1);
+    CHECK_EQ(outputs[1].shape().ndims(), (size_t)2);
 
-    CHECK(outputs[0].data() && inputs[2].data());
-    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[2].shape().ndims(), (size_t)1);
+    /// dim of input grad == dim of weight
+    CHECK_EQ(out_seq.shape()[1], outputs[1].shape()[1]);
+    /// input and output grad has the same batch_size
+    CHECK_EQ(out_seq.shape()[0], in_seq.shape()[0]);
+    /// dim of output grad = dim of input grad * context_length
+    CHECK_EQ(in_seq.shape()[1], out_seq.shape()[1] * context_length_);
+    CHECK_EQ(out_seq.getArgType(), ADD_TO);
+    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
 
-    /// dim of input == dim of weight
-    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
-    /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
-    /// dim of output = dim of input * context_length
-    CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
-
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-
-    auto out_grad_mat = outputs[0].matrix<Device>();
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    const auto out_grad_mat = in_seq.matrix<Device>();
     auto in_grad_mat =
-        !inputs[0].data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-                          : inputs[0].matrix<Device>();
-    auto w_grad_mat = !inputs[1].data()
+        !out_seq.data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+                        : out_seq.matrix<Device>();
+    auto w_grad_mat = !outputs[1].data()
                           ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-                          : inputs[1].matrix<Device>();
-    auto seq_vec = inputs[2].vector<int, Device>();
+                          : outputs[1].matrix<Device>();
     ContextProjectionBackward<Device>(out_grad_mat,
                                       in_grad_mat,
                                       w_grad_mat,
@@ -238,11 +284,16 @@ private:
   size_t total_pad_;
 };
 
-#if 0
 /**
- * \param inputs[0] input grad.
- * \param inputs[1] input sequence.
- * \param outputs[0] output grad.
+ * Context Projection Backward Data Function
+ * Update input layer grad
+ * input:  sequence of output layer grad
+ * output: sequence of input layer grad
+ *
+ * \param outputs[0].matrix              input layer grad, n * d
+ * \param outputs[0].vector              start position sequence, n * 1
+ * \param inputs[0].matrix               output layer grad, n * (d * l)
+ * \param inputs[0].vector               start positon sequence, n * 1
  */
 template <DeviceType Device>
 class ContextProjectionBackwardDataFunc : public FunctionBase {
@@ -252,32 +303,30 @@ public:
     context_start_ = config.get<int>("context_start");
   }
 
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
-    CHECK_EQ(2, static_cast<int>(inputs.size()));
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1, static_cast<int>(inputs.size()));
     CHECK_EQ(1, static_cast<int>(outputs.size()));
-    CHECK_EQ(0, static_cast<int>(inouts.size()));
-    CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
-    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1);
-    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    const auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+
+    CHECK(in_seq.data() && out_seq.data() && in_seq.getSequenceId().data());
+    CHECK_EQ(static_cast<int>(out_seq.shape().ndims()), 2);
+    CHECK_EQ(static_cast<int>(in_seq.shape().ndims()), 2);
+    CHECK_EQ(static_cast<int>(in_seq.getSequenceId().shape().ndims()), 1);
+    /// output layer grad dim == input layer grad dim * context_length_
+    CHECK_EQ(in_seq.shape().ndims(), out_seq.shape().ndims() * context_length_);
     /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+    CHECK_EQ(in_seq.shape()[0], out_seq.shape()[0]);
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
 
-    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
-    const auto in_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
-    typename SequenceT<Device>::type seq_vec(
-        inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    auto in_grad_mat = out_seq.matrix<Device>();
 
-    ContextProjectionBackwardData<Device>(out_grad_mat.get(),
-                                          in_grad_mat.get(),
-                                          seq_vec,
-                                          context_length_,
-                                          context_start_);
+    ContextProjectionBackwardData<Device>(
+        out_grad_mat, in_grad_mat, seq_vec, context_length_, context_start_);
   }
 
 private:
@@ -286,9 +335,14 @@ private:
 };
 
 /**
- * \param inputs[0] weight grad.
- * \param inputs[1] input sequence.
- * \param outputs[0] output grad.
+ * Context Projection Backward Weight Function
+ * Update weight grad by backprop
+ * input:  sequence of output layer grad
+ * output: weight grad
+ *
+ * \param outputs[0]                   weight grad, pad * d
+ * \param inputs[0].matrix             output layer grad, n * (d * l)
+ * \param inputs[0].vecotr             start positon sequence, n * 1
  */
 template <DeviceType Device>
 class ContextProjectionBackwardWeightFunc : public FunctionBase {
@@ -300,28 +354,25 @@ public:
     total_pad_ = config.get<size_t>("total_pad");
   }
 
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
-    CHECK_EQ(2, static_cast<int>(inputs.size()));
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1, static_cast<int>(inputs.size()));
     CHECK_EQ(1, static_cast<int>(outputs.size()));
-    CHECK_EQ(0, static_cast<int>(inouts.size()));
-
-    CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
-    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1);
-    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
-
-    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
-    auto w_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
-    typename SequenceT<Device>::type seq_vec(
-        inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
+    CHECK(inputs[0].isSequenceArg()) << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    CHECK(in_seq.data() && in_seq.getSequenceId().data() && outputs[0].data());
+    CHECK_EQ(static_cast<int>(outputs[0].shape().ndims()), 2);
+    CHECK_EQ(static_cast<int>(in_seq.shape().ndims()), 2);
+    CHECK_EQ(static_cast<int>(in_seq.getSequenceId().shape().ndims()), 1);
+    CHECK_EQ(in_seq.shape()[0], outputs[0].shape()[0]);
+    /// output layer grad dim == weight dim * context_length_
+    CHECK_EQ(in_seq.shape()[1], outputs[0].shape()[1] * context_length_);
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
 
-    ContextProjectionBackwardWeight<Device>(out_grad_mat.get(),
-                                            w_grad_mat.get(),
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    auto w_grad_mat = outputs[0].matrix<Device>();
+    ContextProjectionBackwardWeight<Device>(out_grad_mat,
+                                            w_grad_mat,
                                             seq_vec,
                                             context_length_,
                                             context_start_,
@@ -335,7 +386,6 @@ private:
   size_t begin_pad_;
   size_t total_pad_;
 };
-#endif
 
 REGISTER_TYPED_FUNC(ContextProjectionForward,
                     CPU,
@@ -350,7 +400,6 @@ REGISTER_TYPED_FUNC(ContextProjectionForward,
 REGISTER_TYPED_FUNC(ContextProjectionBackward,
                     GPU,
                     ContextProjectionBackwardFunc);
-#if 0
 REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
                     GPU,
                     ContextProjectionBackwardDataFunc);
@@ -358,5 +407,4 @@ REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
                     GPU,
                     ContextProjectionBackwardWeightFunc);
 #endif
-#endif
 }  // namespace paddle
diff --git a/paddle/function/ContextProjectionOp.h b/paddle/function/ContextProjectionOp.h
index a558df5e072f2f4dcc5c45afa385b3cf88872d26..2bdd47e4e9b02483c2c5af82bf00c4e55d68f93e 100644
--- a/paddle/function/ContextProjectionOp.h
+++ b/paddle/function/ContextProjectionOp.h
@@ -21,14 +21,14 @@ namespace paddle {
 /**
  * \brief   Context Projection Forward.
  *
- * \param[out]  outputs           output data.
- * \param[in]   input             input data.
- * \param[in]   weight            input weight.
- * \param[in]   sequence          input data.
- * \param[in]   context_length    consecutive rows for concatenation.
- * \param[in]   context_start     context start position.
- * \param[in]   begin_pad         begining pad position.
- * \param[in]   is_padding        whether padding 0 or not.
+ * \param[in/out]  outputs           output data.
+ * \param[in]      input             input data.
+ * \param[in]      weight            input weight.
+ * \param[in]      sequence          input data.
+ * \param[in]      context_length    consecutive rows for concatenation.
+ * \param[in]      context_start     context start position.
+ * \param[in]      begin_pad         begining pad position.
+ * \param[in]      is_padding        whether padding 0 or not.
  *
  */
 template <DeviceType DType>
@@ -56,7 +56,7 @@ void ContextProjectionForward(
  */
 template <DeviceType DType>
 void ContextProjectionBackward(
-    typename Tensor<real, DType>::Matrix& out_grad,
+    const typename Tensor<real, DType>::Matrix& out_grad,
     typename Tensor<real, DType>::Matrix& in_grad,
     typename Tensor<real, DType>::Matrix& w_grad,
     const typename Tensor<int, DType>::Vector& seq_vec,
@@ -68,7 +68,7 @@ void ContextProjectionBackward(
 
 template <DeviceType DType>
 void ContextProjectionBackwardData(
-    typename Tensor<real, DType>::Matrix& out_grad,
+    const typename Tensor<real, DType>::Matrix& out_grad,
     typename Tensor<real, DType>::Matrix& in_grad,
     const typename Tensor<int, DType>::Vector& sequence,
     size_t context_length,
@@ -76,7 +76,7 @@ void ContextProjectionBackwardData(
 
 template <DeviceType DType>
 void ContextProjectionBackwardWeight(
-    typename Tensor<real, DType>::Matrix& out_grad,
+    const typename Tensor<real, DType>::Matrix& out_grad,
     typename Tensor<real, DType>::Matrix& w_grad,
     const typename Tensor<int, DType>::Vector& seq_vec,
     size_t context_length,
diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/function/ContextProjectionOpGpu.cu
index 6a4a01a6510416fc1f945305203f55ece7a28f11..1a5b4042402df3081a493962a5e080d72b7f40b2 100644
--- a/paddle/function/ContextProjectionOpGpu.cu
+++ b/paddle/function/ContextProjectionOpGpu.cu
@@ -138,10 +138,10 @@ void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix& output,
                                 begin_pad);
 }
 
-__global__ void KeContextProjectionBackwardData(real* out_grad,
+__global__ void KeContextProjectionBackwardData(const real* out_grad,
                                                 const int* sequence,
                                                 real* in_grad,
-                                                int input_dim,
+                                                size_t input_dim,
                                                 int context_length,
                                                 int context_start) {
   int idx = threadIdx.x;
@@ -152,7 +152,8 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
   real value = 0;
 
   int instances = seq_end - seq_start + context_length - 1;
-  out_grad += seq_start * input_dim * context_length;
+  auto out = const_cast<real*>(out_grad);
+  out += seq_start * input_dim * context_length;
   in_grad += seq_start * input_dim;
   for (int k = 0; k <= input_dim / block_size; k++) {
     if (idx < input_dim) {
@@ -169,7 +170,7 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
         int outx = (i - context_length) < 0 ? i : (context_length - 1);
         int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
         real* output_r =
-          out_grad + outy * input_dim * context_length + outx * input_dim;
+          out + outy * input_dim * context_length + outx * input_dim;
         for (int j = outy; j < seq_end - seq_start; j++) {
           value += output_r[idx];
           if (j - outy == outx) break;
@@ -194,7 +195,7 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
  * @param[in]   context_start    context start.
  *
  */
-void hl_context_projection_backward_data(real* out_grad,
+void hl_context_projection_backward_data(const real* out_grad,
                                          const int* sequence,
                                          real* input_grad,
                                          size_t num_sequences,
@@ -216,7 +217,7 @@ void hl_context_projection_backward_data(real* out_grad,
 }
 
 template <>
-void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
+void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                                     GpuMatrix& in_grad,
                                                     const GpuIVector& sequence,
                                                     size_t context_length,
@@ -231,7 +232,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
 }
 
 template<int THREADS_X, int THREADS_Y>
-__global__ void KeContextProjectionBackwardWeight(real* out_grad,
+__global__ void KeContextProjectionBackwardWeight(const real* out_grad,
                                                   const int* sequence,
                                                   real* w_grad,
                                                   int num_sequences,
@@ -254,7 +255,8 @@ __global__ void KeContextProjectionBackwardWeight(real* out_grad,
     for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
       int seq_start = sequence[seqId];
       int seq_end = sequence[seqId+1];
-      output_r = out_grad + seq_start * w_dim * context_length;
+      output_r = const_cast<real*>(out_grad)
+                    + seq_start * w_dim * context_length;
 
       if (context_start < 0) {
         if (padId + context_start < 0) {
@@ -318,7 +320,7 @@ __global__ void KeContextProjectionBackwardWeight(real* out_grad,
  * beginning.
  *
  */
-void hl_context_projection_backward_weight(real* out_grad,
+void hl_context_projection_backward_weight(const real* out_grad,
                                            const int* sequence,
                                            real* w_grad,
                                            size_t num_sequences,
@@ -346,7 +348,7 @@ void hl_context_projection_backward_weight(real* out_grad,
 
 template <>
 void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
-        GpuMatrix& out_grad,
+        const GpuMatrix& out_grad,
         GpuMatrix& w_grad,
         const GpuIVector& seq_vec,
         size_t context_length,
@@ -365,7 +367,7 @@ void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
 }
 
 template <>
-void ContextProjectionBackward<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
+void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                                 GpuMatrix& in_grad,
                                                 GpuMatrix& w_grad,
                                                 const GpuIVector& sequence,
diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp
index 6223d2fd23ac3bbb4fbcf51d37d22feaf3b1330b..c9db2ff8008e0bb0fa04370fb7b3ecd7641d2062 100644
--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -56,22 +56,25 @@ void testMatrixProjectionForward(int context_start,
   cpu_out.randomizeUniform();
   gpu_out.copyFrom(cpu_out);
 
-  compare.getCpuFunction()->calc(
-      {Tensor(cpu_in.getData(), Dims{batch_size, input_dim}),
-       Tensor(cpu_weight ? cpu_weight->getData() : nullptr,
-              Dims{pad, input_dim}),
-       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
-              Dims{cpu_seq->getSize()})},
-      {Tensor(cpu_out.getData(), Dims{batch_size, input_dim * context_length})},
-      {});
-  compare.getGpuFunction()->calc(
-      {Tensor(gpu_in.getData(), Dims{batch_size, input_dim}),
-       Tensor(gpu_weight ? gpu_weight->getData() : nullptr,
-              Dims{pad, input_dim}),
-       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
-              Dims{gpu_seq->getSize()})},
-      {Tensor(gpu_out.getData(), Dims{batch_size, input_dim * context_length})},
-      {});
+  BufferArgs cpu_inputs;
+  BufferArgs cpu_outputs;
+  cpu_inputs.addArg(cpu_in, *cpu_seq);
+  if (cpu_weight) {
+    cpu_inputs.addArg(*cpu_weight, *cpu_seq);
+  }
+  cpu_outputs.addArg(cpu_out, *cpu_seq, ADD_TO);
+
+  compare.getCpuFunction()->calc(cpu_inputs, cpu_outputs);
+
+  BufferArgs gpu_inputs;
+  BufferArgs gpu_outputs;
+  gpu_inputs.addArg(gpu_in, *gpu_seq);
+  if (gpu_weight) {
+    gpu_inputs.addArg(*gpu_weight, *gpu_seq);
+  }
+  gpu_outputs.addArg(gpu_out, *gpu_seq, ADD_TO);
+
+  compare.getGpuFunction()->calc(gpu_inputs, gpu_outputs);
 
   autotest::TensorCheckEqual(cpu_out, gpu_out);
 }
@@ -117,25 +120,23 @@ void testMatrixProjectionBackward(int context_start,
     gpu_w_grad->copyFrom(*cpu_w_grad);
   }
 
-  compare.getCpuFunction()->calc(
-      {Tensor(cpu_in_grad.getData(), Dims{batch_size, input_dim}),
-       Tensor(cpu_w_grad ? cpu_w_grad->getData() : nullptr,
-              Dims{pad, input_dim}),
-       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
-              Dims{cpu_seq->getSize()})},
-      {Tensor(cpu_out_grad.getData(),
-              Dims{batch_size, input_dim * context_length})},
-      {});
-
-  compare.getGpuFunction()->calc(
-      {Tensor(gpu_in_grad.getData(), Dims{batch_size, input_dim}),
-       Tensor(gpu_w_grad ? gpu_w_grad->getData() : nullptr,
-              Dims{pad, input_dim}),
-       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
-              Dims{gpu_seq->getSize()})},
-      {Tensor(gpu_out_grad.getData(),
-              Dims{batch_size, input_dim * context_length})},
-      {});
+  BufferArgs cpu_inputs;
+  BufferArgs cpu_outputs;
+  cpu_inputs.addArg(cpu_out_grad, *cpu_seq);
+  cpu_outputs.addArg(cpu_in_grad, *cpu_seq, ADD_TO);
+  cpu_outputs.addArg(
+      cpu_w_grad ? *cpu_w_grad : CpuMatrix(nullptr, 0, input_dim), ADD_TO);
+
+  compare.getCpuFunction()->calc(cpu_inputs, cpu_outputs);
+
+  BufferArgs gpu_inputs;
+  BufferArgs gpu_outputs;
+  gpu_inputs.addArg(gpu_out_grad, *gpu_seq);
+  gpu_outputs.addArg(gpu_in_grad, *gpu_seq, ADD_TO);
+  gpu_outputs.addArg(
+      gpu_w_grad ? *gpu_w_grad : GpuMatrix(nullptr, 0, input_dim), ADD_TO);
+
+  compare.getGpuFunction()->calc(gpu_inputs, gpu_outputs);
 
   autotest::TensorCheckErr(cpu_in_grad, gpu_in_grad);
   if (is_padding) {
diff --git a/paddle/function/CrossMapNormalOp.cpp b/paddle/function/CrossMapNormalOp.cpp
index 92980c503fdaaaa9ac600070197dba6ba4bfb7a4..8e7dc72524a7680a03ea6eb4770a3e25c09ad913 100644
--- a/paddle/function/CrossMapNormalOp.cpp
+++ b/paddle/function/CrossMapNormalOp.cpp
@@ -188,8 +188,13 @@ public:
     CHECK(inputs[0].shape() == inputs[3].shape());
     CHECK(inputs[0].shape() == outputs[0].shape());
 
-    // TODO(hedaoyuan): need support ASSIGN_TO mode.
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
+    if (outputs[0].getArgType() != ADD_TO) {
+      // Currently, some algorithm implementations are ASSIGN_TO mode,
+      // if need to support the ADD_TO calculation, need to clear the output.
+      typename Tensor<real, Device>::Vector tmp(
+          outputs[0].shape().getElements(), outputs[0].data<real>());
+      tmp.zero();
+    }
 
     size_t samples = inputs[0].shape()[0];
     size_t channels = inputs[0].shape()[1];
diff --git a/paddle/function/CrossMapNormalOpTest.cpp b/paddle/function/CrossMapNormalOpTest.cpp
index d65d9310affd7c9b7fee3118c79449870849c243..51f5da81bfc9ae870ac9949ba74da01a9449a04d 100644
--- a/paddle/function/CrossMapNormalOpTest.cpp
+++ b/paddle/function/CrossMapNormalOpTest.cpp
@@ -27,15 +27,19 @@ TEST(CrossMapNormal, real) {
                     << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                     << " size=" << size;
 
-            FunctionCompare compare("CrossMapNormal",
-                                    FuncConfig()
-                                        .set("size", size)
-                                        .set("scale", (real)1.5)
-                                        .set("pow", (real)0.5));
-            Dims dims{numSamples, channels, imgSizeH, imgSizeW};
-            compare.cmpWithArg({Tensor(nullptr, dims)},
-                               {Tensor(nullptr, dims), Tensor(nullptr, dims)},
-                               {});
+            // init Test object
+            FunctionCompare test("CrossMapNormal",
+                                 FuncConfig()
+                                     .set("size", size)
+                                     .set("scale", (real)1.5)
+                                     .set("pow", (real)0.5));
+            // prepare input arguments
+            TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            // run Function
+            test.run();
           }
         }
       }
@@ -53,18 +57,19 @@ TEST(CrossMapNormalGrad, real) {
                     << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW
                     << " size=" << size;
 
-            FunctionCompare compare("CrossMapNormalGrad",
-                                    FuncConfig()
-                                        .set("size", size)
-                                        .set("scale", (real)1.5)
-                                        .set("pow", (real)0.5));
-            Dims dims{numSamples, channels, imgSizeH, imgSizeW};
-            compare.cmpWithArg({Tensor(nullptr, dims),
-                                Tensor(nullptr, dims),
-                                Tensor(nullptr, dims),
-                                Tensor(nullptr, dims)},
-                               {Tensor(nullptr, dims)},
-                               {});
+            FunctionCompare test("CrossMapNormalGrad",
+                                 FuncConfig()
+                                     .set("size", size)
+                                     .set("scale", (real)1.5)
+                                     .set("pow", (real)0.5));
+            TensorShape shape{numSamples, channels, imgSizeH, imgSizeW};
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape));
+            // run Function
+            test.run();
           }
         }
       }
diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp
index dbe3a4e9f608df6333a5637f2d962a555b04d7c3..f47d55a4ade97d76e0f1940a2234e34e20efade6 100644
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -79,15 +79,25 @@ FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
 void BufferArgs::addArg(const Matrix& arg,
                         const TensorShape& shape,
                         ArgType argType) {
-  args_.push_back(std::make_shared<BufferArg>(arg, shape, argType));
+  _args_.push_back(new BufferArg(arg, shape, argType));
+  addArg(*_args_.back());
 }
 
 void BufferArgs::addArg(const CpuSparseMatrix& arg, ArgType argType) {
-  args_.push_back(std::make_shared<SparseMatrixArg>(arg, argType));
+  _args_.push_back(new SparseMatrixArg(arg, argType));
+  addArg(*_args_.back());
 }
 
 void BufferArgs::addArg(const GpuSparseMatrix& arg, ArgType argType) {
-  args_.push_back(std::make_shared<SparseMatrixArg>(arg, argType));
+  _args_.push_back(new SparseMatrixArg(arg, argType));
+  addArg(*_args_.back());
+}
+
+void BufferArgs::addArg(const Matrix& matrix,
+                        const IVector& vector,
+                        ArgType argType) {
+  _args_.push_back(new SequenceArg(matrix, vector, argType));
+  addArg(*_args_.back());
 }
 
 ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index 249f8f9cfad58bf596e8cdce9188409b5690f969..9215c137eb8e85a9a03575104d7f89bbce441eba 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -50,19 +50,44 @@ protected:
  * Argument type for Function::calc().
  * A BufferArgs contains a set of BufferArg,
  * because Function can have multiple inputs and outputs.
+ *
+ * addArg() with Matix object used to adapt Layer Argument.
+ * Will create a BufferArg object in addArg(),
+ * and free in destructor of BufferArgs.
+ *
+ * addArg() with BufferArg object, just save BufferArg object address,
+ * and the caller needs to guarantee the validity of the BufferArg object
+ * in the BufferArgs life time.
  */
 class BufferArgs {
 public:
   BufferArgs() {}
+
+  ~BufferArgs() {
+    for (auto arg : _args_) {
+      delete arg;
+    }
+  }
+
   size_t size() const { return args_.size(); }
 
   // add argument into BufferArgs
   // Tensor can be Matrix, Vector, IVector.
   // For inputs, do not need argType.
   // For outputs, the argType needs to be specified as ASSIGN_TO or ADD_TO.
-  template <typename Tensor>
-  void addArg(const Tensor& arg, ArgType argType = UNSPECIFIED) {
-    args_.push_back(std::make_shared<BufferArg>(arg, argType));
+  void addArg(const Matrix& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  void addArg(const Vector& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
+  }
+
+  void addArg(const IVector& arg, ArgType argType = UNSPECIFIED) {
+    _args_.push_back(new BufferArg(arg, argType));
+    addArg(*_args_.back());
   }
 
   // Add arg into BufferArgs and reshape the arg.
@@ -77,20 +102,37 @@ public:
   void addArg(const CpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
   void addArg(const GpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
 
+  void addArg(const Matrix& matrix,
+              const IVector& vector,
+              ArgType argType = UNSPECIFIED);
+
   // get argument
   const BufferArg& operator[](size_t num) const {
     CHECK_LT(num, args_.size());
     return *args_[num];
   }
 
+  void addArg(BufferArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SequenceIdArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SequenceArg& arg) { args_.push_back(&arg); }
+
+  void addArg(SparseMatrixArg& arg) { args_.push_back(&arg); }
+
 private:
-  std::vector<BufferArgPtr> args_;
+  std::vector<BufferArg*> args_;
+  // The BufferArg object is constructed and freed by BufferArgs.
+  std::vector<BufferArg*> _args_;
 };
 
 /**
  * \brief Base class for Function.
  * The basic Function implementation requires override init and calc interfaces.
  *
+ * The caller needs to ensure the validity of the arguments
+ * during Function execution.
+ *
  * Function inputs are readonly, Function outputs have two modes: ASSIGN_TO
  * and ADD_TO.
  * If output.getArgType() == ASSIGN_TO, this is assign mode, and the calculation
diff --git a/paddle/function/FunctionTest.cpp b/paddle/function/FunctionTest.cpp
index 7ce908320a6f6f764e8fdacc96432aca78d7b2df..fdf7e631e5ab8c67eb5cf906bd0af49740d60112 100644
--- a/paddle/function/FunctionTest.cpp
+++ b/paddle/function/FunctionTest.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "Function.h"
 #include <gtest/gtest.h>
+#include "paddle/math/SparseMatrix.h"
 
 namespace paddle {
 
@@ -56,4 +57,110 @@ TEST(Function, BufferArgs) {
   Function<DEVICE_TYPE_GPU>(gpuArgments);
 }
 
+/**
+ * Some tests case are used to check the consistency between the BufferArg type
+ * argument received by Function and the original type argument.
+ *
+ * Use Case:
+ *  TEST() {
+ *    Matrix matrix(...);
+ *    CheckBufferArg lambda = [=](const BufferArg& arg) {
+ *      // check matrix and arg are equivalent
+ *      EXPECT_EQ(matrix, arg);
+ *    }
+ *
+ *   BufferArgs argments{matrix...};
+ *   std::vector<CheckBufferArg> checkFunc{lambda...};
+ *   testBufferArgs(argments, checkFunc);
+ *  }
+ */
+typedef std::function<void(const BufferArg&)> CheckBufferArg;
+
+void testBufferArgs(const BufferArgs& inputs,
+                    const std::vector<CheckBufferArg>& check) {
+  EXPECT_EQ(inputs.size(), check.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    check[i](inputs[i]);
+  }
+}
+
+void testBufferArgs(const BufferArgs& inputs, const CheckBufferArg& check) {
+  EXPECT_EQ(inputs.size(), 1);
+  check(inputs[0]);
+}
+
+TEST(Arguments, Matrix) {
+  MatrixPtr matrix = Matrix::create(100, 200);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 2);
+    EXPECT_EQ(arg.shape()[0], 100);
+    EXPECT_EQ(arg.shape()[1], 200);
+    EXPECT_EQ(arg.data(), matrix->getData());
+
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight());
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getWidth(), matrix->getWidth());
+    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getData(), matrix->getData());
+  };
+
+  BufferArgs argments;
+  argments.addArg(*matrix);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, Vector) {
+  VectorPtr vector = Vector::create(100, false);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 1);
+    EXPECT_EQ(arg.shape()[0], 100);
+    EXPECT_EQ(arg.data(), vector->getData());
+
+    CpuVector inVector = arg.vector<real, DEVICE_TYPE_CPU>();
+    EXPECT_EQ(inVector.getSize(), vector->getSize());
+    EXPECT_EQ(inVector.getData(), vector->getData());
+  };
+
+  BufferArgs argments;
+  argments.addArg(*vector);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, CpuSparseMatrix) {
+  CpuSparseMatrix sparse(200, 300, 50);
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 2);
+    EXPECT_EQ(arg.shape()[0], 200);
+    EXPECT_EQ(arg.shape()[1], 300);
+    EXPECT_EQ(arg.data(), sparse.getData());
+    // CHECK_EQ(arg.sparse().nnz(), 50);
+    // CHECK_EQ(arg.sparse().dataFormat(), SPARSE_CSR_FORMAT);
+    // CHECK_EQ(arg.sparse().dataType(), SPARSE_FLOAT_VALUE);
+    EXPECT_EQ(arg.sparse().getRowBuf(), sparse.getRows());
+    EXPECT_EQ(arg.sparse().getColBuf(), sparse.getCols());
+  };
+
+  BufferArgs argments;
+  argments.addArg(sparse);
+  std::vector<CheckBufferArg> checkFunc;
+  checkFunc.push_back(check);
+  testBufferArgs(argments, checkFunc);
+}
+
+TEST(Arguments, BufferArg) {
+  BufferArg arg(nullptr, VALUE_TYPE_FLOAT, {1, 2, 3});
+  CheckBufferArg check = [=](const BufferArg& arg) {
+    EXPECT_EQ(arg.shape().ndims(), 3);
+    EXPECT_EQ(arg.shape()[0], 1);
+    EXPECT_EQ(arg.shape()[1], 2);
+    EXPECT_EQ(arg.shape()[2], 3);
+  };
+
+  BufferArgs argments;
+  argments.addArg(arg);
+  testBufferArgs(argments, check);
+}
+
 }  // namespace paddle
diff --git a/paddle/function/FunctionTest.h b/paddle/function/FunctionTest.h
index 32131037f6de4a9f7a3ebf8f5773eccd65dc2cdb..24e7a36a43cfa8941535cb778aa1557ec5a0a6f4 100644
--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -15,95 +15,186 @@ limitations under the License. */
 #include "Function.h"
 #include "paddle/math/Vector.h"
 #include "paddle/math/tests/TensorCheck.h"
+#include "paddle/testing/TestUtil.h"
 
 namespace paddle {
 
+typedef std::shared_ptr<BufferArg> BufferArgPtr;
+
+/**
+ * \brief A class for comparing CPU and GPU implementations of Function.
+ *
+ *
+ * Use case:
+ *  // Initializes a test object, the corresponding cpu and gpu Function
+ *  // are constructed according to FunctionName and FuncConfig.
+ *  FunctionCompare test(FunctionName, FuncConfig);
+ *  // Prepare inputs and outputs arguments.
+ *  // Here the input and output can not contain real data,
+ *  // only contains the argument type and shape.
+ *  test.addInputs(input1);
+ *  test.addInputs(input2);
+ *  test.addOutputs(output1);
+ *  test.addOutputs(output2);
+ *  // Run.
+ *  // Will according to the type and shape of arguments(inputs_/outputs_),
+ *  // automatic initialization cpu and gpu function required arguments
+ *  // (cpuInputs_/cpuOutputs_/gpuInputs_/gpuOutputs_).
+ *  // Call the CPU and GPU Function calculation results.
+ *  // Compares CPU and GPU calculation results for consistency.
+ *  test.run();
+ */
 class FunctionCompare {
 public:
   FunctionCompare(const std::string& name, const FuncConfig& config)
-      : cpu(FunctionBase::funcRegistrar_.createByType(name + "-CPU")),
-        gpu(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) {
-    cpu->init(config);
-    gpu->init(config);
+      : cpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-CPU")),
+        gpuFunc_(FunctionBase::funcRegistrar_.createByType(name + "-GPU")) {
+    cpuFunc_->init(config);
+    gpuFunc_->init(config);
+  }
+
+  ~FunctionCompare() {}
+
+  // input need only contains shape, do not contains data.
+  void addInputs(const BufferArg& input) {
+    size_t size =
+        input.shape().getElements() * sizeOfValuType(input.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    cpuInputs_.emplace_back(std::make_shared<BufferArg>(
+        cpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+    gpuInputs_.emplace_back(std::make_shared<BufferArg>(
+        gpuMemory_.back()->getBuf(), input.valueType(), input.shape()));
+  }
+
+  // output need only contains shape, do not contains data.
+  void addOutputs(const BufferArg& output) {
+    size_t size =
+        output.shape().getElements() * sizeOfValuType(output.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    cpuOutputs_.emplace_back(
+        std::make_shared<BufferArg>(cpuMemory_.back()->getBuf(),
+                                    output.valueType(),
+                                    output.shape(),
+                                    ASSIGN_TO));
+    gpuOutputs_.emplace_back(
+        std::make_shared<BufferArg>(gpuMemory_.back()->getBuf(),
+                                    output.valueType(),
+                                    output.shape(),
+                                    ASSIGN_TO));
   }
 
-  void cmpWithArg(const Arguments& inputs,
-                  const Arguments& outputs,
-                  const Arguments& inouts) {
-    // init cpu and gpu arguments
-    auto initArgs = [=](
-        Arguments& cpuArgs, Arguments& gpuArgs, const Arguments& inArgs) {
-      for (const auto arg : inArgs) {
-        size_t size = sizeof(real);
-        for (const auto dim : arg.dims_) {
-          size *= dim;
-        }
-        if (arg.getData()) {
-          // todo(tianbing), waste unnecessary mem here
-          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
-          cpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
-          gpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
-          // already init outside
-        } else {
-          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
-          cpuArgs.emplace_back(
-              Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
-          gpuArgs.emplace_back(
-              Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
-          // will use an api to refactor this code.
-          CpuVector cpuVector(size / sizeof(real),
-                              (real*)cpuArgs.back().getData());
-          GpuVector gpuVector(size / sizeof(real),
-                              (real*)gpuArgs.back().getData());
-          cpuVector.uniform(0.001, 1);
-          gpuVector.copyFrom(cpuVector);
-        }
+  void addInputs(const SequenceArg& input) {
+    size_t batchSize = input.shape()[0];
+    size_t numSeqs = batchSize / 10 + 1;
+
+    size_t sizeId = (numSeqs + 1) * sizeOfValuType(VALUE_TYPE_INT32);
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(sizeId));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(sizeId));
+
+    TensorShape seqsId({numSeqs + 1});
+    // void* cpuBuffer = cpuMemory_.back()->getBuf();
+    // void* gpuBuffer = gpuMemory_.back()->getBuf();
+
+    size_t size =
+        input.shape().getElements() * sizeOfValuType(input.valueType());
+    cpuMemory_.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+    gpuMemory_.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+
+    // TODO: need be implemented.
+  }
+
+  void run() {
+    // prepare cpu/gpu arguments
+    initInputs();
+
+    // function calculate
+    auto callFunction = [](FunctionBase* function,
+                           std::vector<BufferArgPtr>& inputs,
+                           std::vector<BufferArgPtr>& outputs) {
+      BufferArgs inArgs;
+      BufferArgs outArgs;
+      for (auto arg : inputs) {
+        inArgs.addArg(*arg);
       }
+      for (auto arg : outputs) {
+        outArgs.addArg(*arg);
+      }
+      function->calc(inArgs, outArgs);
     };
-    initArgs(cpuInputs, gpuInputs, inputs);
-    initArgs(cpuOutputs, gpuOutputs, outputs);
-    initArgs(cpuInouts, gpuInouts, inouts);
 
-    // function calculate
-    cpu->calc(cpuInputs, cpuOutputs, cpuInouts);
-    gpu->calc(gpuInputs, gpuOutputs, gpuInouts);
+    callFunction(cpuFunc_.get(), cpuInputs_, cpuOutputs_);
+    callFunction(gpuFunc_.get(), gpuInputs_, gpuOutputs_);
 
     // check outputs and inouts
-    auto checkArgs = [=](const Arguments& cpuArgs, const Arguments& gpuArgs) {
-      for (size_t i = 0; i < cpuArgs.size(); i++) {
-        auto cpu = cpuArgs[i];
-        auto gpu = gpuArgs[i];
-        size_t size = 1;
-        for (auto dim : cpu.dims_) {
-          size *= dim;
-        }
-        CpuVector cpuVector(size, (real*)cpu.getData());
-        GpuVector gpuVector(size, (real*)gpu.getData());
-
-        autotest::TensorCheckErr(cpuVector, gpuVector);
-      }
-    };
-    checkArgs(cpuOutputs, gpuOutputs);
-    checkArgs(cpuInouts, gpuInouts);
+    compareOutputs();
+  }
+
+  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpuFunc_; }
+
+  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpuFunc_; }
+
+protected:
+  void initInputs() {
+    for (size_t i = 0; i < cpuInputs_.size(); i++) {
+      initArg(*cpuInputs_[i]);
+
+      // TODO: Need a BufferCopy used to copy from one BufferArg to another.
+      CpuVector cpuVector(cpuInputs_[i]->shape().getElements(),
+                          (real*)cpuInputs_[i]->data());
+      GpuVector gpuVector(gpuInputs_[i]->shape().getElements(),
+                          (real*)gpuInputs_[i]->data());
+
+      gpuVector.copyFrom(cpuVector);
+    }
+  }
+
+  void compareOutputs() {
+    for (size_t i = 0; i < cpuOutputs_.size(); i++) {
+      // TODO, Need a BufferCheck used to compare the two buffers.
+      auto cpu = cpuOutputs_[i];
+      auto gpu = gpuOutputs_[i];
+      CpuVector cpuVector(cpu->shape().getElements(), (real*)cpu->data());
+      GpuVector gpuVector(cpu->shape().getElements(), (real*)gpu->data());
+
+      autotest::TensorCheckErr(cpuVector, gpuVector);
+    }
   }
 
-  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpu; }
+  // only init cpu argument, gpu argument copy from cpu argument.
+  void initArg(BufferArg& arg) {
+    CpuVector vector(arg.shape().getElements(), (real*)arg.data());
+    vector.uniform(0.001, 1);
+  }
 
-  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpu; }
+  void initArg(SequenceIdArg& arg, size_t batchSize) {
+    size_t numSeqs = arg.numSeqs();
+    int* buf = reinterpret_cast<int*>(arg.data());
+    int pos = 0;
+    size_t maxLen = 2 * batchSize / numSeqs;
+    for (int i = 0; i < (int)numSeqs; ++i) {
+      int len = uniformRandom(
+                    std::min<int64_t>(maxLen, batchSize - pos - numSeqs + i)) +
+                1;
+      buf[i] = pos;
+      pos += len;
+      VLOG(1) << " len=" << len;
+    }
+    buf[numSeqs] = batchSize;
+  }
 
 protected:
-  std::shared_ptr<FunctionBase> cpu;
-  std::shared_ptr<FunctionBase> gpu;
-  std::vector<CpuMemHandlePtr> cpuMemory;
-  std::vector<GpuMemHandlePtr> gpuMemory;
-  Arguments cpuInputs;
-  Arguments cpuOutputs;
-  Arguments cpuInouts;
-  Arguments gpuInputs;
-  Arguments gpuOutputs;
-  Arguments gpuInouts;
+  std::shared_ptr<FunctionBase> cpuFunc_;
+  std::shared_ptr<FunctionBase> gpuFunc_;
+  std::vector<CpuMemHandlePtr> cpuMemory_;
+  std::vector<GpuMemHandlePtr> gpuMemory_;
+  std::vector<BufferArgPtr> cpuInputs_;
+  std::vector<BufferArgPtr> cpuOutputs_;
+  std::vector<BufferArgPtr> gpuInputs_;
+  std::vector<BufferArgPtr> gpuOutputs_;
 };
 
 }  // namespace paddle
diff --git a/paddle/function/PadOp.cpp b/paddle/function/PadOp.cpp
index ff13f27e94217b43067391afe53b49cc8f752fbe..75e64a8ee415135040d204f270a4fc261c176b79 100644
--- a/paddle/function/PadOp.cpp
+++ b/paddle/function/PadOp.cpp
@@ -89,20 +89,21 @@ public:
    * \param inputs[0] input value.
    * \param outputs[0] output value.
    */
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(1UL, inputs.size());
     CHECK_EQ(1UL, outputs.size());
-    CHECK_EQ(0UL, inouts.size());
-
-    size_t num = inputs[0].dims_[0];
-    size_t inC = inputs[0].dims_[1];
-    size_t inH = inputs[0].dims_[2];
-    size_t inW = inputs[0].dims_[3];
-
-    Pad<Device>(outputs[0].getData(),
-                inputs[0].getData(),
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+
+    size_t num = inputs[0].shape()[0];
+    size_t inC = inputs[0].shape()[1];
+    size_t inH = inputs[0].shape()[2];
+    size_t inW = inputs[0].shape()[3];
+    typename Tensor<real, Device>::Vector vec(outputs[0].shape().getElements(),
+                                              outputs[0].data<real>());
+    vec.zero();
+
+    Pad<Device>(outputs[0].data<real>(),
+                inputs[0].data<real>(),
                 num,
                 inC,
                 inH,
@@ -140,21 +141,25 @@ public:
    * \param inputs[0] output grad.
    * \param inouts[0] input grad.
    */
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
     CHECK_EQ(1UL, inputs.size());
-    CHECK_EQ(0UL, outputs.size());
-    CHECK_EQ(1UL, inouts.size());
+    CHECK_EQ(1UL, outputs.size());
 
-    size_t n = inouts[0].dims_[0];
-    size_t inC = inouts[0].dims_[1];
-    size_t inH = inouts[0].dims_[2];
-    size_t inW = inouts[0].dims_[3];
+    size_t num = outputs[0].shape()[0];
+    size_t inC = outputs[0].shape()[1];
+    size_t inH = outputs[0].shape()[2];
+    size_t inW = outputs[0].shape()[3];
+
+    if (outputs[0].getArgType() != ADD_TO) {
+      // for unit test
+      typename Tensor<real, Device>::Vector tmp(
+          outputs[0].shape().getElements(), outputs[0].data<real>());
+      tmp.zero();
+    }
 
-    PadGrad<Device>(inouts[0].getData(),
-                    inputs[0].getData(),
-                    n,
+    PadGrad<Device>(outputs[0].data<real>(),
+                    inputs[0].data<real>(),
+                    num,
                     inC,
                     inH,
                     inW,
diff --git a/paddle/function/PadOpTest.cpp b/paddle/function/PadOpTest.cpp
index be989807d13688c04602884a1c9f9cc663c252e5..dce2bac3e9a6ec3a6d01077fca3d69fe4553a592 100644
--- a/paddle/function/PadOpTest.cpp
+++ b/paddle/function/PadOpTest.cpp
@@ -33,10 +33,12 @@ TEST(Pad, real) {
                                       .set("padh1", 2)
                                       .set("padw0", 3)
                                       .set("padw1", 2));
-          Dims inDims{numSamples, channels, imgSizeH, imgSizeW};
-          Dims outDims{numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
-          compare.cmpWithArg(
-              {Tensor(nullptr, inDims)}, {Tensor(nullptr, outDims)}, {});
+          TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
+          TensorShape outDims{
+              numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
+          compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, inDims));
+          compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT, outDims, ASSIGN_TO));
+          compare.run();
         }
       }
     }
@@ -50,7 +52,6 @@ TEST(PadGrad, real) {
         for (size_t imgSizeW : {5, 32, 96}) {
           VLOG(3) << " numSamples=" << numSamples << " channels=" << channels
                   << " imgSizeH=" << imgSizeH << " imgSizeW=" << imgSizeW;
-
           FunctionCompare compare("PadGrad",
                                   FuncConfig()
                                       .set("padc0", 2)
@@ -59,10 +60,12 @@ TEST(PadGrad, real) {
                                       .set("padh1", 2)
                                       .set("padw0", 3)
                                       .set("padw1", 2));
-          Dims inDims{numSamples, channels, imgSizeH, imgSizeW};
-          Dims outDims{numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
-          compare.cmpWithArg(
-              {Tensor(nullptr, outDims)}, {}, {Tensor(nullptr, inDims)});
+          TensorShape inDims{numSamples, channels, imgSizeH, imgSizeW};
+          TensorShape outDims{
+              numSamples, channels + 5, imgSizeH + 3, imgSizeW + 5};
+          compare.addInputs(BufferArg(VALUE_TYPE_FLOAT, outDims));
+          compare.addOutputs(BufferArg(VALUE_TYPE_FLOAT, inDims, ASSIGN_TO));
+          compare.run();
         }
       }
     }
diff --git a/paddle/function/TensorShape.h b/paddle/function/TensorShape.h
index e491e3f1d6b26e14a5273b3b5a38aec941f5a9e5..cda58f19dfa4a8b80efc97570c83ca38fd7adf27 100644
--- a/paddle/function/TensorShape.h
+++ b/paddle/function/TensorShape.h
@@ -55,6 +55,15 @@ public:
     numElements();
   }
 
+  void reshape(std::initializer_list<size_t> dims) {
+    ndims_ = dims.size();
+    if (ndims_ > kMinDims) {
+      dims_.resize(ndims_);
+    }
+    dims_.assign(dims);
+    numElements();
+  }
+
   // number of dimensions of the tensor
   size_t ndims() const { return ndims_; }
 
@@ -82,7 +91,7 @@ private:
 
   // init dims_
   void initDims(size_t ndims) {
-    size_t count = ndims < 4 ? 4 : ndims;
+    size_t count = ndims < kMinDims ? kMinDims : ndims;
     dims_.assign(count, 1);
   }
 
@@ -92,6 +101,7 @@ private:
   // number of elements
   size_t nelements_;
   std::vector<size_t> dims_;
+  static const size_t kMinDims = 4;
 };
 
 }  // namespace paddle
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index ebcc87cbf48a3c34a4e625e67f872fed69cdf44f..d7042af1c25e7432e5b1efbb89cd8fd3f63fb4ae 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -118,16 +118,15 @@ void ContextProjection::forward() {
   /// first use state_, otherwise use weight_(padding false === w nullptr)
   auto w_ptr =
       state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
-  auto start_pos = in_->sequenceStartPositions;
-
+  const auto start_pos = in_->sequenceStartPositions->getVector(useGpu_);
   BufferArgs inputs;
   BufferArgs outputs;
-  inputs.addArg(*in_->value);
-  inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
-                          w_ptr ? w_ptr->getHeight() : 0,
-                          input_dim));
-  inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
-  outputs.addArg(*out_->value, ADD_TO);
+  inputs.addArg(*in_->value, *start_pos);
+  if (w_ptr) {
+    inputs.addArg(CpuMatrix(w_ptr->getData(), w_ptr->getHeight(), input_dim),
+                  *start_pos);
+  }
+  outputs.addArg(*out_->value, *start_pos, ADD_TO);
   forward_[0]->calc(inputs, outputs);
 
   if (state_ && config_.context_start() < 0) {
@@ -166,13 +165,16 @@ void ContextProjection::backward(const UpdateCallback& callback) {
 
   BufferArgs inputs;
   BufferArgs outputs;
-  inputs.addArg(CpuMatrix(
-      in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim));
-  inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
-                          w_ptr ? w_ptr->getHeight() : 0,
-                          input_dim));
-  inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
-  outputs.addArg(*out_->grad, ADD_TO);
+  inputs.addArg(*out_->grad, *in_->sequenceStartPositions->getVector(useGpu_));
+  outputs.addArg(
+      CpuMatrix(
+          in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim),
+      *in_->sequenceStartPositions->getVector(useGpu_),
+      ADD_TO);
+  outputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
+                           w_ptr ? w_ptr->getHeight() : 0,
+                           input_dim),
+                 ADD_TO);
   backward_[0]->calc(inputs, outputs);
 
   if (config_.trainable_padding()) {
diff --git a/paddle/gserver/layers/PadLayer.cpp b/paddle/gserver/layers/PadLayer.cpp
index 9069d382ef9273faf71892274c6b09698b19d586..a2a469ff92fb2a5541e8123519def289148855ee 100644
--- a/paddle/gserver/layers/PadLayer.cpp
+++ b/paddle/gserver/layers/PadLayer.cpp
@@ -27,11 +27,11 @@ bool PadLayer::init(const LayerMap& layerMap,
   auto& pad_conf = config_.inputs(0).pad_conf();
   auto& img_conf = pad_conf.image_conf();
   CHECK_EQ(config_.inputs_size(), 1);
-  inDims_.push_back(0);
-  inDims_.push_back(img_conf.channels());
-  inDims_.push_back(img_conf.has_img_size_y() ? img_conf.img_size_y()
-                                              : img_conf.img_size());
-  inDims_.push_back(img_conf.img_size());
+  inDims_ = TensorShape(
+      {0,
+       img_conf.channels(),
+       img_conf.has_img_size_y() ? img_conf.img_size_y() : img_conf.img_size(),
+       img_conf.img_size()});
 
   CHECK_EQ(2, pad_conf.pad_c_size());
   CHECK_EQ(2, pad_conf.pad_h_size());
@@ -43,7 +43,7 @@ bool PadLayer::init(const LayerMap& layerMap,
   padw_.push_back(pad_conf.pad_w(0));
   padw_.push_back(pad_conf.pad_w(1));
 
-  outDims_.resize(4);
+  outDims_ = TensorShape(4);
   setOutDims(0);
 
   createFunction(forward_,
@@ -68,20 +68,20 @@ bool PadLayer::init(const LayerMap& layerMap,
   return true;
 }
 
-void PadLayer::setOutDims(int batchSize) {
-  outDims_[0] = batchSize;
-  outDims_[1] = inDims_[1] + padc_[0] + padc_[1];
-  outDims_[2] = inDims_[2] + padh_[0] + padh_[1];
-  outDims_[3] = inDims_[3] + padw_[0] + padw_[1];
+void PadLayer::setOutDims(const size_t batchSize) {
+  outDims_.reshape({batchSize,
+                    inDims_[1] + padc_[0] + padc_[1],
+                    inDims_[2] + padh_[0] + padh_[1],
+                    inDims_[3] + padw_[0] + padw_[1]});
 }
 
-void PadLayer::setTensorDim(int batchSize) {
+void PadLayer::setTensorDim(const size_t batchSize) {
   CHECK_EQ(static_cast<int>(inputLayers_.size()), 1);
-  inDims_[0] = batchSize;
+  inDims_.setDim(0, batchSize);
   int h = inputLayers_[0]->getOutput().getFrameHeight();
-  if (h != 0) inDims_[2];
+  if (h != 0) inDims_.setDim(2, h);
   int w = inputLayers_[0]->getOutput().getFrameWidth();
-  if (w != 0) inDims_[3];
+  if (w != 0) inDims_.setDim(3, w);
   setOutDims(batchSize);
 }
 
@@ -94,22 +94,22 @@ void PadLayer::forward(PassType passType) {
   resetOutput(batchSize, size);
   MatrixPtr outV = getOutputValue();
   REGISTER_TIMER_INFO("PadForward", getName().c_str());
-  forward_[0]->calc({Tensor(input->getData(), inDims_)},
-                    {Tensor(outV->getData(), outDims_)},
-                    {});
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getInputValue(0), inDims_);
+  outputs.addArg(*getOutputValue(), outDims_, ASSIGN_TO);
+  forward_[0]->calc(inputs, outputs);
 }
 
 void PadLayer::backward(const UpdateCallback& callback) {
   (void)callback;
-
-  MatrixPtr preGrad = inputLayers_[0]->getOutputGrad();
-  if (NULL == preGrad) {
-    return;
-  }
-  MatrixPtr outGrad = getOutputGrad();
   REGISTER_TIMER_INFO("PadBackward", getName().c_str());
-  backward_[0]->calc({Tensor(outGrad->getData(), outDims_)},
-                     {},
-                     {Tensor(preGrad->getData(), inDims_)});
+
+  BufferArgs inputs;
+  BufferArgs outputs;
+  inputs.addArg(*getOutputGrad(), outDims_);
+  outputs.addArg(*getInputGrad(0), inDims_, ADD_TO);
+  backward_[0]->calc(inputs, outputs);
 }
 }  // namespace paddle
diff --git a/paddle/gserver/layers/PadLayer.h b/paddle/gserver/layers/PadLayer.h
index 834622a7af396e870865667b6672d896cd1627d7..1f5a4a54a0a07ebfa0cef64a5eecb2c95d0bccc2 100644
--- a/paddle/gserver/layers/PadLayer.h
+++ b/paddle/gserver/layers/PadLayer.h
@@ -33,13 +33,13 @@ public:
   void backward(const UpdateCallback& callback = nullptr);
 
 protected:
-  void setOutDims(int batchSize);
-  void setTensorDim(int batchSize);
+  void setOutDims(const size_t batchSize);
+  void setTensorDim(const size_t batchSize);
 
   std::vector<int> padc_;
   std::vector<int> padh_;
   std::vector<int> padw_;
-  Dims inDims_;
-  Dims outDims_;
+  TensorShape inDims_;
+  TensorShape outDims_;
 };
 }  // namespace paddle
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index 981d10afda2671be9e8f0da1a4bee755f7aa9d61..21d1cb75f4d40e6ed011b33c6366c9d31c0fcc7c 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -34,6 +34,10 @@ class IScanner(object):
 
 
 class DenseScanner(IScanner):
+    """
+    :type __mat__: numpy.ndarray
+    """
+
     def __init__(self, input_type, pos):
         IScanner.__init__(self, input_type, pos)
         self.__mat__ = None
@@ -47,6 +51,8 @@ class DenseScanner(IScanner):
     def finish_scan(self, argument):
         assert isinstance(argument, swig_paddle.Arguments)
         assert isinstance(self.input_type, dp2.InputType)
+        if self.__mat__.dtype != numpy.float32:
+            self.__mat__ = self.__mat__.astype(numpy.float32)
         m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True, False)
         argument.setSlotValue(self.pos, m)
 
diff --git a/paddle/scripts/travis/before_install.osx.sh b/paddle/scripts/travis/before_install.osx.sh
index 7036f971fdd7bac68b67c7b5a92e50c352e214c1..80f031a74e7052d183b5ef21d432476ff1cce722 100755
--- a/paddle/scripts/travis/before_install.osx.sh
+++ b/paddle/scripts/travis/before_install.osx.sh
@@ -1,6 +1,4 @@
 #!/bin/bash
 brew update
 brew tap homebrew/science
-brew install python
-sudo pip install --upgrade protobuf
-brew install swig openblas md5sha1sum protobuf
+brew install openblas swig md5sha1sum
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index fd3aeb02b21d659f783702905117fc838b93eafd..5e6350b57458594163f23cca41a546d7bd9b1eda 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -6,7 +6,7 @@ if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
   export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages
   export PYTHONHOME=/opt/python/2.7.12
   export PATH=/opt/python/2.7.12/bin:${PATH}
-  cmake .. -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
+  cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
   NRPOC=`nproc`
   make -j $NPROC
   make coveralls
diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/docs.sh
index bdafb145bcd4e5990f382bb890f804687c474f7c..6b43cad20b76e9abeb3cb10a726d3d8e3da5f8e2 100755
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/docs.sh
@@ -4,7 +4,7 @@
 source ./common.sh
 
 # Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS}
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS}
 make paddle_docs paddle_docs_cn
 
 # check websites for broken links
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index c80ed0274461da2de0617c6c4ee1a1d03df5683d..6701eced60d068312a1a866a6312002f9f5207f7 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -2650,7 +2650,7 @@ class AverageLayer(LayerBase):
 
 @config_layer('cos')
 class CosSimLayer(LayerBase):
-    def __init__(self, name, inputs, cos_scale=5, device=None):
+    def __init__(self, name, inputs, cos_scale=1, device=None):
         super(CosSimLayer, self).__init__(
             name, 'cos', 1, inputs=inputs, device=device)
         config_assert(len(self.inputs) == 2, 'CosSimLayer must have 2 inputs')
diff --git a/python/paddle/trainer_config_helpers/layers.py b/python/paddle/trainer_config_helpers/layers.py
index 3f88727b63066f2ac9167721172cdc42a193d78f..4e4744b518d8590acf43928d0ddebd7aacfdc224 100644
--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -1674,7 +1674,7 @@ def trans_layer(input, name=None, layer_attr=None):
 
 @wrap_name_default()
 @layer_support()
-def cos_sim(a, b, scale=5, size=1, name=None, layer_attr=None):
+def cos_sim(a, b, scale=1, size=1, name=None, layer_attr=None):
     """
     Cosine Similarity Layer. The cosine similarity equation is here.
 
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
index 403aafabe9143472dd2f0857ecd25f7acf515b6c..93dd7796c246ae81a146759df7e0c19e334375f1 100644
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -9,17 +9,10 @@ add_test(NAME test_reset_hook
         ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
     WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
 
-if (PROTOBUF_3)
-  add_paddle_exe(protobuf_equal
-    ProtobufEqualMain.cpp)
-  add_test(NAME test_layerHelpers
-    COMMAND
-    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
-    ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
-  )
-else()
-  add_test(NAME test_layerHelpers
-    COMMAND
-    ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
-  )
-endif()
+add_paddle_exe(protobuf_equal
+  ProtobufEqualMain.cpp)
+add_test(NAME test_layerHelpers
+  COMMAND
+  ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
+  ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
+)
diff --git a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
index b30bbb2a4e24d74ebe1d6c8eda8be5aa09217f6d..c1bfdf1b19c61d096c25af061c6fbb3bbfc50265 100644
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_ntm_layers.protostr
@@ -79,7 +79,7 @@ layers {
   inputs {
     input_layer_name: "b"
   }
-  cos_scale: 5
+  cos_scale: 1
 }
 layers {
   name: "__cos_sim_1__"
@@ -92,7 +92,7 @@ layers {
   inputs {
     input_layer_name: "c"
   }
-  cos_scale: 5
+  cos_scale: 1
 }
 layers {
   name: "__sum_to_one_norm_layer_0__"
diff --git a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
index a37eb6439e6d2803a417883f0aed2a5d56d059b9..c8a3b190b19148ddb701020f5be55c4c29a17079 100755
--- a/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh
@@ -2,16 +2,18 @@
 cd `dirname $0`
 
 set -e
+PYTHON_EXEC=$1
+COMPARE_PROTO_UTIL=$2
 
 protostr=`dirname $0`/protostr
 
 files=`ls $protostr | grep -v "unittest"`
 
-./generate_protostr.sh $1
+./generate_protostr.sh ${PYTHON_EXEC}
 
 . ./file_list.sh
 
-if [ -z $1 ]; then
+if [ -z ${COMPARE_PROTO_UTIL} ]; then
   for file in $files
   do
       base_protostr=$protostr/$file
@@ -22,20 +24,20 @@ if [ -z $1 ]; then
 else
   for file in ${configs[*]}
   do
-    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.unittest; then
+    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.unittest; then
       diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
     fi
-    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest; then
+    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest; then
       diff $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest -u
     fi
   done
 
   for file in ${whole_configs[*]}
   do
-    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.unittest --whole; then
+    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.unittest --whole; then
       diff $protostr/$file.protostr $protostr/$file.protostr.unittest -u
     fi
-    if ! $1 $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest --whole; then
+    if ! ${COMPARE_PROTO_UTIL} $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest --whole; then
       diff $protostr/$file.protostr $protostr/$file.protostr.non_file_config.unittest -u
     fi
   done