diff --git a/.travis.yml b/.travis.yml
index 0705baa1aca8b480b2a774076bd91fb9df401a53..162bebba091d84b295f929527de9804e65df5a65 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -25,9 +25,9 @@ addons:
     packages:
       - gcc-4.8
       - g++-4.8
+      - gfortran-4.8
       - git
       - build-essential
-      - libatlas-base-dev
       - python
       - python-pip
       - python2.7-dev
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 4e1ae7dc81231943c4bf3db4d4ac6f073f4fd1c4..26306f9849100d4463dde267acae5392cc81d7ac 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -16,7 +16,7 @@
 set(CBLAS_FOUND OFF)
 
 ## Find MKL First.
-set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains MKL")
+set(MKL_ROOT $ENV{MKLROOT} CACHE PATH "Folder contains MKL")
 
 find_path(MKL_INCLUDE_DIR mkl.h PATHS
   ${MKL_ROOT}/include)
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 0e8c29c831c823f701d8eecd954d3b120085e495..29d17691db9f4575bae4372c61a0e1964e163fc9 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -15,7 +15,6 @@
 INCLUDE(cblas)
 
 IF(NOT ${CBLAS_FOUND})
-    MESSAGE(FATAL_ERROR "Please install OpenBlas, MKL or ATLAS.")
     INCLUDE(ExternalProject)
 
     SET(CBLAS_SOURCES_DIR ${THIRD_PARTY_PATH}/openblas)
@@ -28,20 +27,40 @@ IF(NOT ${CBLAS_FOUND})
         SET(CBLAS_LIBRARIES "${CBLAS_INSTALL_DIR}/lib/libopenblas.a" CACHE FILEPATH "openblas library" FORCE)
     ENDIF(WIN32)
 
+    IF(CMAKE_COMPILER_IS_GNUCC)
+        ENABLE_LANGUAGE(Fortran)
+        LIST(APPEND CBLAS_LIBRARIES gfortran pthread)
+    ENDIF(CMAKE_COMPILER_IS_GNUCC)
+
+    IF(NOT CMAKE_Fortran_COMPILER)
+        MESSAGE(FATAL_ERROR "To build lapack in libopenblas, "
+                "you need to set gfortran compiler: cmake .. -DCMAKE_Fortran_COMPILER=...")
+    ENDIF(NOT CMAKE_Fortran_COMPILER)
+
     ExternalProject_Add(
         openblas
         ${EXTERNAL_PROJECT_LOG_ARGS}
-        URL                 "https://github.com/xianyi/OpenBLAS/archive/v0.2.19.tar.gz"
+        GIT_REPOSITORY      https://github.com/xianyi/OpenBLAS.git
+        GIT_TAG             v0.2.19
         PREFIX              ${CBLAS_SOURCES_DIR}
         INSTALL_DIR         ${CBLAS_INSTALL_DIR}
         BUILD_IN_SOURCE     1
-        CONFIGURE_COMMAND   ""
-        BUILD_COMMAND       make CC=${CMAKE_C_COMPILER} FC=${CMAKE_Fortran_COMPILER}
-        INSTALL_COMMAND     make install PREFIX=<INSTALL_DIR>
+        BUILD_COMMAND       ${CMAKE_MAKE_PROGRAM} FC=${CMAKE_Fortran_COMPILER} CC=${CMAKE_C_COMPILER} HOSTCC=${CMAKE_C_COMPILER} NO_SHARED=1 libs netlib
+        INSTALL_COMMAND     ${CMAKE_MAKE_PROGRAM} install NO_SHARED=1 PREFIX=<INSTALL_DIR>
         UPDATE_COMMAND      ""
+        CONFIGURE_COMMAND   ""
+    )
+
+    ExternalProject_Add_Step(
+        openblas lapacke_install
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_mangling_with_flags.h" "${CBLAS_INSTALL_DIR}/include/lapacke_mangling.h"
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke.h" "${CBLAS_INSTALL_DIR}/include/lapacke.h"
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_config.h" "${CBLAS_INSTALL_DIR}/include/lapacke_config.h"
+        COMMAND ${CMAKE_COMMAND} -E copy "${CBLAS_SOURCES_DIR}/src/openblas/lapack-netlib/LAPACKE/include/lapacke_utils.h" "${CBLAS_INSTALL_DIR}/include/lapacke_utils.h"
+        DEPENDEES install
     )
 
     LIST(APPEND external_project_dependencies openblas)
-ENDIF()
+ENDIF(NOT ${CBLAS_FOUND})
 
 INCLUDE_DIRECTORIES(${CBLAS_INC_DIR})
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index c0cf2719f9a7b3ae6be5cefffa3dbd2c3f712e82..613614c0e3d42fac4147f78edbc1bd6d62847419 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -54,6 +54,7 @@ ExternalProject_Add(
   CONFIGURE_COMMAND
     ${CMAKE_COMMAND} ${PROTOBUF_SOURCES_DIR}/src/protobuf/cmake
     -Dprotobuf_BUILD_TESTS=OFF
+    -DZLIB_ROOT:FILEPATH=${ZLIB_ROOT}
     -DCMAKE_POSITION_INDEPENDENT_CODE=ON
     -DCMAKE_BUILD_TYPE=Release
     -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
diff --git a/cmake/external/python.cmake b/cmake/external/python.cmake
index 29247d5c3d474acaa5c65e450780f00b8885ee78..209e679f2cb2178423bf20dec73a0bccef199fcb 100644
--- a/cmake/external/python.cmake
+++ b/cmake/external/python.cmake
@@ -31,6 +31,7 @@ IF(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
         "please use pip to upgrade protobuf.")
     ENDIF(${PY_GOOGLE.PROTOBUF_VERSION} VERSION_LESS "3.0.0")
 ELSE(PYTHONLIBS_FOUND AND PYTHONINTERP_FOUND)
+    MESSAGE(FATAL_ERROR "Please install python 2.7 before building PaddlePaddle.")
     ##################################### PYTHON ########################################
     SET(PYTHON_SOURCES_DIR ${THIRD_PARTY_PATH}/python)
     SET(PYTHON_INSTALL_DIR ${THIRD_PARTY_PATH}/install/python)
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index 0d1ef5cd8449bd31b4cfa4619f27bce7c1f55ebb..b76852fc6c50e80633c8294fb2724b83f15293a7 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -96,6 +96,7 @@ set(COMMON_FLAGS
     -Wno-unused-parameter
     -Wno-unused-function
     -Wno-error=literal-suffix
+    -Wno-error=sign-compare
     -Wno-error=unused-local-typedefs)
 
 set(GPU_COMMON_FLAGS
@@ -105,6 +106,7 @@ set(GPU_COMMON_FLAGS
     -Wdelete-non-virtual-dtor
     -Wno-unused-parameter
     -Wno-unused-function
+    -Wno-error=sign-compare
     -Wno-error=literal-suffix
     -Wno-error=unused-local-typedefs
     -Wno-error=unused-function  # Warnings in Numpy Header.
diff --git a/cmake/system.cmake b/cmake/system.cmake
index 788db404ebfb6facbaedf2910186f3b1afe775c1..1e9f79496441c8597ae29814c650db972ca7117d 100644
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@@ -47,7 +47,7 @@ SET(EXTERNAL_PROJECT_LOG_ARGS
     LOG_DOWNLOAD    0     # Wrap download in script to log output
     LOG_UPDATE      1     # Wrap update in script to log output
     LOG_CONFIGURE   1     # Wrap configure in script to log output
-    LOG_BUILD       1     # Wrap build in script to log output
+    LOG_BUILD       0     # Wrap build in script to log output
     LOG_TEST        1     # Wrap test in script to log output
-    LOG_INSTALL     1     # Wrap install in script to log output
+    LOG_INSTALL     0     # Wrap install in script to log output
 )
diff --git a/doc/getstarted/build_and_install/build_from_source_en.md b/doc/getstarted/build_and_install/build_from_source_en.md
index 6954be3b2bb956755c7820bf285addfd15226874..1abd7b698bc09c0cd085dc38e6ed6d53db88d7fc 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.md
+++ b/doc/getstarted/build_and_install/build_from_source_en.md
@@ -64,7 +64,8 @@ As a simple example, consider the following:
 
 1. **BLAS Dependencies(optional)**
   
-    Paddle will find BLAS from system's default path. But you can specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
+    CMake will search BLAS libraries from system. If not found, OpenBLAS will be downloaded, built and installed automatically.
+    To utilize preinstalled BLAS， you can simply specify MKL, OpenBLAS or ATLAS via `MKL_ROOT`, `OPENBLAS_ROOT` or `ATLAS_ROOT`.
 
     ```bash
     # specify MKL
@@ -99,7 +100,7 @@ As a simple example, consider the following:
     ```bash
     # necessary
     sudo apt-get update
-    sudo apt-get install -y g++ make cmake build-essential libatlas-base-dev python python-pip libpython-dev git
+    sudo apt-get install -y g++ make cmake build-essential python python-pip libpython-dev git
     sudo pip install wheel numpy
     sudo pip install 'protobuf>=3.0.0'
     ```
diff --git a/doc/howto/usage/k8s/k8s_aws_en.md b/doc/howto/usage/k8s/k8s_aws_en.md
index b04bfba590de42956dfe99256cde325b24adbfab..c776ba9eb97a8c5eadef77ce0119b995fa8da315 100644
--- a/doc/howto/usage/k8s/k8s_aws_en.md
+++ b/doc/howto/usage/k8s/k8s_aws_en.md
@@ -2,15 +2,9 @@
 
 ## Create AWS Account and IAM Account
 
-To use AWS, we need to sign up an AWS account on Amazon's Web site.
-An AWS account allows us to login to the AWS Console Web interface to
-create IAM users and user groups. Usually, we create a user group with
-privileges required to run PaddlePaddle, and we create users for
-those who are going to run PaddlePaddle and add these users into the
-group. IAM users can identify themselves using password and tokens,
-where passwords allows users to log in to the AWS Console, and tokens
-make it easy for users to submit and inspect jobs from the command
-line.
+AWS account allow us to manage AWS from Web Console. Amazon AMI enable us to manage AWS from command line interface.
+
+We need to create an AMI user with sufficient privilege to create kubernetes cluster on AWS.
 
 To sign up an AWS account, please
 follow
@@ -19,8 +13,7 @@ To create users and user groups under an AWS account, please
 follow
 [this guide](http://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html).
 
-Please be aware that this tutorial needs the following privileges in
-the user group:
+Please be aware that this tutorial needs the following privileges for the user in AMI:
 
 - AmazonEC2FullAccess
 - AmazonS3FullAccess
@@ -31,6 +24,7 @@ the user group:
 - IAMUserSSHKeys
 - IAMFullAccess
 - NetworkAdministrator
+- AWSKeyManagementServicePowerUser
 
 
 By the time we write this tutorial, we noticed that Chinese AWS users
@@ -46,9 +40,11 @@ it.
 Here we will show you step by step on how to run PaddlePaddle training on AWS cluster.
 
 
-###Download kube-aws and kubectl
+### Download kube-aws and kubectl
+
+#### kube-aws
 
-####kube-aws
+[kube-aws](https://github.com/coreos/kube-aws) is a CLI tool to automate cluster deployment to AWS.
 
 Import the CoreOS Application Signing Public Key:
 
@@ -88,24 +84,22 @@ mv ${PLATFORM}/kube-aws /usr/local/bin
 ```
 
 
-####kubectl
+#### kubectl
+
+[kubectl](https://kubernetes.io/docs/user-guide/kubectl-overview/) is a command line interface for running commands against Kubernetes clusters.
 
 Go to the [releases](https://github.com/kubernetes/kubernetes/releases) and download the latest release tarball.
 
 Extract the tarball and then concate the kubernetes binaries directory into PATH:
 
 ```
-export PATH=<path/to/kubernetes-directory>/platforms/linux/amd64:$PATH
-
+export PATH=<path/to/kubernetes-directory>/platforms/linux/amd64:$PATH # The exact path depend on your platform
 ```
 
-User credentials and security tokens will be generated later in user directory, not in `~/.kube/config`, they will be necessary to use the CLI or the HTTP Basic Auth.
-
-
-###Configure AWS Credentials
 
-First check out [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) for installing the AWS command line interface, if you use ec2 instance with default amazon AMI, the cli tool has already been installed on your machine.
+### Configure AWS Credentials
 
+First check out [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) for installing the AWS command line interface.
 
 And then configure your AWS account information:
 
@@ -126,33 +120,35 @@ Default output format: json
 
 ```
 
-Test that your credentials work by describing any instances you may already have running on your account:
+Verify that your credentials work by describing any instances you may already have running on your account:
 
 ```
 aws ec2 describe-instances
 ```
 
-###Define Cluster Parameters
+### Define Cluster Parameters
 
-####EC2 key pair
+#### EC2 key pair
 
 The keypair that will authenticate SSH access to your EC2 instances. The public half of this key pair will be configured on each CoreOS node.
 
-After creating a key pair, you will use the name you gave the keys to configure the cluster. Key pairs are only available to EC2 instances in the same region. More info in the [EC2 Keypair docs](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html).
+Follow [EC2 Keypair docs](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html) to create a EC2 key pair
 
-####KMS key
+After creating a key pair, you will use the name you gave the keys to configure the cluster. Key pairs are only available to EC2 instances in the same region.
+
+#### KMS key
 
 Amazon KMS keys are used to encrypt and decrypt cluster TLS assets. If you already have a KMS Key that you would like to use, you can skip creating a new key and provide the Arn string for your existing key.
 
 You can create a KMS key in the AWS console, or with the aws command line tool:
 
 ```
-$ aws kms --region=us-west-2 create-key --description="kube-aws assets"
+$ aws kms --region=us-west-1 create-key --description="kube-aws assets"
 {
     "KeyMetadata": {
         "CreationDate": 1458235139.724,
         "KeyState": "Enabled",
-        "Arn": "arn:aws:kms:us-west-2:xxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx",
+        "Arn": "arn:aws:kms:us-west-1:xxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx",
         "AWSAccountId": "xxxxxxxxxxxxx",
         "Enabled": true,
         "KeyUsage": "ENCRYPT_DECRYPT",
@@ -166,7 +162,9 @@ You will use the `KeyMetadata.Arn` string to identify your KMS key in the init s
 
 And then you need to add several inline policies in your user permission.
 
-kms inline policy:
+Go to AMI user page, click on `Add inline policy` button, and then select `Custom Policy`
+
+paste into following inline policies:
 
 ```
 {
@@ -182,16 +180,8 @@ kms inline policy:
             "Resource": [
                 "arn:aws:kms:*:xxxxxxxxx:key/*"
             ]
-        }
-    ]
-}
-```
-cloudformation inline policy:
-
-```
-"Version": "2012-10-17",
-    "Statement": [
-        {
+        },
+		{
             "Sid": "Stmt1482205746000",
             "Effect": "Allow",
             "Action": [
@@ -200,10 +190,11 @@ cloudformation inline policy:
                 "cloudformation:DeleteStack",
                 "cloudformation:DescribeStacks",
                 "cloudformation:DescribeStackResource",
-                "cloudformation:GetTemplate"
+                "cloudformation:GetTemplate",
+                "cloudformation:DescribeStackEvents"
             ],
             "Resource": [
-                "arn:aws:cloudformation:us-west-2:xxxxxxxxx:stack/YOUR_CLUSTER_NAME/*"
+                "arn:aws:cloudformation:us-west-1:xxxxxxxxx:stack/YOUR_CLUSTER_NAME/*"
             ]
         }
     ]
@@ -211,15 +202,23 @@ cloudformation inline policy:
 ```
 
 
-####External DNS name
+#### External DNS name
 
 When the cluster is created, the controller will expose the TLS-secured API on a public IP address. You will need to create an A record for the external DNS hostname you want to point to this IP address. You can find the API external IP address after the cluster is created by invoking kube-aws status.
 
-####S3 bucket
+#### S3 bucket
 
 You need to create an S3 bucket before startup the Kubernetes cluster.
 
-####Initialize an asset directory
+command (need to have a global unique name):
+
+```
+paddle aws s3api --region=us-west-1 create-bucket --bucket bucket-name
+```
+
+If you get an error message, try a different bucket name. The bucket name needs to be globally unique.
+
+#### Initialize an asset directory
 
 Create a directory on your local machine to hold the generated assets:
 
@@ -237,12 +236,16 @@ $ kube-aws init \
 --region=us-west-1 \
 --availability-zone=us-west-1c \
 --key-name=key-pair-name \
---kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
+--kms-key-arn="arn:aws:kms:us-west-1:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
 ```
 
+Here `us-west-1c` is used for parameter `--availability-zone`, but supported availability zone varies among AWS accounts.
+
+Please check if `us-west-1c` is supported by `aws ec2 --region us-west-1 describe-availability-zones`, if not switch to other supported availability zone. (e.g., `us-west-1a`, or `us-west-1b`)
+
 There will now be a cluster.yaml file in the asset directory. This is the main configuration file for your cluster.
 
-####Render contents of the asset directory
+#### Render contents of the asset directory
 
 In the simplest case, you can have kube-aws generate both your TLS identities and certificate authority for you.
 
@@ -285,21 +288,21 @@ $ tree
 These assets (templates and credentials) are used to create, update and interact with your Kubernetes cluster.
 
 
-###Kubernetes Cluster Start Up
+### Kubernetes Cluster Start Up
 
-####Create the instances defined in the CloudFormation template
+#### Create the instances defined in the CloudFormation template
 
-Now for the exciting part, creating your cluster:
+Now for the exciting part, creating your cluster (choose any `<prefix>`):
 
 ```
 $ kube-aws up --s3-uri s3://<your-bucket-name>/<prefix>
 ```
 
-####Configure DNS
+#### Configure DNS
 
-You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation, if necessary. This command can take a while. And then dig the load balancer hostname to get the ip address, use this ip to setup an A record for your external dns name.
+You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation, if necessary. This command can take a while. And use command `dig` to check the load balancer hostname to get the ip address, use this ip to setup an A record for your external dns name.
 
-####Access the cluster
+#### Access the cluster
 
 Once the API server is running, you should see:
 
@@ -312,7 +315,7 @@ ip-10-0-0-xx.us-west-1.compute.internal    Ready,SchedulingDisabled   5m
 ```
 
 
-###Setup PaddlePaddle Environment on AWS
+### Setup PaddlePaddle Environment on AWS
 
 Now, we've created a cluster with following network capability:
 
diff --git a/paddle/function/BufferArg.cpp b/paddle/function/BufferArg.cpp
index fde48a73b61c31d06225cc1763efbc6971c86f57..5d595deb12c6c8ea419dd1f31b3c131a2f6a587a 100644
--- a/paddle/function/BufferArg.cpp
+++ b/paddle/function/BufferArg.cpp
@@ -20,23 +20,27 @@ limitations under the License. */
 namespace paddle {
 
 const SequenceArg& BufferArg::sequence() const {
-  // CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
+  CHECK_EQ(bufferType_, TENSOR_SEQUENCE_DATA);
   return dynamic_cast<const SequenceArg&>(*this);
 }
 
 const SparseMatrixArg& BufferArg::sparse() const {
-  // CHECK_EQ(bufferType_, TENSOR_SPARSE);
+  CHECK_EQ(bufferType_, TENSOR_SPARSE);
   return dynamic_cast<const SparseMatrixArg&>(*this);
 }
 
 SparseMatrixArg::SparseMatrixArg(const CpuSparseMatrix& sparse, ArgType argType)
     : BufferArg(sparse, argType),
       row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {
+  bufferType_ = TENSOR_SPARSE;
+}
 
 SparseMatrixArg::SparseMatrixArg(const GpuSparseMatrix& sparse, ArgType argType)
     : BufferArg(sparse, argType),
       row_(reinterpret_cast<void*>(sparse.getRows()), VALUE_TYPE_INT32),
-      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {}
+      col_(reinterpret_cast<void*>(sparse.getCols()), VALUE_TYPE_INT32) {
+  bufferType_ = TENSOR_SPARSE;
+}
 
 }  // namespace paddle
diff --git a/paddle/function/BufferArg.h b/paddle/function/BufferArg.h
index 28542a86574812048ff5f72a2d4b2c188f7e30bf..9c792c6bb72aa27fb78519497c3da05452050575 100644
--- a/paddle/function/BufferArg.h
+++ b/paddle/function/BufferArg.h
@@ -23,10 +23,11 @@ limitations under the License. */
 namespace paddle {
 
 enum BufferType {
-  TENSOR_NORMAL = 0,
-  TENSOR_SEQUENCE_ID = 1,
-  TENSOR_SEQUENCE_DATA = 2,
-  TENSOR_SPARSE = 3
+  TENSOR_UNKNOWN = 0,
+  TENSOR_NORMAL = 1,
+  TENSOR_SEQUENCE_ID = 2,
+  TENSOR_SEQUENCE_DATA = 3,
+  TENSOR_SPARSE = 4
 };
 
 enum SparseDataType {
@@ -98,6 +99,7 @@ public:
         valueType_(DataType<real>::value),
         shape_(2),
         argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
     shape_.setDim(0, matrix.getHeight());
     shape_.setDim(1, matrix.getWidth());
   }
@@ -110,6 +112,7 @@ public:
         valueType_(DataType<real>::value),
         shape_(shape),
         argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
     CHECK_EQ(matrix.getElementCnt(), shape.getElements());
   }
 
@@ -119,6 +122,7 @@ public:
         valueType_(DataType<real>::value),
         shape_(1),
         argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
     shape_.setDim(0, vector.getSize());
   }
 
@@ -128,6 +132,7 @@ public:
         valueType_(VALUE_TYPE_INT32),
         shape_(1),
         argType_(argType) {
+    bufferType_ = TENSOR_NORMAL;
     shape_.setDim(0, vector.getSize());
   }
 
@@ -162,6 +167,8 @@ public:
   ValueType valueType() const { return valueType_; }
   BufferType bufferType() const { return bufferType_; }
   const TensorShape& shape() const { return shape_; }
+  bool isSparse() const { return (TENSOR_SPARSE == bufferType_); }
+  bool isSequenceArg() const { return TENSOR_SEQUENCE_DATA == bufferType_; }
 
   const SequenceArg& sequence() const;
   const SparseMatrixArg& sparse() const;
@@ -170,8 +177,8 @@ protected:
   void* buf_;
   ValueType valueType_;
   TensorShape shape_;
-  BufferType bufferType_;
-  ArgType argType_ = UNSPECIFIED;
+  BufferType bufferType_{TENSOR_UNKNOWN};
+  ArgType argType_{UNSPECIFIED};
   // leading dimensions. The size is dims_.size()
   // Dims lds_;
 };
@@ -192,11 +199,13 @@ public:
                 const TensorShape& shape,
                 ArgType argType = UNSPECIFIED)
       : BufferArg(buf, VALUE_TYPE_INT32, shape, argType) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
     CHECK_EQ(shape_.ndims(), (size_t)1);
     numSeqs_ = shape_[0] - 1;
   }
 
   SequenceIdArg(const IVector& vector) : BufferArg(vector) {
+    bufferType_ = TENSOR_SEQUENCE_ID;
     numSeqs_ = shape_[0] - 1;
   }
 
@@ -226,12 +235,16 @@ public:
               const SequenceIdArg& startPositions,
               ArgType argType = UNSPECIFIED)
       : BufferArg(buf, valueType, shape, argType),
-        startPositions_(startPositions) {}
+        startPositions_(startPositions) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
 
   SequenceArg(const Matrix& matrix,
               const IVector& vector,
               ArgType argType = UNSPECIFIED)
-      : BufferArg(matrix, argType), startPositions_(vector) {}
+      : BufferArg(matrix, argType), startPositions_(vector) {
+    bufferType_ = TENSOR_SEQUENCE_DATA;
+  }
 
   ~SequenceArg() {}
 
@@ -264,6 +277,7 @@ public:
         nnz_(nnz),
         format_(format),
         type_(type) {
+    bufferType_ = TENSOR_SPARSE;
     CHECK((valueType == VALUE_TYPE_FLOAT) || (valueType == VALUE_TYPE_DOUBLE));
     CHECK_EQ(shape_.ndims(), (size_t)2);
     CHECK_EQ(row_.shape().ndims(), (size_t)1);
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 566fe53b14d85d472ecb32268a476c0a5662560f..a5cf16cb568ee9bafd15a8c9737d933b6fbbd12b 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -24,7 +24,7 @@ if(WITH_TESTING)
     add_simple_unittest(TensorTypeTest)
     add_simple_unittest(BufferArgTest)
     add_simple_unittest(FunctionTest)
-    # add_simple_unittest(ContextProjectionOpTest)
+    add_simple_unittest(ContextProjectionOpTest)
 endif()
 endif()
 
diff --git a/paddle/function/ContextProjectionOp.cpp b/paddle/function/ContextProjectionOp.cpp
index cb448562ebb37022f727ee65024f06f69d63e9cb..6cd4e4abee8fccf3a4745b0bfc6701df4ddfa5c0 100644
--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
@@ -17,7 +17,10 @@ limitations under the License. */
 #include "paddle/math/Vector.h"
 
 namespace paddle {
-
+/**
+ * Context Projection Forward with CPU Matrix Device.
+ *
+ */
 template <>
 void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
                                                const CpuMatrix& input_mat,
@@ -70,10 +73,30 @@ void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix& out_mat,
 }
 
 /**
- * \param inputs[0] input value.
- * \param inputs[1] input weight.
- * \param inputs[2] input sequence.
- * \param outputs[0] output value.
+ * Paddle Function for Context Projection Forward.
+ * Calculate the output layer value sequence after context projection.
+ *
+ * What is Context Projection for a sequence?
+ * For example, assumed input (x) has 4 words and the dimension of each word
+ * representation is 2. If we use zero to pad instead of learned weight to pad,
+ * and the context_lenth is 3, the output (y) is:
+ *
+ * @code
+ *  x = [a1, a2;
+ *       b1, b2;
+ *       c1, c2;
+ *       d1, d2]
+ *  y = [0,  0,  a1, a2, b1, b2;
+ *       a1, a2, b1, b2, c1, c2;
+ *       b1, b2, c1, c2, d1, d2;
+ *       c1, c2, d1, d2, 0,  0]
+ * @endcode
+ *
+ * \param outputs[0].matrix   output layer value, n * (d * l)
+ * \param outputs[0].vector   start position sequence, n * 1
+ * \param inputs[0].matrix    input layer value, n * d
+ * \param inputs[0].vector    start position sequence, n * 1
+ * \param inputs[1].matrix    input layer weight, pad * d
  */
 template <DeviceType Device>
 class ContextProjectionForwardFunc : public FunctionBase {
@@ -85,28 +108,37 @@ public:
   }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ((size_t)3, inputs.size());
+    CHECK(1 == inputs.size() || 2 == inputs.size());
     CHECK_EQ((size_t)1, outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto val_seqs = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
 
-    CHECK(outputs[0].data() && inputs[0].data() && inputs[2].data());
-    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[2].shape().ndims(), (size_t)1);
+    CHECK(out_seq.data() && val_seqs.data() && val_seqs.getSequenceId().data());
+    CHECK_EQ(out_seq.shape().ndims(), (size_t)2);
+    CHECK_EQ(val_seqs.shape().ndims(), (size_t)2);
+    CHECK_EQ(val_seqs.getSequenceId().shape().ndims(), (size_t)1);
+    if (2 == inputs.size()) {
+      CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
+    }
     /// dim of output = dim of input * context_length
-    CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
-    /// dim of input == dim of weight
-    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
+    CHECK_EQ(out_seq.shape()[1], val_seqs.shape()[1] * context_length_);
     /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
+    CHECK_EQ(val_seqs.shape()[0], out_seq.shape()[0]);
+    /// dim of input == dim of weight
+    if (2 == inputs.size()) {
+      CHECK_EQ(val_seqs.shape()[1], inputs[1].shape()[1]);
+    }
 
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-    auto out_mat = outputs[0].matrix<Device>();
-    auto in_mat = inputs[0].matrix<Device>();
-    auto w_mat = !inputs[1].data()
-                     ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-                     : inputs[1].matrix<Device>();
-    auto seq_vec = inputs[2].vector<int, Device>();
+    CHECK_EQ(out_seq.getArgType(), ADD_TO);
+    auto out_mat = out_seq.matrix<Device>();
+    const auto in_mat = val_seqs.matrix<Device>();
+    const auto w_mat =
+        (2 == inputs.size())
+            ? inputs[1].matrix<Device>()
+            : typename Tensor<real, Device>::Matrix(nullptr, 0, 0);
+    const auto seq_vec = val_seqs.getSequenceId().vector<int, Device>();
     ContextProjectionForward<Device>(out_mat,
                                      in_mat,
                                      w_mat,
@@ -122,8 +154,12 @@ private:
   size_t begin_pad_;
 };
 
+/**
+ * Context Projection Backward with CPU Matrix Device.
+ *
+ */
 template <>
-void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
+void ContextProjectionBackward<DEVICE_TYPE_CPU>(const CpuMatrix& out_grad_mat,
                                                 CpuMatrix& in_grad_mat,
                                                 CpuMatrix& w_grad_mat,
                                                 const CpuIVector& seq_vec,
@@ -146,7 +182,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
         int64_t pad_size =
             std::min(starts[i] - begin, starts[i + 1] - starts[i]);
         if (is_padding && w_grad_mat) {
-          MatrixPtr mat = out_grad_mat.subMatrix(starts[i], pad_size);
+          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
+                              .subMatrix(starts[i], pad_size);
           MatrixPtr sub = w_grad_mat.subMatrix(j, pad_size);
           sub->addAtOffset(*mat, j * input_dim);
         }
@@ -157,8 +194,8 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
         int64_t pad_size =
             std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
         if (is_padding && w_grad_mat) {
-          MatrixPtr mat =
-              out_grad_mat.subMatrix(starts[i + 1] - pad_size, pad_size);
+          MatrixPtr mat = const_cast<CpuMatrix&>(out_grad_mat)
+                              .subMatrix(starts[i + 1] - pad_size, pad_size);
           MatrixPtr sub = w_grad_mat.subMatrix(
               begin_pad + context_start + j - pad_size, pad_size);
           sub->addAtOffset(*mat, j * input_dim);
@@ -169,17 +206,22 @@ void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix& out_grad_mat,
       if (end <= begin) continue;
       if (!in_grad_mat) continue;
       MatrixPtr src = in_grad_mat.subMatrix(begin, end - begin);
-      MatrixPtr dst = out_grad_mat.subMatrix(dst_begin, dst_end - dst_begin);
+      MatrixPtr dst = const_cast<CpuMatrix&>(out_grad_mat)
+                          .subMatrix(dst_begin, dst_end - dst_begin);
       src->addAtOffset(*dst, j * input_dim);
     }
   }
 }
 
 /**
- * \param inputs[0] input grad.
- * \param inputs[1] weight grad.
- * \param inputs[2] input sequence.
- * \param outputs[0] output value.
+ * Context Projection Backward Function.
+ * Update the weight gradient and input layer gradient with backprop
+ *
+ * \param inputs[0].matrix          output layer grad, n * (d * l)
+ * \param inputs[0].vector          start position sequence, n * 1
+ * \param outputs[0].matrix         input layer grad, n * d
+ * \param outputs[0].vector         start position sequence, n * 1
+ * \param outputs[1]                weight grad, pad * d
  */
 template <DeviceType Device>
 class ContextProjectionBackwardFunc : public FunctionBase {
@@ -193,32 +235,36 @@ public:
   }
 
   void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
-    CHECK_EQ((size_t)3, inputs.size());
-    CHECK_EQ((size_t)1, outputs.size());
+    CHECK_EQ((size_t)1, inputs.size());
+    CHECK_EQ((size_t)2, outputs.size());
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+    CHECK(in_seq.data() && in_seq.getSequenceId().data());
+    CHECK_EQ(in_seq.shape().ndims(), (size_t)2);
+    CHECK_EQ(in_seq.getSequenceId().shape().ndims(), (size_t)1);
+    CHECK_EQ(out_seq.shape().ndims(), (size_t)2);
+    CHECK_EQ(out_seq.getSequenceId().shape().ndims(), (size_t)1);
+    CHECK_EQ(outputs[1].shape().ndims(), (size_t)2);
 
-    CHECK(outputs[0].data() && inputs[2].data());
-    CHECK_EQ(outputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[0].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[1].shape().ndims(), (size_t)2);
-    CHECK_EQ(inputs[2].shape().ndims(), (size_t)1);
+    /// dim of input grad == dim of weight
+    CHECK_EQ(out_seq.shape()[1], outputs[1].shape()[1]);
+    /// input and output grad has the same batch_size
+    CHECK_EQ(out_seq.shape()[0], in_seq.shape()[0]);
+    /// dim of output grad = dim of input grad * context_length
+    CHECK_EQ(in_seq.shape()[1], out_seq.shape()[1] * context_length_);
+    CHECK_EQ(out_seq.getArgType(), ADD_TO);
+    CHECK_EQ(outputs[1].getArgType(), ADD_TO);
 
-    /// dim of input == dim of weight
-    CHECK_EQ(inputs[0].shape()[1], inputs[1].shape()[1]);
-    /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].shape()[0], outputs[0].shape()[0]);
-    /// dim of output = dim of input * context_length
-    CHECK_EQ(outputs[0].shape()[1], inputs[0].shape()[1] * context_length_);
-
-    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
-
-    auto out_grad_mat = outputs[0].matrix<Device>();
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    const auto out_grad_mat = in_seq.matrix<Device>();
     auto in_grad_mat =
-        !inputs[0].data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-                          : inputs[0].matrix<Device>();
-    auto w_grad_mat = !inputs[1].data()
+        !out_seq.data() ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
+                        : out_seq.matrix<Device>();
+    auto w_grad_mat = !outputs[1].data()
                           ? typename Tensor<real, Device>::Matrix(nullptr, 0, 0)
-                          : inputs[1].matrix<Device>();
-    auto seq_vec = inputs[2].vector<int, Device>();
+                          : outputs[1].matrix<Device>();
     ContextProjectionBackward<Device>(out_grad_mat,
                                       in_grad_mat,
                                       w_grad_mat,
@@ -238,11 +284,16 @@ private:
   size_t total_pad_;
 };
 
-#if 0
 /**
- * \param inputs[0] input grad.
- * \param inputs[1] input sequence.
- * \param outputs[0] output grad.
+ * Context Projection Backward Data Function
+ * Update input layer grad
+ * input:  sequence of output layer grad
+ * output: sequence of input layer grad
+ *
+ * \param outputs[0].matrix              input layer grad, n * d
+ * \param outputs[0].vector              start position sequence, n * 1
+ * \param inputs[0].matrix               output layer grad, n * (d * l)
+ * \param inputs[0].vector               start positon sequence, n * 1
  */
 template <DeviceType Device>
 class ContextProjectionBackwardDataFunc : public FunctionBase {
@@ -252,32 +303,30 @@ public:
     context_start_ = config.get<int>("context_start");
   }
 
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
-    CHECK_EQ(2, static_cast<int>(inputs.size()));
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1, static_cast<int>(inputs.size()));
     CHECK_EQ(1, static_cast<int>(outputs.size()));
-    CHECK_EQ(0, static_cast<int>(inouts.size()));
-    CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
-    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1);
-    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+    CHECK(inputs[0].isSequenceArg() && outputs[0].isSequenceArg())
+        << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    const auto out_seq = dynamic_cast<const SequenceArg&>(outputs[0]);
+
+    CHECK(in_seq.data() && out_seq.data() && in_seq.getSequenceId().data());
+    CHECK_EQ(static_cast<int>(out_seq.shape().ndims()), 2);
+    CHECK_EQ(static_cast<int>(in_seq.shape().ndims()), 2);
+    CHECK_EQ(static_cast<int>(in_seq.getSequenceId().shape().ndims()), 1);
+    /// output layer grad dim == input layer grad dim * context_length_
+    CHECK_EQ(in_seq.shape().ndims(), out_seq.shape().ndims() * context_length_);
     /// input and output has the same batch_size
-    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+    CHECK_EQ(in_seq.shape()[0], out_seq.shape()[0]);
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
 
-    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
-    const auto in_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
-    typename SequenceT<Device>::type seq_vec(
-        inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    auto in_grad_mat = out_seq.matrix<Device>();
 
-    ContextProjectionBackwardData<Device>(out_grad_mat.get(),
-                                          in_grad_mat.get(),
-                                          seq_vec,
-                                          context_length_,
-                                          context_start_);
+    ContextProjectionBackwardData<Device>(
+        out_grad_mat, in_grad_mat, seq_vec, context_length_, context_start_);
   }
 
 private:
@@ -286,9 +335,14 @@ private:
 };
 
 /**
- * \param inputs[0] weight grad.
- * \param inputs[1] input sequence.
- * \param outputs[0] output grad.
+ * Context Projection Backward Weight Function
+ * Update weight grad by backprop
+ * input:  sequence of output layer grad
+ * output: weight grad
+ *
+ * \param outputs[0]                   weight grad, pad * d
+ * \param inputs[0].matrix             output layer grad, n * (d * l)
+ * \param inputs[0].vecotr             start positon sequence, n * 1
  */
 template <DeviceType Device>
 class ContextProjectionBackwardWeightFunc : public FunctionBase {
@@ -300,28 +354,25 @@ public:
     total_pad_ = config.get<size_t>("total_pad");
   }
 
-  void calc(const Arguments& inputs,
-            const Arguments& outputs,
-            const Arguments& inouts) override {
-    CHECK_EQ(2, static_cast<int>(inputs.size()));
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(1, static_cast<int>(inputs.size()));
     CHECK_EQ(1, static_cast<int>(outputs.size()));
-    CHECK_EQ(0, static_cast<int>(inouts.size()));
-
-    CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
-    CHECK_EQ(static_cast<int>(outputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[0].dims_.size()), 2);
-    CHECK_EQ(static_cast<int>(inputs[1].dims_.size()), 1);
-    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
-
-    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
-    auto w_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
-        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
-    typename SequenceT<Device>::type seq_vec(
-        inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
+    CHECK(inputs[0].isSequenceArg()) << "SequenceArg required here";
+    const auto in_seq = dynamic_cast<const SequenceArg&>(inputs[0]);
+    CHECK(in_seq.data() && in_seq.getSequenceId().data() && outputs[0].data());
+    CHECK_EQ(static_cast<int>(outputs[0].shape().ndims()), 2);
+    CHECK_EQ(static_cast<int>(in_seq.shape().ndims()), 2);
+    CHECK_EQ(static_cast<int>(in_seq.getSequenceId().shape().ndims()), 1);
+    CHECK_EQ(in_seq.shape()[0], outputs[0].shape()[0]);
+    /// output layer grad dim == weight dim * context_length_
+    CHECK_EQ(in_seq.shape()[1], outputs[0].shape()[1] * context_length_);
+    CHECK_EQ(outputs[0].getArgType(), ADD_TO);
 
-    ContextProjectionBackwardWeight<Device>(out_grad_mat.get(),
-                                            w_grad_mat.get(),
+    const auto seq_vec = in_seq.getSequenceId().vector<int, Device>();
+    const auto out_grad_mat = in_seq.matrix<Device>();
+    auto w_grad_mat = outputs[0].matrix<Device>();
+    ContextProjectionBackwardWeight<Device>(out_grad_mat,
+                                            w_grad_mat,
                                             seq_vec,
                                             context_length_,
                                             context_start_,
@@ -335,7 +386,6 @@ private:
   size_t begin_pad_;
   size_t total_pad_;
 };
-#endif
 
 REGISTER_TYPED_FUNC(ContextProjectionForward,
                     CPU,
@@ -350,7 +400,6 @@ REGISTER_TYPED_FUNC(ContextProjectionForward,
 REGISTER_TYPED_FUNC(ContextProjectionBackward,
                     GPU,
                     ContextProjectionBackwardFunc);
-#if 0
 REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
                     GPU,
                     ContextProjectionBackwardDataFunc);
@@ -358,5 +407,4 @@ REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
                     GPU,
                     ContextProjectionBackwardWeightFunc);
 #endif
-#endif
 }  // namespace paddle
diff --git a/paddle/function/ContextProjectionOp.h b/paddle/function/ContextProjectionOp.h
index a558df5e072f2f4dcc5c45afa385b3cf88872d26..2bdd47e4e9b02483c2c5af82bf00c4e55d68f93e 100644
--- a/paddle/function/ContextProjectionOp.h
+++ b/paddle/function/ContextProjectionOp.h
@@ -21,14 +21,14 @@ namespace paddle {
 /**
  * \brief   Context Projection Forward.
  *
- * \param[out]  outputs           output data.
- * \param[in]   input             input data.
- * \param[in]   weight            input weight.
- * \param[in]   sequence          input data.
- * \param[in]   context_length    consecutive rows for concatenation.
- * \param[in]   context_start     context start position.
- * \param[in]   begin_pad         begining pad position.
- * \param[in]   is_padding        whether padding 0 or not.
+ * \param[in/out]  outputs           output data.
+ * \param[in]      input             input data.
+ * \param[in]      weight            input weight.
+ * \param[in]      sequence          input data.
+ * \param[in]      context_length    consecutive rows for concatenation.
+ * \param[in]      context_start     context start position.
+ * \param[in]      begin_pad         begining pad position.
+ * \param[in]      is_padding        whether padding 0 or not.
  *
  */
 template <DeviceType DType>
@@ -56,7 +56,7 @@ void ContextProjectionForward(
  */
 template <DeviceType DType>
 void ContextProjectionBackward(
-    typename Tensor<real, DType>::Matrix& out_grad,
+    const typename Tensor<real, DType>::Matrix& out_grad,
     typename Tensor<real, DType>::Matrix& in_grad,
     typename Tensor<real, DType>::Matrix& w_grad,
     const typename Tensor<int, DType>::Vector& seq_vec,
@@ -68,7 +68,7 @@ void ContextProjectionBackward(
 
 template <DeviceType DType>
 void ContextProjectionBackwardData(
-    typename Tensor<real, DType>::Matrix& out_grad,
+    const typename Tensor<real, DType>::Matrix& out_grad,
     typename Tensor<real, DType>::Matrix& in_grad,
     const typename Tensor<int, DType>::Vector& sequence,
     size_t context_length,
@@ -76,7 +76,7 @@ void ContextProjectionBackwardData(
 
 template <DeviceType DType>
 void ContextProjectionBackwardWeight(
-    typename Tensor<real, DType>::Matrix& out_grad,
+    const typename Tensor<real, DType>::Matrix& out_grad,
     typename Tensor<real, DType>::Matrix& w_grad,
     const typename Tensor<int, DType>::Vector& seq_vec,
     size_t context_length,
diff --git a/paddle/function/ContextProjectionOpGpu.cu b/paddle/function/ContextProjectionOpGpu.cu
index 6a4a01a6510416fc1f945305203f55ece7a28f11..1a5b4042402df3081a493962a5e080d72b7f40b2 100644
--- a/paddle/function/ContextProjectionOpGpu.cu
+++ b/paddle/function/ContextProjectionOpGpu.cu
@@ -138,10 +138,10 @@ void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix& output,
                                 begin_pad);
 }
 
-__global__ void KeContextProjectionBackwardData(real* out_grad,
+__global__ void KeContextProjectionBackwardData(const real* out_grad,
                                                 const int* sequence,
                                                 real* in_grad,
-                                                int input_dim,
+                                                size_t input_dim,
                                                 int context_length,
                                                 int context_start) {
   int idx = threadIdx.x;
@@ -152,7 +152,8 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
   real value = 0;
 
   int instances = seq_end - seq_start + context_length - 1;
-  out_grad += seq_start * input_dim * context_length;
+  auto out = const_cast<real*>(out_grad);
+  out += seq_start * input_dim * context_length;
   in_grad += seq_start * input_dim;
   for (int k = 0; k <= input_dim / block_size; k++) {
     if (idx < input_dim) {
@@ -169,7 +170,7 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
         int outx = (i - context_length) < 0 ? i : (context_length - 1);
         int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
         real* output_r =
-          out_grad + outy * input_dim * context_length + outx * input_dim;
+          out + outy * input_dim * context_length + outx * input_dim;
         for (int j = outy; j < seq_end - seq_start; j++) {
           value += output_r[idx];
           if (j - outy == outx) break;
@@ -194,7 +195,7 @@ __global__ void KeContextProjectionBackwardData(real* out_grad,
  * @param[in]   context_start    context start.
  *
  */
-void hl_context_projection_backward_data(real* out_grad,
+void hl_context_projection_backward_data(const real* out_grad,
                                          const int* sequence,
                                          real* input_grad,
                                          size_t num_sequences,
@@ -216,7 +217,7 @@ void hl_context_projection_backward_data(real* out_grad,
 }
 
 template <>
-void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
+void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                                     GpuMatrix& in_grad,
                                                     const GpuIVector& sequence,
                                                     size_t context_length,
@@ -231,7 +232,7 @@ void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
 }
 
 template<int THREADS_X, int THREADS_Y>
-__global__ void KeContextProjectionBackwardWeight(real* out_grad,
+__global__ void KeContextProjectionBackwardWeight(const real* out_grad,
                                                   const int* sequence,
                                                   real* w_grad,
                                                   int num_sequences,
@@ -254,7 +255,8 @@ __global__ void KeContextProjectionBackwardWeight(real* out_grad,
     for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
       int seq_start = sequence[seqId];
       int seq_end = sequence[seqId+1];
-      output_r = out_grad + seq_start * w_dim * context_length;
+      output_r = const_cast<real*>(out_grad)
+                    + seq_start * w_dim * context_length;
 
       if (context_start < 0) {
         if (padId + context_start < 0) {
@@ -318,7 +320,7 @@ __global__ void KeContextProjectionBackwardWeight(real* out_grad,
  * beginning.
  *
  */
-void hl_context_projection_backward_weight(real* out_grad,
+void hl_context_projection_backward_weight(const real* out_grad,
                                            const int* sequence,
                                            real* w_grad,
                                            size_t num_sequences,
@@ -346,7 +348,7 @@ void hl_context_projection_backward_weight(real* out_grad,
 
 template <>
 void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
-        GpuMatrix& out_grad,
+        const GpuMatrix& out_grad,
         GpuMatrix& w_grad,
         const GpuIVector& seq_vec,
         size_t context_length,
@@ -365,7 +367,7 @@ void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
 }
 
 template <>
-void ContextProjectionBackward<DEVICE_TYPE_GPU>(GpuMatrix& out_grad,
+void ContextProjectionBackward<DEVICE_TYPE_GPU>(const GpuMatrix& out_grad,
                                                 GpuMatrix& in_grad,
                                                 GpuMatrix& w_grad,
                                                 const GpuIVector& sequence,
diff --git a/paddle/function/ContextProjectionOpTest.cpp b/paddle/function/ContextProjectionOpTest.cpp
index 6223d2fd23ac3bbb4fbcf51d37d22feaf3b1330b..c9db2ff8008e0bb0fa04370fb7b3ecd7641d2062 100644
--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
@@ -56,22 +56,25 @@ void testMatrixProjectionForward(int context_start,
   cpu_out.randomizeUniform();
   gpu_out.copyFrom(cpu_out);
 
-  compare.getCpuFunction()->calc(
-      {Tensor(cpu_in.getData(), Dims{batch_size, input_dim}),
-       Tensor(cpu_weight ? cpu_weight->getData() : nullptr,
-              Dims{pad, input_dim}),
-       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
-              Dims{cpu_seq->getSize()})},
-      {Tensor(cpu_out.getData(), Dims{batch_size, input_dim * context_length})},
-      {});
-  compare.getGpuFunction()->calc(
-      {Tensor(gpu_in.getData(), Dims{batch_size, input_dim}),
-       Tensor(gpu_weight ? gpu_weight->getData() : nullptr,
-              Dims{pad, input_dim}),
-       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
-              Dims{gpu_seq->getSize()})},
-      {Tensor(gpu_out.getData(), Dims{batch_size, input_dim * context_length})},
-      {});
+  BufferArgs cpu_inputs;
+  BufferArgs cpu_outputs;
+  cpu_inputs.addArg(cpu_in, *cpu_seq);
+  if (cpu_weight) {
+    cpu_inputs.addArg(*cpu_weight, *cpu_seq);
+  }
+  cpu_outputs.addArg(cpu_out, *cpu_seq, ADD_TO);
+
+  compare.getCpuFunction()->calc(cpu_inputs, cpu_outputs);
+
+  BufferArgs gpu_inputs;
+  BufferArgs gpu_outputs;
+  gpu_inputs.addArg(gpu_in, *gpu_seq);
+  if (gpu_weight) {
+    gpu_inputs.addArg(*gpu_weight, *gpu_seq);
+  }
+  gpu_outputs.addArg(gpu_out, *gpu_seq, ADD_TO);
+
+  compare.getGpuFunction()->calc(gpu_inputs, gpu_outputs);
 
   autotest::TensorCheckEqual(cpu_out, gpu_out);
 }
@@ -117,25 +120,23 @@ void testMatrixProjectionBackward(int context_start,
     gpu_w_grad->copyFrom(*cpu_w_grad);
   }
 
-  compare.getCpuFunction()->calc(
-      {Tensor(cpu_in_grad.getData(), Dims{batch_size, input_dim}),
-       Tensor(cpu_w_grad ? cpu_w_grad->getData() : nullptr,
-              Dims{pad, input_dim}),
-       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
-              Dims{cpu_seq->getSize()})},
-      {Tensor(cpu_out_grad.getData(),
-              Dims{batch_size, input_dim * context_length})},
-      {});
-
-  compare.getGpuFunction()->calc(
-      {Tensor(gpu_in_grad.getData(), Dims{batch_size, input_dim}),
-       Tensor(gpu_w_grad ? gpu_w_grad->getData() : nullptr,
-              Dims{pad, input_dim}),
-       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
-              Dims{gpu_seq->getSize()})},
-      {Tensor(gpu_out_grad.getData(),
-              Dims{batch_size, input_dim * context_length})},
-      {});
+  BufferArgs cpu_inputs;
+  BufferArgs cpu_outputs;
+  cpu_inputs.addArg(cpu_out_grad, *cpu_seq);
+  cpu_outputs.addArg(cpu_in_grad, *cpu_seq, ADD_TO);
+  cpu_outputs.addArg(
+      cpu_w_grad ? *cpu_w_grad : CpuMatrix(nullptr, 0, input_dim), ADD_TO);
+
+  compare.getCpuFunction()->calc(cpu_inputs, cpu_outputs);
+
+  BufferArgs gpu_inputs;
+  BufferArgs gpu_outputs;
+  gpu_inputs.addArg(gpu_out_grad, *gpu_seq);
+  gpu_outputs.addArg(gpu_in_grad, *gpu_seq, ADD_TO);
+  gpu_outputs.addArg(
+      gpu_w_grad ? *gpu_w_grad : GpuMatrix(nullptr, 0, input_dim), ADD_TO);
+
+  compare.getGpuFunction()->calc(gpu_inputs, gpu_outputs);
 
   autotest::TensorCheckErr(cpu_in_grad, gpu_in_grad);
   if (is_padding) {
diff --git a/paddle/function/Function.cpp b/paddle/function/Function.cpp
index 3fdc37b9689559393d5c10bfb28f3e17f022f3a8..2632c17e3ac61a6012c82ebff97246fcf063d6f3 100644
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -93,6 +93,12 @@ void BufferArgs::addArg(const GpuSparseMatrix& arg, ArgType argType) {
   addArg(*_args_.back());
 }
 
+void BufferArgs::addArg(const Matrix& matrix,
+                        const IVector& vector,
+                        ArgType argType) {
+  args_.push_back(std::make_shared<SequenceArg>(matrix, vector, argType));
+}
+
 ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
 
 }  // namespace paddle
diff --git a/paddle/function/Function.h b/paddle/function/Function.h
index b0c6ba0facc7f83796464e6324b6fb0c2eca868d..9215c137eb8e85a9a03575104d7f89bbce441eba 100644
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -102,6 +102,10 @@ public:
   void addArg(const CpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
   void addArg(const GpuSparseMatrix& arg, ArgType argType = UNSPECIFIED);
 
+  void addArg(const Matrix& matrix,
+              const IVector& vector,
+              ArgType argType = UNSPECIFIED);
+
   // get argument
   const BufferArg& operator[](size_t num) const {
     CHECK_LT(num, args_.size());
diff --git a/paddle/gserver/layers/ContextProjection.cpp b/paddle/gserver/layers/ContextProjection.cpp
index ebcc87cbf48a3c34a4e625e67f872fed69cdf44f..d7042af1c25e7432e5b1efbb89cd8fd3f63fb4ae 100644
--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -118,16 +118,15 @@ void ContextProjection::forward() {
   /// first use state_, otherwise use weight_(padding false === w nullptr)
   auto w_ptr =
       state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
-  auto start_pos = in_->sequenceStartPositions;
-
+  const auto start_pos = in_->sequenceStartPositions->getVector(useGpu_);
   BufferArgs inputs;
   BufferArgs outputs;
-  inputs.addArg(*in_->value);
-  inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
-                          w_ptr ? w_ptr->getHeight() : 0,
-                          input_dim));
-  inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
-  outputs.addArg(*out_->value, ADD_TO);
+  inputs.addArg(*in_->value, *start_pos);
+  if (w_ptr) {
+    inputs.addArg(CpuMatrix(w_ptr->getData(), w_ptr->getHeight(), input_dim),
+                  *start_pos);
+  }
+  outputs.addArg(*out_->value, *start_pos, ADD_TO);
   forward_[0]->calc(inputs, outputs);
 
   if (state_ && config_.context_start() < 0) {
@@ -166,13 +165,16 @@ void ContextProjection::backward(const UpdateCallback& callback) {
 
   BufferArgs inputs;
   BufferArgs outputs;
-  inputs.addArg(CpuMatrix(
-      in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim));
-  inputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
-                          w_ptr ? w_ptr->getHeight() : 0,
-                          input_dim));
-  inputs.addArg(*in_->sequenceStartPositions->getVector(useGpu_));
-  outputs.addArg(*out_->grad, ADD_TO);
+  inputs.addArg(*out_->grad, *in_->sequenceStartPositions->getVector(useGpu_));
+  outputs.addArg(
+      CpuMatrix(
+          in_->grad ? in_->grad->getData() : nullptr, batch_size, input_dim),
+      *in_->sequenceStartPositions->getVector(useGpu_),
+      ADD_TO);
+  outputs.addArg(CpuMatrix(w_ptr ? w_ptr->getData() : nullptr,
+                           w_ptr ? w_ptr->getHeight() : 0,
+                           input_dim),
+                 ADD_TO);
   backward_[0]->calc(inputs, outputs);
 
   if (config_.trainable_padding()) {
diff --git a/paddle/py_paddle/dataprovider_converter.py b/paddle/py_paddle/dataprovider_converter.py
index 981d10afda2671be9e8f0da1a4bee755f7aa9d61..21d1cb75f4d40e6ed011b33c6366c9d31c0fcc7c 100644
--- a/paddle/py_paddle/dataprovider_converter.py
+++ b/paddle/py_paddle/dataprovider_converter.py
@@ -34,6 +34,10 @@ class IScanner(object):
 
 
 class DenseScanner(IScanner):
+    """
+    :type __mat__: numpy.ndarray
+    """
+
     def __init__(self, input_type, pos):
         IScanner.__init__(self, input_type, pos)
         self.__mat__ = None
@@ -47,6 +51,8 @@ class DenseScanner(IScanner):
     def finish_scan(self, argument):
         assert isinstance(argument, swig_paddle.Arguments)
         assert isinstance(self.input_type, dp2.InputType)
+        if self.__mat__.dtype != numpy.float32:
+            self.__mat__ = self.__mat__.astype(numpy.float32)
         m = swig_paddle.Matrix.createDenseFromNumpy(self.__mat__, True, False)
         argument.setSlotValue(self.pos, m)
 
diff --git a/paddle/scripts/travis/before_install.osx.sh b/paddle/scripts/travis/before_install.osx.sh
index 7036f971fdd7bac68b67c7b5a92e50c352e214c1..80f031a74e7052d183b5ef21d432476ff1cce722 100755
--- a/paddle/scripts/travis/before_install.osx.sh
+++ b/paddle/scripts/travis/before_install.osx.sh
@@ -1,6 +1,4 @@
 #!/bin/bash
 brew update
 brew tap homebrew/science
-brew install python
-sudo pip install --upgrade protobuf
-brew install swig openblas md5sha1sum protobuf
+brew install openblas swig md5sha1sum
diff --git a/paddle/scripts/travis/build_and_test.sh b/paddle/scripts/travis/build_and_test.sh
index fd3aeb02b21d659f783702905117fc838b93eafd..5e6350b57458594163f23cca41a546d7bd9b1eda 100755
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
@@ -6,7 +6,7 @@ if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then
   export PYTHONPATH=/opt/python/2.7.12/lib/python2.7/site-packages
   export PYTHONHOME=/opt/python/2.7.12
   export PATH=/opt/python/2.7.12/bin:${PATH}
-  cmake .. -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
+  cmake .. -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DON_TRAVIS=ON -DON_COVERALLS=ON -DCOVERALLS_UPLOAD=ON ${EXTRA_CMAKE_OPTS}
   NRPOC=`nproc`
   make -j $NPROC
   make coveralls
diff --git a/paddle/scripts/travis/docs.sh b/paddle/scripts/travis/docs.sh
index bdafb145bcd4e5990f382bb890f804687c474f7c..6b43cad20b76e9abeb3cb10a726d3d8e3da5f8e2 100755
--- a/paddle/scripts/travis/docs.sh
+++ b/paddle/scripts/travis/docs.sh
@@ -4,7 +4,7 @@
 source ./common.sh
 
 # Compile Documentation only.
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS}
+cmake .. -DCMAKE_BUILD_TYPE=Debug -DCMAKE_Fortran_COMPILER=/usr/bin/gfortran-4.8 -DWITH_GPU=OFF -DWITH_DOC=ON ${EXTRA_CMAKE_OPTS}
 make paddle_docs paddle_docs_cn
 
 # check websites for broken links