Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into stat_usage

766adaea · zhouyingfeng · 7193cbf1 · 495649af · 766adaea · 766adaea
56 changed file
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -107,7 +107,6 @@ function(link_paddle_exe TARGET_NAME)
        paddle_parameter
        paddle_proto
        paddle_cuda
-        paddle_test_main
        ${METRIC_LIBS}
        ${PROTOBUF_LIBRARY}
        ${LIBGLOG_LIBRARY}
@@ -155,8 +154,9 @@ endfunction()
 # Rest Arguemnts: not used.
 function(link_paddle_test TARGET_NAME)
    link_paddle_exe(${TARGET_NAME})
-    target_link_libraries(${TARGET_NAME} ${GTEST_MAIN_LIBRARIES}
+    target_link_libraries(${TARGET_NAME}
-        ${GTEST_LIBRARIES})
+                          paddle_test_main
+                          ${GTEST_LIBRARIES})
 endfunction()
 # add_unittest_without_exec

--- a/demo/model_zoo/embedding/pre_DictAndModel.sh
+++ b/demo/model_zoo/embedding/pre_DictAndModel.sh
@@ -16,10 +16,9 @@ set -e
 set -x
 BASE_URL='http://paddlepaddle.cdn.bcebos.com/model_zoo/embedding'
-wget ${BASE_URL}/baidu.dict
+DOWNLOAD_ITEMS=(baidu.dict model_32.emb model_64.emb model_128.emb model_256.emb)
+ITEM_MD5=(fa03a12321eaab6c30a8fcc9442eaea3
-DOWNLOAD_ITEMS=(model_32.emb model_64.emb model_128.emb model_256.emb)
+          f88c8325ee6da6187f1080e8fe66c1cd
-ITEM_MD5=(f88c8325ee6da6187f1080e8fe66c1cd
          927cf70f27f860aff1a5703ebf7f1584
 	  a52e43655cd25d279777ed509a1ae27b
 	  b92c67fe9ff70fea53596080e351ac80)

--- a/demo/recommendation/evaluate.py
+++ b/demo/recommendation/evaluate.py
+#!/usr/bin/python
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+import re
+import math
+def get_best_pass(log_filename):
+    with open(log_filename, 'r') as f:
+        text = f.read()
+        pattern = re.compile('Test.*? cost=([0-9]+\.[0-9]+).*?pass-([0-9]+)',
+                             re.S)
+        results = re.findall(pattern, text)
+        sorted_results = sorted(results, key=lambda result: float(result[0]))
+        return sorted_results[0]
+log_filename = sys.argv[1]
+log = get_best_pass(log_filename)
+predict_error = math.sqrt(float(log[0])) / 2
+print 'Best pass is %s, error is %s, which means predict get error as %f' % (
+    log[1], log[0], predict_error)
+evaluate_pass = "output/pass-%s" % log[1]
+print "evaluating from pass %s" % evaluate_pass
--- a/doc/howto/usage/cluster/k8s-aws/README.md
+++ b/doc/howto/usage/cluster/k8s-aws/README.md
+# PaddlePaddle on AWS with Kubernetes
+## Create AWS Account and IAM Account
+To use AWS, we need to sign up an AWS account on Amazon's Web site.
+An AWS account allows us to login to the AWS Console Web interface to
+create IAM users and user groups.  Usually, we create a user group with
+privileges required to run PaddlePaddle, and we create users for
+those who are going to run PaddlePaddle and add these users into the
+group.  IAM users can identify themselves using password and tokens,
+where passwords allows users to log in to the AWS Console, and tokens
+make it easy for users to submit and inspect jobs from the command
+line.
+To sign up an AWS account, please
+follow
+[this guide](http://docs.aws.amazon.com/lambda/latest/dg/setting-up.html).
+To create users and user groups under an AWS account, please
+follow
+[this guide](http://docs.aws.amazon.com/IAM/latest/UserGuide/id_users_create.html).
+Please be aware that this tutorial needs the following privileges in
+the user group:
+- AmazonEC2FullAccess
+- AmazonS3FullAccess
+- AmazonRoute53FullAccess
+- AmazonRoute53DomainsFullAccess
+- AmazonElasticFileSystemFullAccess
+- AmazonVPCFullAccess
+- IAMUserSSHKeys
+- IAMFullAccess
+- NetworkAdministrator
+By the time we write this tutorial, we noticed that Chinese AWS users
+might suffer from authentication problems when running this tutorial.
+Our solution is that we create a VM instance with the default Amazon
+AMI and in the same zone as our cluster runs, so we can SSH to this VM
+instance as a tunneling server and control our cluster and jobs from
+it.
+## PaddlePaddle on AWS
+Here we will show you step by step on how to run PaddlePaddle training on AWS cluster.
+###Download kube-aws and kubectl
+####kube-aws
+Import the CoreOS Application Signing Public Key:
+```
+gpg2 --keyserver pgp.mit.edu --recv-key FC8A365E
+```
+Validate the key fingerprint:
+```
+gpg2 --fingerprint FC8A365E
+```
+The correct key fingerprint is `18AD 5014 C99E F7E3 BA5F 6CE9 50BD D3E0 FC8A 365E`
+Go to the [releases](https://github.com/coreos/kube-aws/releases) and download the latest release tarball and detached signature (.sig) for your architecture.
+Validate the tarball's GPG signature:
+```
+PLATFORM=linux-amd64
+ # Or
+PLATFORM=darwin-amd64
+gpg2 --verify kube-aws-${PLATFORM}.tar.gz.sig kube-aws-${PLATFORM}.tar.gz
+```
+Extract the binary:
+```
+tar zxvf kube-aws-${PLATFORM}.tar.gz
+```
+Add kube-aws to your path:
+```
+mv ${PLATFORM}/kube-aws /usr/local/bin
+```
+####kubectl
+Go to the [releases](https://github.com/kubernetes/kubernetes/releases) and download the latest release tarball.
+Extract the tarball and then concate the kubernetes binaries directory into PATH:
+```
+export PATH=<path/to/kubernetes-directory>/platforms/linux/amd64:$PATH
+```
+User credentials and security tokens will be generated later in user directory, not in `~/.kube/config`, they will be necessary to use the CLI or the HTTP Basic Auth.
+###Configure AWS Credentials
+First check out [this](http://docs.aws.amazon.com/cli/latest/userguide/installing.html) for installing the AWS command line interface, if you use ec2 instance with default amazon AMI, the cli tool has already been installed on your machine.
+And then configure your AWS account information:
+```
+aws configure
+```
+Fill in the required fields (You can get your AWS aceess key id and AWS secrete access key by following [this](http://docs.aws.amazon.com/cli/latest/userguide/cli-chap-getting-started.html) instruction):
+```
+AWS Access Key ID: YOUR_ACCESS_KEY_ID
+AWS Secrete Access Key: YOUR_SECRETE_ACCESS_KEY
+Default region name: us-west-2
+Default output format: json
+```
+Test that your credentials work by describing any instances you may already have running on your account:
+```
+aws ec2 describe-instances
+```
+###Define Cluster Parameters
+####EC2 key pair
+The keypair that will authenticate SSH access to your EC2 instances. The public half of this key pair will be configured on each CoreOS node.
+After creating a key pair, you will use the name you gave the keys to configure the cluster. Key pairs are only available to EC2 instances in the same region. More info in the [EC2 Keypair docs](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/ec2-key-pairs.html).
+####KMS key
+Amazon KMS keys are used to encrypt and decrypt cluster TLS assets. If you already have a KMS Key that you would like to use, you can skip creating a new key and provide the Arn string for your existing key.
+You can create a KMS key in the AWS console, or with the aws command line tool:
+```
+$ aws kms --region=us-west-2 create-key --description="kube-aws assets"
+{
+    "KeyMetadata": {
+        "CreationDate": 1458235139.724,
+        "KeyState": "Enabled",
+        "Arn": "arn:aws:kms:us-west-2:xxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx",
+        "AWSAccountId": "xxxxxxxxxxxxx",
+        "Enabled": true,
+        "KeyUsage": "ENCRYPT_DECRYPT",
+        "KeyId": "xxxxxxxxx",
+        "Description": "kube-aws assets"
+    }
+}
+```
+You will use the `KeyMetadata.Arn` string to identify your KMS key in the init step.
+And then you need to add several inline policies in your user permission.
+kms inline policy:
+```
+{
+    "Version": "2012-10-17",
+    "Statement": [
+        {
+            "Sid": "Stmt1482205552000",
+            "Effect": "Allow",
+            "Action": [
+                "kms:Decrypt",
+                "kms:Encrypt"
+            ],
+            "Resource": [
+                "arn:aws:kms:*:xxxxxxxxx:key/*"
+            ]
+        }
+    ]
+}
+```
+cloudformation inline policy:
+```
+"Version": "2012-10-17",
+    "Statement": [
+        {
+            "Sid": "Stmt1482205746000",
+            "Effect": "Allow",
+            "Action": [
+                "cloudformation:CreateStack",
+                "cloudformation:UpdateStack",
+                "cloudformation:DeleteStack",
+                "cloudformation:DescribeStacks",
+                "cloudformation:DescribeStackResource",
+                "cloudformation:GetTemplate"
+            ],
+            "Resource": [
+                "arn:aws:cloudformation:us-west-2:xxxxxxxxx:stack/YOUR_CLUSTER_NAME/*"
+            ]
+        }
+    ]
+}
+```
+####External DNS name
+When the cluster is created, the controller will expose the TLS-secured API on a public IP address. You will need to create an A record for the external DNS hostname you want to point to this IP address. You can find the API external IP address after the cluster is created by invoking kube-aws status.
+####S3 bucket
+You need to create an S3 bucket before startup the Kubernetes cluster.
+####Initialize an asset directory
+Create a directory on your local machine to hold the generated assets:
+```
+$ mkdir my-cluster
+$ cd my-cluster
+```
+Initialize the cluster CloudFormation stack with the KMS Arn, key pair name, and DNS name from the previous step:
+```
+$ kube-aws init \
+--cluster-name=my-cluster-name \
+--external-dns-name=my-cluster-endpoint \
+--region=us-west-1 \
+--availability-zone=us-west-1c \
+--key-name=key-pair-name \
+--kms-key-arn="arn:aws:kms:us-west-2:xxxxxxxxxx:key/xxxxxxxxxxxxxxxxxxx"
+```
+There will now be a cluster.yaml file in the asset directory. This is the main configuration file for your cluster.
+####Render contents of the asset directory
+In the simplest case, you can have kube-aws generate both your TLS identities and certificate authority for you.
+```
+$ kube-aws render credentials --generate-ca
+```
+The next command generates the default set of cluster assets in your asset directory.
+```
+sh $ kube-aws render stack
+```
+Here's what the directory structure looks like:
+```
+$ tree
+.
+├── cluster.yaml
+├── credentials
+│   ├── admin-key.pem
+│   ├── admin.pem
+│   ├── apiserver-key.pem
+│   ├── apiserver.pem
+│   ├── ca-key.pem
+│   ├── ca.pem
+│   ├── worker-key.pem
+│   └── worker.pem
+│   ├── etcd-key.pem
+│   └── etcd.pem
+│   ├── etcd-client-key.pem
+│   └── etcd-client.pem
+├── kubeconfig
+├── stack-template.json
+└── userdata
+    ├── cloud-config-controller
+    └── cloud-config-worker
+```
+These assets (templates and credentials) are used to create, update and interact with your Kubernetes cluster.
+###Kubernetes Cluster Start Up
+####Create the instances defined in the CloudFormation template
+Now for the exciting part, creating your cluster:
+```
+$ kube-aws up --s3-uri s3://<your-bucket-name>/<prefix>
+```
+####Configure DNS
+You can invoke `kube-aws status` to get the cluster API endpoint after cluster creation, if necessary. This command can take a while. And then dig the load balancer hostname to get the ip address, use this ip to setup an A record for your external dns name.
+####Access the cluster
+Once the API server is running, you should see:
+```
+$ kubectl --kubeconfig=kubeconfig get nodes
+NAME                                       STATUS                     AGE
+ip-10-0-0-xxx.us-west-1.compute.internal   Ready                      5m
+ip-10-0-0-xxx.us-west-1.compute.internal   Ready                      5m
+ip-10-0-0-xx.us-west-1.compute.internal    Ready,SchedulingDisabled   5m
+```
+###Setup PaddlePaddle Environment on AWS
+Now, we've created a cluster with following network capability:
+1. All Kubernetes nodes can communicate with each other.
+1. All Docker containers on Kubernetes nodes can communicate with each other.
+1. All Kubernetes nodes can communicate with all Docker containers on Kubernetes nodes.
+1. All other traffic loads from outside of Kubernetes nodes cannot reach to the Docker containers on Kubernetes nodes except for creating the services for containers.
+For sharing the training data across all the Kubernetes nodes, we use EFS (Elastic File System) in AWS. Ceph might be a better solution, but it requires high version of Linux kernel that might not be stable enough at this moment. We haven't automated the EFS setup at this moment, so please do the following steps:
+1. Make sure you added AmazonElasticFileSystemFullAccess policy in your group.
+1. Create the Elastic File System in AWS console, and attach the new VPC with it.
+<img src="create_efs.png" width="800">
+1. Modify the Kubernetes security group under ec2/Security Groups, add additional inbound policy "All TCP TCP 0 - 65535 0.0.0.0/0" for Kubernetes default VPC security group. 
+<img src="add_security_group.png" width="800">
+1. Follow the EC2 mount instruction to mount the disk onto all the Kubernetes nodes, we recommend to mount EFS disk onto ~/efs.
+<img src="efs_mount.png" width="800">
+Before starting the training, you should place your user config and divided training data onto EFS. When the training start, each task will copy related files from EFS into container, and it will also write the training results back onto EFS, we will show you how to place the data later in this article.
+###Core Concept of PaddlePaddle Training on AWS
+Now we've already setup a 3 nodes distributed Kubernetes cluster, and on each node we've attached the EFS volume, in this training demo, we will create three Kubernetes pod and scheduling them on 3 node. Each pod contains a PaddlePaddle container. When container gets created, it will start pserver and trainer process, load the training data from EFS volume and start the distributed training task.
+####Use Kubernetes Job
+We use Kubernetes job to represent one time of distributed training. After the job get finished, Kubernetes will destroy job container and release all related resources.
+We can write a yaml file to describe the Kubernetes job. The file contains lots of configuration information, for example PaddlePaddle's node number, `paddle pserver` open port number, the network card info etc., these information are passed into container for processes to use as environment variables.
+In one time of distributed training, user will confirm the PaddlePaddle node number first. And then upload the pre-divided training data and configuration file onth EFS volume. And then create the Kubernetes job yaml file; submit to the Kubernetes cluster to start the training job.
+####Create PaddlePaddle Node
+After Kubernetes master gets the request, it will parse the yaml file and create several pods (defined by PaddlePaddle's node number)， Kubernetes will allocate these pods onto cluster's node. A pod represents a PaddlePaddle node, when pod is successfully allocated onto one physical/virtual machine, Kubernetes will startup the container in the pod, and this container will use the environment variables in yaml file and start up `paddle pserver` and `paddle trainer` processes.
+####Start up Training
+After container gets started, it starts up the distributed training by using scripts. We know `paddle train` process need to know other node's ip address and it's own trainer_id, since PaddlePaddle currently don't have the ability to do the service discovery, so in the start up script, each node will use job pod's name to query all to pod info from Kubernetes apiserver (apiserver's endpoint is an environment variable in container by default).
+With pod information, we can assign each pod a unique trainer_id. Here we sort all the pods by pod's ip, and assign the index to each PaddlePaddle node as it's trainer_id. The workflow of starting up the script is as follows:
+1. Query the api server to get pod information, and assign the trainer_id by sorting the ip.
+1. Copy the training data from EFS sharing volume into container.
+1. Parse the `paddle pserver` and 'paddle trainer' startup parameters from environment variables, and then start up the processes.
+1. PaddlePaddle will automatically write the result onto the PaddlePaddle node with trainer_id:0, we set the output path to be the EFS volume to save the result data.
+###Start PaddlePaddle Training Demo on AWS
+Now we'll start a PaddlePaddle training demo on AWS, steps are as follows:
+1. Build PaddlePaddle Docker image.
+1. Divide the training data file and upload it onto the EFS sharing volume.
+1. Create the training job yaml file, and start up the job.
+1. Check the result after training.
+####Build PaddlePaddle Docker Image
+PaddlePaddle docker image need to provide the runtime environment for `paddle pserver` and `paddle train`, so the container use this image should have two main function:
+1. Copy the training data into container.
+1. Generate the startup parameter for `paddle pserver` and `paddle train` process, and startup the training.
+Since official `paddledev/paddle:cpu-latest` have already included the PaddlePaddle binary, but lack of the above functionalities, so we will create the startup script based on this image, to achieve the work above. the detailed Dockerfile is as follows:
+```
+FROM paddledev/paddle:cpu-latest
+MAINTAINER zjsxzong89@gmail.com
+COPY start.sh /root/
+COPY start_paddle.py /root/
+CMD ["bash"," -c","/root/start.sh"]
+```
+At this point, we will copy our `start.sh` and `start_paddle.py` file into container, and then exec `start_paddle.py` script to start up the training, all the steps like assigning trainer_id, getting other nodes' ip are implemented in `start_paddle.py`.
+`start_paddle.py` will start parsing the parameters.
+```
+parser = argparse.ArgumentParser(prog="start_paddle.py",
+                                     description='simple tool for k8s')
+    args, train_args_list = parser.parse_known_args()
+    train_args = refine_unknown_args(train_args_list)
+    train_args_dict = dict(zip(train_args[:-1:2], train_args[1::2]))
+    podlist = getPodList()
+```
+And then using function `getPodList()` to query all the pod information from the job name through Kubernetes api server. When all the pods are in the running status, using `getIdMap(podlist)` to get the trainer_id.
+```
+    podlist = getPodList()
+    # need to wait until all pods are running
+    while not isPodAllRunning(podlist):
+        time.sleep(10)
+        podlist = getPodList()
+    idMap = getIdMap(podlist)
+```
+In function `getIdMap(podlist)`, we use podlist to get the ip address for each pod and sort them, use the index as the trainer_id.
+```
+def getIdMap(podlist):
+    '''
+    generate tainer_id by ip
+    '''
+    ips = []
+    for pod in podlist["items"]:
+        ips.append(pod["status"]["podIP"])
+    ips.sort()
+    idMap = {}
+    for i in range(len(ips)):
+        idMap[ips[i]] = i
+    return idMap
+```
+After getting `idMap`, we use function `startPaddle(idMap, train_args_dict)` to generate `paddle pserver` and `paddle train` start up parameters and then start up the processes.
+In function `startPaddle`, the most important work is to generate `paddle pserver` and `paddle train` start up parameters. For example, `paddle train` parameter parsing, we will get parameters like `PADDLE_NIC`, `PADDLE_PORT`, `PADDLE_PORTS_NUM`, and get the `trainer_id` from `idMap`.
+```
+    program = 'paddle train'
+    args = " --nics=" + PADDLE_NIC
+    args += " --port=" + str(PADDLE_PORT)
+    args += " --ports_num=" + str(PADDLE_PORTS_NUM)
+    args += " --comment=" + "paddle_process_by_paddle"
+    ip_string = ""
+    for ip in idMap.keys():
+        ip_string += (ip + ",")
+    ip_string = ip_string.rstrip(",")
+    args += " --pservers=" + ip_string
+    args_ext = ""
+    for key, value in train_args_dict.items():
+        args_ext += (' --' + key + '=' + value)
+    localIP = socket.gethostbyname(socket.gethostname())
+    trainerId = idMap[localIP]
+    args += " " + args_ext + " --trainer_id=" + \
+        str(trainerId) + " --save_dir=" + JOB_PATH_OUTPUT
+```
+Use `docker build` to build toe Docker Image:
+```
+docker build -t your_repo/paddle:mypaddle .
+```
+And then push the built image onto docker registry.
+```
+docker push  your_repo/paddle:mypaddle
+```
+####Upload Training Data File
+Here we will use PaddlePaddle's official recommendation demo as the content for this training, we put the training data file into a directory named by job name, which located in EFS sharing volume, the tree structure for the directory looks like:
+```
+efs
+└── paddle-cluster-job
+    ├── data
+    │   ├── 0
+    │   │
+    │   ├── 1
+    │   │
+    │   └── 2
+    ├── output
+    └── recommendation
+```
+The `paddle-cluster-job` directory is the job name for this training, this training includes 3 PaddlePaddle node, we store the pre-divided data under `paddle-cluster-job/data` directory, directory 0, 1, 2 each represent 3 nodes' trainer_id. the training data in in recommendation directory, the training results and logs will be in the output directory.
+####Create Kubernetes Job
+Kubernetes use yaml file to describe job details, and then use command line tool to create the job in Kubernetes cluster.
+In yaml file, we describe the Docker image we use for this training, the node number we need to startup, the volume mounting information and all the necessary parameters we need for `paddle pserver` and `paddle train` processes.
+The yaml file content is as follows:
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: paddle-cluster-job
+spec:
+  parallelism: 3
+  completions: 3
+  template:
+    metadata:
+      name: paddle-cluster-job
+    spec:
+      volumes:
+      - name: jobpath
+        hostPath:
+          path: /home/admin/efs
+      containers:
+      - name: trainer
+        image: drinkcode/paddle:k8s-job
+        command: ["bin/bash",  "-c", "/root/start.sh"]
+        env:
+        - name: JOB_NAME
+          value: paddle-cluster-job
+        - name: JOB_PATH
+          value: /home/jobpath
+        - name: JOB_NAMESPACE
+          value: default
+        - name: TRAIN_CONFIG_DIR
+          value: recommendation
+        - name: CONF_PADDLE_NIC
+          value: eth0
+        - name: CONF_PADDLE_PORT
+          value: "7164"
+        - name: CONF_PADDLE_PORTS_NUM
+          value: "2"
+        - name: CONF_PADDLE_PORTS_NUM_SPARSE
+          value: "2"
+        - name: CONF_PADDLE_GRADIENT_NUM
+          value: "3"
+        volumeMounts:
+        - name: jobpath
+          mountPath: /home/jobpath
+        ports:
+        - name: jobport
+          hostPort: 30001
+          containerPort: 30001
+      restartPolicy: Never
+```
+In yaml file, the metadata's name is the job's name. `parallelism, completions` means this job will simultaneously start up 3 PaddlePaddle nodes, and this job will be finished when there are 3 finished pods. For the data store volume, we declare the path jobpath, it mount the /home/admin/efs on host machine into the container with path /home/jobpath. So in container, the /home/jobpath actually stores the data onto EFS sharing volume.
+`env` field represents container's environment variables, we pass the PaddlePaddle parameters into containers by using the `env` field.
+`JOB_PATH` represents the sharing volume path, `JOB_NAME` represents job name, `TRAIN_CONFIG_DIR` represents the training data file directory, we can these three parameters to get the file path for this training.
+`CONF_PADDLE_NIC` represents `paddle pserver` process's `--nics` parameters, the NIC name.
+`CONF_PADDLE_PORT` represents `paddle pserver` process's `--port` parameters, `CONF_PADDLE_PORTS_NUM` represents `--port_num` parameter.
+`CONF_PADDLE_PORTS_NUM_SPARSE` represents the sparse updated port number, `--ports_num_for_sparse` parameter.
+`CONF_PADDLE_GRADIENT_NUM` represents the training node number, `--num_gradient_servers` parameter.
+After we create the yaml file, we can use Kubernetes command line tool to create the job onto the cluster.
+```
+kubectl create -f job.yaml
+```
+After we execute the above command, Kubernetes will create 3 pods and then pull the PaddlePaddle image, then start up the containers for training.
+####Check Training Results
+During the training, we can see the logs and models on EFS sharing volume, the output directory contains the training results. (Caution: node_0, node_1, node_2 directories represents PaddlePaddle node and train_id, not the Kubernetes node)
+```
+[root@paddle-kubernetes-node0 output]# tree -d
+.
+├── node_0
+│   ├── server.log
+│   └── train.log
+├── node_1
+│   ├── server.log
+│   └── train.log
+├── node_2
+......
+├── pass-00002
+│   ├── done
+│   ├── ___embedding_0__.w0
+│   ├── ___embedding_1__.w0
+......
+```
+We can always check the container training status through logs, for example:
+```
+[root@paddle-kubernetes-node0 node_0]# cat train.log
+I1116 09:10:17.123121    50 Util.cpp:155] commandline:
+ /usr/local/bin/../opt/paddle/bin/paddle_trainer
+    --nics=eth0 --port=7164
+    --ports_num=2 --comment=paddle_process_by_paddle
+    --pservers=192.168.129.66,192.168.223.143,192.168.129.71
+    --ports_num_for_sparse=2 --config=./trainer_config.py
+    --trainer_count=4 --num_passes=10 --use_gpu=0 
+    --log_period=50 --dot_period=10 --saving_period=1 
+    --local=0 --trainer_id=0
+    --save_dir=/home/jobpath/paddle-cluster-job/output
+I1116 09:10:17.123440    50 Util.cpp:130] Calling runInitFunctions
+I1116 09:10:17.123764    50 Util.cpp:143] Call runInitFunctions done.
+[WARNING 2016-11-16 09:10:17,227 default_decorators.py:40] please use keyword arguments in paddle config.
+[INFO 2016-11-16 09:10:17,239 networks.py:1282] The input order is [movie_id, title, genres, user_id, gender, age, occupation, rating]
+[INFO 2016-11-16 09:10:17,239 networks.py:1289] The output order is [__regression_cost_0__]
+I1116 09:10:17.392917    50 Trainer.cpp:170] trainer mode: Normal
+I1116 09:10:17.613910    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.680917    50 PyDataProvider2.cpp:257] loading dataprovider dataprovider::process
+I1116 09:10:17.681543    50 GradientMachine.cpp:134] Initing parameters..
+I1116 09:10:18.012390    50 GradientMachine.cpp:141] Init parameters done.
+I1116 09:10:18.018641    50 ParameterClient2.cpp:122] pserver 0 192.168.129.66:7164
+I1116 09:10:18.018950    50 ParameterClient2.cpp:122] pserver 1 192.168.129.66:7165
+I1116 09:10:18.019069    50 ParameterClient2.cpp:122] pserver 2 192.168.223.143:7164
+I1116 09:10:18.019492    50 ParameterClient2.cpp:122] pserver 3 192.168.223.143:7165
+I1116 09:10:18.019716    50 ParameterClient2.cpp:122] pserver 4 192.168.129.71:7164
+I1116 09:10:18.019836    50 ParameterClient2.cpp:122] pserver 5 192.168.129.71:7165
+```
+It'll take around 8 hours to finish this PaddlePaddle recommendation training demo on three 2 core 8 GB EC2 machine (m3.large).
+###Kubernetes Cluster Tear Down
+If you want to tear down the whole Kubernetes cluster, make sure to *delete* the EFS volume first (otherwise, you will get stucked on following steps), and then use the following command:
+```
+kube-aws destroy
+```
+It's an async call, it might take 5 min to tear down the whole cluster.
+If you created any Kubernetes Services of type LoadBalancer, you must delete these first, as the CloudFormation cannot be fully destroyed if any externally-managed resources still exist.
+## For Experts with Kubernetes and AWS
+Sometimes we might need to create or manage the cluster on AWS manually with limited privileges, so here we will explain more on what’s going on with the Kubernetes setup script.
+### Some Presumptions
+* Instances run on CoreOS, the official IAM.
+* Kubernetes node use instance storage, no EBS get mounted.  Etcd is running on additional node.
+* For networking, we use Flannel network at this moment, we will use Calico solution later on.
+* When you create a service with Type=LoadBalancer, Kubernetes will create and ELB, and create a security group for the ELB.
--- a/doc/howto/usage/cluster/k8s-aws/add_security_group.png
+++ b/doc/howto/usage/cluster/k8s-aws/add_security_group.png
--- a/doc/howto/usage/cluster/k8s-aws/create_efs.png
+++ b/doc/howto/usage/cluster/k8s-aws/create_efs.png
--- a/doc/howto/usage/cluster/k8s-aws/efs_mount.png
+++ b/doc/howto/usage/cluster/k8s-aws/efs_mount.png
--- a/doc/howto/usage/cluster/k8s-aws/managed_policy.png
+++ b/doc/howto/usage/cluster/k8s-aws/managed_policy.png
--- a/doc/howto/usage/cluster/k8s/k8s_en.md
+++ b/doc/howto/usage/cluster/k8s/k8s_en.md
+# Paddle On Kubernetes
+>In this article, we will introduce how to run Paddle training job on single CPU machine using Kubernetes. In next article, we will introduce how to run Paddle training job on distributed cluster.
+## Build Docker Image
+In distributed Kubernetes cluster, we will use Ceph or other shared storage system for storing training related data so that all processes in Paddle training can retrieve data from Ceph. In this example, we will only demo training job on single machine. In order to simplify the requirement of the environment, we will directly put training data into Paddle's Docker Image, so we need to create a Paddle Docker image that already includes the training data.
+Paddle's [Quick Start Tutorial](http://www.paddlepaddle.org/doc/demo/quick_start/index_en.html) introduces how to download and train data by using script from Paddle's source code.
+And `paddledev/paddle:cpu-demo-latest` image has the Paddle source code and demo. (Caution: Default Paddle image `paddledev/paddle:cpu-latest` doesn't include the source code, Paddle's different versions of image can be referred here: [Docker installation guide](http://www.paddlepaddle.org/doc/build/docker_install.html)), so we run this container and download the training data, and then commit the whole container to be a new Docker image.
+### Run Docker Container
+```
+$ docker run --name quick_start_data -it paddledev/paddle:cpu-demo-latest
+```
+### Download Training Data
+Getting into `/root/paddle/demo/quick_start/data` Directory，using `get_data.sh` to download training data.
+Then getting into `/root/paddle/demo/quick_start` Directory, using `preprocess.sh` to pre-process training data.
+```
+$ root@fbd1f2bb71f4:~/paddle/demo/quick_start/data# ./get_data.sh
+Downloading Amazon Electronics reviews data...
+--2016-10-31 01:33:43--  http://snap.stanford.edu/data/amazon/productGraph/categoryFiles/reviews_Electronics_5.json.gz
+Resolving snap.stanford.edu (snap.stanford.edu)... 171.64.75.80
+Connecting to snap.stanford.edu (snap.stanford.edu)|171.64.75.80|:80... connected.
+HTTP request sent, awaiting response... 200 OK
+Length: 495854086 (473M) [application/x-gzip]
+Saving to: 'reviews_Electronics_5.json.gz'
+ 10% [=======>                                         ] 874,279     64.7KB/s  eta 2h 13m
+```
+### Modify Startup Script
+After downloading the data，modify `/root/paddle/demo/quick_start/train.sh` file contents are as follows (one more cd cmd):
+```
+set -e
+cd /root/paddle/demo/quick_start
+cfg=trainer_config.lr.py
+#cfg=trainer_config.emb.py
+#cfg=trainer_config.cnn.py
+#cfg=trainer_config.lstm.py
+#cfg=trainer_config.bidi-lstm.py
+#cfg=trainer_config.db-lstm.py
+paddle train \
+  --config=$cfg \
+  --save_dir=./output \
+  --trainer_count=4 \
+  --log_period=20 \
+  --num_passes=15 \
+  --use_gpu=false \
+  --show_parameter_stats_period=100 \
+  --test_all_data_in_one_period=1 \
+  2>&1 | tee 'train.log'
+```
+### Commit Docker Image
+```
+$ docker commit quick_start_data mypaddle/paddle:quickstart
+```
+## Use Kubernetes For Training
+>We will use Kubernetes job for training process, following steps shows how to do the training with Kubernetes.
+### Create Yaml Files
+The output result in container will be demolished when job finished (container stopped running), so we need to mount the volume out to the local disk when creating the container to store the training result. Using our previously created image, we can create a [Kubernetes Job](http://kubernetes.io/docs/user-guide/jobs/#what-is-a-job), the yaml contents are as follows:
+```
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: quickstart
+spec:
+  parallelism: 1
+  completions: 1
+  template:
+    metadata:
+      name: quickstart
+    spec:
+      volumes:
+      - name: output
+        hostPath: 
+          path: /home/work/paddle_output     
+      containers:
+      - name: pi
+        image: mypaddle/paddle:quickstart
+        command: ["bin/bash",  "-c", "/root/paddle/demo/quick_start/train.sh"]
+        volumeMounts:
+        - name: output
+          mountPath: /root/paddle/demo/quick_start/output
+      restartPolicy: Never
+```
+### Start Paddle Job
+Using the above yaml file to start the Kubernetes job.
+```
+$ kubectl  create -f paddle.yaml
+```
+Get the detailed status of the job:
+```
+$ kubectl  get job
+NAME         DESIRED   SUCCESSFUL   AGE
+quickstart   1         0            58s
+$ kubectl  describe job quickstart
+Name:		quickstart
+Namespace:	default
+Image(s):	registry.baidu.com/public/paddle:cpu-demo-latest
+Selector:	controller-uid=f120da72-9f18-11e6-b363-448a5b355b84
+Parallelism:	1
+Completions:	1
+Start Time:	Mon, 31 Oct 2016 11:20:16 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Pods Statuses:	0 Running / 1 Succeeded / 0 Failed
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+Events:
+  FirstSeen	LastSeen	Count	From			SubobjectPath	Type		Reason			Message
+  ---------	--------	-----	----			-------------	--------	------			-------
+  1m		1m		1	{job-controller }			Normal		SuccessfulCreate	Created pod: quickstart-fa0wx
+```
+### Get Training Result
+We can use kubectl command to take a look at the status of related pod.
+```
+$ kubectl  describe pod quickstart-fa0wx
+Name:		quickstart-fa0wx
+Namespace:	default
+Node:		paddle-demo-let02/10.206.202.44
+Start Time:	Mon, 31 Oct 2016 11:20:17 +0800
+Labels:		controller-uid=f120da72-9f18-11e6-b363-448a5b355b84,job-name=quickstart
+Status:		Succeeded
+IP:		10.0.0.9
+Controllers:	Job/quickstart
+Containers:
+  quickstart:
+    Container ID:	docker://b8561f5c79193550d64fa47418a9e67ebdd71546186e840f88de5026b8097465
+    Image:		registry.baidu.com/public/paddle:cpu-demo-latest
+    Image ID:		docker://18e457ce3d362ff5f3febf8e7f85ffec852f70f3b629add10aed84f930a68750
+    Port:
+    Command:
+      bin/bash
+      -c
+      /root/paddle/demo/quick_start/train.sh
+    QoS Tier:
+      cpu:		BestEffort
+      memory:		BestEffort
+    State:		Terminated
+      Reason:		Completed
+      Exit Code:	0
+      Started:		Mon, 31 Oct 2016 11:20:20 +0800
+      Finished:		Mon, 31 Oct 2016 11:21:46 +0800
+    Ready:		False
+    Restart Count:	0
+    Environment Variables:
+Conditions:
+  Type		Status
+  Ready 	False
+Volumes:
+  output:
+    Type:	HostPath (bare host directory volume)
+    Path:	/home/work/paddle_output
+```
+We can also ssh to Kubernetes node to take a look at the training result.
+```
+[root@paddle-demo-let02 paddle_output]# ll
+total 60
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00000
+drwxr-xr-x 2 root root 4096 Oct 31 11:20 pass-00001
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00002
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00003
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00004
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00005
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00006
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00007
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00008
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00009
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00010
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00011
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00012
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00013
+drwxr-xr-x 2 root root 4096 Oct 31 11:21 pass-00014
+```
--- a/paddle/CMakeLists.txt
+++ b/paddle/CMakeLists.txt
 add_subdirectory(cuda)
 add_subdirectory(function)
 add_subdirectory(utils)
+add_subdirectory(testing)
 add_subdirectory(math)
 add_subdirectory(parameter)
 add_subdirectory(gserver)

--- a/paddle/api/Paddle.swig
+++ b/paddle/api/Paddle.swig
@@ -178,6 +178,7 @@ namespace std {
 %newobject ParameterOptimizer::create;
 %newobject ParameterOptimizer::needSpecialTraversal;
 %newobject ParameterUpdater::createLocalUpdater;
+%newobject ParameterUpdater::createRemoteUpdater;
 %feature("director") UpdateCallback;
 %feature("autodoc", 1); // To generate method stub, for code hint in ide

--- a/paddle/api/PaddleAPI.h
+++ b/paddle/api/PaddleAPI.h
@@ -803,6 +803,8 @@ private:
 public:
  static ParameterUpdater* createLocalUpdater(OptimizationConfig* config);
+  static ParameterUpdater* createRemoteUpdater(OptimizationConfig* config,
+                                               int passCount);
  ~ParameterUpdater();
  /**

--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -15,15 +15,25 @@ limitations under the License. */
 #include "PaddleAPI.h"
 #include "PaddleAPIPrivate.h"
+#include "paddle/trainer/RemoteParameterUpdater.h"
 #include "paddle/trainer/ThreadParameterUpdater.h"
 ParameterUpdater::ParameterUpdater() : m(new ParameterUpdaterPrivate()) {}
 ParameterUpdater *ParameterUpdater::createLocalUpdater(
    OptimizationConfig *config) {
-  auto param = new ParameterUpdater();
+  auto updater = new ParameterUpdater();
-  param->m->updater.reset(new paddle::SgdThreadUpdater(config->m->getConfig()));
+  updater->m->updater.reset(
-  return param;
+      new paddle::SgdThreadUpdater(config->m->getConfig()));
+  return updater;
+}
+ParameterUpdater *ParameterUpdater::createRemoteUpdater(
+    OptimizationConfig *config, int passCount) {
+  auto updater = new ParameterUpdater();
+  updater->m->updater.reset(new paddle::RemoteParameterUpdater(
+      config->m->getConfig(), passCount, nullptr));
+  return updater;
 }
 ParameterUpdater::~ParameterUpdater() { delete m; }

--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -48,78 +48,6 @@ extern void hl_max_sequence_forward(real* input,
 extern void hl_max_sequence_backward(
    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim);
-/**
- * @brief   Context projection forward.
- *
- * @param[in]   input           input sequence.
- * @param[in]   sequence        sequence index.
- * @param[in]   weightData      padding data.
- * @param[out]  output          output sequence.
- * @param[in]   numSequences    number of sequences.
- * @param[in]   inputDim        input sequence dimension.
- * @param[in]   contextLength   context length.
- * @param[in]   contextStart    context start.
- * @param[in]   beginPad        number of extra timesteps added at the
- * beginning.
- * @param[in]   isPadding       trainable padding.
- *
- */
-extern void hl_context_projection_forward(real* input,
-                                          const int* sequence,
-                                          real* weightData,
-                                          real* output,
-                                          int numSequences,
-                                          int inputDim,
-                                          int contextLength,
-                                          int contextStart,
-                                          int beginPad,
-                                          bool isPadding);
-/**
- * @brief   Context projection backward data.
- *
- * @param[in]   outputGrad      output gradient.
- * @param[in]   sequence        sequence index.
- * @param[out]  inputGrad       input gradient.
- * @param[in]   numSequences    number of sequences.
- * @param[in]   inputDim        input sequence dimension.
- * @param[in]   contextLength   context length.
- * @param[in]   contextStart    context start.
- *
- */
-extern void hl_context_projection_backward_data(real* outputGrad,
-                                                const int* sequence,
-                                                real* inputGrad,
-                                                int numSequences,
-                                                int inputDim,
-                                                int contextLength,
-                                                int contextStart);
-/**
- * @brief   Context projection backward weight.
- *
- * @param[in]   outputGrad      output gradient.
- * @param[in]   sequence        sequence index.
- * @param[out]  weightGrad      weight gradient.
- * @param[in]   numSequences    number of sequences.
- * @param[in]   weightDim       input sequence dimension.
- * @param[in]   totalPad        number of extra timesteps.
- * @param[in]   contextLength   context length.
- * @param[in]   contextStart    context start.
- * @param[in]   beginPad        number of extra timesteps added at the
- * beginning.
- *
- */
-extern void hl_context_projection_backward_weight(real* outputGrad,
-                                                  const int* sequence,
-                                                  real* weightGrad,
-                                                  int numSequences,
-                                                  int weightDim,
-                                                  int totalPad,
-                                                  int contextLength,
-                                                  int contextStart,
-                                                  int beginPad);
 /**
 * @brief   Memory copy from sequence to batch.
 *

--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -27,35 +27,6 @@ inline void hl_max_sequence_forward(real* input,
 inline void hl_max_sequence_backward(
    real* outputGrad, int* index, real* inputGrad, int numSequences, int dim) {}
-inline void hl_context_projection_forward(real* input,
-                                          const int* sequence,
-                                          real* weightData,
-                                          real* output,
-                                          int numSequences,
-                                          int inputDim,
-                                          int contextLength,
-                                          int contextStart,
-                                          int beginPad,
-                                          bool isPadding) {}
-inline void hl_context_projection_backward_data(real* outputGrad,
-                                                const int* sequence,
-                                                real* inputGrad,
-                                                int numSequences,
-                                                int inputDim,
-                                                int contextLength,
-                                                int contextStart) {}
-inline void hl_context_projection_backward_weight(real* outputGrad,
-                                                  const int* sequence,
-                                                  real* weightGrad,
-                                                  int numSequences,
-                                                  int weightDim,
-                                                  int totalPad,
-                                                  int contextLength,
-                                                  int contextStart,
-                                                  int beginPad) {}
 inline void hl_sequence2batch_copy(real* batch,
                                   real* sequence,
                                   const int* batchIndex,

--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -90,258 +90,6 @@ void hl_max_sequence_backward(real* outputGrad,
  CHECK_SYNC("hl_max_sequence_backward failed");
 }
-template <bool padding>
-__global__ void KeContextProjectionForward(real* input,
-                                           const int* sequence,
-                                           real* weightData,
-                                           real* output,
-                                           int inputDim,
-                                           int contextLength,
-                                           int contextStart,
-                                           int beginPad) {
-  int idx = threadIdx.x;
-  int blockSize = blockDim.x;
-  int sequenceId = blockIdx.x;
-  int seqStart = sequence[sequenceId];
-  int seqEnd = sequence[sequenceId+1];
-  real value = 0;
-  int instances = seqEnd - seqStart + contextLength - 1;
-  output += seqStart * inputDim * contextLength;
-  input += seqStart * inputDim;
-  for (int k = 0; k <= inputDim / blockSize; k++) {
-    if (idx < inputDim) {
-      for (int i = 0; i < instances; i++) {
-        // i + contextStart;
-        if ((i + contextStart) < 0) {
-          if (padding) {
-            value = weightData[i * inputDim + idx];
-          } else {
-            continue;
-          }
-        } else if ((i + contextStart) >= (seqEnd - seqStart)) {
-          if (padding) {
-            value =
-              weightData[(beginPad + i + contextStart - (seqEnd - seqStart)) *
-                         inputDim + idx];
-          } else {
-            continue;
-          }
-        } else {
-          value = input[(i + contextStart) * inputDim + idx];
-        }
-        int outx = (i - contextLength) < 0 ? i : (contextLength - 1);
-        int outy = (i - contextLength) < 0 ? 0 : (i - (contextLength - 1));
-        real* output_r =
-          output + outy * inputDim * contextLength + outx * inputDim;
-        for (int j = outy; j < seqEnd - seqStart; j++) {
-          output_r[idx] += value;
-          if (j - outy == outx) break;
-          output_r += (contextLength - 1) * inputDim;
-        }
-      }
-    }
-    idx += blockSize;
-  }
-}
-void hl_context_projection_forward(real* input,
-                                   const int* sequence,
-                                   real* weightData,
-                                   real* output,
-                                   int numSequences,
-                                   int inputDim,
-                                   int contextLength,
-                                   int contextStart,
-                                   int beginPad,
-                                   bool isPadding) {
-  CHECK_NOTNULL(input);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(output);
-  CHECK(!isPadding || weightData);
-  int blockSize = 128;
-  int blocksX = numSequences;
-  int blocksY = 1;
-  dim3 threads(blockSize, 1);
-  dim3 grid(blocksX, blocksY);
-  if (isPadding) {
-    KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (input, sequence, weightData, output, inputDim,
-       contextLength, contextStart, beginPad);
-  } else  {
-    KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
-      (input, sequence, weightData, output, inputDim,
-       contextLength, contextStart, beginPad);
-  }
-  CHECK_SYNC("hl_context_projection_forward failed");
-}
-__global__ void KeContextProjectionBackwardData(real* outputGrad,
-                                                const int* sequence,
-                                                real* inputGrad,
-                                                int inputDim,
-                                                int contextLength,
-                                                int contextStart) {
-  int idx = threadIdx.x;
-  int blockSize = blockDim.x;
-  int sequenceId = blockIdx.x;
-  int seqStart = sequence[sequenceId];
-  int seqEnd = sequence[sequenceId+1];
-  real value = 0;
-  int instances = seqEnd - seqStart + contextLength - 1;
-  outputGrad += seqStart * inputDim * contextLength;
-  inputGrad += seqStart * inputDim;
-  for (int k = 0; k <= inputDim / blockSize; k++) {
-    if (idx < inputDim) {
-      for (int i = 0; i < instances; i++) {
-        if ((i + contextStart) < 0) {
-          continue;
-        } else if ((i + contextStart) >= (seqEnd - seqStart)) {
-          continue;
-        } else {
-          // value = 0;
-          value = inputGrad[(i + contextStart) * inputDim + idx];
-        }
-        int outx = (i - contextLength) < 0 ? i : (contextLength - 1);
-        int outy = (i - contextLength) < 0 ? 0 : (i - (contextLength - 1));
-        real* output_r =
-          outputGrad + outy * inputDim * contextLength + outx * inputDim;
-        for (int j = outy; j < seqEnd - seqStart; j++) {
-          value += output_r[idx];
-          if (j - outy == outx) break;
-          output_r += (contextLength - 1) * inputDim;
-        }
-        inputGrad[(i + contextStart) * inputDim + idx] = value;
-      }
-    }
-    idx += blockSize;
-  }
-}
-void hl_context_projection_backward_data(real* outputGrad,
-                                         const int* sequence,
-                                         real* inputGrad,
-                                         int numSequences,
-                                         int inputDim,
-                                         int contextLength,
-                                         int contextStart) {
-  CHECK_NOTNULL(outputGrad);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(inputGrad);
-  int blockSize = 128;
-  int blocksX = numSequences;
-  int blocksY = 1;
-  dim3 threads(blockSize, 1);
-  dim3 grid(blocksX, blocksY);
-  KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
-    (outputGrad, sequence, inputGrad, inputDim, contextLength, contextStart);
-  CHECK_SYNC("hl_context_projection_backward_data failed");
-}
-template<int THREADS_X, int THREADS_Y>
-__global__ void KeContextProjectionBackwardWeight(real* outputGrad,
-                                                  const int* sequence,
-                                                  real* weightGrad,
-                                                  int numSequences,
-                                                  int weightDim,
-                                                  int contextLength,
-                                                  int contextStart,
-                                                  int beginPad) {
-  __shared__ real sum_s[THREADS_Y][THREADS_X];
-  int padOfBlock = (weightDim + THREADS_X - 1) / THREADS_X;
-  const int idx = threadIdx.x;
-  const int idy = threadIdx.y;
-  int padId = blockIdx.x / padOfBlock;
-  int weightIdx = idx + THREADS_X * (blockIdx.x % padOfBlock);
-  int instanceId;
-  real value = 0;
-  real* output_r;
-  sum_s[idy][idx] = 0.0f;
-  if (weightIdx < weightDim) {
-    for (int seqId = idy; seqId < numSequences; seqId += THREADS_Y) {
-      int seqStart = sequence[seqId];
-      int seqEnd = sequence[seqId+1];
-      output_r = outputGrad + seqStart * weightDim * contextLength;
-      if (contextStart < 0) {
-        if (padId + contextStart < 0) {
-          instanceId = padId;
-        } else {
-          // beginPad > 0;
-          instanceId = (padId - beginPad) + (seqEnd - seqStart) - contextStart;
-        }
-      } else {
-        if (padId + (seqEnd - seqStart) < contextStart) {
-          continue;
-        } else {
-          // beginPad == 0;
-          instanceId = padId + (seqEnd - seqStart) - contextStart;
-        }
-      }
-      int outx = (instanceId - contextLength) < 0 ?
-                 instanceId : (contextLength - 1);
-      int outy = (instanceId - contextLength) < 0 ?
-                 0 : (instanceId - (contextLength - 1));
-      output_r += outy * weightDim * contextLength + outx * weightDim;
-      for (int j = outy; j < seqEnd - seqStart; j++) {
-        value += output_r[weightIdx];
-        if (j - outy == outx) break;
-        output_r += (contextLength - 1) * weightDim;
-      }
-    }
-    sum_s[idy][idx] = value;
-  }
-  __syncthreads();
-  for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
-    if (idy < stride) {
-      sum_s[idy][idx] += sum_s[idy + stride][idx];
-    }
-    __syncthreads();
-  }
-  __syncthreads();
-  if (weightIdx < weightDim) {
-    if (idy == 0) {
-      weightGrad[padId * weightDim + weightIdx] += sum_s[0][idx];
-    }
-  }
-}
-void hl_context_projection_backward_weight(real* outputGrad,
-                                           const int* sequence,
-                                           real* weightGrad,
-                                           int numSequences,
-                                           int weightDim,
-                                           int totalPad,
-                                           int contextLength,
-                                           int contextStart,
-                                           int beginPad) {
-  CHECK_NOTNULL(outputGrad);
-  CHECK_NOTNULL(sequence);
-  CHECK_NOTNULL(weightGrad);
-  int threadsX = 32;
-  int threadsY = 32;
-  int blocksX = totalPad * ((weightDim + threadsX - 1) / threadsX);
-  dim3 threads(threadsX, threadsY);
-  dim3 grid(blocksX, 1);
-  KeContextProjectionBackwardWeight<32, 32>
-    <<< grid, threads, 0, STREAM_DEFAULT >>>
-    (outputGrad, sequence, weightGrad, numSequences, weightDim,
-     contextLength, contextStart, beginPad);
-  CHECK_SYNC("hl_context_projection_backward_weight failed");
-}
 template<int blockDimX, int blockDimY, int gridDimX, bool AddRow>
 __global__ void KeMatrixAddRows(real* output,
                                real* table,

--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -11,13 +11,16 @@ endif()
 add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
-add_library(paddle_test_main STATIC TestMain.cpp)
 if(WITH_GPU)
+if(WITH_TESTING)
    # TODO:
    # file(GLOB test_files . *OpTest.cpp)
    # add_executable(${test_bin} EXCLUDE_FROM_ALL ${test_files})
    add_simple_unittest(CrossMapNormalOpTest)
+    add_unittest(ContextProjectionOpTest
+        ContextProjectionOpTest.cpp
+        ../gserver/tests/TestUtil.cpp)
+endif()
 endif()
 add_style_check_target(paddle_function ${h_files})

--- a/paddle/function/ContextProjectionOp.cpp
+++ b/paddle/function/ContextProjectionOp.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "ContextProjectionOp.h"
+#include "paddle/math/Matrix.h"
+#include "paddle/math/Vector.h"
+namespace paddle {
+template <>
+void ContextProjectionForward<DEVICE_TYPE_CPU>(CpuMatrix* out_mat,
+                                               const CpuMatrix* input_mat,
+                                               const CpuMatrix* weight_mat,
+                                               const CpuIVector& seq_vec,
+                                               size_t context_length,
+                                               int context_start,
+                                               size_t begin_pad) {
+  const int* starts = seq_vec.getData();
+  const size_t num_sequences = seq_vec.getSize() - 1;
+  auto w_mat = const_cast<CpuMatrix*>(weight_mat);
+  auto in_mat = const_cast<CpuMatrix*>(input_mat);
+  for (size_t i = 0; i < num_sequences; ++i) {
+    for (size_t j = 0; j < context_length; ++j) {
+      int begin = starts[i] + context_start + j;
+      int end = starts[i + 1] + context_start + j;
+      int dst_begin = starts[i];
+      int dst_end = starts[i + 1];
+      if (begin < starts[i]) {
+        int64_t pad_size =
+            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
+        MatrixPtr mat = out_mat->subMatrix(starts[i], pad_size);
+        if (w_mat) {
+          MatrixPtr sub = w_mat->subMatrix(j, pad_size);
+          mat->addAtOffset(*sub, j * in_mat->getWidth());
+        }
+        dst_begin = starts[i] + pad_size;
+        begin = starts[i];
+      }
+      if (end > starts[i + 1]) {
+        int64_t pad_size =
+            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
+        MatrixPtr mat = out_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
+        if (w_mat) {
+          MatrixPtr sub = w_mat->subMatrix(
+              begin_pad + context_start + j - pad_size, pad_size);
+          mat->addAtOffset(*sub, j * in_mat->getWidth());
+        }
+        dst_end = starts[i + 1] - pad_size;
+        end = starts[i + 1];
+      }
+      if (end <= begin) continue;
+      MatrixPtr src = in_mat->subMatrix(begin, end - begin);
+      MatrixPtr dst = out_mat->subMatrix(dst_begin, dst_end - dst_begin);
+      dst->addAtOffset(*src, j * in_mat->getWidth());
+    }
+  }
+}
+/**
+ * \param inputs[0] input value.
+ * \param inputs[1] input weight.
+ * \param inputs[2] input sequence.
+ * \param outputs[0] output value.
+ */
+template <DeviceType Device>
+class ContextProjectionForwardFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+  }
+  void calc(const Arguments& inputs,
+            const Arguments& outputs,
+            const Arguments& inouts) override {
+    CHECK_EQ(3, inputs.size());
+    CHECK_EQ(1, outputs.size());
+    CHECK_EQ(0, inouts.size());
+    CHECK(outputs[0].getData() && inputs[0].getData() && inputs[2].getData());
+    CHECK_EQ(outputs[0].dims_.size(), 2);
+    CHECK_EQ(inputs[0].dims_.size(), 2);
+    CHECK_EQ(inputs[1].dims_.size(), 2);
+    CHECK_EQ(inputs[2].dims_.size(), 1);
+    /// dim of output = dim of input * context_length
+    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+    /// dim of input == dim of weight
+    CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
+    /// input and output has the same batch_size
+    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+    auto out_mat = std::make_shared<typename MatrixT<Device>::type>(
+        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
+    const auto in_mat = std::make_shared<typename MatrixT<Device>::type>(
+        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
+    const auto w_mat =
+        !inputs[1].getData()
+            ? nullptr
+            : std::make_shared<typename MatrixT<Device>::type>(
+                  inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
+    typename SequenceT<Device>::type seq_vec(
+        inputs[2].dims_[0], reinterpret_cast<int*>(inputs[2].getData()));
+    ContextProjectionForward<Device>(out_mat.get(),
+                                     in_mat.get(),
+                                     w_mat.get(),
+                                     seq_vec,
+                                     context_length_,
+                                     context_start_,
+                                     begin_pad_);
+  }
+private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+};
+template <>
+void ContextProjectionBackward<DEVICE_TYPE_CPU>(CpuMatrix* out_grad_mat,
+                                                CpuMatrix* in_grad_mat,
+                                                CpuMatrix* w_grad_mat,
+                                                const CpuIVector& seq_vec,
+                                                size_t context_length,
+                                                int context_start,
+                                                size_t begin_pad,
+                                                bool is_padding,
+                                                size_t total_pad) {
+  CHECK(out_grad_mat);
+  size_t input_dim = in_grad_mat ? in_grad_mat->getWidth()
+                                 : w_grad_mat ? w_grad_mat->getWidth() : 0;
+  const int* starts = seq_vec.getData();
+  size_t num_sequences = seq_vec.getSize() - 1;
+  for (size_t i = 0; i < num_sequences; ++i) {
+    for (size_t j = 0; j < context_length; ++j) {
+      int begin = starts[i] + context_start + j;
+      int end = starts[i + 1] + context_start + j;
+      int dst_begin = starts[i];
+      int dst_end = starts[i + 1];
+      if (begin < starts[i]) {
+        int64_t pad_size =
+            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
+        if (is_padding && w_grad_mat) {
+          MatrixPtr mat = out_grad_mat->subMatrix(starts[i], pad_size);
+          MatrixPtr sub = w_grad_mat->subMatrix(j, pad_size);
+          sub->addAtOffset(*mat, j * input_dim);
+        }
+        dst_begin = starts[i] + pad_size;
+        begin = starts[i];
+      }
+      if (end > starts[i + 1]) {
+        int64_t pad_size =
+            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
+        if (is_padding && w_grad_mat) {
+          MatrixPtr mat =
+              out_grad_mat->subMatrix(starts[i + 1] - pad_size, pad_size);
+          MatrixPtr sub = w_grad_mat->subMatrix(
+              begin_pad + context_start + j - pad_size, pad_size);
+          sub->addAtOffset(*mat, j * input_dim);
+        }
+        dst_end = starts[i + 1] - pad_size;
+        end = starts[i + 1];
+      }
+      if (end <= begin) continue;
+      if (!in_grad_mat) continue;
+      MatrixPtr src = in_grad_mat->subMatrix(begin, end - begin);
+      MatrixPtr dst = out_grad_mat->subMatrix(dst_begin, dst_end - dst_begin);
+      src->addAtOffset(*dst, j * input_dim);
+    }
+  }
+}
+/**
+ * \param inputs[0] input grad.
+ * \param inputs[1] weight grad.
+ * \param inputs[2] input sequence.
+ * \param outputs[0] output value.
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+    is_padding_ = config.get<bool>("is_padding");
+    total_pad_ = config.get<size_t>("total_pad");
+  }
+  void calc(const Arguments& inputs,
+            const Arguments& outputs,
+            const Arguments& inouts) override {
+    CHECK_EQ(3, inputs.size());
+    CHECK_EQ(1, outputs.size());
+    CHECK_EQ(0, inouts.size());
+    CHECK(outputs[0].getData() && inputs[2].getData());
+    CHECK_EQ(outputs[0].dims_.size(), 2);
+    CHECK_EQ(inputs[0].dims_.size(), 2);
+    CHECK_EQ(inputs[1].dims_.size(), 2);
+    CHECK_EQ(inputs[2].dims_.size(), 1);
+    /// dim of input == dim of weight
+    CHECK_EQ(inputs[0].dims_[1], inputs[1].dims_[1]);
+    /// input and output has the same batch_size
+    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+    /// dim of output = dim of input * context_length
+    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
+    auto in_grad_mat =
+        !inputs[0].getData()
+            ? nullptr
+            : std::make_shared<typename MatrixT<Device>::type>(
+                  inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
+    auto w_grad_mat =
+        !inputs[1].getData()
+            ? nullptr
+            : std::make_shared<typename MatrixT<Device>::type>(
+                  inputs[1].getData(), inputs[1].dims_[0], inputs[1].dims_[1]);
+    typename SequenceT<Device>::type seq_vec(
+        inputs[2].dims_[0], reinterpret_cast<int*>(inputs[2].getData()));
+    ContextProjectionBackward<Device>(out_grad_mat.get(),
+                                      in_grad_mat ? in_grad_mat.get() : nullptr,
+                                      w_grad_mat ? w_grad_mat.get() : nullptr,
+                                      seq_vec,
+                                      context_length_,
+                                      context_start_,
+                                      begin_pad_,
+                                      is_padding_,
+                                      total_pad_);
+  }
+private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+  bool is_padding_;
+  size_t total_pad_;
+};
+/**
+ * \param inputs[0] input grad.
+ * \param inputs[1] input sequence.
+ * \param outputs[0] output grad.
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardDataFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+  }
+  void calc(const Arguments& inputs,
+            const Arguments& outputs,
+            const Arguments& inouts) override {
+    CHECK_EQ(2, inputs.size());
+    CHECK_EQ(1, outputs.size());
+    CHECK_EQ(0, inouts.size());
+    CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
+    CHECK_EQ(outputs[0].dims_.size(), 2);
+    CHECK_EQ(inputs[0].dims_.size(), 2);
+    CHECK_EQ(inputs[1].dims_.size(), 1);
+    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+    /// input and output has the same batch_size
+    CHECK_EQ(inputs[0].dims_[0], outputs[0].dims_[0]);
+    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
+    const auto in_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
+    typename SequenceT<Device>::type seq_vec(
+        inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
+    ContextProjectionBackwardData<Device>(out_grad_mat.get(),
+                                          in_grad_mat.get(),
+                                          seq_vec,
+                                          context_length_,
+                                          context_start_);
+  }
+private:
+  size_t context_length_;
+  int context_start_;
+};
+/**
+ * \param inputs[0] weight grad.
+ * \param inputs[1] input sequence.
+ * \param outputs[0] output grad.
+ */
+template <DeviceType Device>
+class ContextProjectionBackwardWeightFunc : public FunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    context_length_ = config.get<size_t>("context_length");
+    context_start_ = config.get<int>("context_start");
+    begin_pad_ = config.get<size_t>("begin_pad");
+    total_pad_ = config.get<size_t>("total_pad");
+  }
+  void calc(const Arguments& inputs,
+            const Arguments& outputs,
+            const Arguments& inouts) override {
+    CHECK_EQ(2, inputs.size());
+    CHECK_EQ(1, outputs.size());
+    CHECK_EQ(0, inouts.size());
+    CHECK(inputs[0].getData() && outputs[0].getData() && inputs[1].getData());
+    CHECK_EQ(outputs[0].dims_.size(), 2);
+    CHECK_EQ(inputs[0].dims_.size(), 2);
+    CHECK_EQ(inputs[1].dims_.size(), 1);
+    CHECK_EQ(outputs[0].dims_[1], inputs[0].dims_[1] * context_length_);
+    auto out_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        outputs[0].getData(), outputs[0].dims_[0], outputs[0].dims_[1]);
+    auto w_grad_mat = std::make_shared<typename MatrixT<Device>::type>(
+        inputs[0].getData(), inputs[0].dims_[0], inputs[0].dims_[1]);
+    typename SequenceT<Device>::type seq_vec(
+        inputs[1].dims_[0], reinterpret_cast<int*>(inputs[1].getData()));
+    ContextProjectionBackwardWeight<Device>(out_grad_mat.get(),
+                                            w_grad_mat.get(),
+                                            seq_vec,
+                                            context_length_,
+                                            context_start_,
+                                            total_pad_,
+                                            begin_pad_);
+  }
+private:
+  size_t context_length_;
+  int context_start_;
+  size_t begin_pad_;
+  size_t total_pad_;
+};
+REGISTER_TYPED_FUNC(ContextProjectionForward,
+                    CPU,
+                    ContextProjectionForwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackward,
+                    CPU,
+                    ContextProjectionBackwardFunc);
+#ifndef PADDLE_ONLY_CPU
+REGISTER_TYPED_FUNC(ContextProjectionForward,
+                    GPU,
+                    ContextProjectionForwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackward,
+                    GPU,
+                    ContextProjectionBackwardFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackwardData,
+                    GPU,
+                    ContextProjectionBackwardDataFunc);
+REGISTER_TYPED_FUNC(ContextProjectionBackwardWeight,
+                    GPU,
+                    ContextProjectionBackwardWeightFunc);
+#endif
+}  // namespace paddle
--- a/paddle/function/ContextProjectionOp.h
+++ b/paddle/function/ContextProjectionOp.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "Function.h"
+namespace paddle {
+/**
+ * \brief   Context Projection Forward.
+ *
+ * \param[out]  outputs           output data.
+ * \param[in]   input             input data.
+ * \param[in]   weight            input weight.
+ * \param[in]   sequence          input data.
+ * \param[in]   context_length    consecutive rows for concatenation.
+ * \param[in]   context_start     context start position.
+ * \param[in]   begin_pad         begining pad position.
+ * \param[in]   is_padding        whether padding 0 or not.
+ *
+ */
+template <DeviceType Device>
+void ContextProjectionForward(typename MatrixT<Device>::type* output,
+                              const typename MatrixT<Device>::type* input,
+                              const typename MatrixT<Device>::type* weight,
+                              const typename SequenceT<Device>::type& sequence,
+                              size_t context_length,
+                              int context_start,
+                              size_t begin_pad);
+/**
+ * \brief   Context Projection Backward.
+ *
+ * \param[out]  outputs           output gradient.
+ * \param[in]   input             input gradient.
+ * \param[in]   weight            input weight gradient.
+ * \param[in]   sequence          input data.
+ * \param[in]   context_length    consecutive rows for concatenation.
+ * \param[in]   context_start     context start position.
+ * \param[in]   begin_pad         begining pad position.
+ * \param[in]   is_padding        whether padding 0 or not.
+ *
+ */
+template <DeviceType Device>
+void ContextProjectionBackward(typename MatrixT<Device>::type* out_grad,
+                               typename MatrixT<Device>::type* in_grad,
+                               typename MatrixT<Device>::type* w_grad,
+                               const typename SequenceT<Device>::type& seq_vec,
+                               size_t context_length,
+                               int context_start,
+                               size_t begin_pad,
+                               bool is_padding,
+                               size_t total_pad);
+template <DeviceType Device>
+void ContextProjectionBackwardData(
+    typename MatrixT<Device>::type* out_grad,
+    typename MatrixT<Device>::type* in_grad,
+    const typename SequenceT<Device>::type& sequence,
+    size_t context_length,
+    int context_start);
+template <DeviceType Device>
+void ContextProjectionBackwardWeight(
+    typename MatrixT<Device>::type* out_grad,
+    typename MatrixT<Device>::type* w_grad,
+    const typename SequenceT<Device>::type& seq_vec,
+    size_t context_length,
+    int context_start,
+    size_t total_pad,
+    size_t begin_pad);
+}  // namespace paddle
--- a/paddle/function/ContextProjectionOpGpu.cu
+++ b/paddle/function/ContextProjectionOpGpu.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "hl_base.h"
+#include "ContextProjectionOp.h"
+namespace paddle {
+template <bool padding>
+__global__ void KeContextProjectionForward(const real* input,
+                                           const int* sequence,
+                                           const real* weight,
+                                           real* output,
+                                           int input_dim,
+                                           int context_length,
+                                           int context_start,
+                                           int begin_pad) {
+  int idx = threadIdx.x;
+  int block_size = blockDim.x;
+  int sequenceId = blockIdx.x;
+  int seq_start = sequence[sequenceId];
+  int seq_end = sequence[sequenceId+1];
+  real value = 0;
+  int instances = seq_end - seq_start + context_length - 1;
+  output += seq_start * input_dim * context_length;
+  input += seq_start * input_dim;
+  for (int k = 0; k <= input_dim / block_size; k++) {
+    if (idx < input_dim) {
+      for (int i = 0; i < instances; i++) {
+        // i + context_start;
+        if ((i + context_start) < 0) {
+          if (padding) {
+            value = weight[i * input_dim + idx];
+          } else {
+            continue;
+          }
+        } else if ((i + context_start) >= (seq_end - seq_start)) {
+          if (padding) {
+            value =
+              weight[(begin_pad + i + context_start - (seq_end - seq_start)) *
+                         input_dim + idx];
+          } else {
+            continue;
+          }
+        } else {
+          value = input[(i + context_start) * input_dim + idx];
+        }
+        int outx = (i - context_length) < 0 ? i : (context_length - 1);
+        int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
+        real* output_r =
+          output + outy * input_dim * context_length + outx * input_dim;
+        for (int j = outy; j < seq_end - seq_start; j++) {
+          output_r[idx] += value;
+          if (j - outy == outx) break;
+          output_r += (context_length - 1) * input_dim;
+        }
+      }
+    }
+    idx += block_size;
+  }
+}
+/**
+ * @brief   Context projection forward.
+ *
+ * @param[in]   input           input sequence.
+ * @param[in]   sequence        sequence index.
+ * @param[in]   weight          padding data.
+ * @param[out]  output          output sequence.
+ * @param[in]   num_sequences    number of sequences.
+ * @param[in]   input_dim        input sequence dimension.
+ * @param[in]   context_length   context length.
+ * @param[in]   context_start    context start.
+ * @param[in]   begin_pad        number of extra timesteps added at the
+ * beginning.
+ *
+ */
+void hl_context_projection_forward(const real* input,
+                                   const int* sequence,
+                                   const real* weight,
+                                   real* output,
+                                   size_t num_sequences,
+                                   size_t input_dim,
+                                   size_t context_length,
+                                   int context_start,
+                                   size_t begin_pad) {
+  CHECK_NOTNULL(input);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(output);
+  int block_size = 128;
+  int blocks_x = num_sequences;
+  int blocks_y = 1;
+  dim3 threads(block_size, 1);
+  dim3 grid(blocks_x, blocks_y);
+  if (weight) {
+    KeContextProjectionForward<true><<< grid, threads, 0, STREAM_DEFAULT >>>
+      (input, sequence, weight, output, input_dim,
+       context_length, context_start, begin_pad);
+  } else  {
+    KeContextProjectionForward<false><<< grid, threads, 0, STREAM_DEFAULT >>>
+      (input, sequence, weight, output, input_dim,
+       context_length, context_start, begin_pad);
+  }
+  CHECK_SYNC("hl_context_projection_forward failed");
+}
+template <>
+void ContextProjectionForward<DEVICE_TYPE_GPU>(GpuMatrix* output,
+                                               const GpuMatrix* input,
+                                               const GpuMatrix* weight,
+                                               const GpuIVector& sequence,
+                                               size_t context_length,
+                                               int context_start,
+                                               size_t begin_pad) {
+  CHECK(input && output);
+  hl_context_projection_forward(input->getData(),
+                                sequence.getData(),
+                                weight ? weight->getData() : nullptr,
+                                output->getData(),
+                                sequence.getSize() - 1,
+                                input->getWidth(),
+                                context_length,
+                                context_start,
+                                begin_pad);
+}
+__global__ void KeContextProjectionBackwardData(real* out_grad,
+                                                const int* sequence,
+                                                real* in_grad,
+                                                int input_dim,
+                                                int context_length,
+                                                int context_start) {
+  int idx = threadIdx.x;
+  int block_size = blockDim.x;
+  int sequenceId = blockIdx.x;
+  int seq_start = sequence[sequenceId];
+  int seq_end = sequence[sequenceId+1];
+  real value = 0;
+  int instances = seq_end - seq_start + context_length - 1;
+  out_grad += seq_start * input_dim * context_length;
+  in_grad += seq_start * input_dim;
+  for (int k = 0; k <= input_dim / block_size; k++) {
+    if (idx < input_dim) {
+      for (int i = 0; i < instances; i++) {
+        if ((i + context_start) < 0) {
+          continue;
+        } else if ((i + context_start) >= (seq_end - seq_start)) {
+          continue;
+        } else {
+          // value = 0;
+          value = in_grad[(i + context_start) * input_dim + idx];
+        }
+        int outx = (i - context_length) < 0 ? i : (context_length - 1);
+        int outy = (i - context_length) < 0 ? 0 : (i - (context_length - 1));
+        real* output_r =
+          out_grad + outy * input_dim * context_length + outx * input_dim;
+        for (int j = outy; j < seq_end - seq_start; j++) {
+          value += output_r[idx];
+          if (j - outy == outx) break;
+          output_r += (context_length - 1) * input_dim;
+        }
+        in_grad[(i + context_start) * input_dim + idx] = value;
+      }
+    }
+    idx += block_size;
+  }
+}
+/**
+ * @brief   Context projection backward data.
+ *
+ * @param[in]   out_grad         output gradient.
+ * @param[in]   sequence         sequence index.
+ * @param[out]  input_grad       input gradient.
+ * @param[in]   num_sequences    number of sequences.
+ * @param[in]   input_dim        input sequence dimension.
+ * @param[in]   context_length   context length.
+ * @param[in]   context_start    context start.
+ *
+ */
+void hl_context_projection_backward_data(real* out_grad,
+                                         const int* sequence,
+                                         real* input_grad,
+                                         size_t num_sequences,
+                                         size_t input_dim,
+                                         size_t context_length,
+                                         int context_start) {
+  CHECK_NOTNULL(out_grad);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(input_grad);
+  int block_size = 128;
+  int blocks_x = num_sequences;
+  int blocks_y = 1;
+  dim3 threads(block_size, 1);
+  dim3 grid(blocks_x, blocks_y);
+  KeContextProjectionBackwardData<<< grid, threads, 0, STREAM_DEFAULT >>>
+    (out_grad, sequence, input_grad, input_dim, context_length, context_start);
+  CHECK_SYNC("hl_context_projection_backward_data failed");
+}
+template <>
+void ContextProjectionBackwardData<DEVICE_TYPE_GPU>(GpuMatrix* out_grad,
+                                                    GpuMatrix* in_grad,
+                                                    const GpuIVector& sequence,
+                                                    size_t context_length,
+                                                    int context_start) {
+  CHECK(in_grad && out_grad);
+  hl_context_projection_backward_data(out_grad->getData(),
+                                      sequence.getData(),
+                                      in_grad->getData(),
+                                      sequence.getSize() - 1,
+                                      in_grad->getWidth(),
+                                      context_length,
+                                      context_start);
+}
+template<int THREADS_X, int THREADS_Y>
+__global__ void KeContextProjectionBackwardWeight(real* out_grad,
+                                                  const int* sequence,
+                                                  real* w_grad,
+                                                  int num_sequences,
+                                                  int w_dim,
+                                                  int context_length,
+                                                  int context_start,
+                                                  int begin_pad) {
+  __shared__ real sum_s[THREADS_Y][THREADS_X];
+  int pad_of_block = (w_dim + THREADS_X - 1) / THREADS_X;
+  const int idx = threadIdx.x;
+  const int idy = threadIdx.y;
+  int padId = blockIdx.x / pad_of_block;
+  int weight_idx = idx + THREADS_X * (blockIdx.x % pad_of_block);
+  int instanceId;
+  real value = 0;
+  real* output_r;
+  sum_s[idy][idx] = 0.0f;
+  if (weight_idx < w_dim) {
+    for (int seqId = idy; seqId < num_sequences; seqId += THREADS_Y) {
+      int seq_start = sequence[seqId];
+      int seq_end = sequence[seqId+1];
+      output_r = out_grad + seq_start * w_dim * context_length;
+      if (context_start < 0) {
+        if (padId + context_start < 0) {
+          instanceId = padId;
+        } else {
+          // begin_pad > 0;
+          instanceId = (padId - begin_pad) +
+            (seq_end - seq_start) - context_start;
+        }
+      } else {
+        if (padId + (seq_end - seq_start) < context_start) {
+          continue;
+        } else {
+          // begin_pad == 0;
+          instanceId = padId + (seq_end - seq_start) - context_start;
+        }
+      }
+      int outx = (instanceId - context_length) < 0 ?
+                 instanceId : (context_length - 1);
+      int outy = (instanceId - context_length) < 0 ?
+                 0 : (instanceId - (context_length - 1));
+      output_r += outy * w_dim * context_length + outx * w_dim;
+      for (int j = outy; j < seq_end - seq_start; j++) {
+        value += output_r[weight_idx];
+        if (j - outy == outx) break;
+        output_r += (context_length - 1) * w_dim;
+      }
+    }
+    sum_s[idy][idx] = value;
+  }
+  __syncthreads();
+  for (int stride = THREADS_Y/2; stride > 0; stride = stride/2) {
+    if (idy < stride) {
+      sum_s[idy][idx] += sum_s[idy + stride][idx];
+    }
+    __syncthreads();
+  }
+  __syncthreads();
+  if (weight_idx < w_dim) {
+    if (idy == 0) {
+      w_grad[padId * w_dim + weight_idx] += sum_s[0][idx];
+    }
+  }
+}
+/**
+ * @brief   Context projection backward weight.
+ *
+ * @param[in]   out_grad         output gradient.
+ * @param[in]   sequence         sequence index.
+ * @param[out]  w_grad           weight gradient.
+ * @param[in]   num_sequences    number of sequences.
+ * @param[in]   w_dim            input sequence dimension.
+ * @param[in]   total_pad        number of extra timesteps.
+ * @param[in]   context_length   context length.
+ * @param[in]   context_start    context start.
+ * @param[in]   begin_pad        number of extra timesteps added at the
+ * beginning.
+ *
+ */
+void hl_context_projection_backward_weight(real* out_grad,
+                                           const int* sequence,
+                                           real* w_grad,
+                                           size_t num_sequences,
+                                           size_t w_dim,
+                                           size_t total_pad,
+                                           size_t context_length,
+                                           int context_start,
+                                           size_t begin_pad) {
+  CHECK_NOTNULL(out_grad);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(w_grad);
+  int threads_x = 32;
+  int threads_y = 32;
+  int blocks_x = total_pad * ((w_dim + threads_x - 1) / threads_x);
+  dim3 threads(threads_x, threads_y);
+  dim3 grid(blocks_x, 1);
+  KeContextProjectionBackwardWeight<32, 32>
+    <<< grid, threads, 0, STREAM_DEFAULT >>>
+    (out_grad, sequence, w_grad, num_sequences, w_dim,
+     context_length, context_start, begin_pad);
+  CHECK_SYNC("hl_context_projection_backward_weight failed");
+}
+template <>
+void ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
+        GpuMatrix* out_grad,
+        GpuMatrix* w_grad,
+        const GpuIVector& seq_vec,
+        size_t context_length,
+        int context_start,
+        size_t total_pad,
+        size_t begin_pad) {
+  CHECK(out_grad && w_grad);
+  hl_context_projection_backward_weight(out_grad->getData(),
+                                        seq_vec.getData(),
+                                        w_grad->getData(),
+                                        seq_vec.getSize() - 1,
+                                        w_grad->getWidth(),
+                                        total_pad,
+                                        context_length,
+                                        context_start,
+                                        begin_pad);
+}
+template <>
+void ContextProjectionBackward<DEVICE_TYPE_GPU>(GpuMatrix* out_grad,
+                                                GpuMatrix* in_grad,
+                                                GpuMatrix* w_grad,
+                                                const GpuIVector& sequence,
+                                                size_t context_length,
+                                                int context_start,
+                                                size_t begin_pad,
+                                                bool is_padding,
+                                                size_t total_pad) {
+    CHECK(out_grad);
+    if (in_grad) {
+        ContextProjectionBackwardData<DEVICE_TYPE_GPU>(
+                out_grad,
+                in_grad,
+                sequence,
+                context_length,
+                context_start);
+    }
+    if (is_padding && w_grad) {
+        ContextProjectionBackwardWeight<DEVICE_TYPE_GPU>(
+                out_grad,
+                w_grad,
+                sequence,
+                context_length,
+                context_start,
+                total_pad,
+                begin_pad);
+  }
+}
+}  // namespace paddle
--- a/paddle/function/ContextProjectionOpTest.cpp
+++ b/paddle/function/ContextProjectionOpTest.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "FunctionTest.h"
+#include "paddle/gserver/tests/TestUtil.h"
+#include "paddle/math/Matrix.h"
+using namespace paddle;  // NOLINT
+void testMatrixProjectionForward(int context_start,
+                                 size_t context_length,
+                                 bool is_padding,
+                                 size_t batch_size,
+                                 size_t input_dim) {
+  size_t pad = std::max(0, -context_start) +
+               std::max(0, (int)(context_start + context_length - 1));
+  if (pad == 0) is_padding = false;
+  FunctionCompare compare("ContextProjectionForward",
+                          FuncConfig()
+                              .set("context_length", context_length)
+                              .set("context_start", context_start)
+                              .set("begin_pad", std::max(0, -context_start)));
+  CpuMatrix cpu_in(batch_size, input_dim);
+  cpu_in.randomizeUniform();
+  GpuMatrix gpu_in(batch_size, input_dim);
+  gpu_in.copyFrom(cpu_in);
+  auto cpu_weight =
+      is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
+  auto gpu_weight =
+      is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
+  if (is_padding) {
+    cpu_weight->randomizeUniform();
+    gpu_weight->copyFrom(*cpu_weight);
+  }
+  IVectorPtr cpu_seq;
+  generateSequenceStartPositions(batch_size, cpu_seq);
+  IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
+  gpu_seq->copyFrom(*cpu_seq);
+  CpuMatrix cpu_out(batch_size, input_dim * context_length);
+  GpuMatrix gpu_out(batch_size, input_dim * context_length);
+  cpu_out.randomizeUniform();
+  gpu_out.copyFrom(cpu_out);
+  compare.getCpuFunction()->calc(
+      {Tensor(cpu_in.getData(), Dims{batch_size, input_dim}),
+       Tensor(cpu_weight ? cpu_weight->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
+              Dims{cpu_seq->getSize()})},
+      {Tensor(cpu_out.getData(), Dims{batch_size, input_dim * context_length})},
+      {});
+  compare.getGpuFunction()->calc(
+      {Tensor(gpu_in.getData(), Dims{batch_size, input_dim}),
+       Tensor(gpu_weight ? gpu_weight->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
+              Dims{gpu_seq->getSize()})},
+      {Tensor(gpu_out.getData(), Dims{batch_size, input_dim * context_length})},
+      {});
+  autotest::TensorCheckEqual(cpu_out, gpu_out);
+}
+void testMatrixProjectionBackward(int context_start,
+                                  int context_length,
+                                  bool is_padding,
+                                  size_t batch_size,
+                                  size_t input_dim) {
+  size_t pad = std::max(0, -context_start) +
+               std::max(0, (int)(context_start + context_length - 1));
+  if (pad == 0) is_padding = false;
+  FunctionCompare compare("ContextProjectionBackward",
+                          FuncConfig()
+                              .set("context_length", context_length)
+                              .set("context_start", context_start)
+                              .set("begin_pad", std::max(0, -context_start))
+                              .set("is_padding", is_padding)
+                              .set("total_pad", pad));
+  CpuMatrix cpu_in_grad(batch_size, input_dim);
+  cpu_in_grad.randomizeUniform();
+  GpuMatrix gpu_in_grad(batch_size, input_dim);
+  gpu_in_grad.copyFrom(cpu_in_grad);
+  CpuMatrix cpu_out_grad(batch_size, input_dim * context_length);
+  cpu_out_grad.randomizeUniform();
+  GpuMatrix gpu_out_grad(batch_size, input_dim * context_length);
+  gpu_out_grad.copyFrom(cpu_out_grad);
+  IVectorPtr cpu_seq;
+  generateSequenceStartPositions(batch_size, cpu_seq);
+  IVectorPtr gpu_seq = IVector::create(cpu_seq->getSize(), true);
+  gpu_seq->copyFrom(*cpu_seq);
+  auto cpu_w_grad =
+      is_padding ? std::make_shared<CpuMatrix>(pad, input_dim) : nullptr;
+  auto gpu_w_grad =
+      is_padding ? std::make_shared<GpuMatrix>(pad, input_dim) : nullptr;
+  if (is_padding) {
+    cpu_w_grad->randomizeUniform();
+    gpu_w_grad->copyFrom(*cpu_w_grad);
+  }
+  compare.getCpuFunction()->calc(
+      {Tensor(cpu_in_grad.getData(), Dims{batch_size, input_dim}),
+       Tensor(cpu_w_grad ? cpu_w_grad->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(cpu_seq->getData()),
+              Dims{cpu_seq->getSize()})},
+      {Tensor(cpu_out_grad.getData(),
+              Dims{batch_size, input_dim * context_length})},
+      {});
+  compare.getGpuFunction()->calc(
+      {Tensor(gpu_in_grad.getData(), Dims{batch_size, input_dim}),
+       Tensor(gpu_w_grad ? gpu_w_grad->getData() : nullptr,
+              Dims{pad, input_dim}),
+       Tensor(reinterpret_cast<real*>(gpu_seq->getData()),
+              Dims{gpu_seq->getSize()})},
+      {Tensor(gpu_out_grad.getData(),
+              Dims{batch_size, input_dim * context_length})},
+      {});
+  autotest::TensorCheckErr(cpu_in_grad, gpu_in_grad);
+  if (is_padding) {
+    autotest::TensorCheckErr(*cpu_w_grad, *gpu_w_grad);
+  }
+}
+TEST(ContextProjection, projection) {
+  for (auto context_start : {-5, -3, -1, 0, 3}) {
+    for (auto context_length : {1, 2, 5, 7}) {
+      for (auto trainable_padding : {false, true}) {
+        for (auto batch_size : {1, 2, 5, 20, 100}) {
+          for (auto input_dim : {15, 32, 63, 128, 200}) {
+            VLOG(3) << " context_start=" << context_start
+                    << " context_length=" << context_length
+                    << " trainable_padding=" << trainable_padding
+                    << " batch_size=" << batch_size
+                    << " input_dim=" << input_dim;
+            testMatrixProjectionForward(context_start,
+                                        context_length,
+                                        trainable_padding,
+                                        batch_size,
+                                        input_dim);
+            testMatrixProjectionBackward(context_start,
+                                         context_length,
+                                         trainable_padding,
+                                         batch_size,
+                                         input_dim);
+          }
+        }
+      }
+    }
+  }
+}
--- a/paddle/function/Function.cpp
+++ b/paddle/function/Function.cpp
@@ -30,6 +30,20 @@ real FuncConfig::get<real>(const std::string& key) const {
  return it->second.r;
 }
+template <>
+int FuncConfig::get<int>(const std::string& key) const {
+  auto it = valueMap_.find(key);
+  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
+  return it->second.i;
+}
+template <>
+bool FuncConfig::get<bool>(const std::string& key) const {
+  auto it = valueMap_.find(key);
+  CHECK(it != valueMap_.end()) << "Cannot find value: '" << key << "'";
+  return it->second.b;
+}
 template <>
 FuncConfig& FuncConfig::set<size_t>(const std::string& key, size_t v) {
  CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
@@ -44,6 +58,20 @@ FuncConfig& FuncConfig::set<real>(const std::string& key, real v) {
  return *this;
 }
+template <>
+FuncConfig& FuncConfig::set<int>(const std::string& key, int v) {
+  CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
+  valueMap_[key].i = v;
+  return *this;
+}
+template <>
+FuncConfig& FuncConfig::set<bool>(const std::string& key, bool v) {
+  CHECK_EQ(valueMap_.count(key), 0) << "Duplicated value: " << key;
+  valueMap_[key].b = v;
+  return *this;
+}
 ClassRegistrar<FunctionBase> FunctionBase::funcRegistrar_;
 }  // namespace paddle
--- a/paddle/function/Function.h
+++ b/paddle/function/Function.h
@@ -40,6 +40,19 @@ struct MatrixT<DEVICE_TYPE_GPU> {
  using type = GpuMatrix;
 };
+template <DeviceType Device>
+struct SequenceT;
+template <>
+struct SequenceT<DEVICE_TYPE_CPU> {
+  using type = CpuIVector;
+};
+template <>
+struct SequenceT<DEVICE_TYPE_GPU> {
+  using type = GpuIVector;
+};
 typedef std::vector<size_t> Dims;
 class Tensor {
@@ -59,6 +72,8 @@ public:
  union value {
    size_t s;
    real r;
+    int i;
+    bool b;
  };
  template <typename T>

--- a/paddle/function/FunctionTest.h
+++ b/paddle/function/FunctionTest.h
@@ -33,25 +33,33 @@ public:
    // init cpu and gpu arguments
    auto initArgs = [=](
        Arguments& cpuArgs, Arguments& gpuArgs, const Arguments& inArgs) {
-      for (auto arg : inArgs) {
+      for (const auto arg : inArgs) {
        size_t size = sizeof(real);
-        for (auto dim : arg.dims_) {
+        for (const auto dim : arg.dims_) {
          size *= dim;
        }
-        cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
+        if (arg.getData()) {
-        gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
+          // todo(tianbing), waste unnecessary mem here
-        cpuArgs.emplace_back(
+          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-            Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
+          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
-        gpuArgs.emplace_back(
+          cpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
-            Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
+          gpuArgs.emplace_back(Tensor((real*)arg.getData(), arg.dims_));
+          // already init outside
-        // will use an api to refactor this code.
+        } else {
-        CpuVector cpuVector(size / sizeof(real),
+          cpuMemory.emplace_back(std::make_shared<CpuMemoryHandle>(size));
-                            (real*)cpuArgs.back().getData());
+          gpuMemory.emplace_back(std::make_shared<GpuMemoryHandle>(size));
-        GpuVector gpuVector(size / sizeof(real),
+          cpuArgs.emplace_back(
-                            (real*)gpuArgs.back().getData());
+              Tensor((real*)cpuMemory.back()->getBuf(), arg.dims_));
-        cpuVector.uniform(0.001, 1);
+          gpuArgs.emplace_back(
-        gpuVector.copyFrom(cpuVector);
+              Tensor((real*)gpuMemory.back()->getBuf(), arg.dims_));
+          // will use an api to refactor this code.
+          CpuVector cpuVector(size / sizeof(real),
+                              (real*)cpuArgs.back().getData());
+          GpuVector gpuVector(size / sizeof(real),
+                              (real*)gpuArgs.back().getData());
+          cpuVector.uniform(0.001, 1);
+          gpuVector.copyFrom(cpuVector);
+        }
      }
    };
    initArgs(cpuInputs, gpuInputs, inputs);
@@ -81,6 +89,10 @@ public:
    checkArgs(cpuInouts, gpuInouts);
  }
+  std::shared_ptr<FunctionBase> getCpuFunction() const { return cpu; }
+  std::shared_ptr<FunctionBase> getGpuFunction() const { return gpu; }
 protected:
  std::shared_ptr<FunctionBase> cpu;
  std::shared_ptr<FunctionBase> gpu;

--- a/paddle/gserver/layers/ContextProjection.cpp
+++ b/paddle/gserver/layers/ContextProjection.cpp
@@ -38,6 +38,32 @@ ContextProjection::ContextProjection(const ProjectionConfig& config,
    CHECK_EQ(inputDim * totalPad, parameter->getSize());
    weight_.reset(new Weight(totalPad, inputDim, parameter));
  }
+  // init forward_ and backward_ functions
+  init();
+}
+bool ContextProjection::init() {
+  size_t context_length = config_.context_length();
+  int context_start = config_.context_start();
+  bool is_padding = config_.trainable_padding();
+  size_t total_pad = is_padding ? beginPad_ + endPad_ : 0;
+  createFunction(forward_,
+                 "ContextProjectionForward",
+                 FuncConfig()
+                     .set("context_length", context_length)
+                     .set("context_start", context_start)
+                     .set("begin_pad", beginPad_));
+  createFunction(backward_,
+                 "ContextProjectionBackward",
+                 FuncConfig()
+                     .set("context_length", context_length)
+                     .set("context_start", context_start)
+                     .set("begin_pad", beginPad_)
+                     .set("is_padding", is_padding)
+                     .set("total_pad", total_pad));
+  return true;
 }
 void ContextProjection::resetState() {
@@ -78,25 +104,29 @@ LayerStatePtr ContextProjection::getState() {
 }
 void ContextProjection::forward() {
-  CHECK(in_->value);
+  CHECK(in_->value && out_->value);
  CHECK(in_->sequenceStartPositions);
-  auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
+  size_t input_dim = in_->value->getWidth();
+  size_t dim = out_->value->getWidth();
-  int64_t inputDim = in_->value->getWidth();
+  CHECK_EQ(dim, input_dim * config_.context_length());
-  int64_t dim = out_->value->getWidth();
+  size_t batch_size = in_->value->getHeight();
-  CHECK_EQ(dim, inputDim * config_.context_length());
+  CHECK_EQ(forward_.size(), 1) << "Only one forward function here";
  REGISTER_TIMER_INFO("ContextProjectionForward", getName().c_str());
-  bool isPadding = config_.trainable_padding();
+  bool is_padding = config_.trainable_padding();
-  out_->value->contextProjectionForward(
+  /// first use state_, otherwise use weight_(padding false === w nullptr)
-      *(in_->value),
+  auto w_ptr =
-      state_ ? state_.get() : isPadding ? weight_->getW().get() : nullptr,
+      state_ ? state_.get() : is_padding ? weight_->getW().get() : nullptr;
-      *startPositions,
+  auto start_pos = in_->sequenceStartPositions;
-      config_.context_length(),
+  forward_[0]->calc({Tensor(in_->value->getData(), Dims{batch_size, input_dim}),
-      config_.context_start(),
+                     Tensor(w_ptr ? w_ptr->getData() : nullptr,
-      beginPad_,
+                            Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
-      state_ ? true : isPadding);
+                     Tensor(reinterpret_cast<real*>(
+                                const_cast<int*>(start_pos->getData(useGpu_))),
+                            Dims{start_pos->getSize()})},
+                    {Tensor(out_->value->getData(), Dims{batch_size, dim})},
+                    {});
  if (state_ && config_.context_start() < 0) {
    CHECK_EQ(1, in_->getNumSequences());
@@ -118,41 +148,27 @@ void ContextProjection::forward() {
 }
 void ContextProjection::backward(const UpdateCallback& callback) {
-  CHECK(in_->value);
+  CHECK(in_->value && out_->value && out_->grad);
-  int64_t inputDim = in_->value->getWidth();
+  size_t input_dim = in_->value->getWidth();
-  int64_t dim = out_->value->getWidth();
+  size_t dim = out_->value->getWidth();
-  CHECK_EQ(dim, inputDim * config_.context_length());
+  CHECK_EQ(dim, input_dim * config_.context_length());
-  auto startPositions = in_->sequenceStartPositions->getVector(useGpu_);
+  size_t batch_size = in_->value->getHeight();
+  CHECK_EQ(batch_size, out_->value->getHeight());
+  CHECK_EQ(backward_.size(), 1) << "Only one backward function here";
  REGISTER_TIMER_INFO("ContextProjectionBackward", getName().c_str());
-  bool isPadding = config_.trainable_padding();
+  bool is_padding = config_.trainable_padding();
-  if (!out_->grad->useGpu()) {
+  auto start_pos = in_->sequenceStartPositions;
-    out_->grad->contextProjectionBackward(
+  auto w_ptr = is_padding ? weight_->getWGrad() : nullptr;
-        in_->grad.get(),
+  backward_[0]->calc({Tensor(in_->grad ? in_->grad->getData() : nullptr,
-        isPadding ? weight_->getWGrad().get() : nullptr,
+                             Dims{batch_size, input_dim}),
-        *startPositions,
+                      Tensor(w_ptr ? w_ptr->getData() : nullptr,
-        config_.context_length(),
+                             Dims{w_ptr ? w_ptr->getHeight() : 0, input_dim}),
-        config_.context_start(),
+                      Tensor(reinterpret_cast<real*>(
-        beginPad_,
+                                 const_cast<int*>(start_pos->getData(useGpu_))),
-        isPadding);
+                             Dims{start_pos->getSize()})},
-  } else {
+                     {Tensor(out_->grad->getData(), Dims{batch_size, dim})},
-    if (in_->grad) {
+                     {});
-      out_->grad->contextProjectionBackwardData(*(in_->grad),
-                                                *startPositions,
-                                                config_.context_length(),
-                                                config_.context_start());
-    }
-    if (isPadding && weight_->getWGrad()) {
-      out_->grad->contextProjectionBackwardWeight(
-          *(weight_->getWGrad()),
-          *startPositions,
-          config_.context_length(),
-          config_.context_start(),
-          weight_->getWGrad()->getHeight(),
-          beginPad_);
-    }
-  }
  if (config_.trainable_padding()) {
    weight_->getParameterPtr()->incUpdate(callback);

--- a/paddle/gserver/layers/ContextProjection.h
+++ b/paddle/gserver/layers/ContextProjection.h
@@ -61,6 +61,8 @@ public:
  virtual LayerStatePtr getState();
+  virtual bool init();
 protected:
  std::unique_ptr<Weight> weight_;
  /// number of extra timesteps added at the beginning

--- a/paddle/gserver/layers/Projection.h
+++ b/paddle/gserver/layers/Projection.h
@@ -88,11 +88,37 @@ public:
   */
  virtual LayerStatePtr getState() { return nullptr; }
+  /**
+   * init forward_ and backward_ functions
+   */
+  virtual bool init() { return true; }
  /**
   * Get output size of projection.
   */
  size_t getOutputSize() const { return config_.output_size(); }
+protected:
+  /**
+   * Create layer function. Function is called in forward or backward.
+   * \param function, Layer::forward_ or Layer::backward_
+   * \param name, function name
+   * \param config, initialization configuration for the function
+   */
+  void createFunction(std::vector<std::shared_ptr<FunctionBase>>& function,
+                      const std::string& name,
+                      const FuncConfig& config) {
+    if (useGpu_) {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-GPU"));
+    } else {
+      function.emplace_back(
+          FunctionBase::funcRegistrar_.createByType(name + "-CPU"));
+    }
+    auto& func = function.back();
+    func->init(config);
+  }
 protected:
  /// Config of projection
  ProjectionConfig config_;
@@ -106,5 +132,9 @@ protected:
  const Argument* out_;
  /// Store `passType` passed to forward()
  PassType passType_;
+  /// Layer forward function
+  std::vector<std::shared_ptr<FunctionBase>> forward_;
+  /// Layer backward function
+  std::vector<std::shared_ptr<FunctionBase>> backward_;
 };
 }  // namespace paddle
--- a/paddle/gserver/tests/test_LinearChainCRF.cpp
+++ b/paddle/gserver/tests/test_LinearChainCRF.cpp
@@ -65,9 +65,3 @@ TEST(LinearChainCRF, decoding) {
    }
  }
 }
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/paddle/gserver/tests/test_ProtoDataProvider.cpp
+++ b/paddle/gserver/tests/test_ProtoDataProvider.cpp
@@ -730,9 +730,3 @@ TEST(ProtoSequenceDataProvider, test) {
    }        // end for (int numIdSlots : numSlotsArray)
  }          // end for (int numSparseNonValueVecSlots : numSlotsArray)
 }
-int main(int argc, char** argv) {
-  initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
@@ -242,9 +242,3 @@ TEST(Layer, WarpCTCLayer) {
    }
  }
 }
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -1304,68 +1304,6 @@ void GpuMatrix::maxSequenceBackward(Matrix& outputGrad,
  hl_max_sequence_backward(outGrad, maxIndex, inputGrad, numSequences, dim);
 }
-void GpuMatrix::contextProjectionForward(Matrix& input,
-                                         Matrix* weight,
-                                         const IVector& sequence,
-                                         int contextLength,
-                                         int contextStart,
-                                         size_t beginPad,
-                                         bool isPadding) {
-  CHECK(dynamic_cast<GpuMatrix*>(&input));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  if (weight) CHECK(dynamic_cast<GpuMatrix*>(weight));
-  CHECK_EQ(getWidth(), input.getWidth() * contextLength);
-  hl_context_projection_forward(input.getData(),
-                                sequence.getData(),
-                                isPadding ? weight->getData() : NULL,
-                                getData(),
-                                sequence.getSize() - 1,
-                                input.getWidth(),
-                                contextLength,
-                                contextStart,
-                                beginPad,
-                                isPadding);
-}
-void GpuMatrix::contextProjectionBackwardData(Matrix& inputGrad,
-                                              const IVector& sequence,
-                                              int contextLength,
-                                              int contextStart) {
-  CHECK(dynamic_cast<GpuMatrix*>(&inputGrad));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  CHECK_EQ(getWidth(), inputGrad.getWidth() * contextLength);
-  hl_context_projection_backward_data(getData(),
-                                      sequence.getData(),
-                                      inputGrad.getData(),
-                                      sequence.getSize() - 1,
-                                      inputGrad.getWidth(),
-                                      contextLength,
-                                      contextStart);
-}
-void GpuMatrix::contextProjectionBackwardWeight(Matrix& weightGrad,
-                                                const IVector& sequence,
-                                                int contextLength,
-                                                int contextStart,
-                                                int totalPad,
-                                                size_t beginPad) {
-  CHECK(dynamic_cast<GpuMatrix*>(&weightGrad));
-  CHECK(dynamic_cast<const GpuIVector*>(&sequence));
-  CHECK_EQ(getWidth(), weightGrad.getWidth() * contextLength);
-  hl_context_projection_backward_weight(getData(),
-                                        sequence.getData(),
-                                        weightGrad.getData(),
-                                        sequence.getSize() - 1,
-                                        weightGrad.getWidth(),
-                                        totalPad,
-                                        contextLength,
-                                        contextStart,
-                                        beginPad);
-}
 void GpuMatrix::paramReluForward(Matrix& data, Matrix& W) {
  CHECK(data.useGpu_ == true && W.useGpu_ == true)
      << "Matrix type are not equal";
@@ -2203,113 +2141,6 @@ void CpuMatrix::maxSequenceBackward(Matrix& outputGrad,
  }
 }
-void CpuMatrix::contextProjectionForward(Matrix& input,
-                                         Matrix* weight,
-                                         const IVector& sequence,
-                                         int contextLength,
-                                         int contextStart,
-                                         size_t beginPad,
-                                         bool isPadding) {
-  auto input_ptr = dynamic_cast<CpuMatrix*>(&input);
-  auto seq_ptr = dynamic_cast<const CpuIVector*>(&sequence);
-  CHECK(input_ptr && seq_ptr);
-  if (weight) CHECK(dynamic_cast<CpuMatrix*>(weight));
-  CHECK_EQ(getWidth(), input_ptr->getWidth() * contextLength);
-  const int* starts = seq_ptr->getData();
-  size_t numSequences = seq_ptr->getSize() - 1;
-  for (size_t i = 0; i < numSequences; ++i) {
-    for (int j = 0; j < contextLength; ++j) {
-      int begin = starts[i] + contextStart + j;
-      int end = starts[i + 1] + contextStart + j;
-      int dstBegin = starts[i];
-      int dstEnd = starts[i + 1];
-      if (begin < starts[i]) {
-        int64_t padSize =
-            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        MatrixPtr mat = this->subMatrix(starts[i], padSize);
-        if (isPadding) {
-          MatrixPtr sub = weight->subMatrix(j, padSize);
-          mat->addAtOffset(*sub, j * input_ptr->getWidth());
-        }
-        dstBegin = starts[i] + padSize;
-        begin = starts[i];
-      }
-      if (end > starts[i + 1]) {
-        int64_t padSize =
-            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        MatrixPtr mat = this->subMatrix(starts[i + 1] - padSize, padSize);
-        if (isPadding) {
-          MatrixPtr sub =
-              weight->subMatrix(beginPad + contextStart + j - padSize, padSize);
-          mat->addAtOffset(*sub, j * input_ptr->getWidth());
-        }
-        dstEnd = starts[i + 1] - padSize;
-        end = starts[i + 1];
-      }
-      if (end <= begin) continue;
-      MatrixPtr src = input_ptr->subMatrix(begin, end - begin);
-      MatrixPtr dst = this->subMatrix(dstBegin, dstEnd - dstBegin);
-      dst->addAtOffset(*src, j * input_ptr->getWidth());
-    }
-  }
-}
-void CpuMatrix::contextProjectionBackward(Matrix* inputGrad,
-                                          Matrix* weightGrad,
-                                          const IVector& sequence,
-                                          int contextLength,
-                                          int contextStart,
-                                          size_t beginPad,
-                                          bool isPadding) {
-  if (inputGrad) CHECK(dynamic_cast<CpuMatrix*>(inputGrad));
-  if (weightGrad) CHECK(dynamic_cast<CpuMatrix*>(weightGrad));
-  CHECK(dynamic_cast<const CpuIVector*>(&sequence));
-  int64_t inputDim = inputGrad ? inputGrad->getWidth()
-                               : weightGrad ? weightGrad->getWidth() : 0;
-  CHECK_EQ(getWidth(), inputDim * contextLength);
-  const int* starts = sequence.getData();
-  size_t numSequences = sequence.getSize() - 1;
-  for (size_t i = 0; i < numSequences; ++i) {
-    for (int j = 0; j < contextLength; ++j) {
-      int begin = starts[i] + contextStart + j;
-      int end = starts[i + 1] + contextStart + j;
-      int dstBegin = starts[i];
-      int dstEnd = starts[i + 1];
-      if (begin < starts[i]) {
-        int64_t padSize =
-            std::min(starts[i] - begin, starts[i + 1] - starts[i]);
-        if (isPadding && weightGrad) {
-          MatrixPtr mat = this->subMatrix(starts[i], padSize);
-          MatrixPtr sub = weightGrad->subMatrix(j, padSize);
-          sub->addAtOffset(*mat, j * inputDim);
-        }
-        dstBegin = starts[i] + padSize;
-        begin = starts[i];
-      }
-      if (end > starts[i + 1]) {
-        int64_t padSize =
-            std::min(end - starts[i + 1], starts[i + 1] - starts[i]);
-        if (isPadding && weightGrad) {
-          MatrixPtr mat = this->subMatrix(starts[i + 1] - padSize, padSize);
-          MatrixPtr sub = weightGrad->subMatrix(
-              beginPad + contextStart + j - padSize, padSize);
-          sub->addAtOffset(*mat, j * inputDim);
-        }
-        dstEnd = starts[i + 1] - padSize;
-        end = starts[i + 1];
-      }
-      if (end <= begin) continue;
-      if (!inputGrad) continue;
-      MatrixPtr src = inputGrad->subMatrix(begin, end - begin);
-      MatrixPtr dst = this->subMatrix(dstBegin, dstEnd - dstBegin);
-      src->addAtOffset(*dst, j * inputDim);
-    }
-  }
-}
 inline void vecAddTo(real* a, const real* b, size_t len) {
  for (unsigned int i = 0; i < len; ++i) {
    a[i] += b[i];

--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -972,42 +972,6 @@ public:
    LOG(FATAL) << "Not implemeted";
  }
-  virtual void contextProjectionForward(Matrix& input,
-                                        Matrix* weight,
-                                        const IVector& sequence,
-                                        int contextLength,
-                                        int contextStart,
-                                        size_t beginPad,
-                                        bool isPadding) {
-    LOG(FATAL) << "Not implemeted";
-  }
-  virtual void contextProjectionBackward(Matrix* inputGrad,
-                                         Matrix* weightGrad,
-                                         const IVector& sequence,
-                                         int contextLength,
-                                         int contextStart,
-                                         size_t beginPad,
-                                         bool isPadding) {
-    LOG(FATAL) << "Not implemeted";
-  }
-  virtual void contextProjectionBackwardData(Matrix& inputGrad,
-                                             const IVector& sequence,
-                                             int contextLength,
-                                             int contextStart) {
-    LOG(FATAL) << "Not implemeted";
-  }
-  virtual void contextProjectionBackwardWeight(Matrix& weightGrad,
-                                               const IVector& sequence,
-                                               int contextLength,
-                                               int contextStart,
-                                               int totalPad,
-                                               size_t beginPad) {
-    LOG(FATAL) << "Not implemeted";
-  }
  /**
   * @code
   * this.row[i] += table.row[ids[i]]
@@ -1442,26 +1406,6 @@ public:
                           const IVector& sequence,
                           IVector& index);
-  void contextProjectionForward(Matrix& input,
-                                Matrix* weight,
-                                const IVector& sequence,
-                                int contextLength,
-                                int contextStart,
-                                size_t beginPad,
-                                bool isPadding);
-  void contextProjectionBackwardData(Matrix& inputGrad,
-                                     const IVector& sequence,
-                                     int contextLength,
-                                     int contextStart);
-  void contextProjectionBackwardWeight(Matrix& weightGrad,
-                                       const IVector& sequence,
-                                       int contextLength,
-                                       int contextStart,
-                                       int totalPad,
-                                       size_t beginPad);
  void bilinearForward(const Matrix& in,
                       const size_t inImgH,
                       const size_t inImgW,
@@ -1648,22 +1592,6 @@ public:
                           const IVector& sequence,
                           IVector& index);
-  void contextProjectionForward(Matrix& input,
-                                Matrix* weight,
-                                const IVector& sequence,
-                                int contextLength,
-                                int contextStart,
-                                size_t beginPad,
-                                bool isPadding);
-  void contextProjectionBackward(Matrix* inputGrad,
-                                 Matrix* weightGrad,
-                                 const IVector& sequence,
-                                 int contextLength,
-                                 int contextStart,
-                                 size_t beginPad,
-                                 bool isPadding);
  real* getRow(size_t row) { return BaseMatrix::rowBuf(row); }
  virtual real* getRowBuf(size_t row) { return getRow(row); }

--- a/paddle/math/tests/test_Allocator.cpp
+++ b/paddle/math/tests/test_Allocator.cpp
@@ -120,9 +120,3 @@ TEST(MemoryHandle, Gpu) {
  }
 }
 #endif
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -242,10 +242,4 @@ TEST(BaseMatrix, Other) {
  }
 }
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
 #endif
--- a/paddle/math/tests/test_CpuGpuVector.cpp
+++ b/paddle/math/tests/test_CpuGpuVector.cpp
@@ -77,11 +77,4 @@ TEST(CpuGpuVector, subCreate) {
  checkDataEqual(v1Check->getData() + offset, v2Check->getData(), size2);
 }
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
 #endif
--- a/paddle/math/tests/test_ExecViaCpu.cpp
+++ b/paddle/math/tests/test_ExecViaCpu.cpp
@@ -114,9 +114,3 @@ TEST(ExecViaCpu, test1) {
  testWrapper(functor);
 }
 #endif
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/paddle/math/tests/test_Matrix.cpp
+++ b/paddle/math/tests/test_Matrix.cpp
@@ -291,10 +291,4 @@ TEST(Matrix, multiBinaryCrossEntropy) {
  }
 }
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
 #endif
--- a/paddle/math/tests/test_SIMDFunctions.cpp
+++ b/paddle/math/tests/test_SIMDFunctions.cpp
@@ -169,9 +169,3 @@ TEST(SIMDFunction, decayL1_WithoutLR) {
    ASSERT_NEAR(dest[i], simd_dest[i], EPSILON);
  }
 }
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/paddle/math/tests/test_SparseMatrix.cpp
+++ b/paddle/math/tests/test_SparseMatrix.cpp
@@ -561,9 +561,3 @@ TEST(Matrix, SparseMatrixCSCFormatTrimFrom) {
  checkSMatrixEqual2(matA, matD);
 #endif
 }
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/paddle/math/tests/test_Tensor.cu
+++ b/paddle/math/tests/test_Tensor.cu
@@ -1163,11 +1163,3 @@ TEST(Quaternary, CompareOp) {
  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
 #endif
 }
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  hl_start();
-  hl_init(0);
-  return RUN_ALL_TESTS();
-}
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
@@ -459,11 +459,3 @@ void testSparseMomentum(size_t size, bool useGpu) {
 }
 TEST(Training, SparseMomentum) { testCase(testSparseMomentum); }
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  hl_start();
-  hl_init(FLAGS_gpu_id);
-  return RUN_ALL_TESTS();
-}
--- a/paddle/math/tests/test_batchTranspose.cpp
+++ b/paddle/math/tests/test_batchTranspose.cpp
@@ -53,9 +53,3 @@ TEST(MatrixBatchTransTest, test_batch_matrix_transpose) {
  checkMatrixEqual(cBatchTransMat, cMat_d2h);
 }
 #endif
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/paddle/math/tests/test_lazyAssign.cu
+++ b/paddle/math/tests/test_lazyAssign.cu
@@ -139,11 +139,3 @@ TEST(sgdUpdate, GPU) {
  testMatrixCase(testSgdUpdate<GpuMatrix>);
 }
 #endif
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  hl_start();
-  hl_init(0);
-  return RUN_ALL_TESTS();
-}
--- a/paddle/math/tests/test_matrixCompare.cpp
+++ b/paddle/math/tests/test_matrixCompare.cpp
@@ -29,148 +29,6 @@ using namespace std;     // NOLINT
 using autotest::TensorCheckEqual;
 using autotest::TensorCheckErr;
-void testMatrixProjectionForward(int contextStart,
-                                 int contextLength,
-                                 bool padding,
-                                 int batchSize,
-                                 int inputDim) {
-  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInput = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInput->randomizeUniform();
-  gpuInput->copyFrom(*cpuInput);
-  int pad = std::max(0, -contextStart) +
-            std::max(0, contextStart + contextLength - 1);
-  if (pad == 0) padding = false;
-  MatrixPtr cpuWeight = nullptr;
-  MatrixPtr gpuWeight = nullptr;
-  if (padding) {
-    cpuWeight = std::make_shared<CpuMatrix>(pad, inputDim);
-    gpuWeight = std::make_shared<GpuMatrix>(pad, inputDim);
-    cpuWeight->randomizeUniform();
-    gpuWeight->copyFrom(*cpuWeight);
-  }
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-  MatrixPtr cpuOutput =
-      std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
-  MatrixPtr gpuOutput =
-      std::make_shared<GpuMatrix>(batchSize, inputDim * contextLength);
-  cpuOutput->randomizeUniform();
-  gpuOutput->copyFrom(*cpuOutput);
-  // calculate
-  int beginPad = std::max(0, -contextStart);
-  cpuOutput->contextProjectionForward(*cpuInput,
-                                      cpuWeight.get(),
-                                      *cpuSequence,
-                                      contextLength,
-                                      contextStart,
-                                      beginPad,
-                                      padding);
-  gpuOutput->contextProjectionForward(*gpuInput,
-                                      gpuWeight.get(),
-                                      *gpuSequence,
-                                      contextLength,
-                                      contextStart,
-                                      beginPad,
-                                      padding);
-  TensorCheckEqual(*cpuOutput, *gpuOutput);
-}
-void testMatrixProjectionBackward(int contextStart,
-                                  int contextLength,
-                                  bool padding,
-                                  int batchSize,
-                                  int inputDim) {
-  MatrixPtr cpuOutputGrad =
-      std::make_shared<CpuMatrix>(batchSize, inputDim * contextLength);
-  MatrixPtr gpuOutputGrad =
-      std::make_shared<GpuMatrix>(batchSize, inputDim * contextLength);
-  cpuOutputGrad->randomizeUniform();
-  gpuOutputGrad->copyFrom(*cpuOutputGrad);
-  IVectorPtr cpuSequence;
-  generateSequenceStartPositions(batchSize, cpuSequence);
-  IVectorPtr gpuSequence = IVector::create(cpuSequence->getSize(), true);
-  gpuSequence->copyFrom(*cpuSequence);
-  MatrixPtr cpuInputGrad = std::make_shared<CpuMatrix>(batchSize, inputDim);
-  MatrixPtr gpuInputGrad = std::make_shared<GpuMatrix>(batchSize, inputDim);
-  cpuInputGrad->randomizeUniform();
-  gpuInputGrad->copyFrom(*cpuInputGrad);
-  int pad = std::max(0, -contextStart) +
-            std::max(0, contextStart + contextLength - 1);
-  if (pad == 0) padding = false;
-  MatrixPtr cpuWeightGrad = nullptr;
-  MatrixPtr gpuWeightGrad = nullptr;
-  if (padding) {
-    cpuWeightGrad = std::make_shared<CpuMatrix>(pad, inputDim);
-    gpuWeightGrad = std::make_shared<GpuMatrix>(pad, inputDim);
-    cpuWeightGrad->randomizeUniform();
-    gpuWeightGrad->copyFrom(*cpuWeightGrad);
-  }
-  // calculate
-  int beginPad = std::max(0, -contextStart);
-  cpuOutputGrad->contextProjectionBackward(cpuInputGrad.get(),
-                                           cpuWeightGrad.get(),
-                                           *cpuSequence,
-                                           contextLength,
-                                           contextStart,
-                                           beginPad,
-                                           padding);
-  gpuOutputGrad->contextProjectionBackwardData(
-      *gpuInputGrad, *gpuSequence, contextLength, contextStart);
-  if (padding) {
-    gpuOutputGrad->contextProjectionBackwardWeight(*gpuWeightGrad,
-                                                   *gpuSequence,
-                                                   contextLength,
-                                                   contextStart,
-                                                   pad,
-                                                   beginPad);
-  }
-  TensorCheckErr(*cpuInputGrad, *gpuInputGrad);
-  if (padding) {
-    TensorCheckErr(*cpuWeightGrad, *gpuWeightGrad);
-  }
-}
-TEST(Matrix, projection) {
-  for (auto contextStart : {-5, -3, -1, 0, 3}) {
-    for (auto contextLength : {1, 2, 5, 7}) {
-      for (auto trainablePadding : {false, true}) {
-        for (auto batchSize : {1, 2, 5, 20, 100}) {
-          for (auto inputDim : {15, 32, 63, 128, 200}) {
-            VLOG(3) << " contextStart=" << contextStart
-                    << " contextLength=" << contextLength
-                    << " trainablePadding=" << trainablePadding
-                    << " batchSize=" << batchSize << " inputDim=" << inputDim;
-            testMatrixProjectionForward(contextStart,
-                                        contextLength,
-                                        trainablePadding,
-                                        batchSize,
-                                        inputDim);
-            testMatrixProjectionBackward(contextStart,
-                                         contextLength,
-                                         trainablePadding,
-                                         batchSize,
-                                         inputDim);
-          }
-        }
-      }
-    }
-  }
-}
 void testMatrixMaxSequence(int batchSize, int inputDim) {
  // forward
  MatrixPtr cpuInput = std::make_shared<CpuMatrix>(batchSize, inputDim);
@@ -1262,10 +1120,4 @@ TEST(Matrix, MaxOutFwdBwd) {
  }
 }
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
 #endif
--- a/paddle/math/tests/test_sparseMatrixCompare.cpp
+++ b/paddle/math/tests/test_sparseMatrixCompare.cpp
@@ -171,11 +171,4 @@ TEST(SMatrix, sMatrixCollectBias) {
  }
 }
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  initMain(argc, argv);
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
 #endif
--- a/paddle/parameter/tests/test_common.cpp
+++ b/paddle/parameter/tests/test_common.cpp
@@ -23,15 +23,6 @@ limitations under the License. */
 using namespace paddle;  // NOLINT
-int main(int argc, char** argv) {
-  paddle::initMain(argc, argv);
-  testing::InitGoogleTest(&argc, argv);
-  int ret = RUN_ALL_TESTS();
-  return ret;
-}
 class CommonTest : public ::testing::Test {
 protected:
  CommonTest() : testStat_("test") {}

--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
+# for paddle test case
+if(WITH_TESTING)
+  add_library(paddle_test_main STATIC TestMain.cpp)
+  add_dependencies(paddle_test_main gen_proto_cpp)
+endif()
--- a/paddle/function/TestMain.cpp
+++ b/paddle/function/TestMain.cpp
--- a/paddle/trainer/RemoteParameterUpdater.h
+++ b/paddle/trainer/RemoteParameterUpdater.h
@@ -56,7 +56,7 @@ class RemoteParameterUpdater : public ParameterUpdater {
 public:
  RemoteParameterUpdater(
      const OptimizationConfig& config,
-      int expectedPpassCount,
+      int expectedPassCount,
      std::unique_ptr<ParameterUpdater>&& localUpdater = nullptr);
  ~RemoteParameterUpdater() {
    if (controllerThread_) {
@@ -146,7 +146,7 @@ protected:
  BatchStatus batchStatus_;
  /// controller thread for sync-sgd
  std::unique_ptr<std::thread> controllerThread_;
-  /// passed alread finished
+  /// passed already finished
  int64_t passCount_;
  /// expected passes to finished
  int64_t expectedPassCount_;

--- a/paddle/utils/ThreadLocal.cpp
+++ b/paddle/utils/ThreadLocal.cpp
@@ -37,7 +37,7 @@ unsigned int* ThreadLocalRand::getSeed() {
      p = new unsigned int(defaultSeed_ - 1);
    } else {
      p = new unsigned int(defaultSeed_ + getTID());
-      LOG(INFO) << "thread use undeterministic rand seed:" << *p;
+      VLOG(3) << "thread use undeterministic rand seed:" << *p;
    }
    seed_.set(p);
  }

--- a/paddle/utils/Util.cpp
+++ b/paddle/utils/Util.cpp
@@ -125,7 +125,7 @@ void registerInitFunction(std::function<void()> func, int priority) {
 void runInitFunctions() {
  std::call_once(g_onceFlag, []() {
-    LOG(INFO) << "Calling runInitFunctions";
+    VLOG(3) << "Calling runInitFunctions";
    if (g_initFuncs) {
      std::sort(g_initFuncs->begin(),
                g_initFuncs->end(),
@@ -139,7 +139,7 @@ void runInitFunctions() {
      g_initFuncs = nullptr;
    }
    g_initialized = true;
-    LOG(INFO) << "Call runInitFunctions done.";
+    VLOG(3) << "Call runInitFunctions done.";
  });
 }
@@ -231,7 +231,7 @@ std::string join(const std::string& part1, const std::string& part2) {
 }  // namespace path
 void copyFileToPath(const std::string& file, const std::string& dir) {
-  LOG(INFO) << "copy " << file << " to " << dir;
+  VLOG(3) << "copy " << file << " to " << dir;
  std::string fileName = path::basename(file);
  std::string dst = path::join(dir, fileName);
  std::ifstream source(file, std::ios_base::binary);

--- a/paddle/utils/tests/test_CustomStackTrace.cpp
+++ b/paddle/utils/tests/test_CustomStackTrace.cpp
@@ -96,9 +96,3 @@ TEST(CustomStackTrace, normalTest) {
    }
  });
 }
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/paddle/utils/tests/test_SIMDFlags.cpp
+++ b/paddle/utils/tests/test_SIMDFlags.cpp
@@ -44,8 +44,3 @@ TEST(SIMDFlags, normalPrint) {
  LOG(INFO) << "Has AVX2:    " << std::boolalpha << HAS_AVX2;
  LOG(INFO) << "Has AVX512:  " << std::boolalpha << HAS_AVX512;
 }
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/paddle/utils/tests/test_SpinLock.cpp
+++ b/paddle/utils/tests/test_SpinLock.cpp
@@ -53,9 +53,3 @@ TEST(ThreadSpinLock, normalTest) {
        });
  }
 }
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/paddle/utils/tests/test_Thread.cpp
+++ b/paddle/utils/tests/test_Thread.cpp
@@ -79,8 +79,3 @@ TEST(AsyncThreadPool, addBatchJobWithResults) {
    ASSERT_EQ(res[i], i);
  }
 }
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
--- a/paddle/utils/tests/test_ThreadBarrier.cpp
+++ b/paddle/utils/tests/test_ThreadBarrier.cpp
@@ -64,9 +64,3 @@ TEST(ThreadBarrier, normalTest) {
                   });
  }
 }
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  paddle::initMain(argc, argv);
-  return RUN_ALL_TESTS();
-}