diff --git a/adversarial/README.md b/adversarial/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..51da21918a9d6e2192a2e03eabef4fde97896bc5
--- /dev/null
+++ b/adversarial/README.md
@@ -0,0 +1,9 @@
+# Advbox
+
+Advbox is a Python toolbox to create adversarial examples that fool neural networks. It requires Python and paddle.
+
+## How to use
+
+1. train a model and save it's parameters. (like fluid_mnist.py)
+2. load the parameters which is trained in step1, then reconstruct the model.(like mnist_tutorial_fgsm.py)
+3. use advbox to generate the adversarial sample.
diff --git a/adversarial/advbox/__init__.py b/adversarial/advbox/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f56f14f18dafdfe1e712cea178a63f09a087b587
--- /dev/null
+++ b/adversarial/advbox/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+   A set of tools for generating adversarial example on paddle platform 
+"""
diff --git a/adversarial/advbox/attacks/base.py b/adversarial/advbox/attacks/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..98a65f2fddff999ac6fa98a5733128a63a60f916
--- /dev/null
+++ b/adversarial/advbox/attacks/base.py
@@ -0,0 +1,39 @@
+"""
+The base model of the model.
+"""
+from abc import ABCMeta, abstractmethod
+
+
+class Attack(object):
+    """
+    Abstract base class for adversarial attacks. `Attack` represent an adversarial attack
+    which search an adversarial example. subclass should implement the _apply() method.
+
+    Args:
+        model(Model): an instance of the class advbox.base.Model.
+
+    """
+    __metaclass__ = ABCMeta
+
+    def __init__(self, model):
+        self.model = model
+
+    def __call__(self, image_label):
+        """
+        Generate the adversarial sample.
+
+        Args:
+        image_label(list): The image and label tuple list with one element.
+        """
+        adv_img = self._apply(image_label)
+        return adv_img
+
+    @abstractmethod
+    def _apply(self, image_label):
+        """
+        Search an adversarial example.
+
+        Args:
+        image_batch(list): The image and label tuple list with one element.
+        """
+        raise NotImplementedError
diff --git a/adversarial/advbox/attacks/gradientsign.py b/adversarial/advbox/attacks/gradientsign.py
new file mode 100644
index 0000000000000000000000000000000000000000..15b1d176cb11330ac290d73aec1419a3d8f3cc4c
--- /dev/null
+++ b/adversarial/advbox/attacks/gradientsign.py
@@ -0,0 +1,38 @@
+"""
+This module provide the attack method for FGSM's implement.
+"""
+from __future__ import division
+import numpy as np
+from collections import Iterable
+from .base import Attack
+
+
+class GradientSignAttack(Attack):
+    """
+    This attack was originally implemented by Goodfellow et al. (2015) with the
+    infinity norm (and is known as the "Fast Gradient Sign Method"). This is therefore called
+    the Fast Gradient Method.
+    Paper link: https://arxiv.org/abs/1412.6572
+    """
+
+    def _apply(self, image_label, epsilons=1000):
+        assert len(image_label) == 1
+        pre_label = np.argmax(self.model.predict(image_label))
+
+        min_, max_ = self.model.bounds()
+        gradient = self.model.gradient(image_label)
+        gradient_sign = np.sign(gradient) * (max_ - min_)
+
+        if not isinstance(epsilons, Iterable):
+            epsilons = np.linspace(0, 1, num=epsilons + 1)
+
+        for epsilon in epsilons:
+            adv_img = image_label[0][0].reshape(
+                gradient_sign.shape) + epsilon * gradient_sign
+            adv_img = np.clip(adv_img, min_, max_)
+            adv_label = np.argmax(self.model.predict([(adv_img, 0)]))
+            if pre_label != adv_label:
+                return adv_img
+
+
+FGSM = GradientSignAttack
diff --git a/adversarial/advbox/models/__init__.py b/adversarial/advbox/models/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..eee0f6efd4774b42fcd082eb06d1398d2ee51bc4
--- /dev/null
+++ b/adversarial/advbox/models/__init__.py
@@ -0,0 +1,16 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Paddle model for target of attack 
+"""
diff --git a/adversarial/advbox/models/base.py b/adversarial/advbox/models/base.py
new file mode 100644
index 0000000000000000000000000000000000000000..74e1045def7648b4a8df30e89312d73c0d4fe7e1
--- /dev/null
+++ b/adversarial/advbox/models/base.py
@@ -0,0 +1,90 @@
+"""
+The base model of the model.
+"""
+from abc import ABCMeta
+import abc
+
+abstractmethod = abc.abstractmethod
+
+
+class Model(object):
+    """
+    Base class of model to provide attack.
+
+
+    Args:
+        bounds(tuple): The lower and upper bound for the image pixel.
+        channel_axis(int): The index of the axis that represents the color channel.
+        preprocess(tuple): Two element tuple used to preprocess the input. First
+            substract the first element, then divide the second element.
+    """
+    __metaclass__ = ABCMeta
+
+    def __init__(self, bounds, channel_axis, preprocess=None):
+        assert len(bounds) == 2
+        assert channel_axis in [0, 1, 2, 3]
+
+        if preprocess is None:
+            preprocess = (0, 1)
+        self._bounds = bounds
+        self._channel_axis = channel_axis
+        self._preprocess = preprocess
+
+    def bounds(self):
+        """
+        Return the upper and lower bounds of the model.
+        """
+        return self._bounds
+
+    def channel_axis(self):
+        """
+        Return the channel axis of the model.
+        """
+        return self._channel_axis
+
+    def _process_input(self, input_):
+        res = input_
+        sub, div = self._preprocess
+        if sub != 0:
+            res = input_ - sub
+        assert div != 0
+        if div != 1:
+            res /= div
+        return res
+
+    @abstractmethod
+    def predict(self, image_batch):
+        """
+        Calculate the prediction of the image batch.
+
+        Args:
+            image_batch(numpy.ndarray): image batch of shape (batch_size, height, width, channels).
+
+        Return:
+            numpy.ndarray: predictions of the images with shape (batch_size, num_of_classes).
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def num_classes(self):
+        """
+        Determine the number of the classes
+
+        Return:
+            int: the number of the classes
+        """
+        raise NotImplementedError
+
+    @abstractmethod
+    def gradient(self, image_batch):
+        """
+        Calculate the gradient of the cross-entropy loss w.r.t the image.
+
+        Args:
+            image_batch(list): The image and label tuple list.
+
+        Return:
+            numpy.ndarray: gradient of the cross-entropy loss w.r.t the image with
+                the shape (height, width, channel).
+        """
+        raise NotImplementedError
diff --git a/adversarial/advbox/models/paddle.py b/adversarial/advbox/models/paddle.py
new file mode 100644
index 0000000000000000000000000000000000000000..33b2a3d5c6973470fb25c98872cd53b3ff11bab4
--- /dev/null
+++ b/adversarial/advbox/models/paddle.py
@@ -0,0 +1,101 @@
+from __future__ import absolute_import
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+from paddle.v2.fluid.framework import program_guard
+
+from .base import Model
+
+
+class PaddleModel(Model):
+    """
+    Create a PaddleModel instance.
+    When you need to generate a adversarial sample, you should construct an instance of PaddleModel.
+
+    Args:
+        program(paddle.v2.fluid.framework.Program): The program of the model which generate the adversarial sample.
+        input_name(string): The name of the input.
+        logits_name(string): The name of the logits.
+        predict_name(string): The name of the predict.
+        cost_name(string): The name of the loss in the program.
+    """
+
+    def __init__(self,
+                 program,
+                 input_name,
+                 logits_name,
+                 predict_name,
+                 cost_name,
+                 bounds,
+                 channel_axis=3,
+                 preprocess=None):
+        super(PaddleModel, self).__init__(
+            bounds=bounds, channel_axis=channel_axis, preprocess=preprocess)
+
+        if preprocess is None:
+            preprocess = (0, 1)
+
+        self._program = program
+        self._place = fluid.CPUPlace()
+        self._exe = fluid.Executor(self._place)
+
+        self._input_name = input_name
+        self._logits_name = logits_name
+        self._predict_name = predict_name
+        self._cost_name = cost_name
+
+        # gradient
+        loss = self._program.block(0).var(self._cost_name)
+        param_grads = fluid.backward.append_backward(
+            loss, parameter_list=[self._input_name])
+        self._gradient = dict(param_grads)[self._input_name]
+
+    def predict(self, image_batch):
+        """
+            Predict the label of the image_batch.
+
+            Args:
+                image_batch(list): The image and label tuple list.
+            Return:
+                numpy.ndarray: predictions of the images with shape (batch_size, num_of_classes).
+        """
+        feeder = fluid.DataFeeder(
+            feed_list=[self._input_name, self._logits_name],
+            place=self._place,
+            program=self._program)
+        predict_var = self._program.block(0).var(self._predict_name)
+        predict = self._exe.run(self._program,
+                                feed=feeder.feed(image_batch),
+                                fetch_list=[predict_var])
+        return predict
+
+    def num_classes(self):
+        """
+            Calculate the number of classes of the output label. 
+
+        Return:
+            int: the number of classes
+        """
+        predict_var = self._program.block(0).var(self._predict_name)
+        assert len(predict_var.shape) == 2
+        return predict_var.shape[1]
+
+    def gradient(self, image_batch):
+        """
+        Calculate the gradient of the loss w.r.t the input.
+
+        Args:
+            image_batch(list): The image and label tuple list.
+        Return:
+            list: The list of the gradient of the image.
+        """
+        feeder = fluid.DataFeeder(
+            feed_list=[self._input_name, self._logits_name],
+            place=self._place,
+            program=self._program)
+
+        grad, = self._exe.run(self._program,
+                              feed=feeder.feed(image_batch),
+                              fetch_list=[self._gradient])
+        return grad
diff --git a/adversarial/fluid_mnist.py b/adversarial/fluid_mnist.py
new file mode 100644
index 0000000000000000000000000000000000000000..db4d4b51868ffa8be13d4d57a40e1def7e25d1a8
--- /dev/null
+++ b/adversarial/fluid_mnist.py
@@ -0,0 +1,86 @@
+"""
+CNN on mnist data using fluid api of paddlepaddle
+"""
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+
+def mnist_cnn_model(img):
+    """
+    Mnist cnn model
+
+    Args:
+        img(Varaible): the input image to be recognized
+
+    Returns:
+        Variable: the label prediction
+    """
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        num_filters=20,
+        filter_size=5,
+        pool_size=2,
+        pool_stride=2,
+        act='relu')
+
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        num_filters=50,
+        filter_size=5,
+        pool_size=2,
+        pool_stride=2,
+        act='relu')
+
+    logits = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    return logits
+
+
+def main():
+    """
+    Train the cnn model on mnist datasets
+    """
+    img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32')
+    label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+    logits = mnist_cnn_model(img)
+    cost = fluid.layers.cross_entropy(input=logits, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    optimizer = fluid.optimizer.Adam(learning_rate=0.01)
+    optimizer.minimize(avg_cost)
+
+    accuracy = fluid.evaluator.Accuracy(input=logits, label=label)
+
+    BATCH_SIZE = 50
+    PASS_NUM = 3
+    ACC_THRESHOLD = 0.98
+    LOSS_THRESHOLD = 10.0
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+    feeder = fluid.DataFeeder(feed_list=[img, label], place=place)
+    exe.run(fluid.default_startup_program())
+
+    for pass_id in range(PASS_NUM):
+        accuracy.reset(exe)
+        for data in train_reader():
+            loss, acc = exe.run(fluid.default_main_program(),
+                                feed=feeder.feed(data),
+                                fetch_list=[avg_cost] + accuracy.metrics)
+            pass_acc = accuracy.eval(exe)
+            print("pass_id=" + str(pass_id) + " acc=" + str(acc) + " pass_acc="
+                  + str(pass_acc))
+            if loss < LOSS_THRESHOLD and pass_acc > ACC_THRESHOLD:
+                break
+
+        pass_acc = accuracy.eval(exe)
+        print("pass_id=" + str(pass_id) + " pass_acc=" + str(pass_acc))
+    fluid.io.save_params(
+        exe, dirname='./mnist', main_program=fluid.default_main_program())
+    print('train mnist done')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/adversarial/mnist_tutorial_fgsm.py b/adversarial/mnist_tutorial_fgsm.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b29346b8cd7f643771640afc4f783f7544cd071
--- /dev/null
+++ b/adversarial/mnist_tutorial_fgsm.py
@@ -0,0 +1,87 @@
+"""
+FGSM demos on mnist using advbox tool.
+"""
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import matplotlib.pyplot as plt
+import numpy as np
+
+from advbox.models.paddle import PaddleModel
+from advbox.attacks.gradientsign import GradientSignAttack
+
+
+def cnn_model(img):
+    """
+    Mnist cnn model
+    Args:
+        img(Varaible): the input image to be recognized
+    Returns:
+        Variable: the label prediction
+    """
+    #conv1 = fluid.nets.conv2d()
+    conv_pool_1 = fluid.nets.simple_img_conv_pool(
+        input=img,
+        num_filters=20,
+        filter_size=5,
+        pool_size=2,
+        pool_stride=2,
+        act='relu')
+
+    conv_pool_2 = fluid.nets.simple_img_conv_pool(
+        input=conv_pool_1,
+        num_filters=50,
+        filter_size=5,
+        pool_size=2,
+        pool_stride=2,
+        act='relu')
+
+    logits = fluid.layers.fc(input=conv_pool_2, size=10, act='softmax')
+    return logits
+
+
+def main():
+    """
+    Advbox demo which demonstrate how to use advbox.
+    """
+    IMG_NAME = 'img'
+    LABEL_NAME = 'label'
+
+    img = fluid.layers.data(name=IMG_NAME, shape=[1, 28, 28], dtype='float32')
+    # gradient should flow
+    img.stop_gradient = False
+    label = fluid.layers.data(name=LABEL_NAME, shape=[1], dtype='int64')
+    logits = cnn_model(img)
+    cost = fluid.layers.cross_entropy(input=logits, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    BATCH_SIZE = 1
+    train_reader = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.mnist.train(), buf_size=500),
+        batch_size=BATCH_SIZE)
+    feeder = fluid.DataFeeder(
+        feed_list=[IMG_NAME, LABEL_NAME],
+        place=place,
+        program=fluid.default_main_program())
+
+    fluid.io.load_params(
+        exe, "./mnist/", main_program=fluid.default_main_program())
+
+    # advbox demo
+    m = PaddleModel(fluid.default_main_program(), IMG_NAME, LABEL_NAME,
+                    logits.name, avg_cost.name, (-1, 1))
+    att = GradientSignAttack(m)
+    for data in train_reader():
+        # fgsm attack
+        adv_img = att(data)
+        plt.imshow(n[0][0], cmap='Greys_r')
+        plt.show()
+        #np.save('adv_img', adv_img)
+        break
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmark/cluster/README.md b/benchmark/cluster/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..b619613ea7a5b6e940ec735314e8e47338b2c600
--- /dev/null
+++ b/benchmark/cluster/README.md
@@ -0,0 +1,78 @@
+# Cluster Training Benchmark
+
+## Setup
+
+- Platform
+  - Kubernetes: v1.6.2
+  - Linux Kernel: v3.10.0
+
+- Resource
+  - CPU: 10 Cores per Pod
+  - Memory: 5GB per Pod
+
+- Docker Image
+
+  We use different base Docker Image to run the benchmark on Kubernetes:
+  - PaddlePaddle v2: paddlepaddle/paddle:0.11.0
+  - PaddlePaddle Fluid: paddlepaddle/paddle:[commit-id]
+  - TensorFlow: tensorflow/tensorflow:1.5.0-rc0
+
+- Model
+  vgg16 is used in this benchmark.
+
+## Cases
+
+- Variable
+  - Batch Size of training data.
+  - PServer count of the training job.
+  - The number of trainers.
+
+- Invariant
+  - The resource of trainer/pserver Pod.
+
+### Measure the Performance for Different Batch Size
+
+- PServer Count: 40
+- Trainer Count: 100
+- Metrics: mini-batch / sec
+
+| Batch Size | 32 | 64 | 128 | 256 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - |
+| TensorFlow | - | - | - | - |
+
+### Measure the Performance for Different PServer Count
+
+- Trainer Count: 100
+- Batch Size: 64
+- Metrics: mini-batch / sec
+
+| PServer Count | 10 | 20 | 40 | 60 |
+| -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - |
+| TensorFlow | - | - | - | - |
+
+### Measure Parallel Efficiency By Increasing Trainer Count
+
+- PServer Count: 20
+- Batch Size: 64
+- Metrics:
+
+$S = \div(T1, TN)$
+
+which S is the ratio of T1 over TN, training time of 1 and N trainers.
+The parallel efficiency is:
+
+$E = \div(S, N)$
+
+| Trainer Counter | 1 | 10 | 20 | 30 | 40 | 50 | 60 | 70 | 80 | 90 | 100 |
+| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- |
+| PaddlePaddle Fluid | - | - | - | - | - | - | - | - | - | - | - |
+| PaddlePaddle v2 | - | - | - | - | - | - | - | - | - | - | - | - |
+| TensorFlow | - | - | - | - | - | - | - | - | - | - | - | - | - |
+
+## Reproduce the benchmark
+
+TODO
diff --git a/doc/api/v2/fluid.rst b/doc/api/v2/fluid.rst
index 43fc19dc492bbc119f2356034b81c65e443db2fa..5f15cad2b530dfb3702357b3c26885ac2a7b7beb 100644
--- a/doc/api/v2/fluid.rst
+++ b/doc/api/v2/fluid.rst
@@ -15,4 +15,4 @@ Fluid
     fluid/param_attr.rst
     fluid/profiler.rst
     fluid/regularizer.rst
-
+    fluid/io.rst
diff --git a/doc/api/v2/fluid/io.rst b/doc/api/v2/fluid/io.rst
new file mode 100644
index 0000000000000000000000000000000000000000..67f68c4e9e16b379207b8de114cdf769e056f78e
--- /dev/null
+++ b/doc/api/v2/fluid/io.rst
@@ -0,0 +1,10 @@
+===========
+IO
+===========
+
+
+
+is_parameter
+-----------
+..  autofunction:: paddle.v2.fluid.io.is_parameter
+    :noindex:
diff --git a/doc/api/v2/fluid/layers.rst b/doc/api/v2/fluid/layers.rst
index a7c8670f66cc7f319e41155211ead2d89126117f..24bdf08fffd176a799fd12680f4651bb4bd0c9a9 100644
--- a/doc/api/v2/fluid/layers.rst
+++ b/doc/api/v2/fluid/layers.rst
@@ -38,6 +38,16 @@ elementwise_add
 ..  autofunction:: paddle.v2.fluid.layers.elementwise_add
     :noindex:
 
+elementwise_sub
+---------------
+..  autofunction:: paddle.v2.fluid.layers.elementwise_sub
+    :noindex:
+
+elementwise_mul
+---------------
+..  autofunction:: paddle.v2.fluid.layers.elementwise_mul
+    :noindex:
+
 elementwise_div
 ---------------
 ..  autofunction:: paddle.v2.fluid.layers.elementwise_div
@@ -348,3 +358,132 @@ reduce_min
 ..  autofunction:: paddle.v2.fluid.layers.reduce_min
     :noindex:
 
+logsigmoid
+----------
+..  autofunction:: paddle.v2.fluid.layers.logsigmoid
+    :noindex:
+
+exp
+---
+..  autofunction:: paddle.v2.fluid.layers.exp
+    :noindex:
+
+relu
+----
+..  autofunction:: paddle.v2.fluid.layers.relu
+    :noindex:
+
+tanh
+----
+..  autofunction:: paddle.v2.fluid.layers.tanh
+    :noindex:
+
+tanh_shrink
+-----------
+..  autofunction:: paddle.v2.fluid.layers.tanh_shrink
+    :noindex:
+
+softshrink
+----------
+..  autofunction:: paddle.v2.fluid.layers.softshrink
+    :noindex:
+
+sqrt
+----
+..  autofunction:: paddle.v2.fluid.layers.sqrt
+    :noindex:
+
+abs
+----
+..  autofunction:: paddle.v2.fluid.layers.abs
+    :noindex:
+
+ceil
+----
+..  autofunction:: paddle.v2.fluid.layers.ceil
+    :noindex:
+
+floor
+-----
+..  autofunction:: paddle.v2.fluid.layers.floor
+    :noindex:
+
+round
+-----
+..  autofunction:: paddle.v2.fluid.layers.round
+    :noindex:
+
+reciprocal
+----------
+..  autofunction:: paddle.v2.fluid.layers.reciprocal
+    :noindex:
+
+log
+---
+..  autofunction:: paddle.v2.fluid.layers.log
+    :noindex:
+
+square
+------
+..  autofunction:: paddle.v2.fluid.layers.square
+    :noindex:
+
+softplus
+--------
+..  autofunction:: paddle.v2.fluid.layers.softplus
+    :noindex:
+
+softsign
+---------
+..  autofunction:: paddle.v2.fluid.layers.softsign
+    :noindex:
+
+brelu
+-----
+..  autofunction:: paddle.v2.fluid.layers.brelu
+    :noindex:
+
+leaky_relu
+----------
+..  autofunction:: paddle.v2.fluid.layers.leaky_relu
+    :noindex:
+
+soft_relu
+---------
+..  autofunction:: paddle.v2.fluid.layers.soft_relu
+    :noindex:
+
+elu
+----
+..  autofunction:: paddle.v2.fluid.layers.elu
+    :noindex:
+
+relu6
+-----
+..  autofunction:: paddle.v2.fluid.layers.relu6
+    :noindex:
+
+pow
+----
+..  autofunction:: paddle.v2.fluid.layers.pow
+    :noindex:
+
+hard_shrink
+-----------
+..  autofunction:: paddle.v2.fluid.layers.hard_shrink
+    :noindex:
+
+thresholded_relu
+----------------
+..  autofunction:: paddle.v2.fluid.layers.thresholded_relu
+    :noindex:
+
+hard_sigmoid
+-------------
+..  autofunction:: paddle.v2.fluid.layers.hard_sigmoid
+    :noindex:
+
+swish
+------
+..  autofunction:: paddle.v2.fluid.layers.swish
+    :noindex:
diff --git a/doc/design/block.md b/doc/design/block.md
index fab7f2dc481ae51aa982164dc5048d90fcdc2b0b..907a2def557fd472ac4d679c73447bd9107d1190 100644
--- a/doc/design/block.md
+++ b/doc/design/block.md
@@ -202,8 +202,8 @@ This `OpDesc` value is in the `ops` field of the `BlockDesc` value representing
 
 During the generation of the Protobuf message, the Block should store VarDesc (the Protobuf message which describes Variable) and OpDesc (the Protobuf message which describes Operator).
 
-VarDesc in a block should have its name scope to avoid local variables affect parent block's name scope.
-Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that stored in parent block. For example:
+VarDesc in a block should have its name scope to avoid local variables affecting parent block's name scope.
+Child block's name scopes should inherit the parent's so that OpDesc in child block can reference a VarDesc that is stored in the parent block. For example:
 
 ```python
 a = pd.Variable(shape=[20, 20])
diff --git a/doc/design/memory_optimization.md b/doc/design/memory_optimization.md
index 00f514711a46bfd5af3bae51e0d9225ecc4c8998..1f68cef4cc28cd005acbeaa5c03cc0d84a83939c 100644
--- a/doc/design/memory_optimization.md
+++ b/doc/design/memory_optimization.md
@@ -5,28 +5,28 @@
 
 In a lecture from Andrew Ng, he attributes the recent sucess of AI due to a combination of these:
 
-- availability of Big Data
-- supercomputing power to process this Big Data over very large neural networks
-- modern algorithms
+- Availability of Big Data
+- Supercomputing power to process this Big Data over very large neural networks
+- Modern algorithms
 
 Following graph shows the details:
 
 ![](images/deep_learning.png)
 
-Larger model usually brings better performance. However, GPU memory is certain limited. For example, the memory size of a GTX TITAN X is only 12GB. To train complex and large model, we have to take care of memory using. Besides, memory optimization is also necessary in both online/mobile inference. 
+Larger model usually bring better performance. However, GPU memory is limited. For example, the memory size of a GTX TITAN X is only 12GB. To train complex and large models, we have to take care of memory usage. Besides, memory optimization is also necessary in both online/mobile inference. 
 
 ## Solution
 
 ### Basic Strategy
 
-There are some basic strategies to make memory optimization, including in-place operation and memory sharing.
+There are some basic strategies to improve memory usage, including in-place operations and memory sharing.
 
 #### In-place Operation
 In a relu activation operatorï¼š 
 
 $y = \max(x, 0)$
 
-If the variable x is not used in any other operator, we can make an in-place operation. In other words, the memory block of variable y and variable x are the same. In-place operation will save 50% memory occupancy immediately.
+If the variable x is not used in any other operator, we can make an in-place operation. In other words, the memory block of variable y and variable x will be the same. In-place operations will save 50% memory occupancy immediately.
 
 #### Memory Sharing
 
@@ -40,18 +40,18 @@ d = op2(a)
 e = op3(d, f)
 ```
 
-In this case, variable a is no longer used, and op2 does not support in-place operation. After op2 finished, we can put the memory of variable a to a memory pool. Then, variable e can share the memory of variable a from the pool.
+In this case, variable a is no longer used, and op2 does not support in-place operation. After op2 finishes, we can put the memory of variable a to a memory pool. Then, variable e can share the memory of variable a from the pool.
 
 
 ### Live Variable Analysis
 
-It's not enough to only have some basic strategies. The prerequisite of memory optimization is to know if a variable is still "live" after an operation.
+It's not enough to only have some basic strategies. The pre-requisite of memory optimization is to know if a variable is still "live" after an operation.
 
 In our design, the neural network topology is defined as a program. Luckily, [live variable analysis](https://en.wikipedia.org/wiki/Live_variable_analysis) is a classic problem in compilers which can be used in many stages, such as register allocation. 
 
-In compilers, the front end of the compilers translates programs into an intermediate language with an unbounded number of temporaries. This program must run on a machine with a bounded number of registers. Two temporaries a and b can fit into the same register, if a and b are never "in use" at the same time. Thus, many temporaries can fit in few registers; if they don't all fit, the excess temporaries can be kept in memory.
+In compilers, the front end of the compiler translates programs into an intermediate language with an unbounded number of temporary variables. This program must run on a machine with a bounded number of registers. Two temporary variables a and b can fit into the same register, if a and b are never "in use" at the same time. Thus, many temporary variables can fit in few registers; if they don't all fit, the excess tempory variables can be kept in memory.
 
-Therefore, the compiler needs to analyze the intermediate-representation program to determine which temporaries are in use at the same time. We say a variable is "live" if it holds a value that may be needed in the future, so this analysis is called liveness analysis. 
+Therefore, the compiler needs to analyze the intermediate-representation program to determine which temporary variables are in use at the same time. We say a variable is "live" if it holds a value that may be needed in the future, so this analysis is called liveness analysis. 
 
 We can leran these techniques from compilers. There are mainly two stages to make live variable analysis:
 
@@ -60,7 +60,7 @@ We can leran these techniques from compilers. There are mainly two stages to mak
 
 
 #### Control Flow Graph
-To preform analyses on a program, it is often useful to make a control flow graph. A [control flow graph](https://en.wikipedia.org/wiki/Control_flow_graph) (CFG) in computer science is a representation, using graph notation, of all paths that might be traversed through a program during its execution. Each statement in the program is a node in the flow graph; if statemment x can be followed by statement y, there is an egde from x to y.
+To perform analysis on a program, it is often useful to make a control flow graph. A [control flow graph](https://en.wikipedia.org/wiki/Control_flow_graph) (CFG) in computer science is a representation, using graph notation, of all paths that might be traversed through a program during its execution. Each statement in the program is a node in the flow graph; if statemment x can be followed by statement y, there is an egde from x to y.
 
 Following is the flow graph for a simple loop.
 
@@ -68,18 +68,18 @@ Following is the flow graph for a simple loop.
 
 #### Dataflow Analysis
 
-liveness of variable "flows" around the edges of the control flow graph; determining the live range of each variable is an example of a dataflow problem. [Dataflow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis) is a technique for gathering information about the possible set of values calculated at various points in a computer program.
+Liveness of variable "flows" around the edges of the control flow graph; determining the live range of each variable is an example of a dataflow problem. [Dataflow analysis](https://en.wikipedia.org/wiki/Data-flow_analysis) is a technique for gathering information about the possible set of values calculated at various points in a computer program.
 
 A simple way to perform data-flow analysis of programs is to set up dataflow equations for each node of the control flow graph and solve them by repeatedly calculating the output from the input locally at each node until the whole system stabilizes.
 
 - Flow Graph Terminology
 
-A flow graph node has out-edges that lead to sucessor nodes, and in-edges that come from presucessor nodes. The set *pred[n]* is all the predecessors of node n, and *succ[n]* is the set of sucessors.
+A flow graph node has out-edges that lead to sucessor nodes, and in-edges that come from predecessor nodes. The set *pred[n]* is all the predecessors of node n, and *succ[n]* is the set of sucessors.
 In former control flow graph, the out-edges of node 5 are 5 --> 6 and 5 --> 2, and *succ[5]* = {2, 6}. The in-edges of 2 are 5 --> 2 and 1 --> 2, and *pred[2]* = {1, 5}.
 
 - Uses and Defs
 
-An assignmemt to a variable or temporary defines that variable. An occurence of a variable on the right-hand side of an assginment(or in other expressions) uses the variable. We can speak the *def* of a variable as the set of graph nodes that define it; or the *def* of a graph node as the set of variables that it defines; and the similarly for the *use* of a variable or graph node. In former control flow graph, *def(3)* = {c}, *use(3)* = {b, c}.
+An assignmemt to a variable or temporary defines that variable. An occurence of a variable on the right-hand side of an assginment(or in other expressions) uses the variable. We can define the *def* of a variable as the set of graph nodes that define it; or the *def* of a graph node as the set of variables that it defines; and the similarly for the *use* of a variable or graph node. In former control flow graph, *def(3)* = {c}, *use(3)* = {b, c}.
 
 - Liveness
 
@@ -168,9 +168,9 @@ class ControlFlowGraph(object):
         return self._program
 ```
 
-#### make dataflow analysis
+#### Make dataflow analysis
 
-We follow guide from compilers and try to solve the dataflow equation to get liveness of every variable. If the live-in of an operator node is different from the live-out, then we can make memory sharing. 
+We follow the guide from compilers and try to solve the dataflow equation to get liveness of every variable. If the live-in of an operator node is different from the live-out, then we can make memory sharing. 
 
 For example:
 
diff --git a/doc/design/operator_kernel_type.md b/doc/design/operator_kernel_type.md
index aa82e96bf79319f1a57e2ad58aa9826e57be6470..f86e6b7a564ed23f2bddbec25da1c110014f941d 100644
--- a/doc/design/operator_kernel_type.md
+++ b/doc/design/operator_kernel_type.md
@@ -1,6 +1,6 @@
 # Design Doc: The Keys of Operator Kernel Type
 ## Problem
-An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique Kernel. Before an operator runs, an certain kernel must be chosen by a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
+An operator can have different kernel implementations, and each operator will have a map to store the related kernels. Fluid uses `OpKernelType` as a key to identify a unique kernel. Before an operator runs, a certain type of kernel must be chosen via a key of `OpKernelType`. Currently, `OpKernelType` is defined as follows:
 
 ```cpp
 struct OpKernelType {
@@ -10,13 +10,13 @@ struct OpKernelType {
 ```
 For more details, please refer to [codes](https://github.com/PaddlePaddle/Paddle/blob/2d5ec16bc8a09fb8e0f62c89b116b0cd1d333907/paddle/framework/operator.h#L348-L374) in github.
 
-It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys are not enough. We need a more complete representation of `OpKernelType`. 
+It contains two keys, `Place` and `DataType`. And these two keys will be hashed to a unique key to represent a certain type of kernel. However, these two keys do not provide enough information. We need a more complete representation of `OpKernelType`.
 
-We often implement a kernel of an operator with some computing library in certain device(place). Please remind that computing library and device are not one-to-one corresponding. A device can have a lot of computing libraries and a computing library can also support several devices. 
+We often implement a kernel of an operator with some computing library on certain device(place). Please note that computing library and device do not have a one-to-one correspondence. A device can have a lot of computing libraries and a computing library can also support different devices.
 
-For example, Eigen library can support Nvidia GPU/AMD GPU/CPU. And MKLDNN library can support Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
+For example, Eigen library supports Nvidia GPU/AMD GPU/CPU and MKLDNN library supports Intel CPU/Intel FPGA. Both `Place` and `Library` should be a key of `OpKernelType`.
 
-It's obvious that different DataTypes, like fp64/fp32/int8 will have different kernels. But the data layout of a Tensor will also lead to different implementation. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209). Data Layout should also be taken into consideration.
+Different DataTypes, such as fp64/fp32/int8, will obviously have different kernels. But different data layout of a Tensor will also lead to different implementations. Please refer to the batch norm operator [kernels](https://github.com/PaddlePaddle/Paddle/blob/a948fac4d0ad7e0412d373b8aabeb711c2899563/paddle/operators/batch_norm_op.cc#L180-L209) as an example. Data layout should also be taken into consideration.
 
 ## Solution
 
@@ -31,17 +31,17 @@ struct OpKernelType {
 };
 ```
 
-Following is the details:
+The details are as follows:
 
 ### Place
 
-`Place` is defined as follows:
+`Place` is defined as:
 
 ```cpp
 typedef boost::variant<CUDAPlace, ROCmPlace, FPGAPlace, CPUPlace> Place;
 ```
 
-`Place` is to represent the device memory where data is locating.
+`Place` represents the device memory where data is located.
 
 
 ### Library
@@ -52,10 +52,10 @@ One operator kernel is usually implemented based on one library. `Library` is de
 enum Library { Plain, MKLDNN, CUDNN };
 ```
 
-We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
-A library usually has a corresponding `DeviceContext` which contains some handles needed by computation. Fluid now have two default DeviceContexts in CPU and CUDA, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains a Eigen library handle and `CDUADeviceContext` contains a Eigen library handle and cuBLAS handle.
+We use `Plain` enumerator to represent default library. Since most operators in Fluid are implemented based on the `Eigen` library, we take `Eigen` library as the `Plain` enumerator.
+A library usually has a corresponding `DeviceContext` which contains some handles needed for computation. Fluid now has two default DeviceContexts for CPU and CUDA, namely, `CPUDeviceContext` and `CUDADeviceContext`. `CPUDeviceContext` contains an Eigen library handle and `CDUADeviceContext` contains an Eigen library handle and a cuBLAS handle.
 
-If we want to support new Library, a new enumerator need to be added to `Library` and a new corresponding `LibraryDeviceContext` will be created.
+If we want to support new library, a new enumerator need to be added to `Library` and a corresponding new `LibraryDeviceContext` need to be created.
 
 
 ### DataType
@@ -67,15 +67,15 @@ If we want to support new Library, a new enumerator need to be added to `Library
 
 Actually, a Tensor is a view of a block of memory. Besides a pointer to the memory, we also have to get some other descriptions of this block of memory, such as shape(ddim), stride, and layout.
 
-Different layout leads to different implementation of operator kernel. There are mainly 4 principles we have to follow to support layout in our fluid framework.
+Different layout leads to different implementation of the operator kernel. There are mainly 4 principles we have to follow to support layout in our Fluid framework.
 
-- We take layout as a data member of Tensor. Layout is actually a enum variable. If fluid is built with MKLDNN, then, the memory format in MKLDNN will be added into this enum variable too.
+- We take layout as a data member of Tensor. Layout is actually a enum variable. If Fluid is built with MKLDNN, then the memory format in MKLDNN will also be added into this enum variable.
 
-- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout of generating data. Of course, we can have some default layout, like NCHW.
+- Users have to set layout for input data. And some operators like fill_constant/random, also have to set layout for generating data. Of course, we can have some default layout, like NCHW.
 
-- The inference of Layout is at run-time, not compile-time.
+- The inference of Layout is at run-time, not at compile-time.
 
-- Every operator have to implement different kernels for different layouts. Let's take MKLDNN as an example, if we want to implement a MKLDNN convolution operator, we have to realize all the kernels for different layout, list at [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to do registering kernels for MKLDNN operators.
+- Every operator has to implement different kernels for different layouts. Let's take MKLDNN as an example. If we want to implement an MKLDNN convolution operator, we have to implement all the kernels for different layouts, which are listed [here](http://01org.github.io/mkl-dnn/structmkldnn_1_1memory.html). And we will have a special macro to  register kernels for MKLDNN operators.
 
 `Layout` is also defined as a enum variable:
 
diff --git a/doc/design/python_api.md b/doc/design/python_api.md
index cb5fdc765b7126fc66a1c8978d4b96c0dc5a9f2c..73f6d7b90c7dca0d48109cf3d28d5f7cd56b5c0b 100644
--- a/doc/design/python_api.md
+++ b/doc/design/python_api.md
@@ -279,6 +279,26 @@ class LayerHelper(object):
     return tmp
 ```
 
+### Return value of layer functions
+
+The layer will return a Variable, which is also the output of an operator.  However, outputs of a layer function have more attributes than an operator. There are parameter variables, and their gradient variables need to return. To return them is useful. For example,
+
+1. Users can debug the network by printing parameter gradients.
+2. Users can append attributes to a parameter, such as, `param.stop_gradient=True` will make a parameter stop generate the gradient. We can fix the parameter value during training by using this attribute.
+
+However, it is good to return a Variable for layers, since all layers and operators use Variables as their parameters. We can just append a `param` field and a `grad` field for layer function since the Python is dynamic typing.
+
+The sample usage is
+
+```python
+data = fluid.layers.data(...)
+hidden = fluid.layers.fc(data, ...)
+...
+
+executor.run(fetch_list=[hidden.param, hidden.param.grad], ...)
+```
+
+
 ## Optimizer
 
 [Optimizer Design Doc](./optimizer.md)
diff --git a/doc/design/var_desc.md b/doc/design/var_desc.md
index 0b2958c1b10ef6a6ce51aa75f61e15a7f2d94b3f..89fa95326c5c4909137544c6b5fd574e1281abe2 100644
--- a/doc/design/var_desc.md
+++ b/doc/design/var_desc.md
@@ -1,12 +1,12 @@
 ## Background
-PaddlePaddle divides the description of neural network computation graph into two stages: compile time and runtime.
+PaddlePaddle divides the description of neural network computation into two stages: compile time and runtime. At compile time, the neural network computation is described as a `ProgramDesc` whereas at runtime an `Executor` interprets the `ProgramDesc` to compute the operations.
 
-PaddlePaddle use proto message to describe compile time graph because
+PaddlePaddle use proto message to describe compile time program because
 
-1. Computation graph should be able to be saved to a file.
-1. In distributed training, the graph will be serialized and send to multiple workers.
+1. The computation program description must be serializable and saved in a file.
+1. During distributed training, the sreialized program will be sent to multiple workers. It should also be possible to break the program into different components, each of which can be executed on different workers.
 
-The computation graph is constructed by Data Node and Operation Node. The concept to represent them is in the table below.
+The computation `Program` consists of nested `Blocks`. Each `Block` will consist of data(i.e. `Variable`)  and  `Operations`. The concept to represent them is in the table below.
 
 | |compile time|runtime|
 |---|---|---|
diff --git a/doc/getstarted/build_and_install/build_from_source_cn.rst b/doc/getstarted/build_and_install/build_from_source_cn.rst
index 41ac07ca5674d2c121baba77c58226ad328cd681..71904dc41ed0d946867d890cc585e1b88450ca8c 100644
--- a/doc/getstarted/build_and_install/build_from_source_cn.rst
+++ b/doc/getstarted/build_and_install/build_from_source_cn.rst
@@ -32,6 +32,16 @@ PaddlePaddleä¸»è¦ä½¿ç”¨ `CMake <https://cmake.org>`_ ä»¥åŠGCC, G++ä½œä¸ºç¼–è¯‘
 
    pip install build/python/dist/*.whl
 
+å¦‚æžœæœºå™¨ä¸å·²ç»å®‰è£…è¿‡PaddlePaddleï¼Œæœ‰ä¸¤ç§æ–¹æ³•ï¼š
+
+.. code-block:: bash
+
+   1. å…ˆå¸è½½ä¹‹å‰çš„ç‰ˆæœ¬ï¼Œå†é‡æ–°å®‰è£…
+   pip uninstall paddlepaddle
+   pip install build/python/dist/*.whl
+
+   2. ç›´æŽ¥å‡çº§åˆ°æ›´æ–°çš„ç‰ˆæœ¬
+   pip install build/python/dist/*.whl -U
 
 .. _run_test:
 
diff --git a/doc/getstarted/build_and_install/build_from_source_en.rst b/doc/getstarted/build_and_install/build_from_source_en.rst
index 92211aee8c3bc0ae6e1a38311d40ddf92117cac7..27f73b2e2c029b41d514e1612912ed1c335605b6 100644
--- a/doc/getstarted/build_and_install/build_from_source_en.rst
+++ b/doc/getstarted/build_and_install/build_from_source_en.rst
@@ -36,6 +36,16 @@ machine or copy it to the target machine.
 
    pip install build/python/dist/*.whl
 
+If the machine has installed PaddlePaddle before, there are two methods:
+
+.. code-block:: bash
+
+   1. uninstall and reinstall
+   pip uninstall paddlepaddle
+   pip install build/python/dist/*.whl
+
+   2. upgrade directly
+   pip install build/python/dist/*.whl -U
 
 .. _run_test:
 
diff --git a/doc/howto/index_cn.rst b/doc/howto/index_cn.rst
index ccd909770253bb85dbc8a5a2560594076c2f68b0..e0c69f7a6a4043abe999af6c8dd2555178b68424 100644
--- a/doc/howto/index_cn.rst
+++ b/doc/howto/index_cn.rst
@@ -9,6 +9,7 @@
 
   usage/cmd_parameter/index_cn.rst
   usage/cluster/cluster_train_cn.md
+  usage/capi/index_cn.rst
 
 å¼€å‘æ ‡å‡†
 --------
diff --git a/doc/howto/read_source.md b/doc/howto/read_source.md
index e4211abb3be9cace80bc14dbe3db3e0a31221dd0..31987920f32f217ac2db42548874cfe7da57dd72 100644
--- a/doc/howto/read_source.md
+++ b/doc/howto/read_source.md
@@ -26,16 +26,16 @@ sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
 sgd_optimizer.minimize(avg_cost)
 ```
 
-- Variables: `x`,  `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/framework.py#L93)
-- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/layers.py)
+- Variables: `x`,  `y`, `y_predict`, `cost` and `avg_cost`. [Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/framework.py#)
+- Layers: `fluid.layers.data`, `fluid.layers.fc` and `fluid.layers.mean` are layers. [Python](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/v2/fluid/layers)
   - Every Layer has one or more operators and variables/parameters
     - All the operators are defined at [`paddle/operators/`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/operators). Other worth-looking files:
       - Base class: [`paddle/framework/operator.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/operator.h)
       - Operator Registration: [`paddle/framework/op_registry.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_registry.h) 
       - Operator Lookup: [`paddle/framework/op_info.h`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/op_info.h)
 - Optimizer: `fluid.optimizer.SGD`. It does the following
-  - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/backward.py), [C++](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/framework/backward.cc)]
-  - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/optimizer.py), [C++](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/optimizer)]
+  - Add backward operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/backward.py)]
+  - Add optimizer operators. [[Python](https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/v2/fluid/optimizer.py)]
 
 # Run Time
 
diff --git a/doc/howto/usage/capi/compile_paddle_lib_cn.md b/doc/howto/usage/capi/compile_paddle_lib_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..ac5ecffe2ea8ddc3703a32e9a0a8ee83bbe5dd14
--- /dev/null
+++ b/doc/howto/usage/capi/compile_paddle_lib_cn.md
@@ -0,0 +1,122 @@
+## ç¼–è¯‘ PaddlePaddle é¢„æµ‹åº“
+
+### æ¦‚è¿°
+
+ä½¿ç”¨ C-API è¿›è¡Œé¢„æµ‹ä¾èµ–äºŽå°† PaddlePaddle æ ¸å¿ƒä»£ç ç¼–è¯‘æˆé“¾æŽ¥åº“ï¼Œåªéœ€åœ¨ç¼–è¯‘æ—¶éœ€é…åˆ¶ä¸‹é¢è¿™äº›ç¼–è¯‘é€‰é¡¹ï¼š
+
+å¿…é¡»é…ç½®é€‰é¡¹ï¼š
+- `WITH_C_API`ï¼Œå¿…é¡»é…ç½®ä¸º`ON`ã€‚
+
+æŽ¨èé…ç½®é€‰é¡¹ï¼š
+- `WITH_PYTHON`ï¼ŒæŽ¨èé…ç½®ä¸º`OFF`
+- `WITH_SWIG_PY`ï¼ŒæŽ¨èé…ç½®ä¸º`OFF`
+- `WITH_GOLANG`ï¼ŒæŽ¨èè®¾ç½®ä¸º`OFF`
+
+å¯é€‰é…ç½®é€‰é¡¹ï¼š
+- `WITH_GPU`ï¼Œå¯é…ç½®ä¸º`ON/OFF`
+- `WITH_MKL`ï¼Œå¯é…ç½®ä¸º`ON/OFF`
+
+å¯¹æŽ¨èé…ç½®ä¸çš„é€‰é¡¹å»ºè®®æŒ‰ç…§è®¾ç½®ï¼Œä»¥é¿å…é“¾æŽ¥ä¸å¿…è¦çš„åº“ã€‚å…¶å®ƒå¯é€‰ç¼–è¯‘é€‰é¡¹æŒ‰éœ€è¿›è¡Œè®¾å®šã€‚
+
+ä¸‹é¢çš„ä»£ç ç‰‡æ®µä»Žgithubæ‹‰å–æœ€æ–°ä»£ç ï¼Œé…åˆ¶ç¼–è¯‘é€‰é¡¹ï¼ˆéœ€è¦å°†PADDLE_ROOTæ›¿æ¢ä¸ºPaddlePaddleé¢„æµ‹åº“çš„å®‰è£…è·¯å¾„ï¼‰ï¼š
+
+```shell
+PADDLE_ROOT=/path/of/capi
+git clone https://github.com/PaddlePaddle/Paddle.git
+cd Paddle
+mkdir build
+cd build
+cmake -DCMAKE_INSTALL_PREFIX=$PADDLE_ROOT \
+      -DCMAKE_BUILD_TYPE=Release \
+      -DWITH_C_API=ON \
+      -DWITH_SWIG_PY=OFF \
+      -DWITH_GOLANG=OFF \
+      -DWITH_PYTHON=OFF \
+      -DWITH_MKL=OFF \
+      -DWITH_GPU=OFF  \
+      ..
+```
+
+æ‰§è¡Œä¸Šè¿°ä»£ç ç”ŸæˆMakefileæ–‡ä»¶åŽï¼Œæ‰§è¡Œï¼š`make && make install`ã€‚æˆåŠŸç¼–è¯‘åŽï¼Œä½¿ç”¨C-APIæ‰€éœ€çš„ä¾èµ–ï¼ˆåŒ…æ‹¬ï¼šï¼ˆ1ï¼‰ç¼–è¯‘å‡ºçš„PaddlePaddleé¢„æµ‹åº“å’Œå¤´æ–‡ä»¶ï¼›ï¼ˆ2ï¼‰ç¬¬ä¸‰æ–¹é“¾æŽ¥åº“å’Œå¤´æ–‡ä»¶ï¼‰å‡ä¼šå˜æ”¾äºŽ`PADDLE_ROOT`ç›®å½•ä¸ã€‚
+
+ç¼–è¯‘æˆåŠŸåŽåœ¨ `PADDLE_ROOT` ä¸‹ä¼šçœ‹åˆ°å¦‚ä¸‹ç›®å½•ç»“æž„ï¼ˆåŒ…æ‹¬äº†ç¼–è¯‘å‡ºçš„PaddlePaddleå¤´æ–‡ä»¶å’Œé“¾æŽ¥åº“ï¼Œä»¥åŠç¬¬ä¸‰æ–¹ä¾èµ–é“¾æŽ¥åº“å’Œå¤´æ–‡ä»¶ï¼ˆå¦‚æžœéœ€è¦ï¼Œç”±é“¾æŽ¥æ–¹å¼å†³å®šï¼‰ï¼‰ï¼š
+
+```text
+â”œâ”€â”€ include
+â”‚Â Â  â””â”€â”€ paddle
+â”‚Â Â      â”œâ”€â”€ arguments.h
+â”‚Â Â      â”œâ”€â”€ capi.h
+â”‚Â Â      â”œâ”€â”€ capi_private.h
+â”‚Â Â      â”œâ”€â”€ config.h
+â”‚Â Â      â”œâ”€â”€ error.h
+â”‚Â Â      â”œâ”€â”€ gradient_machine.h
+â”‚Â Â      â”œâ”€â”€ main.h
+â”‚Â Â      â”œâ”€â”€ matrix.h
+â”‚Â Â      â”œâ”€â”€ paddle_capi.map
+â”‚Â Â      â””â”€â”€ vector.h
+â”œâ”€â”€ lib
+â”‚Â Â  â”œâ”€â”€ libpaddle_capi_engine.a
+â”‚Â Â  â”œâ”€â”€ libpaddle_capi_layers.a
+â”‚Â Â  â”œâ”€â”€ libpaddle_capi_shared.so
+â”‚Â Â  â””â”€â”€ libpaddle_capi_whole.a
+â””â”€â”€ third_party
+    â”œâ”€â”€ gflags
+    â”‚Â Â  â”œâ”€â”€ include
+    â”‚Â Â  â”‚Â Â  â””â”€â”€ gflags
+    â”‚Â Â  â”‚Â Â      â”œâ”€â”€ gflags_completions.h
+    â”‚Â Â  â”‚Â Â      â”œâ”€â”€ gflags_declare.h
+    â”‚Â Â  â”‚Â Â      ...
+    â”‚Â Â  â””â”€â”€ lib
+    â”‚Â Â      â””â”€â”€ libgflags.a
+    â”œâ”€â”€ glog
+    â”‚Â Â  â”œâ”€â”€ include
+    â”‚Â Â  â”‚Â Â  â””â”€â”€ glog
+    â”‚Â Â  â”‚Â Â      â”œâ”€â”€ config.h
+    â”‚Â Â  â”‚Â Â      ...
+    â”‚Â Â  â””â”€â”€ lib
+    â”‚Â Â      â””â”€â”€ libglog.a
+    â”œâ”€â”€ openblas
+    â”‚Â Â  â”œâ”€â”€ include
+    â”‚Â Â  â”‚Â Â  â”œâ”€â”€ cblas.h
+    â”‚Â Â  â”‚Â Â  ...
+    â”‚Â Â  â””â”€â”€ lib
+    â”‚Â Â      ...
+    â”œâ”€â”€ protobuf
+    â”‚Â Â  â”œâ”€â”€ include
+    â”‚Â Â  â”‚Â Â  â””â”€â”€ google
+    â”‚Â Â  â”‚Â Â      â””â”€â”€ protobuf
+    â”‚Â Â  â”‚Â Â          ...
+    â”‚Â Â  â””â”€â”€ lib
+    â”‚Â Â      â””â”€â”€ libprotobuf-lite.a
+    â””â”€â”€ zlib
+        â”œâ”€â”€ include
+        â”‚Â Â  ...
+        â””â”€â”€ lib
+            ...
+
+```
+
+### é“¾æŽ¥è¯´æ˜Ž
+
+ç›®å‰æä¾›ä¸‰ç§é“¾æŽ¥æ–¹å¼ï¼š
+
+1. é“¾æŽ¥`libpaddle_capi_shared.so` åŠ¨æ€åº“
+    - ä½¿ç”¨ PaddlePaddle C-API å¼€å‘é¢„æµ‹ç¨‹åºé“¾æŽ¥`libpaddle_capi_shared.so`æ—¶ï¼Œéœ€æ³¨æ„ï¼š
+        1. å¦‚æžœç¼–è¯‘æ—¶æŒ‡å®šç¼–è¯‘CPUç‰ˆæœ¬ï¼Œä¸”ä½¿ç”¨`OpenBLAS`æ•°å¦åº“ï¼Œåœ¨ä½¿ç”¨C-APIå¼€å‘é¢„æµ‹ç¨‹åºæ—¶ï¼Œåªéœ€è¦é“¾æŽ¥`libpaddle_capi_shared.so`è¿™ä¸€ä¸ªåº“ã€‚
+        1. å¦‚æžœæ˜¯ç”¨ç¼–è¯‘æ—¶æŒ‡å®šCPUç‰ˆæœ¬ï¼Œä¸”ä½¿ç”¨`MKL`æ•°å¦åº“ï¼Œç”±äºŽ`MKL`åº“æœ‰è‡ªå·±ç‹¬ç«‹çš„åŠ¨æ€åº“æ–‡ä»¶ï¼Œåœ¨ä½¿ç”¨PaddlePaddle C-APIå¼€å‘é¢„æµ‹ç¨‹åºæ—¶ï¼Œéœ€è¦è‡ªå·±é“¾æŽ¥MKLé“¾æŽ¥åº“ã€‚
+        1. å¦‚æžœç¼–è¯‘æ—¶æŒ‡å®šç¼–è¯‘GPUç‰ˆæœ¬ï¼ŒCUDAç›¸å…³åº“ä¼šåœ¨é¢„æµ‹ç¨‹åºè¿è¡Œæ—¶åŠ¨æ€è£…è½½ï¼Œéœ€è¦å°†CUDAç›¸å…³çš„åº“è®¾ç½®åˆ°`LD_LIBRARY_PATH`çŽ¯å¢ƒå˜é‡ä¸ã€‚
+    - è¿™ç§æ–¹å¼æœ€ä¸ºç®€ä¾¿ï¼Œé“¾æŽ¥ç›¸å¯¹å®¹æ˜“ï¼Œ**åœ¨æ— ç‰¹æ®Šéœ€æ±‚æƒ…å†µä¸‹ï¼ŒæŽ¨èä½¿ç”¨æ¤æ–¹å¼**ã€‚
+
+2. é“¾æŽ¥é™æ€åº“ `libpaddle_capi_whole.a`
+    - ä½¿ç”¨PaddlePaddle C-API å¼€å‘é¢„æµ‹ç¨‹åºé“¾æŽ¥`libpaddle_capi_whole.a`æ—¶ï¼Œéœ€æ³¨æ„ï¼š
+        1. éœ€è¦æŒ‡å®š`-Wl,--whole-archive`é“¾æŽ¥é€‰é¡¹ã€‚
+        1. éœ€è¦æ˜¾å¼åœ°é“¾æŽ¥ `gflags`ã€`glog`ã€`libz`ã€`protobuf` ç‰ç¬¬ä¸‰æ–¹åº“ï¼Œå¯åœ¨`PADDLE_ROOT/third_party`ä¸‹æ‰¾åˆ°ã€‚
+        1. å¦‚æžœåœ¨ç¼–è¯‘ C-API æ—¶ä½¿ç”¨OpenBLASæ•°å¦åº“ï¼Œéœ€è¦æ˜¾ç¤ºåœ°é“¾æŽ¥`libopenblas.a`ã€‚
+        1. å¦‚æžœåœ¨ç¼–è¯‘ C-API æ˜¯ä½¿ç”¨MKLæ•°å¦åº“ï¼Œéœ€è¦æ˜¾ç¤ºåœ°é“¾æŽ¥MKLçš„åŠ¨æ€åº“ã€‚
+
+3. é“¾æŽ¥é™æ€åº“ `libpaddle_capi_layers.a`å’Œ`libpaddle_capi_engine.a`
+    - ä½¿ç”¨PaddlePaddle C-API å¼€å‘é¢„æµ‹ç¨‹åºé“¾æŽ¥`libpaddle_capi_whole.a`æ—¶ï¼Œéœ€æ³¨æ„ï¼š
+        1. è¿™ç§é“¾æŽ¥æ–¹å¼ä¸»è¦ç”¨äºŽç§»åŠ¨ç«¯é¢„æµ‹ã€‚
+        1. ä¸ºäº†å‡å°‘ç”Ÿæˆé“¾æŽ¥åº“çš„å¤§å°æŠŠ`libpaddle_capi_whole.a`æ‹†æˆä»¥ä¸Šä¸¤ä¸ªé™æ€é“¾æŽ¥åº“ã€‚
+        1. éœ€æŒ‡å®š`-Wl,--whole-archive -lpaddle_capi_layers`Â å’Œ `-Wl,--no-whole-archive -lpaddle_capi_engine` è¿›è¡Œé“¾æŽ¥ã€‚
+        1. ç¬¬ä¸‰æ–¹ä¾èµ–åº“éœ€è¦æŒ‰ç…§ä¸Žæ–¹å¼2åŒæ ·æ–¹æ³•æ˜¾ç¤ºåœ°è¿›è¡Œé“¾æŽ¥ã€‚
diff --git a/doc/howto/usage/capi/images/csr.png b/doc/howto/usage/capi/images/csr.png
new file mode 100644
index 0000000000000000000000000000000000000000..3dc10b8de4f6d3f517624956b1694b689405a031
Binary files /dev/null and b/doc/howto/usage/capi/images/csr.png differ
diff --git a/doc/howto/usage/capi/images/sequence_data.png b/doc/howto/usage/capi/images/sequence_data.png
new file mode 100644
index 0000000000000000000000000000000000000000..6e47a46b8955dfe977e85898fe3c9f33ed28de7e
Binary files /dev/null and b/doc/howto/usage/capi/images/sequence_data.png differ
diff --git a/doc/howto/usage/capi/images/workflow_of_CAPI.png b/doc/howto/usage/capi/images/workflow_of_CAPI.png
new file mode 100644
index 0000000000000000000000000000000000000000..a4399ade048b3fe10d2d9c714bc34333ca068edb
Binary files /dev/null and b/doc/howto/usage/capi/images/workflow_of_CAPI.png differ
diff --git a/doc/howto/usage/capi/index_cn.rst b/doc/howto/usage/capi/index_cn.rst
new file mode 100644
index 0000000000000000000000000000000000000000..fd774fbc742671c5a8009cb742f2c9d06a525199
--- /dev/null
+++ b/doc/howto/usage/capi/index_cn.rst
@@ -0,0 +1,9 @@
+PaddlePaddle C-API
+==================
+
+..  toctree::
+  :maxdepth: 1
+
+  compile_paddle_lib_cn.md
+  organization_of_the_inputs_cn.md
+  workflow_of_capi_cn.md
diff --git a/doc/howto/usage/capi/organization_of_the_inputs_cn.md b/doc/howto/usage/capi/organization_of_the_inputs_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..563ec5ca21ec5d75800fa201943d65e6d6fe51ea
--- /dev/null
+++ b/doc/howto/usage/capi/organization_of_the_inputs_cn.md
@@ -0,0 +1,285 @@
+## è¾“å…¥/è¾“å‡ºæ•°æ®ç»„ç»‡
+
+è¿™ç¯‡æ–‡æ¡£ä»‹ç»åœ¨ä½¿ç”¨ PaddlePaddle C-API æ—¶å¦‚ä½•ç»„ç»‡è¾“å…¥æ•°æ®ï¼Œä»¥åŠå¦‚ä½•è§£æžç¥žç»ç½‘ç»œå‰å‘è®¡ç®—çš„è¾“å‡ºç»“æžœã€‚
+
+### è¾“å…¥/è¾“å‡ºæ•°æ®ç±»åž‹
+åœ¨C-APIä¸ï¼ŒæŒ‰ç…§åŸºæœ¬æ•°æ®ç±»åž‹åœ¨PaddlePaddleå†…éƒ¨çš„å®šä¹‰å’Œå®žçŽ°ï¼Œè¾“å…¥æ•°æ®å¯åˆ†ä¸ºï¼š
+1. ä¸€ç»´æ•´åž‹æ•°ç»„
+1. äºŒç»´æµ®ç‚¹åž‹çŸ©é˜µ
+    - ç¨ å¯†çŸ©é˜µ
+    - ç¨€ç–çŸ©é˜µ
+
+è¯´æ˜Žï¼š
+1. ä¸€ç»´æ•°ç»„**ä»…æ”¯æŒæ•´åž‹å€¼**ï¼›
+    - å¸¸ç”¨äºŽè‡ªç„¶è¯è¨€å¤„ç†ä»»åŠ¡ï¼Œä¾‹å¦‚ï¼šè¡¨ç¤ºè¯è¯åœ¨è¯å…¸ä¸çš„åºå·ï¼›
+    - åˆ†ç±»ä»»åŠ¡ä¸ç±»åˆ«æ ‡ç¾ï¼›
+1. é€»è¾‘ä¸Šé«˜äºŽäºŒç»´çš„æ•°æ®ï¼ˆä¾‹å¦‚å«æœ‰å¤šä¸ªé€šé“çš„å›¾ç‰‡ï¼Œè§†é¢‘ç‰ï¼‰åœ¨ç¨‹åºå®žçŽ°ä¸éƒ½ä¼šè½¬åŒ–ä¸ºäºŒç»´çŸ©é˜µï¼Œè½¬åŒ–æ–¹æ³•åœ¨ç›¸åº”çš„é¢†åŸŸéƒ½æœ‰é€šç”¨è§£å†³æ–¹æ¡ˆï¼Œéœ€è¦ä½¿ç”¨è€…è‡ªå·±äº†è§£å¹¶å®Œæˆè½¬åŒ–ï¼›
+1. äºŒç»´çŸ©é˜µå¯ä»¥è¡¨ç¤ºè¡Œå‘é‡å’Œåˆ—å‘é‡ï¼Œä»»ä½•æ—¶å€™å¦‚æžœéœ€è¦æµ®ç‚¹åž‹æ•°ç»„ï¼ˆå‘é‡ï¼‰ï¼Œéƒ½åº”ä½¿ç”¨C-APIä¸çš„çŸ©é˜µæ¥è¡¨ç¤ºï¼Œè€Œä¸æ˜¯C-APIä¸çš„ä¸€ç»´æ•°ç»„ã€‚
+1. ä¸è®ºæ˜¯ä¸€ç»´æ•´åž‹æ•°ç»„è¿˜æ˜¯äºŒç»´æµ®ç‚¹æ•°çŸ©é˜µï¼Œ**ä¸ºå®ƒä»¬é™„åŠ ä¸Šåºåˆ—ä¿¡æ¯å°†å˜æˆåºåˆ—è¾“å…¥ã€‚PaddlePaddle ä¼šé€šè¿‡åˆ¤æ•°æ®æ˜¯å¦é™„å¸¦æœ‰åºåˆ—ä¿¡æ¯æ¥åˆ¤æ–ä¸€ä¸ªå‘é‡/çŸ©é˜µæ˜¯å¦æ˜¯ä¸€ä¸ªåºåˆ—**ã€‚å½“éžåºåˆ—è¾“å…¥æ—¶ï¼Œæ— éœ€å…³å¿ƒå’Œå¤„ç†åºåˆ—ä¿¡æ¯ã€‚å…³äºŽä»€ä¹ˆæ˜¯â€œåºåˆ—ä¿¡æ¯â€ï¼Œä¸‹æ–‡ä¼šè¯¦ç»†è¿›è¡Œä»‹ç»ã€‚
+
+### åŸºæœ¬ä½¿ç”¨æ¦‚å¿µ
+
+- åœ¨PaddlePaddleå†…éƒ¨ï¼Œç¥žç»ç½‘ç»œä¸ä¸€ä¸ªè®¡ç®—å±‚çš„è¾“å…¥/è¾“å‡ºè¢«ç»„ç»‡ä¸ºä¸€ä¸ª `Argument` ç»“æž„ä½“ï¼Œå¦‚æžœç¥žç»ç½‘ç»œæœ‰å¤šä¸ªè¾“å…¥æˆ–è€…å¤šä¸ªè¾“å…¥ï¼Œæ¯ä¸€ä¸ªè¾“å…¥/è¾“å…¥éƒ½ä¼šå¯¹åº”æœ‰è‡ªå·±çš„`Argument`ã€‚
+- `Argument` å¹¶ä¸çœŸæ£â€œå˜å‚¨â€æ•°æ®ï¼Œè€Œæ˜¯å°†è¾“å…¥/è¾“å‡ºä¿¡æ¯æœ‰æœºåœ°ç»„ç»‡åœ¨ä¸€èµ·ã€‚
+- åœ¨`Argument`å†…éƒ¨ç”±`IVector`ï¼ˆå¯¹åº”ç€ä¸Šæ–‡æåˆ°çš„ä¸€ç»´æ•´åž‹æ•°ç»„ï¼‰å’Œ`Matrix`ï¼ˆå¯¹åº”ç€ä¸Šæ–‡æåˆ°çš„äºŒç»´æµ®ç‚¹åž‹çŸ©é˜µï¼‰æ¥å®žé™…å˜å‚¨æ•°æ®ï¼›ç”± `Sequence Start Positions` (ä¸‹æ–‡è¯¦ç»†è§£é‡Š) æ¥æè¿°è¾“å…¥/è¾“å‡ºçš„åºåˆ—ä¿¡æ¯ã€‚
+
+- **æ³¨**ï¼š
+    1. è¿™ç¯‡æ–‡æ¡£ä¹‹åŽéƒ¨åˆ†å°†ä¼šç»Ÿä¸€ä½¿ç”¨`argument`æ¥ç‰¹æŒ‡PaddlePaddleä¸ç¥žç»ç½‘ç»œè®¡ç®—å±‚ä¸€ä¸ªè¾“å…¥/è¾“å‡ºæ•°æ®ã€‚
+    1. ä½¿ç”¨`paddle_ivector`æ¥ç‰¹æŒ‡PaddlePaddleä¸çš„ä¸€ç»´æ•´åž‹æ•°ç»„ã€‚
+    1. ä½¿ç”¨`paddle_matrix`æ¥ç‰¹æŒ‡PaddlePaddleä¸çš„äºŒç»´æµ®ç‚¹åž‹çŸ©é˜µã€‚
+
+### ç»„ç»‡è¾“å…¥æ•°æ®
+- ä¸€ç»´æ•´åž‹æ•°ç»„
+
+    æ¦‚å¿µä¸Šå¯ä»¥å°†`paddle_ivector`ç†è§£ä¸ºä¸€ä¸ªä¸€ç»´çš„æ•´åž‹æ•°ç»„ï¼Œé€šå¸¸ç”¨äºŽè¡¨ç¤ºç¦»æ•£çš„ç±»åˆ«æ ‡ç¾ï¼Œæˆ–æ˜¯åœ¨è‡ªç„¶è¯è¨€å¤„ç†ä»»åŠ¡ä¸è¡¨ç¤ºè¯è¯åœ¨å—å…¸ä¸çš„åºå·ã€‚ä¸‹é¢çš„ä»£ç ç‰‡æ®µåˆ›å»ºäº†å«æœ‰ä¸‰ä¸ªå…ƒç´ `1`ã€`2`ã€`3`çš„`paddle_ivector`ã€‚
+    ```c
+    int ids[] = {1, 2, 3};
+     paddle_ivector ids_array =
+         paddle_ivector_create(ids, sizeof(ids) / sizeof(int), false, false);
+     CHECK(paddle_arguments_set_ids(in_args, 0, ids_array));
+    ```
+
+- **ç¨ å¯†çŸ©é˜µ**
+    - ä¸€ä¸ª`mÃ—n`çš„ç¨ å¯†çŸ©é˜µæ˜¯ä¸€ä¸ªç”±`m`è¡Œ`n`åˆ—å…ƒç´ æŽ’åˆ—æˆçš„çŸ©å½¢é˜µåˆ—ï¼ŒçŸ©é˜µé‡Œçš„å…ƒç´ æ˜¯æµ®ç‚¹æ•°ã€‚å¯¹ç¥žç»ç½‘ç»œæ¥è¯´ï¼ŒçŸ©é˜µçš„é«˜åº¦`m`æ˜¯ä¸€æ¬¡é¢„æµ‹æŽ¥å—çš„æ ·æœ¬æ•°ç›®ï¼Œå®½åº¦$n$æ˜¯ç¥žç»ç½‘ç»œå®šä¹‰æ—¶ï¼Œ`paddle.layer.data`çš„`size`ã€‚
+    - ä¸‹é¢çš„ä»£ç ç‰‡æ®µåˆ›å»ºäº†ä¸€ä¸ªé«˜åº¦ä¸º1ï¼Œå®½åº¦ä¸º`layer_size`çš„ç¨ å¯†çŸ©é˜µï¼ŒçŸ©é˜µä¸æ¯ä¸ªå…ƒç´ çš„å€¼éšæœºç”Ÿæˆã€‚
+
+    ```c
+    paddle_matrix mat = paddle_matrix_create(
+                            /* height = batch size */ 1,
+                            /* width = dimensionality of the data layer */ layer_size,
+                            /* whether to use GPU */ false);
+
+    paddle_real* array;
+    // Get the pointer pointing to the start address of the first row of the
+    // created matrix.
+    CHECK(paddle_matrix_get_row(mat, 0, &array));
+
+    // Fill the matrix with a randomly generated test sample.
+    srand(time(0));
+    for (int i = 0; i < layer_size; ++i) {
+      array[i] = rand() / ((float)RAND_MAX);
+    }
+
+    // Assign the matrix to the argument.
+    CHECK(paddle_arguments_set_value(in_args, 0, mat));
+    ```
+
+- **ç¨€ç–çŸ©é˜µ**
+
+  PaddlePaddle C-API ä¸ ç¨€ç–çŸ©é˜µä½¿ç”¨[CSRï¼ˆCompressed Sparse Row Formatï¼‰](https://en.wikipedia.org/wiki/Sparse_matrix#Compressed_sparse_row_(CSR,_CRS_or_Yale_format))æ ¼å¼å˜å‚¨ã€‚ä¸‹å›¾æ˜¯CSRå˜å‚¨ç¨€ç–çŸ©é˜µçš„ç¤ºæ„å›¾ã€‚
+  <p align="center">
+  <img src="https://user-images.githubusercontent.com/5842774/34159369-009fd328-e504-11e7-9e08-36bc6dc5e505.png" width=700><br> å›¾1. ç¨€ç–çŸ©é˜µå˜å‚¨ç¤ºæ„å›¾
+  </p>
+
+  CSRå˜å‚¨æ ¼å¼é€šè¿‡ï¼šï¼ˆ1ï¼‰éžé›¶å…ƒç´ çš„å€¼ï¼ˆä¸Šå›¾ä¸çš„`values`ï¼‰ï¼›ï¼ˆ2ï¼‰è¡Œåç§»(ä¸Šå›¾ä¸çš„`row offsets`)ï¼šæ¯ä¸€è¡Œå…ƒç´ åœ¨`values`ä¸çš„èµ·å§‹åç§»ï¼Œ`row offsets`ä¸å…ƒç´ ä¸ªæ•°æ€»æ˜¯ç‰äºŽè¡Œæ•° + 1ï¼›ï¼ˆ3ï¼‰éžé›¶å…ƒç´ çš„åˆ—å·ï¼ˆä¸Šå›¾ä¸çš„`column indices`ï¼‰æ¥ç¡®å®šç¨€ç–çŸ©é˜µçš„å†…å®¹ã€‚
+
+  åœ¨PaddlePaddle C-APIä¸ï¼Œé€šè¿‡è°ƒç”¨ä»¥ä¸‹æŽ¥å£åˆ›å»ºç¨€ç–çŸ©é˜µï¼š
+
+  ```c
+  PD_API paddle_matrix paddle_matrix_create_sparse(
+      uint64_t height, uint64_t width, uint64_t nnz, bool isBinary, bool useGpu);
+  ```
+
+  1. åˆ›å»ºç¨€ç–çŸ©é˜µæ—¶éœ€è¦æ˜¾ç¤ºåœ°æŒ‡å®šçŸ©é˜µçš„ï¼ˆ1ï¼‰é«˜åº¦ï¼ˆ`height`ï¼Œåœ¨ç¥žç»ç½‘ç»œä¸ç‰äºŽä¸€æ¬¡é¢„æµ‹å¤„ç†çš„æ ·æœ¬æ•°ï¼‰ï¼ˆ2ï¼‰å®½åº¦ï¼ˆ`width`ï¼Œ`paddle.layer.data`çš„`size`ï¼‰ä»¥åŠï¼ˆ3ï¼‰éžé›¶å…ƒä¸ªæ•°ï¼ˆ`nnz`ï¼‰ã€‚
+  1. å½“ä¸Šè¿°æŽ¥å£ç¬¬4ä¸ªå‚æ•°`isBinary`æŒ‡å®šä¸º`true`æ—¶ï¼Œ**åªéœ€è¦è®¾ç½®è¡Œåç§»ï¼ˆ`row_offset`ï¼‰å’Œåˆ—å·(`colum indices`)ï¼Œä¸éœ€è¦æä¾›å…ƒç´ å€¼ï¼ˆ`values`ï¼‰**ï¼Œè¿™æ—¶è¡Œåç§»å’Œåˆ—å·æŒ‡å®šçš„å…ƒç´ é»˜è®¤å…¶å€¼ä¸º1ã€‚
+
+  ä¸‹é¢çš„ä»£ç ç‰‡æ®µåˆ›å»ºäº†ä¸€ä¸ªCPUä¸Šçš„äºŒå€¼ç¨€ç–çŸ©é˜µï¼š
+
+  ```c
+  paddle_matrix mat = paddle_matrix_create_sparse(1, layer_size, nnz, true, false);
+  int colIndices[] = {9, 93, 109};  // layer_size here is greater than 109.
+  int rowOffset[] = {0, sizeof(colIndices) / sizeof(int)};
+
+  CHECK(paddle_matrix_sparse_copy_from(mat,
+                                 rowOffset,
+                                 sizeof(rowOffset) / sizeof(int),
+                                 colIndices,
+                                 (colIndices) / sizeof(int),
+                                 NULL /*values array is NULL.*/,
+                                 0 /*size of the value arrary is 0.*/));
+  CHECK(paddle_arguments_set_value(in_args, 0, mat));
+  ```
+  ä¸‹é¢çš„ä»£ç ç‰‡æ®µåœ¨åˆ›å»ºäº†ä¸€ä¸ªCPUä¸Šçš„å¸¦å…ƒç´ å€¼çš„ç¨€ç–çŸ©é˜µï¼š
+  ```c
+  paddle_matrix mat = paddle_matrix_create_sparse(1, layer_size, nnz, false, false);
+  int colIndices[] = {9, 93, 109};  // layer_size here is greater than 109.
+  int rowOffset[] = {0, sizeof(colIndices) / sizeof(int)};
+  float values[] = {0.5, 0.5, 0.5};
+
+  CHECK(paddle_matrix_sparse_copy_from(mat,
+                                 rowOffset,
+                                 sizeof(rowOffset) / sizeof(int),
+                                 colIndices,
+                                 sizeof(colIndices) / sizeof(int),
+                                 values,
+                                 sizeof(values) / sizeof(float)));
+  ```
+  æ³¨æ„äº‹é¡¹ï¼š
+  1. ç§»åŠ¨ç«¯é¢„æµ‹**ä¸æ”¯æŒ**ç¨€ç–çŸ©é˜µåŠç›¸å…³çš„æŽ¥å£ã€‚
+
+### ç»„ç»‡åºåˆ—ä¿¡æ¯
+
+å¤šä¸ªæŽ’æˆä¸€åˆ—çš„å…ƒç´ ï¼ˆå¯ä»¥æ˜¯æ•´åž‹ã€æµ®ç‚¹æ•°ã€æµ®ç‚¹æ•°å‘é‡ç‰ï¼‰æž„æˆä¸€ä¸ªåºåˆ—ï¼Œå…ƒç´ ä¹‹é—´çš„é¡ºåºæ˜¯åºåˆ—æ‰€æºå¸¦çš„é‡è¦ä¿¡æ¯ã€‚ä¸åŒåºåˆ—å¯èƒ½ä¼šå«æœ‰ä¸åŒæ•°ç›®ä¸ªå…ƒç´ ã€‚åœ¨ PaddlePaddle ä¸ï¼Œåºåˆ—è¾“å…¥/è¾“å‡ºæ•°æ®æ˜¯åœ¨ä¸Šæ–‡ä»‹ç»çš„**æ•°æ®è¾“å…¥ï¼ˆä¸€ç»´æ•´åž‹æ•°ç»„ï¼ŒäºŒç»´æµ®ç‚¹æ•°çŸ©é˜µï¼‰åŸºç¡€ä¸Šï¼Œé™„åŠ ä¸Šåºåˆ—ä¿¡æ¯**ã€‚ä¸‹é¢è¯¦ç»†è§£é‡Šä»€ä¹ˆæ˜¯â€œåºåˆ—ä¿¡æ¯â€ã€‚
+
+æˆ‘ä»¬å°†ç¥žç»ç½‘ç»œä¸€æ¬¡è®¡ç®—æŽ¥å—çš„æ‰€æœ‰è¾“å…¥æ ·æœ¬ç§°ä¹‹ä¸ºä¸€ä¸ª`batch`ï¼ˆå¯ä»¥å«æœ‰ä¸€æ¡æˆ–å¤šæ¡æ ·æœ¬ï¼‰ï¼Œæ¯ä¸€ä¸ªåºåˆ—åœ¨æ•´ä¸ª`batch`ä¸çš„åç§»ï¼Œå°±æ˜¯PaddlePaddleä¸æ‰€æŒ‡çš„**åºåˆ—ä¿¡æ¯**ï¼Œç§°ä¹‹ä¸ºâ€œsequence start positionsâ€ã€‚PaddlePaddle æ”¯æŒä¸¤ç§åºåˆ—ç±»åž‹ï¼š
+
+1. å•å±‚åºåˆ—
+    - åºåˆ—ä¸çš„æ¯ä¸€ä¸ªå…ƒç´ æ˜¯éžåºåˆ—ï¼Œæ˜¯è¿›è¡Œè®¡ç®—çš„åŸºæœ¬å•ä½ï¼Œä¸å¯å†è¿›è¡Œæ‹†åˆ†ã€‚
+    - ä¾‹å¦‚ï¼šè‡ªç„¶è¯è¨€ä¸çš„å¥åæ˜¯ä¸€ä¸ªåºåˆ—ï¼Œåºåˆ—ä¸çš„å…ƒç´ æ˜¯è¯è¯ï¼›
+1. åŒå±‚åºåˆ—
+    - åºåˆ—ä¸çš„æ¯ä¸€ä¸ªå…ƒç´ åˆæ˜¯ä¸€ä¸ªåºåˆ—ã€‚
+    - ä¾‹å¦‚ï¼šè‡ªç„¶è¯è¨€ä¸çš„æ®µè½æ˜¯ä¸€ä¸ªåŒå±‚åºåˆ—ï¼›æ®µè½æ˜¯ç”±å¥åæž„æˆçš„åºåˆ—ï¼›å¥åæ˜¯ç”±è¯è¯æž„æˆçš„åºåˆ—ã€‚
+    - åŒå±‚åºåˆ—åœ¨å¤„ç†é•¿åºåˆ—çš„ä»»åŠ¡æˆ–æ˜¯æž„å»ºå±‚çº§æ¨¡åž‹æ—¶ä¼šå‘æŒ¥ä½œç”¨ã€‚
+
+è¿™ç¯‡æ–‡æ¡£ä¹‹åŽéƒ¨åˆ†ä¼šç»Ÿä¸€ä½¿ç”¨`sequence_start_positions`æ¥ç‰¹æŒ‡ï¼šPaddlePaddleä¸ç¥žç»ç½‘ç»œè®¡ç®—å±‚è¾“å…¥/è¾“å‡ºæ‰€æºå¸¦çš„åºåˆ—ä¿¡æ¯ã€‚
+
+å¯¹åŒå±‚åºåˆ—æ¥è®²ï¼Œä¸ä»…è¦æä¾›æ¯ä¸€ä¸ªå¤–å±‚åºåˆ—åœ¨æ•´ä¸ª`batch`ä¸çš„åç§»ï¼Œæ¯ä¸€ä¸ªå¤–å±‚åºåˆ—åˆå«æœ‰è‹¥å¹²ä¸ªå†…å±‚åºåˆ—ï¼Œéœ€è¦åŒæ—¶æä¾›æ¯ä¸€ä¸ªå†…å±‚åºåˆ—åœ¨æ•´ä¸ª`batch`ä¸çš„åç§»ã€‚ä¹Ÿå°±æ˜¯è¯´ï¼š**åŒå±‚åºåˆ—éœ€è¦è®¾ç½®åˆ†åˆ«ä¸ºå¤–å±‚åºåˆ—å’Œå†…å±‚åºåˆ—åˆ†åˆ«è®¾ç½®`sequence_start_positions`ä¿¡æ¯**ã€‚
+
+**æ³¨ï¼š**
+1. ä¸è®ºåºåˆ—ä¸çš„å…ƒç´ åœ¨å†…å˜ä¸å ç”¨å¤šå°‘å®žé™…å˜å‚¨ç©ºé—´ï¼Œ`sequence_start_positions`è¡¨ç¤ºçš„åç§»æ˜¯ä»¥â€œåºåˆ—ä¸çš„ä¸€ä¸ªå…ƒç´ â€ä½œä¸ºç»Ÿè®¡çš„åŸºæœ¬å•ä½ï¼Œè€Œä¸æ˜¯ç›¸å¯¹`batch`èµ·å§‹å˜å‚¨åœ°å€ä»¥æ•°æ®çš„å˜å‚¨å¤§å°ä¸ºå•ä½çš„åç§»ã€‚
+1. éžåºåˆ—è¾“å…¥ä¸æºå¸¦`sequence_start_positions`ï¼Œéžåºåˆ—è¾“å…¥æ— éœ€æž„é€ `sequence_start_positions`ã€‚
+1. **ä¸è®ºæ˜¯å•å±‚åºåˆ—è¿˜æ˜¯åŒå±‚åºåˆ—çš„åºåˆ—ä¿¡æ¯ï¼Œéƒ½ä½¿ç”¨`paddle_ivector`ï¼ˆä¹Ÿå°±æ˜¯PaddlePaddleä¸çš„ä¸€ç»´æ•´åž‹æ•°ç»„ï¼‰æ¥å˜å‚¨ã€‚**
+
+å›¾2 æ˜¯PaddlePaddleä¸å•å±‚åºåˆ—å’ŒåŒå±‚åºåˆ—å˜å‚¨ç¤ºæ„å›¾ã€‚
+<p align="center">
+<img src="https://user-images.githubusercontent.com/5842774/34159714-1f81a9be-e505-11e7-8a8a-4902146ec899.png" width=800><br>å›¾2. åºåˆ—è¾“å…¥ç¤ºæ„å›¾
+</p>
+
+- å•å±‚åºåˆ—
+
+    å›¾2 (a) å±•ç¤ºäº†ä¸€ä¸ªå«æœ‰4ä¸ªåºåˆ—çš„`batch`è¾“å…¥ï¼š
+    1. 4ä¸ªåºåˆ—çš„é•¿åº¦åˆ†åˆ«ä¸ºï¼š5ã€3ã€2ã€4ï¼›
+    1. è¿™æ—¶çš„`sequence_start_positions`ä¸ºï¼š`[0, 5, 8, 10, 14]`ï¼›
+    1. æœ¬åœ°è®ç»ƒ. ä¸è®ºæ•°æ®åŸŸæ˜¯`paddle_ivector`ç±»åž‹è¿˜æ˜¯`paddle_matrix`ç±»åž‹ï¼Œéƒ½å¯ä»¥é€šè¿‡è°ƒç”¨ä¸‹é¢çš„æŽ¥å£ä¸ºåŽŸæœ‰çš„æ•°æ®è¾“å…¥é™„åŠ ä¸Šåºåˆ—ä¿¡æ¯ï¼Œä½¿ä¹‹å˜ä¸ºä¸€ä¸ªå•å±‚åºåˆ—è¾“å…¥ï¼Œä»£ç ç‰‡æ®µå¦‚ä¸‹ï¼š
+
+    ```c
+    int seq_pos_array[] = {0, 5, 8, 10, 14};
+    paddle_ivector seq_pos = paddle_ivector_create(
+        seq_pos_array, sizeof(seq_pos_array) / sizeof(int), false, false);
+    // Suppose the network only has one input data layer.
+    CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 0, seq_pos));
+    ```
+
+- åŒå±‚åºåˆ—
+
+    å›¾2 (b) å±•ç¤ºäº†ä¸€ä¸ªå«æœ‰4ä¸ªåºåˆ—çš„`batch`è¾“å…¥ï¼›
+    1. 4ä¸ªåºåˆ—çš„é•¿åº¦åˆ†åˆ«ä¸ºï¼š5ã€3ã€2ã€4ï¼›è¿™å››ä¸ªåºåˆ—åˆåˆ†åˆ«å«æœ‰3ã€2ã€1ã€2ä¸ªååºåˆ—ï¼›
+    1. è¿™æ—¶çš„éœ€è¦åŒæ—¶æä¾›ï¼š
+        - å¤–å±‚åºåˆ—åœ¨`batch`ä¸çš„èµ·å§‹åç§»`ï¼š[0, 5, 8, 10, 14]`ï¼›
+        - å†…å±‚åºåˆ—åœ¨`batch`ä¸çš„èµ·å§‹åç§»ï¼š`[0, 2, 3, 5, 7ï¼Œ 8ï¼Œ 10ï¼Œ 13ï¼Œ 14]`ï¼›
+    1. ä¸è®ºæ•°æ®åŸŸæ˜¯`paddle_ivector`ç±»åž‹è¿˜æ˜¯`paddle_matrix`ç±»åž‹ï¼Œè¿™æ—¶éœ€è¦è°ƒç”¨åˆ›å»ºåºåˆ—ä¿¡æ¯å’Œä¸º`argument`è®¾ç½®åºåˆ—ä¿¡æ¯çš„æŽ¥å£**ä¸¤æ¬¡**ï¼Œåˆ†åˆ«ä¸ºæ•°æ®è¾“å…¥æ·»åŠ å¤–å±‚åºåˆ—å’Œå†…å±‚åºåˆ—çš„åºåˆ—ä¿¡æ¯ï¼Œä½¿ä¹‹å˜ä¸ºä¸€ä¸ªåŒå±‚åºåˆ—è¾“å…¥ï¼Œä»£ç ç‰‡æ®µå¦‚ä¸‹ï¼š
+    ```c
+    // set the sequence start positions for the outter sequences.
+    int outter_seq_pos_array[] = {0, 5, 8, 10, 14};
+    paddle_ivector seq_pos =
+        paddle_ivector_create(outter_seq_pos_array,
+                              sizeof(outter_pos_array) / sizeof(int),
+                              false,
+                              false);
+    // The third parameter of this API indicates the sequence level.
+    // 0 for the outter sequence. 1 for the inner sequence.
+    // If the input is a sequence not the nested sequence, the third parameter is
+    // fixed to be 0.
+    CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 0, seq_pos));
+
+    // set the sequence start positions for the outter sequences.
+    int inner_seq_pos_array[] = {0, 2, 3, 5, 7ï¼Œ 8ï¼Œ 10ï¼Œ 13ï¼Œ 14};
+    paddle_ivector seq_pos = paddle_ivector_create(
+        inner_pos_array, sizeof(inner_pos_array) / sizeof(int), false, false);
+    // The third parameter of this API indicates the sequence level.
+    // 0 for the outter sequence. 1 for the inner sequence.
+    CHECK(paddle_arguments_set_sequence_start_pos(in_args, 0, 1, seq_pos));
+    ```
+
+æ³¨æ„äº‹é¡¹ï¼š
+1. å½“ä¸€ä¸ª`batch`ä¸å«æœ‰å¤šä¸ªåºåˆ—ï¼Œ**ä¸æ”¯æŒåºåˆ—é•¿åº¦ä¸º`0`çš„åºåˆ—ï¼ˆä¹Ÿå°±æ˜¯ç©ºè¾“å…¥ï¼‰** ä½œä¸ºè¾“å…¥ã€‚ä¸åŒè®¡ç®—å±‚å¯¹ç©ºè¾“å…¥çš„å¤„ç†ç–ç•¥æœ‰å¯èƒ½ä¸åŒï¼Œæ½œåœ¨ä¼šå¼•èµ·æœªå®šä¹‰è¡Œä¸ºï¼Œæˆ–è€…å¼•èµ·è¡Œæ—¶é”™è¯¯ï¼Œè¯·åœ¨è¾“å…¥æ—¶è¿›è¡Œåˆæ³•æ€§æ£€æŸ¥ã€‚
+
+### Python ç«¯æ•°æ®ç±»åž‹è¯´æ˜Ž
+
+ä¸‹è¡¨åˆ—å‡ºäº†Pythonç«¯è®ç»ƒæŽ¥å£æš´éœ²çš„æ•°æ®ç±»åž‹ï¼ˆ`paddle.layer.data`å‡½æ•°`type`å—æ®µçš„å–å€¼ï¼‰å¯¹åº”äºŽè°ƒç”¨C-APIéœ€è¦åˆ›å»ºçš„æ•°æ®ç±»åž‹ï¼š
+
+<html>
+<table border="2" frame="border">
+<table>
+<thead>
+<tr>
+<th style="text-align:left">Python ç«¯æ•°æ®ç±»åž‹</th>
+<th style="text-align:left">C-API è¾“å…¥æ•°æ®ç±»åž‹</th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td style="text-align:left">paddle.data_type.integer_value</td>
+<td style="text-align:left">æ•´åž‹æ•°ç»„ï¼Œæ— éœ€é™„åŠ åºåˆ—ä¿¡æ¯</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.dense_vector</td>
+<td style="text-align:left">æµ®ç‚¹åž‹ç¨ å¯†çŸ©é˜µï¼Œæ— éœ€é™„åŠ åºåˆ—ä¿¡æ¯</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_binary_vector</td>
+<td style="text-align:left">æµ®ç‚¹åž‹ç¨€ç–çŸ©é˜µï¼Œæ— éœ€æä¾›éžé›¶å…ƒçš„å€¼ï¼Œé»˜è®¤ä¸º1ï¼Œæ— éœ€é™„åŠ åºåˆ—ä¿¡æ¯</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_vector</td>
+<td style="text-align:left">æµ®ç‚¹åž‹ç¨€ç–çŸ©é˜µï¼Œéœ€æä¾›éžé›¶å…ƒçš„å€¼ï¼Œæ— éœ€é™„åŠ åºåˆ—ä¿¡æ¯</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.integer_value_sequence</td>
+<td style="text-align:left">æ•´åž‹æ•°ç»„ï¼Œéœ€é™„åŠ åºåˆ—ä¿¡æ¯</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.dense_vector_sequence</td>
+<td style="text-align:left">æµ®ç‚¹åž‹ç¨ å¯†çŸ©é˜µï¼Œéœ€é™„åŠ åºåˆ—ä¿¡æ¯</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_binary_vector_sequence</td>
+<td style="text-align:left">æµ®ç‚¹åž‹ç¨€ç–çŸ©é˜µï¼Œæ— éœ€æä¾›éžé›¶å…ƒçš„å€¼ï¼Œé»˜è®¤ä¸º1ï¼Œéœ€é™„åŠ åºåˆ—ä¿¡æ¯</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_vector_sequence</td>
+<td style="text-align:left">æµ®ç‚¹åž‹ç¨€ç–çŸ©é˜µï¼Œéœ€æä¾›éžé›¶å…ƒçš„å€¼ï¼Œéœ€é™„åŠ åºåˆ—ä¿¡æ¯</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.integer_value_sub_sequence</td>
+<td style="text-align:left">æ•´åž‹æ•°ç»„ï¼Œéœ€é™„åŠ åŒå±‚åºåˆ—ä¿¡æ¯</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.dense_vector_sub_sequence</td>
+<td style="text-align:left">æµ®ç‚¹åž‹ç¨ å¯†çŸ©é˜µï¼Œéœ€é™„åŠ åŒå±‚åºåˆ—ä¿¡æ¯</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_binary_vector_sub_sequence</td>
+<td style="text-align:left">æµ®ç‚¹åž‹ç¨€ç–çŸ©é˜µï¼Œæ— éœ€æä¾›éžé›¶å…ƒçš„å€¼ï¼Œé»˜è®¤ä¸º1ï¼Œéœ€é™„åŠ åŒå±‚åºåˆ—ä¿¡æ¯</td>
+</tr>
+<tr>
+<td style="text-align:left">paddle.data_type.sparse_vector_sub_sequence</td>
+<td style="text-align:left">æµ®ç‚¹åž‹ç¨€ç–çŸ©é˜µï¼Œéœ€æä¾›éžé›¶å…ƒçš„å€¼ï¼Œéœ€é™„åŠ åŒå±‚åºåˆ—ä¿¡æ¯</td>
+</tr>
+</tbody>
+</table>
+</html>
+<br>
+
+
+### è¾“å‡ºæ•°æ®
+
+PaddlePaddleä¸ä¸€ä¸ªè®¡ç®—å±‚çš„è¾“å‡ºæ•°æ®ç»„ç»‡æ–¹å¼å’Œè¾“å…¥æ•°æ®ç»„ç»‡æ–¹å¼å®Œå…¨ç›¸åŒã€‚ä¸€ä¸ªè¾“å‡ºæ•°æ®åŒæ ·è¢«ç»„ç»‡ä¸ºä¸€ä¸ª`argument`ï¼Œ`argument`é€šè¿‡`paddle_matrix`æˆ–`paddle_ivector`å˜æ•°æ•°æ®ï¼Œå¦‚æžœè¾“å‡ºæ˜¯ä¸€ä¸ªåºåˆ—ï¼Œé‚£ä¹ˆä¼šæºå¸¦æœ‰`sequence_start_positions`ä¿¡æ¯ã€‚è°ƒç”¨C-APIç›¸å…³æŽ¥å£ï¼Œè¯»å–éœ€è¦çš„ç»“æžœå³å¯ã€‚
+
+### æ€»ç»“
+
+- åœ¨PaddlePaddleå†…éƒ¨ï¼Œç¥žç»ç½‘ç»œä¸ä¸€ä¸ªè®¡ç®—å±‚çš„è¾“å…¥/è¾“å‡ºè¢«ç»„ç»‡ä¸º`argument`ã€‚
+- `argument`å¹¶ä¸çœŸæ£â€œå˜å‚¨â€æ•°æ®ï¼Œè€Œæ˜¯å°†è¾“å…¥/è¾“å‡ºä¿¡æ¯æœ‰æœºåœ°ç»„ç»‡åœ¨ä¸€èµ·ã€‚
+- åœ¨`argument`å†…éƒ¨ç”±`paddle_ivector`ï¼ˆä¸€ç»´æ•´åž‹æ•°ç»„ï¼‰å’Œ`paddle_matrix`ï¼ˆäºŒç»´æµ®ç‚¹åž‹çŸ©é˜µï¼‰æ¥å®žé™…å˜å‚¨æ•°æ®ã€‚
+å¦‚æžœæ˜¯ä¸€ä¸ªåºåˆ—è¾“å…¥/è¾“å‡ºç”± `sequence start positions` æ¥è®°å½•è¾“å…¥/è¾“å‡ºçš„åºåˆ—ä¿¡æ¯ã€‚
+
+äºŽæ˜¯ï¼Œåœ¨ç»„ç»‡ç¥žç»ç½‘ç»œè¾“å…¥æ—¶ï¼Œéœ€è¦æ€è€ƒå®Œæˆä»¥ä¸‹å·¥ä½œï¼š
+1. ä¸ºæ¯ä¸€ä¸ªè¾“å…¥/è¾“å‡ºåˆ›å»º`argument`ã€‚
+    - C-API ä¸æ“ä½œ`argument`çš„æŽ¥å£è¯·æŸ¥çœ‹[argument.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h)ã€‚
+1. ä¸ºæ¯ä¸€ä¸ª`argument`åˆ›å»º`paddle_matrix`æˆ–è€…`paddle_ivector`æ¥å˜å‚¨æ•°æ®ã€‚
+    - C-API ä¸æ“ä½œ`paddle_ivector`çš„æŽ¥å£è¯·æŸ¥çœ‹ [vector.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/vector.h)ã€‚
+    - C-API ä¸æ“ä½œ`paddle_matrix`çš„æŽ¥å£è¯·æŸ¥çœ‹[matrix.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/matrix.h)ã€‚
+1. å¦‚æžœè¾“å…¥æ˜¯åºåˆ—æ•°æ®ï¼Œéœ€è¦åˆ›å»ºå¹¶å¡«å†™`sequence_start_positions`ä¿¡æ¯ã€‚
+    - é€šè¿‡è°ƒç”¨ [`paddle_arguments_set_sequence_start_pos`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h#L137) æ¥ä¸ºä¸€ä¸ª`argument`æ·»åŠ åºåˆ—ä¿¡æ¯ã€‚
+    - é€šè¿‡è°ƒç”¨ [`paddle_arguments_get_sequence_start_pos`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h#L150) æ¥è¯»å–ä¸€ä¸ª`argument`æ·»åŠ åºåˆ—ä¿¡æ¯ã€‚
+    - æŽ¥å£è¯´æ˜Žè¯·æŸ¥çœ‹ [argument.h](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/arguments.h) æ–‡ä»¶ã€‚
diff --git a/doc/howto/usage/capi/workflow_of_capi_cn.md b/doc/howto/usage/capi/workflow_of_capi_cn.md
new file mode 100644
index 0000000000000000000000000000000000000000..e0a42fff12cf0f53dee18165e059150861524f74
--- /dev/null
+++ b/doc/howto/usage/capi/workflow_of_capi_cn.md
@@ -0,0 +1,119 @@
+## C-API ä½¿ç”¨æµç¨‹
+
+è¿™ç¯‡æ–‡æ¡£ä»‹ç» PaddlePaddle C-API æ•´ä½“ä½¿ç”¨æµç¨‹ã€‚
+
+### ä½¿ç”¨æµç¨‹
+
+ä½¿ç”¨ C-API çš„å·¥ä½œæµç¨‹å¦‚å›¾1æ‰€ç¤ºï¼Œåˆ†ä¸ºï¼ˆ1ï¼‰å‡†å¤‡é¢„æµ‹æ¨¡åž‹å’Œï¼ˆ2ï¼‰é¢„æµ‹ç¨‹åºå¼€å‘ä¸¤å¤§éƒ¨åˆ†ã€‚
+
+<p align="center">
+<img src="https://user-images.githubusercontent.com/5842774/34658453-365f73ea-f46a-11e7-9b3f-0fd112b27bae.png" width=500><br> å›¾1. C-APIä½¿ç”¨æµç¨‹ç¤ºæ„å›¾
+</p>
+
+- å‡†å¤‡é¢„æµ‹æ¨¡åž‹
+    1. åªå°†ç¥žç»ç½‘ç»œç»“æž„è¿›è¡Œåºåˆ—åŒ–ã€‚
+        - åªå¯¹ç¥žç»ç½‘ç»œç»“æž„è¿›è¡Œåºåˆ—åŒ–ï¼ŒåŠ è½½æ¨¡åž‹éœ€åŒæ—¶æŒ‡å®šï¼šç½‘ç»œç»“æž„çš„åºåˆ—åŒ–ç»“æžœå’Œæ¨¡åž‹å‚æ•°å˜å‚¨ç›®å½•ã€‚
+    1. å°†ç½‘ç»œç»“æž„å®šä¹‰å’Œè®ç»ƒç»“æŸå˜å‚¨ä¸‹æ¥çš„æ¨¡åž‹å‚æ•°æ–‡ä»¶ï¼ˆå¤šä¸ªï¼‰åˆå¹¶å…¥ä¸€ä¸ªæ–‡ä»¶ã€‚
+        - ç¥žç»ç½‘ç»œæ¨¡åž‹ç»“æž„å’Œè®ç»ƒå¥½çš„æ¨¡åž‹å°†è¢«åºåˆ—åŒ–åˆå¹¶å…¥ä¸€ä¸ªæ–‡ä»¶ã€‚
+        - é¢„æµ‹æ—¶åªéœ€åŠ è½½ä¸€ä¸ªæ–‡ä»¶ä¾¿äºŽå‘å¸ƒã€‚
+    - **æ³¨æ„**ï¼šä»¥ä¸Šä¸¤ç§æ–¹å¼åªéœ€é€‰æ‹©å…¶ä¸€å³å¯ã€‚
+- è°ƒç”¨ C-API å¼€å‘é¢„æµ‹åº
+    1. åˆå§‹åŒ–PaddlePaddleè¿è¡ŒçŽ¯å¢ƒã€‚
+    1. åŠ è½½é¢„æµ‹æ¨¡åž‹ã€‚
+    1. åˆ›å»ºç¥žç»ç½‘ç»œè¾“å…¥ï¼Œç»„ç»‡è¾“å…¥æ•°æ®ã€‚
+    1. è¿›è¡Œå‰å‘è®¡ç®—ï¼ŒèŽ·å¾—è®¡ç®—ç»“æžœã€‚
+    1. æ¸…ç†å’Œç»“æŸã€‚
+
+### å‡†å¤‡é¢„æµ‹æ¨¡åž‹
+
+å‡†å¤‡é¢„æµ‹æ¨¡åž‹éƒ¨åˆ†ï¼Œæˆ‘ä»¬ä»¥æ‰‹å†™æ•°å—è¯†åˆ«ä»»åŠ¡ä¸ºä¾‹è¿›è¡Œä»‹ç»ã€‚æ‰‹å†™æ•°å—è¯†åˆ«ä»»åŠ¡å®šä¹‰äº†ä¸€ä¸ªå«æœ‰[ä¸¤ä¸ªéšå±‚çš„ç®€å•å…¨è¿žæŽ¥ç½‘ç»œ](https://github.com/PaddlePaddle/book/blob/develop/02.recognize_digits/README.cn.md#softmaxå›žå½’softmax-regression)ï¼Œç½‘ç»œæŽ¥å—ä¸€å¹…å›¾ç‰‡ä½œä¸ºè¾“å…¥ï¼Œå°†å›¾ç‰‡åˆ†ç±»åˆ° 0 ~ 9 ç±»åˆ«æ ‡ç¾ä¹‹ä¸€ã€‚å®Œæ•´ä»£ç å¯ä»¥æŸ¥çœ‹[æ¤ç›®å½•](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense) ä¸çš„ç›¸å…³è„šæœ¬ã€‚
+
+è°ƒç”¨C-APIå¼€å‘é¢„æµ‹ç¨‹åºéœ€è¦ä¸€ä¸ªè®ç»ƒå¥½çš„æ¨¡åž‹ï¼Œè¿è¡Œ[MNISTæ‰‹å†™æ•°å—è¯†åˆ«ç›®å½•](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)ä¸‹çš„[mnist_v2.py](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)è„šæœ¬ï¼Œåœ¨ç»ˆç«¯æ‰§è¡Œ`python mnist_v2.py`ï¼Œä¼šä½¿ç”¨ PaddlePaddle å†…ç½®çš„ [MNIST æ•°æ®é›†](http://yann.lecun.com/exdb/mnist/)è¿›è¡Œè®ç»ƒã€‚è®ç»ƒå¥½çš„æ¨¡åž‹é»˜è®¤ä¿å˜åœ¨å½“å‰è¿è¡Œç›®å½•ä¸‹çš„`models`ç›®å½•ä¸ã€‚
+
+ä¸‹é¢ï¼Œæˆ‘ä»¬å°†è®ç»ƒç»“æŸåŽå˜å‚¨ä¸‹æ¥çš„æ¨¡åž‹è½¬æ¢æˆé¢„æµ‹æ¨¡åž‹ã€‚
+
+1. åºåˆ—åŒ–ç¥žç»ç½‘ç»œæ¨¡åž‹é…ç½®
+
+    PaddlePaddle ä½¿ç”¨ protobuf æ¥ä¼ è¾“ç½‘ç»œé…ç½®æ–‡ä»¶ä¸å®šä¹‰çš„ç½‘ç»œç»“æž„å’Œç›¸å…³å‚æ•°ï¼Œä½¿ç”¨ C-API è¿›è¡Œé¢„æµ‹æ—¶ï¼Œéœ€è¦å°†ç½‘ç»œç»“æž„ä½¿ç”¨ protobuf è¿›è¡Œåºåˆ—åŒ–ï¼Œå†™å…¥æ–‡ä»¶ä¸ã€‚
+
+    è°ƒç”¨[`paddle.utils.dump_v2_config`](https://github.com/PaddlePaddle/Paddle/tree/develop/python/paddle/utils/dump_v2_config.py)ä¸çš„`dump_v2_config`å‡½æ•°èƒ½å¤Ÿå°†ä½¿ç”¨ PaddlePaddle V2 API å®šä¹‰çš„ç¥žç»ç½‘ç»œç»“æž„ dump åˆ°æŒ‡å®šæ–‡ä»¶ä¸ï¼Œç¤ºä¾‹ä»£ç å¦‚ä¸‹ï¼š
+
+    ```python
+    from paddle.utils.dump_v2_config import dump_v2_config
+    from mnist_v2 import network
+
+    predict = network(is_infer=True)
+    dump_v2_config(predict, "trainer_config.bin", True)
+    ```
+
+    å¯¹[æ‰‹å†™æ•°å—è¯†åˆ«](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)è¿™ä¸ªç¤ºä¾‹ï¼Œ[`mnist_v2.py`](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/mnist_v2.py)è„šæœ¬é›†æˆäº†åºåˆ—åŒ–ç¥žç»ç½‘ç»œç»“æž„çš„è¿‡ç¨‹ï¼Œå¯ä»¥ç›´æŽ¥è¿è¡Œ `python mnist_v2.py --task dump_config` å¯¹ç¥žç»ç½‘ç»œç»“æž„è¿›è¡Œåºåˆ—åŒ–ï¼Œç»“æžœä¼šå†™å…¥å½“å‰è¿è¡Œç›®å½•ä¸‹çš„`trainer_config.bin`æ–‡ä»¶ä¸ã€‚
+
+    ä½¿ç”¨è¿™ç§æ–¹å¼ï¼Œéœ€è¦**åœ¨è¿è¡Œæ—¶å°†ç¥žç»ç½‘ç»œçš„å¤šä¸ªå¯å¦ä¹ å‚æ•°æ”¾åœ¨åŒä¸€ä¸ªç›®å½•ä¸**ï¼ŒC-APIå¯ä»¥é€šè¿‡åˆ†åˆ«æŒ‡å®šåºåˆ—åŒ–åŽçš„ç½‘ç»œç»“æž„æ–‡ä»¶å’Œå‚æ•°ç›®å½•æ¥åŠ è½½è®ç»ƒå¥½çš„æ¨¡åž‹ã€‚
+
+2. åˆå¹¶æ¨¡åž‹æ–‡ä»¶(å¯é€‰)
+
+    ä¸€äº›æƒ…å†µä¸ºäº†ä¾¿äºŽå‘å¸ƒï¼Œå¸Œæœ›èƒ½å¤Ÿå°†åºåˆ—åŒ–åŽçš„ç¥žç»ç½‘ç»œç»“æž„å’Œè®ç»ƒå¥½çš„æ¨¡åž‹å‚æ•°æ‰“åŒ…è¿›ä¸€ä¸ªæ–‡ä»¶ã€‚å¯¹äºŽè¿™æ ·çš„éœ€æ±‚ï¼Œå¯ä»¥ä½¿ç”¨`paddle.utils.merge_model`ä¸çš„`merge_v2_model`æŽ¥å£å¯¹ç¥žç»ç½‘ç»œç»“æž„å’Œè®ç»ƒå¥½çš„å‚æ•°è¿›è¡Œåºåˆ—åŒ–ï¼Œå°†åºåˆ—åŒ–ç»“æžœå†™å…¥ä¸€ä¸ªæ–‡ä»¶å†…ã€‚
+
+    ä»£ç ç¤ºä¾‹å¦‚ä¸‹ï¼š
+
+    ```python
+    from paddle.utils.merge_model import merge_v2_modelss
+    from mnist_v2 import network
+
+    net = network(is_infer=True)
+    param_file = "models/params_pass_4.tar"
+    output_file = "output.paddle.model"
+    merge_v2_model(net, param_file, output_file)
+    ```
+    å¯¹[æ‰‹å†™æ•°å—è¯†åˆ«](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense)è¿™ä¸ªç¤ºä¾‹ï¼Œå¯ç›´æŽ¥è¿è¡Œ `python` [merge_v2_model.py](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference/dense/merge_v2_model.py)ã€‚åºåˆ—åŒ–ç»“æžœä¼šå†™å…¥å½“å‰è¿è¡Œç›®å½•ä¸‹çš„`output.paddle.model`æ–‡ä»¶ä¸ã€‚ä½¿ç”¨è¿™ç§æ–¹å¼ï¼Œè¿è¡Œæ—¶C-APIå¯ä»¥é€šè¿‡æŒ‡å®š`output.paddle.model`æ–‡ä»¶çš„è·¯å¾„æ¥åŠ è½½é¢„æµ‹æ¨¡åž‹ã€‚
+
+#### æ³¨æ„äº‹é¡¹
+1. ä¸ºä½¿ç”¨C-APIï¼Œåœ¨è°ƒç”¨`dump_v2_config`åºåˆ—åŒ–ç¥žç»ç½‘ç»œç»“æž„æ—¶ï¼Œå‚æ•°`binary`å¿…é¡»æŒ‡å®šä¸º`True`ã€‚
+1. **é¢„æµ‹ä½¿ç”¨çš„ç½‘ç»œç»“æž„å¾€å¾€ä¸åŒäºŽè®ç»ƒ**ï¼Œé€šå¸¸éœ€è¦åŽ»æŽ‰ç½‘ç»œä¸çš„ï¼šï¼ˆ1ï¼‰ç±»åˆ«æ ‡ç¾å±‚ï¼›ï¼ˆ2ï¼‰æŸå¤±å‡½æ•°å±‚ï¼›ï¼ˆ3ï¼‰`evaluator`ç‰ï¼Œåªç•™ä¸‹æ ¸å¿ƒè®¡ç®—å±‚ï¼Œè¯·æ³¨æ„æ˜¯å¦éœ€è¦ä¿®æ”¹ç½‘ç»œç»“æž„ã€‚
+1. é¢„æµ‹æ—¶ï¼Œå¯ä»¥èŽ·å–ç½‘ç»œä¸å®šä¹‰çš„ä»»æ„å¤šä¸ªï¼ˆå¤§äºŽç‰äºŽä¸€ä¸ªï¼‰å±‚å‰å‘è®¡ç®—çš„ç»“æžœï¼Œéœ€è¦å“ªäº›å±‚çš„è®¡ç®—ç»“æžœä½œä¸ºè¾“å‡ºï¼Œå°±å°†è¿™äº›å±‚åŠ å…¥ä¸€ä¸ªPython listä¸ï¼Œä½œä¸ºè°ƒç”¨`dump_v2_config`çš„ç¬¬ä¸€ä¸ªå‚æ•°ã€‚
+
+### ç¼–å†™é¢„æµ‹ä»£ç 
+
+é¢„æµ‹ä»£ç æ›´å¤šè¯¦ç»†ç¤ºä¾‹ä»£ç è¯·å‚è€ƒ[C-APIä½¿ç”¨ç¤ºä¾‹](https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/capi/examples/model_inference) ç›®å½•ä¸‹çš„ä»£ç ç¤ºä¾‹ã€‚è¿™ä¸€èŠ‚å¯¹å›¾1ä¸é¢„æµ‹ä»£ç ç¼–å†™çš„5ä¸ªæ¥éª¤è¿›è¡Œä»‹ç»å’Œè¯´æ˜Žã€‚
+
+#### step 1. åˆå§‹åŒ–PaddlePaddleè¿è¡ŒçŽ¯å¢ƒ
+ç¬¬ä¸€æ¥éœ€è°ƒç”¨[`paddle_init`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/main.h#L27) åˆå§‹åŒ–PaddlePaddleè¿è¡ŒçŽ¯å¢ƒï¼Œè¯¥æŽ¥å£æŽ¥å—ä¸¤ä¸ªå‚æ•°ï¼šå‚æ•°çš„ä¸ªæ•°å’Œå‚æ•°åˆ—è¡¨ã€‚
+
+#### step2. åŠ è½½æ¨¡åž‹
+
+è¿™é‡Œä»‹ç»C-APIä½¿ç”¨ä¸çš„ä¸€ä¸ªé‡è¦æ¦‚å¿µï¼šGradient Machineã€‚
+
+æ¦‚å¿µä¸Šï¼Œåœ¨ PaddlePaddle å†…éƒ¨ï¼Œä¸€ä¸ªGradientMachineç±»çš„å¯¹è±¡ç®¡ç†ç€ä¸€ç»„è®¡ç®—å±‚ï¼ˆPaddlePaddle Layersï¼‰æ¥å®Œæˆå‰å‘å’Œåå‘è®¡ç®—ï¼Œå¹¶å¤„ç†ä¸Žä¹‹ç›¸å…³çš„æ‰€æœ‰ç»†èŠ‚ã€‚åœ¨è°ƒç”¨C-APIé¢„æµ‹æ—¶ï¼Œåªéœ€è¿›è¡Œå‰å‘è®¡ç®—è€Œæ— éœ€è°ƒç”¨åå‘è®¡ç®—ã€‚è¿™ç¯‡æ–‡æ¡£ä¹‹åŽéƒ¨åˆ†ä¼šä½¿ç”¨`gradient machine`æ¥ç‰¹æŒ‡è°ƒç”¨PaddlePaddle C-APIåˆ›å»ºçš„GradientMachineç±»çš„å¯¹è±¡ã€‚æ¯ä¸€ä¸ª `gradient machine` éƒ½ä¼šç®¡ç†ç»´æŠ¤ä¸€ä»½è®ç»ƒå¥½çš„æ¨¡åž‹ï¼Œä¸‹é¢æ˜¯C-APIæä¾›çš„ï¼Œä¸¤ç§å¸¸ç”¨çš„æ¨¡åž‹åŠ è½½æ–¹å¼ï¼š
+
+1. è°ƒç”¨[`paddle_gradient_machine_load_parameter_from_disk`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L61)æŽ¥å£ï¼Œä»Žç£ç›˜åŠ è½½é¢„æµ‹æ¨¡åž‹ã€‚è¿™æ—¶`gradient machine`ä¼šç‹¬ç«‹æ‹¥æœ‰ä¸€ä»½è®ç»ƒå¥½çš„æ¨¡åž‹ï¼›
+1. è°ƒç”¨[`paddle_gradient_machine_create_shared_param`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L88)æŽ¥å£ï¼Œä¸Žå…¶å®ƒ`gradient machine`çš„å…±äº«å·²ç»åŠ è½½çš„é¢„æµ‹æ¨¡åž‹ã€‚è¿™ç§æƒ…å†µå¤šå‡ºçŽ°åœ¨ä½¿ç”¨å¤šçº¿ç¨‹é¢„æµ‹æ—¶ï¼Œé€šè¿‡å¤šä¸ªçº¿ç¨‹å…±äº«åŒä¸€ä¸ªæ¨¡åž‹æ¥å‡å°‘å†…å˜å¼€é”€ã€‚å¯å‚è€ƒ[æ¤ç¤ºä¾‹](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/examples/model_inference/multi_thread/main.c)ã€‚
+
+- æ³¨æ„äº‹é¡¹
+    1. ä½¿ç”¨PaddlePaddle V2 APIè®ç»ƒï¼Œæ¨¡åž‹ä¸æ‰€æœ‰å¯å¦ä¹ å‚æ•°ä¼šè¢«å˜ä¸ºä¸€ä¸ªåŽ‹ç¼©æ–‡ä»¶ï¼Œéœ€è¦æ‰‹åŠ¨è¿›è¡Œè§£åŽ‹ï¼Œå°†å®ƒä»¬æ”¾åœ¨åŒä¸€ç›®å½•ä¸ï¼ŒC-APIä¸ä¼šç›´æŽ¥åŠ è½½ V2 API å˜å‚¨çš„åŽ‹ç¼©æ–‡ä»¶ã€‚
+    1. å¦‚æžœä½¿ç”¨`merge model`æ–¹å¼å°†ç¥žç»ç½‘ç»œç»“æž„å’Œè®ç»ƒå¥½çš„å‚æ•°åºåˆ—åŒ–åˆ°ä¸€ä¸ªæ–‡ä»¶ï¼Œè¯·å‚è€ƒæ¤[ç¤ºä¾‹](https://github.com/PaddlePaddle/Mobile/blob/develop/Demo/linux/paddle_image_recognizer.cpp#L59)ã€‚
+    1. é€šè¿‡çµæ´»ä½¿ç”¨ä»¥ä¸Šä¸¤ä¸ªæŽ¥å£ï¼ŒåŠ è½½æ¨¡åž‹å¯å…¶å®ƒå¤šç§æ–¹å¼ï¼Œä¾‹å¦‚ä¹Ÿå¯åœ¨ç¨‹åºè¿è¡Œè¿‡ç¨‹ä¸å†åŠ è½½å¦å¤–ä¸€ä¸ªæ¨¡åž‹ã€‚
+
+#### step 3. åˆ›å»ºç¥žç»ç½‘ç»œè¾“å…¥ï¼Œç»„ç»‡è¾“å…¥æ•°æ®
+
+åŸºæœ¬ä½¿ç”¨æ¦‚å¿µï¼š
+- åœ¨PaddlePaddleå†…éƒ¨ï¼Œç¥žç»ç½‘ç»œä¸ä¸€ä¸ªè®¡ç®—å±‚çš„è¾“å…¥è¾“å‡ºè¢«ç»„ç»‡ä¸ºä¸€ä¸ª `Argument` ç»“æž„ä½“ï¼Œå¦‚æžœç¥žç»ç½‘ç»œæœ‰å¤šä¸ªè¾“å…¥æˆ–è€…å¤šä¸ªè¾“å‡ºï¼Œæ¯ä¸€ä¸ªè¾“å…¥/è¾“å‡ºéƒ½ä¼šå¯¹åº”æœ‰è‡ªå·±çš„`Argument`ã€‚
+- `Argument` å¹¶ä¸çœŸæ£â€œå˜å‚¨â€æ•°æ®ï¼Œè€Œæ˜¯å°†è¾“å…¥/è¾“å‡ºæ•°æ®æœ‰æœºåœ°ç»„ç»‡åœ¨ä¸€èµ·ã€‚
+- åœ¨`Argument`å†…éƒ¨ç”±ï¼š1. `Matrix`ï¼ˆäºŒç»´çŸ©é˜µï¼Œå˜å‚¨æµ®ç‚¹ç±»åž‹è¾“å…¥/è¾“å‡ºï¼‰ï¼›2. `IVector`ï¼ˆä¸€ç»´æ•°ç»„ï¼Œ**ä»…ç”¨äºŽå˜å‚¨æ•´åž‹å€¼**ï¼Œå¤šç”¨äºŽè‡ªç„¶è¯è¨€å¤„ç†ä»»åŠ¡ï¼‰æ¥å®žé™…å˜å‚¨æ•°æ®ã€‚
+
+C-APIæ”¯æŒçš„æ‰€æœ‰è¾“å…¥æ•°æ®ç±»åž‹å’Œä»–ä»¬çš„ç»„ç»‡æ–¹å¼ï¼Œè¯·å‚è€ƒâ€œè¾“å…¥/è¾“å‡ºæ•°æ®ç»„ç»‡â€ä¸€èŠ‚ã€‚
+
+è¿™ç¯‡æ–‡æ¡£çš„ä¹‹åŽéƒ¨åˆ†ä¼šä½¿ç”¨`argument`æ¥ç‰¹æŒ‡PaddlePaddle C-APIä¸ç¥žç»ç½‘ç»œçš„ä¸€ä¸ªè¾“å…¥/è¾“å‡ºï¼Œä½¿ç”¨`paddle_matrix`**ç‰¹æŒ‡**`argument`ä¸ç”¨äºŽå˜å‚¨æ•°æ®çš„`Matrix`ç±»çš„å¯¹è±¡ã€‚
+
+åœ¨ç»„ç»‡ç¥žç»ç½‘ç»œè¾“å…¥ï¼ŒèŽ·å–è¾“å‡ºæ—¶ï¼Œéœ€è¦æ€è€ƒå®Œæˆä»¥ä¸‹å·¥ä½œï¼š
+1. ä¸ºæ¯ä¸€ä¸ªè¾“å…¥/è¾“å‡ºåˆ›å»º`argument`ï¼›
+1. ä¸ºæ¯ä¸€ä¸ª`argument`åˆ›å»º`paddle_matrix`æ¥å˜å‚¨æ•°æ®ï¼›
+
+ä¸Žè¾“å…¥ä¸åŒçš„æ˜¯ï¼Œä¸éœ€åœ¨ä½¿ç”¨C-APIæ—¶ä¸ºè¾“å‡º`argument`çš„`paddle_matrix`å¯¹è±¡åˆ†é…ç©ºé—´ã€‚å‰å‘è®¡ç®—ä¹‹åŽPaddlePaddleå†…éƒ¨å·²ç»åˆ†é…/ç®¡ç†äº†æ¯ä¸ªè®¡ç®—å±‚è¾“å‡ºçš„å˜å‚¨ç©ºé—´ã€‚
+
+#### step 4. å‰å‘è®¡ç®—
+
+å®Œæˆä¸Šè¿°å‡†å¤‡ä¹‹åŽï¼Œé€šè¿‡è°ƒç”¨ [`paddle_gradient_machine_forward`](https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/capi/gradient_machine.h#L73) æŽ¥å£å®Œæˆç¥žç»ç½‘ç»œçš„å‰å‘è®¡ç®—ã€‚
+
+#### step 5. æ¸…ç†
+
+ç»“æŸé¢„æµ‹ä¹‹åŽï¼Œå¯¹ä½¿ç”¨çš„ä¸é—´å˜é‡å’Œèµ„æºè¿›è¡Œæ¸…ç†å’Œé‡Šæ”¾ã€‚
diff --git a/paddle/capi/examples/model_inference/dense/main.c b/paddle/capi/examples/model_inference/dense/main.c
index 5eeaf7e31fac7c9ed0b9269e74a7e467bde155ef..376cd46fb09a156d426453986c87dcff6e2f71dd 100644
--- a/paddle/capi/examples/model_inference/dense/main.c
+++ b/paddle/capi/examples/model_inference/dense/main.c
@@ -3,59 +3,82 @@
 
 #include "../common/common.h"
 
+// Modify this path as needed.
 #define CONFIG_BIN "./trainer_config.bin"
+// Modify this path as needed.
+// This demo assumes that merged model is not used, then this path is the
+// directory storing all the trained parameters.
+// If the model is trained by PaddlePaddle V2 API, the model is saved as
+// a compressed file. You need to uncompress the compressed file first.
+#define MODEL_PATH "models/pass_4"
 
 int main() {
-  // Initalize Paddle
+  // Initalize the PaddlePaddle runtime environment.
   char* argv[] = {"--use_gpu=False"};
   CHECK(paddle_init(1, (char**)argv));
 
-  // Reading config binary file. It is generated by `convert_protobin.sh`
+  // Read the binary configuration file generated by `convert_protobin.sh`
   long size;
   void* buf = read_config(CONFIG_BIN, &size);
 
-  // Create a gradient machine for inference.
+  // Create the gradient machine for inference.
   paddle_gradient_machine machine;
   CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
-  CHECK(paddle_gradient_machine_randomize_param(machine));
 
-  // Loading parameter. Uncomment the following line and change the directory.
-  // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
-  //                                                "./some_where_to_params"));
+  // Load the trained model. Modify the parameter MODEL_PATH to set the correct
+  // path of the trained model.
+  CHECK(paddle_gradient_machine_load_parameter_from_disk(machine, MODEL_PATH));
+
+  // Inputs and outputs of the network are organized as paddle_arguments object
+  // in C-API. In the comments below, "argument" specifically means one input of
+  // the neural network in PaddlePaddle C-API.
   paddle_arguments in_args = paddle_arguments_create_none();
 
-  // There is only one input of this network.
+  // There is only one data layer in this demo MNIST network, invoke this
+  // function to create one argument.
   CHECK(paddle_arguments_resize(in_args, 1));
 
-  // Create input matrix.
-  paddle_matrix mat = paddle_matrix_create(/* sample_num */ 1,
-                                           /* size */ 784,
-                                           /* useGPU */ false);
-  srand(time(0));
+  // Each argument needs one matrix or one ivector (integer vector, for sparse
+  // index input, usually used in NLP task) to holds the real input data.
+  // In the comments below, "matrix" specifically means the object needed by
+  // argument to hold the data. Here we create the matrix for the above created
+  // agument to store the testing samples.
+  paddle_matrix mat =
+      paddle_matrix_create(/* height = batch size */ 1,
+                           /* width = dimensionality of the data layer */ 784,
+                           /* whether to use GPU */ false);
 
   paddle_real* array;
-
-  // Get First row.
+  // Get the pointer pointing to the start address of the first row of the
+  // created matrix.
   CHECK(paddle_matrix_get_row(mat, 0, &array));
 
+  // Fill the matrix with a randomly generated test sample.
+  srand(time(0));
   for (int i = 0; i < 784; ++i) {
     array[i] = rand() / ((float)RAND_MAX);
   }
 
+  // Assign the matrix to the argument.
   CHECK(paddle_arguments_set_value(in_args, 0, mat));
 
+  // Create the output argument.
   paddle_arguments out_args = paddle_arguments_create_none();
+
+  // Invoke the forward computation.
   CHECK(paddle_gradient_machine_forward(machine,
                                         in_args,
                                         out_args,
-                                        /* isTrain */ false));
-  paddle_matrix prob = paddle_matrix_create_none();
+                                        /* is train taks or not */ false));
 
+  // Create the matrix to hold the forward result of the neural network.
+  paddle_matrix prob = paddle_matrix_create_none();
+  // Access the matrix of the output argument, the predicted result is stored in
+  // which.
   CHECK(paddle_arguments_get_value(out_args, 0, prob));
 
   uint64_t height;
   uint64_t width;
-
   CHECK(paddle_matrix_get_shape(prob, &height, &width));
   CHECK(paddle_matrix_get_row(prob, 0, &array));
 
@@ -68,6 +91,7 @@ int main() {
   }
   printf("\n");
 
+  // The cleaning up.
   CHECK(paddle_matrix_destroy(prob));
   CHECK(paddle_arguments_destroy(out_args));
   CHECK(paddle_matrix_destroy(mat));
diff --git a/paddle/capi/examples/model_inference/dense/merge_v2_model.py b/paddle/capi/examples/model_inference/dense/merge_v2_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..c030d572cbdb15cb5e90f2685723a81efb230f81
--- /dev/null
+++ b/paddle/capi/examples/model_inference/dense/merge_v2_model.py
@@ -0,0 +1,8 @@
+from paddle.utils.merge_model import merge_v2_model
+
+from mnist_v2 import network
+
+net = network(is_infer=True)
+param_file = "models/params_pass_4.tar"
+output_file = "output.paddle.model"
+merge_v2_model(net, param_file, output_file)
diff --git a/paddle/capi/examples/model_inference/dense/mnist_v2.py b/paddle/capi/examples/model_inference/dense/mnist_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee28111153ca2cf24b9789452c65a0f4c7b64538
--- /dev/null
+++ b/paddle/capi/examples/model_inference/dense/mnist_v2.py
@@ -0,0 +1,117 @@
+import os
+import sys
+import gzip
+import logging
+import argparse
+from PIL import Image
+import numpy as np
+
+import paddle.v2 as paddle
+from paddle.utils.dump_v2_config import dump_v2_config
+
+logger = logging.getLogger("paddle")
+logger.setLevel(logging.INFO)
+
+
+def multilayer_perceptron(img, layer_size, lbl_dim):
+    for idx, size in enumerate(layer_size):
+        hidden = paddle.layer.fc(input=(img if not idx else hidden),
+                                 size=size,
+                                 act=paddle.activation.Relu())
+    return paddle.layer.fc(input=hidden,
+                           size=lbl_dim,
+                           act=paddle.activation.Softmax())
+
+
+def network(input_dim=784, lbl_dim=10, is_infer=False):
+    images = paddle.layer.data(
+        name='pixel', type=paddle.data_type.dense_vector(input_dim))
+
+    predict = multilayer_perceptron(
+        images, layer_size=[128, 64], lbl_dim=lbl_dim)
+
+    if is_infer:
+        return predict
+    else:
+        label = paddle.layer.data(
+            name='label', type=paddle.data_type.integer_value(lbl_dim))
+        return paddle.layer.classification_cost(input=predict, label=label)
+
+
+def main(task="train", use_gpu=False, trainer_count=1, save_dir="models"):
+    if task == "train":
+        if not os.path.exists(save_dir):
+            os.mkdir(save_dir)
+
+        paddle.init(use_gpu=use_gpu, trainer_count=trainer_count)
+        cost = network()
+        parameters = paddle.parameters.create(cost)
+        optimizer = paddle.optimizer.Momentum(
+            learning_rate=0.1 / 128.0,
+            momentum=0.9,
+            regularization=paddle.optimizer.L2Regularization(rate=0.0005 * 128))
+
+        trainer = paddle.trainer.SGD(cost=cost,
+                                     parameters=parameters,
+                                     update_equation=optimizer)
+
+        def event_handler(event):
+            if isinstance(event, paddle.event.EndIteration):
+                if event.batch_id % 100 == 0:
+                    logger.info("Pass %d, Batch %d, Cost %f, %s" %
+                                (event.pass_id, event.batch_id, event.cost,
+                                 event.metrics))
+            if isinstance(event, paddle.event.EndPass):
+                with gzip.open(
+                        os.path.join(save_dir, "params_pass_%d.tar" %
+                                     event.pass_id), "w") as f:
+                    trainer.save_parameter_to_tar(f)
+
+        trainer.train(
+            reader=paddle.batch(
+                paddle.reader.shuffle(
+                    paddle.dataset.mnist.train(), buf_size=8192),
+                batch_size=128),
+            event_handler=event_handler,
+            num_passes=5)
+    elif task == "dump_config":
+        predict = network(is_infer=True)
+        dump_v2_config(predict, "trainer_config.bin", True)
+    else:
+        raise RuntimeError(("Error value for parameter task. "
+                            "Available options are: train and dump_config."))
+
+
+def parse_cmd():
+    parser = argparse.ArgumentParser(
+        description="PaddlePaddle MNIST demo for CAPI.")
+    parser.add_argument(
+        "--task",
+        type=str,
+        required=False,
+        help=("A string indicating the taks type. "
+              "Available options are: \"train\", \"dump_config\"."),
+        default="train")
+    parser.add_argument(
+        "--use_gpu",
+        type=bool,
+        help=("A bool flag indicating whether to use GPU device or not."),
+        default=False)
+    parser.add_argument(
+        "--trainer_count",
+        type=int,
+        help=("This parameter is only used in training task. It indicates "
+              "how many computing threads are created in training."),
+        default=1)
+    parser.add_argument(
+        "--save_dir",
+        type=str,
+        help=("This parameter is only used in training task. It indicates "
+              "path of the directory to save the trained models."),
+        default="models")
+    return parser.parse_args()
+
+
+if __name__ == "__main__":
+    args = parse_cmd()
+    main(args.task, args.use_gpu, args.trainer_count, args.save_dir)
diff --git a/paddle/capi/examples/model_inference/sparse_binary/main.c b/paddle/capi/examples/model_inference/sparse_binary/main.c
index 8ba67aee560239d3050c7f40198d20df99ec370e..029b94ee63ba282aa48193ffd4f625657ddc3a60 100644
--- a/paddle/capi/examples/model_inference/sparse_binary/main.c
+++ b/paddle/capi/examples/model_inference/sparse_binary/main.c
@@ -1,5 +1,6 @@
 #include <paddle/capi.h>
 #include <time.h>
+
 #include "../common/common.h"
 
 #define CONFIG_BIN "./trainer_config.bin"
@@ -9,16 +10,18 @@ int main() {
   char* argv[] = {"--use_gpu=False"};
   CHECK(paddle_init(1, (char**)argv));
 
-  // Reading config binary file. It is generated by `convert_protobin.sh`
+  // Read the binary configuration file which is generated by
+  // `convert_protobin.sh`
   long size;
   void* buf = read_config(CONFIG_BIN, &size);
 
-  // Create a gradient machine for inference.
+  // Create the gradient machine for inference.
   paddle_gradient_machine machine;
   CHECK(paddle_gradient_machine_create_for_inference(&machine, buf, (int)size));
   CHECK(paddle_gradient_machine_randomize_param(machine));
 
-  // Loading parameter. Uncomment the following line and change the directory.
+  // Load the trained parameters. Uncomment the following line and change the
+  // directory as needed.
   // CHECK(paddle_gradient_machine_load_parameter_from_disk(machine,
   //                                                "./some_where_to_params"));
   paddle_arguments in_args = paddle_arguments_create_none();
@@ -26,7 +29,7 @@ int main() {
   // There is only one input of this network.
   CHECK(paddle_arguments_resize(in_args, 1));
 
-  // Create input matrix.
+  // Create the input matrix.
   paddle_matrix mat = paddle_matrix_create_sparse(1, 784, 3, true, false);
   srand(time(0));
   paddle_real* array;
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index ed5f6310f4e1212844948dc8c2555e527b4d10e8..597ea959f230d88350796cef05b7d6f2a42e594a 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -47,7 +47,7 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker)
 cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto)
 cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context)
 cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog
-    shape_inference data_transform)
+    shape_inference data_transform lod_tensor)
 cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry init)
 cc_library(proto_desc SRCS var_desc.cc op_desc.cc block_desc.cc program_desc.cc DEPS shape_inference op_info operator glog)
 
diff --git a/paddle/framework/init.cc b/paddle/framework/init.cc
index e12bac1d78e3f6bbc46849c06b53e3b93e147cfc..4ef82a541efaa35bcf831d5122570154f2fa2423 100644
--- a/paddle/framework/init.cc
+++ b/paddle/framework/init.cc
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <string.h>  // for strdup
 #include <algorithm>
 #include <string>
 
@@ -60,7 +61,9 @@ void InitDevices() {
 }
 
 void InitGLOG(const std::string &prog_name) {
-  google::InitGoogleLogging(prog_name.c_str());
+  // glog will not hold the ARGV[0] inside.
+  // Use strdup to alloc a new string.
+  google::InitGoogleLogging(strdup(prog_name.c_str()));
   google::InstallFailureSignalHandler();
 }
 
diff --git a/paddle/framework/lod_tensor.cc b/paddle/framework/lod_tensor.cc
index 7ae94c646537e0d7c4687b949a1b06cd3a7f3404..87a57d095141cc456af2cbabbc227715a02375e9 100644
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
@@ -69,6 +69,12 @@ std::ostream &operator<<(std::ostream &os, const LoDTensor &t) {
   return os;
 }
 
+std::string LoDToString(const LoD &lod) {
+  std::ostringstream stream;
+  stream << lod;
+  return stream.str();
+}
+
 LoD SliceInLevel(const LoD &in, size_t level, size_t elem_begin,
                  size_t elem_end) {
   PADDLE_ENFORCE_LT(level, in.size());
diff --git a/paddle/framework/lod_tensor.h b/paddle/framework/lod_tensor.h
index 37753f5f4ddea4755ad6211007c367de00aad754..88ea78f2682b2ffc962c9663f6b3c636dedb931d 100644
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
@@ -60,6 +60,8 @@ using LoD = std::vector<Vector<size_t>>;
 std::ostream& operator<<(std::ostream& os, const LoD& lod);
 std::ostream& operator<<(std::ostream& os, const LoDTensor& t);
 
+std::string LoDToString(const LoD& lod);
+
 LoD SliceInLevel(const LoD& in, size_t level, size_t elem_begin,
                  size_t elem_end);
 /*
diff --git a/paddle/framework/operator.cc b/paddle/framework/operator.cc
index ef2c55cc3799ba2fac54f3c9370505b63ef22ad3..be1373dc2a86b18f780422da9528a376f59a5837 100644
--- a/paddle/framework/operator.cc
+++ b/paddle/framework/operator.cc
@@ -11,6 +11,7 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <gflags/gflags.h>
 #include <glog/logging.h>
 
 #include <algorithm>
@@ -21,6 +22,10 @@ limitations under the License. */
 #include "paddle/framework/shape_inference.h"
 #include "paddle/framework/var_type.h"
 
+DEFINE_bool(op_sync, false,
+            "Default cuda is asynchronous device, set to True will"
+            "force op run in synchronous mode.");
+
 namespace paddle {
 namespace framework {
 
@@ -75,7 +80,9 @@ static DDim GetDims(const Scope& scope, const std::string& name) {
   Variable* var = scope.FindVar(name);
   if (var == nullptr) {
     return DDim({-1});
-  } else if (var->IsType<LoDTensor>()) {
+  }
+
+  if (var->IsType<LoDTensor>()) {
     return var->Get<LoDTensor>().dims();
   } else if (var->IsType<SelectedRows>()) {
     return var->Get<SelectedRows>().GetCompleteDims();
@@ -84,6 +91,21 @@ static DDim GetDims(const Scope& scope, const std::string& name) {
   }
 }
 
+static LoD GetLoD(const Scope& scope, const std::string& name) {
+  Variable* var = scope.FindVar(name);
+  auto default_lod = LoD({{}});
+
+  if (var == nullptr) {
+    return default_lod;
+  }
+
+  if (var->IsType<LoDTensor>()) {
+    return var->Get<LoDTensor>().lod();
+  } else {
+    return default_lod;
+  }
+}
+
 std::string OperatorBase::Input(const std::string& name) const {
   auto& ins = Inputs(name);
   PADDLE_ENFORCE_LE(ins.size(), 1UL,
@@ -125,7 +147,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
     for (size_t i = 0; i < input.second.size(); ++i) {
       ss << input.second[i];
       if (scope) {
-        ss << "(" << GetDims(*scope, input.second[i]) << ")";
+        ss << "[" << GetDims(*scope, input.second[i]) << "]";
+        ss << "(" << GetLoD(*scope, input.second[i]) << ")";
       }
       if (i != input.second.size() - 1) {
         ss << ", ";
@@ -144,7 +167,8 @@ std::string OperatorBase::DebugStringEx(const Scope* scope) const {
     for (size_t i = 0; i < output.second.size(); ++i) {
       ss << output.second[i];
       if (scope) {
-        ss << "(" << GetDims(*scope, output.second[i]) << ")";
+        ss << "[" << GetDims(*scope, output.second[i]) << "]";
+        ss << "(" << GetLoD(*scope, output.second[i]) << ")";
       }
       if (i != output.second.size() - 1) {
         ss << ", ";
@@ -542,8 +566,14 @@ void OperatorWithKernel::Run(const Scope& scope,
 
   auto kernel_iter = kernels.find(expected_kernel_key);
 
-  kernel_iter->second->Compute(ExecutionContext(
-      *this, new_scope, *pool.Get(expected_kernel_key.place_)));
+  auto* new_dev_ctx = pool.Get(expected_kernel_key.place_);
+  kernel_iter->second->Compute(
+      ExecutionContext(*this, new_scope, *new_dev_ctx));
+
+  /*For profiling/benchmark only*/
+  if (FLAGS_op_sync) {
+    new_dev_ctx->Wait();
+  }
 }
 
 proto::DataType OperatorWithKernel::IndicateDataType(
diff --git a/paddle/framework/tensor_util.h b/paddle/framework/tensor_util.h
index f541d2ba693a169d074c070dd794a2dd4e52aabf..091b63bf0f907a5449f08f0e36abb6577fa5e43e 100644
--- a/paddle/framework/tensor_util.h
+++ b/paddle/framework/tensor_util.h
@@ -116,8 +116,8 @@ inline void Copy(const Tensor& src, const platform::Place& dst_place,
  * @param[in] src        The external tensor.
  * @param[in] ctx        The device context contains device resources.
  *
- * * @note    CopyFromVector assumes that the tensor has been resized
- *            before invoking.
+ * * @note    CopyFromVector will resize dst to an 1D tensor with the same
+ *            size as src.
  */
 template <typename T>
 inline void CopyFromVector(const std::vector<T>& src,
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index 2d9055a06a467be8094ff01330f750a4088decc1..9ad02aa9cf4b4c45a4107d82636120a89349b143 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -135,6 +135,7 @@ op_library(detection_output_op DEPS softmax)
 op_library(sequence_softmax_op DEPS softmax)
 op_library(sum_op DEPS selected_rows_functor)
 op_library(sgd_op DEPS selected_rows_functor)
+op_library(print_op DEPS lod_tensor)
 op_library(adagrad_op DEPS selected_rows_functor)
 op_library(conv_op DEPS vol2col)
 op_library(pool_op DEPS pooling)
diff --git a/paddle/operators/assign_value_op.cc b/paddle/operators/assign_value_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d5671c1183a0f58d2aedb0723bd462684ac5636e
--- /dev/null
+++ b/paddle/operators/assign_value_op.cc
@@ -0,0 +1,73 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/assign_value_op.h"
+
+namespace paddle {
+namespace operators {
+
+class AssignValueOp : public framework::OperatorWithKernel {
+ public:
+  AssignValueOp(const std::string &type,
+                const framework::VariableNameMap &inputs,
+                const framework::VariableNameMap &outputs,
+                const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of AssignValueOp should not be null.");
+    auto shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    ctx->SetOutputDim("Out", framework::make_ddim(shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::proto::DataType(ctx.Attr<int>("dtype")), ctx.GetPlace());
+  }
+};
+
+class AssignValueOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  AssignValueOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "(Tensor) Output tensor of assign_value operator.");
+    AddAttr<std::vector<int>>("shape",
+                              "(vector<int>) "
+                              "Shape of values.");
+    AddAttr<int>("dtype", "data type of values")
+        .InEnum({framework::proto::DataType::INT32,
+                 framework::proto::DataType::FP32});
+    AddAttr<std::vector<float>>("fp32_values", "store the float values")
+        .SetDefault({});
+    AddAttr<std::vector<int>>("int32_values", "store the int values")
+        .SetDefault({});
+    AddComment(R"DOC(
+AssignValue operator
+
+$$Out = values$$
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+REGISTER_OPERATOR(assign_value, ops::AssignValueOp, ops::AssignValueOpMaker);
+REGISTER_OP_CPU_KERNEL(assign_value, ops::AssignValueKernel<int>,
+                       ops::AssignValueKernel<float>);
diff --git a/paddle/operators/assign_value_op.cu.cc b/paddle/operators/assign_value_op.cu.cc
new file mode 100644
index 0000000000000000000000000000000000000000..b17e20150053cea4c6b9ed6a5f222f77f4a4bd36
--- /dev/null
+++ b/paddle/operators/assign_value_op.cu.cc
@@ -0,0 +1,19 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+Indicesou may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/assign_value_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
+                        ops::AssignValueKernel<float>);
diff --git a/paddle/operators/assign_value_op.h b/paddle/operators/assign_value_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..db2e43077999fa0f9aaada74026dd701ab2bf464
--- /dev/null
+++ b/paddle/operators/assign_value_op.h
@@ -0,0 +1,50 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class AssignValueKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto shape = ctx.Attr<std::vector<int>>("shape");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    int dtype = ctx.Attr<int>("dtype");
+    const char* value_name = nullptr;
+    switch (dtype) {
+      case framework::proto::DataType::INT32:
+        value_name = "int32_values";
+        break;
+      case framework::proto::DataType::FP32:
+        value_name = "fp32_values";
+        break;
+      default:
+        PADDLE_THROW("Unsupported dtype for assign_value_op: %d", dtype);
+        break;
+    }
+    auto values = ctx.Attr<std::vector<T>>(value_name);
+    framework::CopyFromVector(values, ctx.device_context(), out);
+    out->Resize(framework::make_ddim(shape));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/CMakeLists.txt b/paddle/operators/detail/CMakeLists.txt
index f6bdc63cc2cfae526fe911ee4d989675452d5c5d..571a75c9dcd903672d460f192bf28ddbeaea7c78 100644
--- a/paddle/operators/detail/CMakeLists.txt
+++ b/paddle/operators/detail/CMakeLists.txt
@@ -1 +1 @@
-grpc_library(sendrecvop_grpc SRCS recv_impl.cc send_impl.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
+grpc_library(sendrecvop_grpc SRCS sendrecvop_utils.cc grpc_client.cc grpc_server.cc PROTO send_recv.proto DEPS lod_tensor selected_rows)
diff --git a/paddle/operators/detail/grpc_client.cc b/paddle/operators/detail/grpc_client.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5a4db2d7e686ce84abef620f890be8f3aa82cb73
--- /dev/null
+++ b/paddle/operators/detail/grpc_client.cc
@@ -0,0 +1,147 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "grpc_client.h"
+namespace paddle {
+namespace operators {
+namespace detail {
+
+bool RPCClient::AsyncSendVariable(const std::string& ep,
+                                  const platform::DeviceContext& ctx,
+                                  const framework::Scope& scope,
+                                  const std::string& var_name,
+                                  int64_t time_out) {
+  sendrecv::VariableMessage req;
+  auto* var = scope.FindVar(var_name);
+  SerializeToMessage(var_name, var, ctx, &req);
+
+  // varhandle
+  VarHandle var_h;
+  var_h.ep = ep;
+  var_h.scope = &scope;
+  var_h.name = var_name;
+  var_h.ctx = &ctx;
+
+  // stub context
+  auto ch = GetChannel(ep);
+  SendProcessor* s = new SendProcessor(ch);
+  s->Prepare(var_h, time_out);
+  s->response_call_back_ = NULL;
+
+  auto rpc = s->stub_->AsyncSendVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+
+  req_count_++;
+
+  return true;
+}
+
+void ProcGetResponse(const VarHandle& var_h,
+                     const sendrecv::VariableMessage& ret_msg) {
+  auto* outvar = var_h.scope->FindVar(var_h.name);
+
+  std::istringstream iss(ret_msg.serialized());
+  DeserializeFromMessage(ret_msg, *var_h.ctx, outvar);
+}
+
+bool RPCClient::AsyncGetVariable(const std::string& ep,
+                                 const platform::DeviceContext& ctx,
+                                 const framework::Scope& scope,
+                                 const std::string& var_name,
+                                 int64_t time_out) {
+  sendrecv::VariableMessage req;
+  req.set_varname(var_name);
+
+  auto* var = scope.FindVar(var_name);
+  SerializeToMessage(var_name, var, ctx, &req);
+
+  // varhandle
+  VarHandle var_h;
+  var_h.ep = ep;
+  var_h.scope = &scope;
+  var_h.name = var_name;
+  var_h.ctx = &ctx;
+
+  // stub context
+  auto ch = GetChannel(ep);
+  GetProcessor* s = new GetProcessor(ch);
+  s->Prepare(var_h, time_out);
+  s->response_call_back_ = ProcGetResponse;
+
+  auto rpc = s->stub_->AsyncGetVariable(s->context_.get(), req, &cq_);
+  rpc->Finish(&s->reply_, &s->status_, (void*)s);
+
+  req_count_++;
+
+  return true;
+}
+
+bool RPCClient::wait() {
+  bool ok = true;
+
+  while (true) {
+    if (req_count_ <= 0) {
+      break;
+    }
+
+    if (!Proceed()) {
+      LOG(ERROR) << "Get meets CompletionQueue error";
+      return false;
+    }
+  }
+
+  return ok;
+}
+
+bool RPCClient::Proceed() {
+  void* tag = NULL;
+  bool ok = false;
+
+  // request counts.
+  if (!cq_.Next(&tag, &ok)) {
+    return false;
+  }
+  req_count_--;
+
+  GPR_ASSERT(ok);
+  PADDLE_ENFORCE(tag);
+
+  // TODO(gongwb): add more retries.
+  ClientBase* c = static_cast<ClientBase*>(tag);
+  if (!c->status_.ok()) {
+    delete c;
+    return true;
+  }
+
+  c->Process();
+  delete c;
+  return true;
+}
+
+std::shared_ptr<grpc::Channel> RPCClient::GetChannel(const std::string& ep) {
+  auto it = channels_.find(ep);
+  if (it != channels_.end()) {
+    return it->second;
+  }
+
+  auto ch = std::shared_ptr<grpc::Channel>(
+      grpc::CreateChannel(ep, grpc::InsecureChannelCredentials()));
+
+  channels_[ep] = ch;
+  return ch;
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/grpc_client.h b/paddle/operators/detail/grpc_client.h
new file mode 100644
index 0000000000000000000000000000000000000000..d27b5ced9ece67f9b9da3b7f87ec231477603580
--- /dev/null
+++ b/paddle/operators/detail/grpc_client.h
@@ -0,0 +1,147 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <grpc++/grpc++.h>
+#include <grpc/support/log.h>
+#include <time.h>
+#include <chrono>
+#include <ctime>
+#include <functional>
+#include <iostream>
+#include <map>
+#include <string>
+#include <vector>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/operators/detail/sendrecvop_utils.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+struct VarHandle {
+  std::string ep;
+  const platform::DeviceContext* ctx;
+  const framework::Scope* scope;
+  std::string name;
+
+  std::string String() const {
+    std::ostringstream s;
+    s << "name:[" << name << "] ep:[" << ep << "]";
+    return s.str();
+  }
+};
+
+void ProcGetResponse(const VarHandle& var_h,
+                     const sendrecv::VariableMessage& msg);
+
+class ClientBase {
+ public:
+  explicit ClientBase(std::shared_ptr<grpc::Channel> ch) {
+    stub_ = sendrecv::SendRecvService::NewStub(ch);
+    context_ = NULL;
+  }
+
+  virtual ~ClientBase() {}
+
+  virtual void Prepare(const VarHandle& var_info, int64_t time_out) {
+    context_.reset(new grpc::ClientContext());
+    var_h_ = var_info;
+
+    std::chrono::system_clock::time_point deadline =
+        std::chrono::system_clock::now() + std::chrono::milliseconds(time_out);
+
+    context_->set_deadline(deadline);
+  }
+
+  virtual void Process() = 0;
+
+  std::unique_ptr<sendrecv::SendRecvService::Stub> stub_;
+  std::unique_ptr<grpc::ClientContext> context_;
+  grpc::Status status_;
+  VarHandle var_h_;
+};
+
+typedef std::function<void(const VarHandle&, const sendrecv::VoidMessage&)>
+    RequestSendCallBack;
+
+class SendProcessor : public ClientBase {
+ public:
+  explicit SendProcessor(std::shared_ptr<grpc::Channel> ch) : ClientBase(ch) {}
+
+  virtual ~SendProcessor() {}
+
+  virtual void Process() {
+    if (response_call_back_) {
+      response_call_back_(var_h_, reply_);
+    }
+  }
+
+  sendrecv::VoidMessage reply_;
+  RequestSendCallBack response_call_back_ = NULL;
+};
+
+typedef std::function<void(const VarHandle&, const sendrecv::VariableMessage&)>
+    RequestGetCallBack;
+
+class GetProcessor : public ClientBase {
+ public:
+  explicit GetProcessor(std::shared_ptr<grpc::Channel> ch) : ClientBase(ch) {}
+
+  virtual ~GetProcessor() {}
+
+  virtual void Process() {
+    if (response_call_back_) {
+      response_call_back_(var_h_, reply_);
+    }
+  }
+
+  sendrecv::VariableMessage reply_;
+  RequestGetCallBack response_call_back_ = ProcGetResponse;
+};
+
+class RPCClient {
+ public:
+  bool AsyncSendVariable(const std::string& ep,
+                         const platform::DeviceContext& ctx,
+                         const framework::Scope& scope,
+                         const std::string& var_name,
+                         int64_t time_out = 600 * 1000);
+
+  bool AsyncGetVariable(const std::string& ep,
+                        const platform::DeviceContext& ctx,
+                        const framework::Scope& scope,
+                        const std::string& var_name,
+                        int64_t time_out = 600 * 1000);
+  bool wait();
+
+ private:
+  bool Proceed();
+  std::shared_ptr<grpc::Channel> GetChannel(const std::string& ep);
+
+ private:
+  grpc::CompletionQueue cq_;
+  std::map<std::string, std::shared_ptr<grpc::Channel>> channels_;
+  int64_t req_count_ = 0;
+};
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/grpc_server.cc b/paddle/operators/detail/grpc_server.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e8d561a57ff59e9221400241f881cb26fb6c6f06
--- /dev/null
+++ b/paddle/operators/detail/grpc_server.cc
@@ -0,0 +1,237 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/detail/grpc_server.h"
+
+using grpc::ServerAsyncResponseWriter;
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+enum CallStatus { PROCESS = 0, FINISH };
+
+// reference:
+// https://stackoverflow.com/questions/41732884/grpc-multiple-services-in-cpp-async-server
+class RequestBase {
+ public:
+  explicit RequestBase(sendrecv::SendRecvService::AsyncService* service,
+                       grpc::ServerCompletionQueue* cq)
+      : service_(service), cq_(cq), status_(PROCESS) {}
+  virtual ~RequestBase() {}
+  virtual void Process() { assert(false); }
+
+  CallStatus Status() { return status_; }
+  void SetStatus(CallStatus status) { status_ = status; }
+
+ protected:
+  grpc::ServerContext ctx_;
+  sendrecv::SendRecvService::AsyncService* service_;
+  grpc::ServerCompletionQueue* cq_;
+  CallStatus status_;
+};
+
+typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
+
+class RequestSend final : public RequestBase {
+ public:
+  explicit RequestSend(sendrecv::SendRecvService::AsyncService* service,
+                       grpc::ServerCompletionQueue* cq,
+                       SimpleBlockQueue<MessageWithName>* queue)
+      : RequestBase(service, cq), queue_(queue), responder_(&ctx_) {
+    service_->RequestSendVariable(&ctx_, &request_, &responder_, cq_, cq_,
+                                  this);
+  }
+
+  virtual ~RequestSend() {}
+
+  virtual void Process() {
+    MessageWithName msg_with_name =
+        std::make_pair(request_.varname(), std::move(request_));
+    queue_->Push(std::move(msg_with_name));
+    // TODO(gongwb): check var's info.
+    responder_.Finish(reply_, grpc::Status::OK, this);
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  sendrecv::VoidMessage reply_;
+  SimpleBlockQueue<MessageWithName>* queue_;
+  ServerAsyncResponseWriter<sendrecv::VoidMessage> responder_;
+};
+
+class RequestGet final : public RequestBase {
+ public:
+  explicit RequestGet(sendrecv::SendRecvService::AsyncService* service,
+                      grpc::ServerCompletionQueue* cq, framework::Scope* scope)
+      : RequestBase(service, cq), responder_(&ctx_), scope_(scope) {
+    service_->RequestGetVariable(&ctx_, &request_, &responder_, cq_, cq_, this);
+  }
+
+  virtual ~RequestGet() {}
+
+  virtual void Process() {
+    // proc request.
+    std::string var_name = request_.varname();
+    auto* var = scope_->FindVar(var_name);
+    SerializeToMessage(var_name, var, platform::CPUDeviceContext(), &reply_);
+    // TODO(gongwb): check var's info.
+    responder_.Finish(reply_, grpc::Status::OK, this);
+  }
+
+ protected:
+  sendrecv::VariableMessage request_;
+  sendrecv::VariableMessage reply_;
+  ServerAsyncResponseWriter<sendrecv::VariableMessage> responder_;
+  framework::Scope* scope_;
+};
+
+void AsyncGRPCServer::RunSyncUpdate() {
+  grpc::ServerBuilder builder;
+  builder.AddListeningPort(address_, grpc::InsecureServerCredentials());
+  builder.RegisterService(&service_);
+
+  cq_send_ = builder.AddCompletionQueue();
+  cq_get_ = builder.AddCompletionQueue();
+  server_ = builder.BuildAndStart();
+  LOG(INFO) << "Server listening on " << address_ << std::endl;
+
+  std::function<void()> send_register =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewSendOne, this);
+  std::function<void()> get_register =
+      std::bind(&AsyncGRPCServer::TryToRegisterNewGetOne, this);
+
+  t_send_.reset(
+      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, false,
+                                cq_send_.get(), "cq_send", send_register)));
+
+  t_get_.reset(
+      new std::thread(std::bind(&AsyncGRPCServer::HandleRequest, this, true,
+                                cq_get_.get(), "cq_get", get_register)));
+
+  // wait server
+  server_->Wait();
+  t_send_->join();
+  t_get_->join();
+}
+
+void AsyncGRPCServer::ShutdownQueue() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  cq_send_->Shutdown();
+  cq_get_->Shutdown();
+  is_shut_down_ = true;
+}
+
+// This URL explains why shutdown is complicate:
+// https://stackoverflow.com/questions/35708348/grpc-what-is-the-recommended-way-to-shut-down-an-asynchronous-server-in-c
+void AsyncGRPCServer::ShutDown() {
+  server_->Shutdown();
+  ShutdownQueue();
+}
+
+void AsyncGRPCServer::TryToRegisterNewSendOne() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  if (is_shut_down_) {
+    return;
+  }
+  RequestSend* send =
+      new RequestSend(&service_, cq_send_.get(), &var_recv_queue_);
+  VLOG(4) << "create RequestSend status:" << send->Status();
+}
+
+void AsyncGRPCServer::TryToRegisterNewGetOne() {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  if (is_shut_down_) {
+    return;
+  }
+  RequestGet* get = new RequestGet(&service_, cq_get_.get(), scope_);
+  VLOG(4) << "create Requestget status:" << get->Status();
+}
+
+void AsyncGRPCServer::SetFinishOrDelete(RequestBase*& last) {
+  std::unique_lock<std::mutex> lock(cq_mutex_);
+  if (is_shut_down_) {
+    delete last;
+    last = NULL;
+    return;
+  }
+
+  last->SetStatus(FINISH);
+  return;
+}
+
+void AsyncGRPCServer::HandleRequest(bool wait, grpc::ServerCompletionQueue* cq,
+                                    std::string cq_name,
+                                    std::function<void()> TryToRegisterNewOne) {
+  TryToRegisterNewOne();
+
+  void* tag = NULL;
+  bool ok = false;
+  while (true) {
+    if (!cq->Next(&tag, &ok)) {
+      LOG(INFO) << cq_name << " get CompletionQueue shutdown!";
+      break;
+    }
+
+    if (wait && !done_) {
+      Wait();
+    }
+
+    RequestBase* base = (RequestBase*)tag;
+    if (!ok) {
+      VLOG(4) << cq_name << " recv no regular event";
+      TryToRegisterNewOne();
+      delete base;
+      continue;
+    }
+
+    switch (base->Status()) {
+      case PROCESS: {
+        VLOG(4) << cq_name << " status:" << base->Status();
+        TryToRegisterNewOne();
+        base->Process();
+        SetFinishOrDelete(base);
+        break;
+      }
+      case FINISH: {
+        VLOG(4) << cq_name << " status:" << base->Status();
+        delete base;
+        break;
+      }
+      default: { assert(false); }
+    }
+  }
+}
+
+void AsyncGRPCServer::Wait() {
+  std::unique_lock<std::mutex> lock(this->mutex_);
+  condition_.wait(lock, [=] { return this->done_ == true; });
+}
+
+void AsyncGRPCServer::Reset() {
+  std::lock_guard<std::mutex> lock(this->mutex_);
+  done_ = false;
+}
+
+void AsyncGRPCServer::Done() {
+  {
+    std::lock_guard<std::mutex> lock(this->mutex_);
+    done_ = true;
+  }
+  condition_.notify_all();
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/grpc_server.h b/paddle/operators/detail/grpc_server.h
new file mode 100644
index 0000000000000000000000000000000000000000..041fe05b2e9c37e8a91669b8f523c47b56e14cba
--- /dev/null
+++ b/paddle/operators/detail/grpc_server.h
@@ -0,0 +1,91 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/framework/var_type.h"
+#include "paddle/operators/detail/simple_block_queue.h"
+
+#include "paddle/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/operators/detail/send_recv.pb.h"
+
+#include <grpc++/grpc++.h>
+#include <grpc/support/log.h>
+#include <thread>
+#include "paddle/operators/detail/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
+class RequestBase;
+
+class AsyncGRPCServer final : public sendrecv::SendRecvService::Service {
+ public:
+  explicit AsyncGRPCServer(std::string address) { address_ = address; }
+
+  void RunSyncUpdate();
+
+  void Reset();
+
+  void Done();
+
+  void SetScope(framework::Scope *scope) { scope_ = scope; }
+
+  const MessageWithName Get() { return this->var_recv_queue_.Pop(); }
+
+  void Push(const MessageWithName &msg) { this->var_recv_queue_.Push(msg); }
+
+  void ShutDown();
+
+ protected:
+  void Wait();
+  void HandleRequest(bool wait, grpc::ServerCompletionQueue *cq,
+                     std::string cq_name,
+                     std::function<void()> TryToRegisterNewOne);
+  void TryToRegisterNewSendOne();
+  void TryToRegisterNewGetOne();
+  void SetFinishOrDelete(RequestBase *&last);
+  void ShutdownQueue();
+
+ private:
+  std::mutex cq_mutex_;
+  volatile bool is_shut_down_ = false;
+  std::unique_ptr<grpc::ServerCompletionQueue> cq_send_;
+  std::unique_ptr<grpc::ServerCompletionQueue> cq_get_;
+
+  sendrecv::SendRecvService::AsyncService service_;
+  std::unique_ptr<grpc::Server> server_;
+
+  std::string address_;
+  framework::Scope *scope_;
+  // received variable from RPC, operators fetch variable from this queue.
+  SimpleBlockQueue<MessageWithName> var_recv_queue_;
+
+  // condition of the sub program
+  std::mutex mutex_;
+  volatile mutable bool done_;
+  std::condition_variable condition_;
+
+  std::unique_ptr<std::thread> t_send_;
+  std::unique_ptr<std::thread> t_get_;
+};
+
+};  // namespace detail
+};  // namespace operators
+};  // namespace paddle
diff --git a/paddle/operators/detail/recv_impl.cc b/paddle/operators/detail/recv_impl.cc
deleted file mode 100644
index 319404e56a5f3c407f313991240bbbb85fd39a2a..0000000000000000000000000000000000000000
--- a/paddle/operators/detail/recv_impl.cc
+++ /dev/null
@@ -1,65 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "send_recv_impl.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-Status SendRecvServerImpl::SendVariable(ServerContext *context,
-                                        const VariableMessage *in_var,
-                                        VoidMessage *out_var) {
-  MessageWithName msg_with_name =
-      std::make_pair(in_var->varname(), std::move(*in_var));
-  var_recv_queue_.Push(std::move(msg_with_name));
-  return Status::OK;
-}
-
-Status SendRecvServerImpl::GetVariable(ServerContext *context,
-                                       const VariableMessage *in_var,
-                                       VariableMessage *out_var) {
-  std::string get_var_name = in_var->varname();
-  auto *var = scope_->FindVar(get_var_name);
-
-  SerializeToMessage(get_var_name, var, platform::CPUDeviceContext(), out_var);
-  return Status::OK;
-}
-
-Status SendRecvServerImpl::Wait(ServerContext *context,
-                                const VoidMessage *in_var,
-                                VoidMessage *out_var) {
-  {
-    std::unique_lock<std::mutex> lock(this->mutex_);
-    condition_.wait(lock, [=] { return this->done_ == true; });
-  }
-  return Status::OK;
-}
-
-void SendRecvServerImpl::Reset() {
-  std::lock_guard<std::mutex> lock(this->mutex_);
-  done_ = false;
-}
-
-void SendRecvServerImpl::Done() {
-  {
-    std::lock_guard<std::mutex> lock(this->mutex_);
-    done_ = true;
-  }
-  condition_.notify_all();
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/detail/send_impl.cc b/paddle/operators/detail/send_impl.cc
deleted file mode 100644
index ae85cf2cec2cd8e046c0c7fd3408f2212f225819..0000000000000000000000000000000000000000
--- a/paddle/operators/detail/send_impl.cc
+++ /dev/null
@@ -1,67 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "send_recv_impl.h"
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-bool RPCClient::SendVariable(const framework::Scope& scope,
-                             const std::string& inname) {
-  ClientContext context;
-  VariableMessage msg;
-  VoidMessage out_msg;
-  // FIXME(typhoonzero): pass device context to here.
-  auto ctx = platform::CPUDeviceContext();
-  auto* var = scope.FindVar(inname);
-  PADDLE_ENFORCE(var);
-  SerializeToMessage(inname, var, ctx, &msg);
-
-  Status status = stub_->SendVariable(&context, msg, &out_msg);
-  if (!status.ok()) {
-    LOG(ERROR) << "gRPC error: " << status.error_message();
-    return false;
-  }
-  return true;
-}
-
-bool RPCClient::GetVariable(const framework::Scope& scope,
-                            const std::string& outname) {
-  ClientContext context;
-  VariableMessage call_msg, ret_msg;
-  call_msg.set_varname(outname);
-  auto ctx = platform::CPUDeviceContext();
-  Status status = stub_->GetVariable(&context, call_msg, &ret_msg);
-  auto* outvar = scope.FindVar(outname);
-  if (!status.ok()) {
-    LOG(ERROR) << "gRPC error: " << status.error_message();
-    return false;
-  }
-
-  std::istringstream iss(ret_msg.serialized());
-  DeserializeFromMessage(ret_msg, ctx, outvar);
-
-  return true;
-}
-
-void RPCClient::Wait() {
-  ClientContext context;
-  VoidMessage call_msg, ret_msg;
-  stub_->Wait(&context, call_msg, &ret_msg);
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/detail/send_recv.proto b/paddle/operators/detail/send_recv.proto
index f141c755ce14ef540aeab32c11c289179aff3f8c..8f962b4c69cc83dc2ab98b7dc27e18bc4b42bf18 100644
--- a/paddle/operators/detail/send_recv.proto
+++ b/paddle/operators/detail/send_recv.proto
@@ -21,8 +21,6 @@ service SendRecvService {
   rpc SendVariable(VariableMessage) returns (VoidMessage) {}
   // Argument VariableMessage for GetVariable should only contain varname.
   rpc GetVariable(VariableMessage) returns (VariableMessage) {}
-  // wait for one execution of the program
-  rpc Wait(VoidMessage) returns (VoidMessage) {}
 }
 
 // VariableMessage is serialized paddle variable message.
diff --git a/paddle/operators/detail/send_recv_impl.h b/paddle/operators/detail/send_recv_impl.h
deleted file mode 100644
index 1fe54f1f0536aed7d41bbdeeca076534abafe98d..0000000000000000000000000000000000000000
--- a/paddle/operators/detail/send_recv_impl.h
+++ /dev/null
@@ -1,141 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/framework/lod_tensor.h"
-#include "paddle/framework/scope.h"
-#include "paddle/framework/selected_rows.h"
-#include "paddle/framework/var_type.h"
-#include "paddle/operators/detail/simple_block_queue.h"
-
-#include "paddle/operators/detail/send_recv.grpc.pb.h"
-#include "paddle/operators/detail/send_recv.pb.h"
-
-#include <grpc++/grpc++.h>
-
-using grpc::Channel;
-using grpc::Server;
-using grpc::ServerContext;
-using grpc::ServerReader;
-using grpc::ServerBuilder;
-
-using grpc::ClientContext;
-using grpc::ClientReader;
-using grpc::ClientReaderWriter;
-using grpc::ClientWriter;
-using grpc::Status;
-using sendrecv::SendRecvService;
-using sendrecv::VariableMessage;
-using sendrecv::VoidMessage;
-
-namespace paddle {
-namespace operators {
-namespace detail {
-
-typedef std::pair<std::string, sendrecv::VariableMessage> MessageWithName;
-
-class SendRecvServerImpl final : public SendRecvService::Service {
- public:
-  explicit SendRecvServerImpl() {}
-
-  Status SendVariable(ServerContext *context, const VariableMessage *in_var,
-                      VoidMessage *out_var) override;
-  Status GetVariable(ServerContext *context, const VariableMessage *in_var,
-                     VariableMessage *out_var) override;
-  Status Wait(ServerContext *context, const VoidMessage *in_var,
-              VoidMessage *out_var) override;
-  void Reset();
-  void Done();
-  void SetScope(framework::Scope *scope) { scope_ = scope; };
-
-  const MessageWithName Get() { return this->var_recv_queue_.Pop(); }
-
-  void Push(const MessageWithName &msg) { this->var_recv_queue_.Push(msg); }
-
- private:
-  // received variable from RPC, operators fetch variable from this queue.
-  SimpleBlockQueue<MessageWithName> var_recv_queue_;
-  framework::Scope *scope_;
-  // condition of the sub program
-  std::mutex mutex_;
-  bool done_;
-  std::condition_variable condition_;
-};
-
-// RPCClient is a class to send tensors to pserver sub-network
-// using different hashing methods.
-class RPCClient {
- public:
-  RPCClient(std::shared_ptr<Channel> channel)
-      : stub_(SendRecvService::NewStub(channel)) {}
-
-  bool SendVariable(const framework::Scope &scope, const std::string &inname);
-  bool GetVariable(const framework::Scope &scope, const std::string &outname);
-  void Wait();
-
- private:
-  std::unique_ptr<SendRecvService::Stub> stub_;
-};
-
-inline void SerializeToMessage(const std::string &name,
-                               const framework::Variable *var,
-                               const platform::DeviceContext &ctx,
-                               VariableMessage *msg) {
-  msg->set_varname(name);
-  std::ostringstream oss;
-  switch (framework::ToVarType(var->Type())) {
-    case framework::proto::VarDesc_VarType_LOD_TENSOR:
-      msg->set_type(sendrecv::VarType::LOD_TENSOR);
-      framework::SerializeToStream(oss, var->Get<framework::LoDTensor>(), ctx);
-      break;
-    case framework::proto::VarDesc_VarType_SELECTED_ROWS:
-      msg->set_type(sendrecv::VarType::SELECTED_ROWS);
-      framework::SerializeToStream(oss, var->Get<framework::SelectedRows>(),
-                                   ctx);
-      break;
-    default: {
-      PADDLE_THROW("Serialize does not support type: %s",
-                   typeid(var->Type()).name());
-      break;
-    }
-  }
-  msg->set_serialized(oss.str());
-}
-
-inline void DeserializeFromMessage(const VariableMessage &msg,
-                                   const platform::DeviceContext &ctx,
-                                   framework::Variable *var) {
-  using namespace paddle::framework::proto;
-  std::istringstream iss(msg.serialized());
-  switch (msg.type()) {
-    case sendrecv::VarType::LOD_TENSOR:
-      DeserializeFromStream(iss, var->GetMutable<framework::LoDTensor>(), ctx);
-      break;
-    case sendrecv::VarType::SELECTED_ROWS: {
-      DeserializeFromStream(iss, var->GetMutable<framework::SelectedRows>(),
-                            ctx);
-      break;
-    }
-    default: {
-      PADDLE_THROW("Deserialize does not support type: %s",
-                   typeid(var->Type()).name());
-      break;
-    }
-  }
-}
-
-}  // namespace detail
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/operators/detail/sendrecvop_utils.cc b/paddle/operators/detail/sendrecvop_utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7635b9e8dbdff624bb42a9de346b8d05a980f9b6
--- /dev/null
+++ b/paddle/operators/detail/sendrecvop_utils.cc
@@ -0,0 +1,68 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/detail/sendrecvop_utils.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+void SerializeToMessage(const std::string& name, const framework::Variable* var,
+                        const platform::DeviceContext& ctx,
+                        sendrecv::VariableMessage* msg) {
+  msg->set_varname(name);
+  std::ostringstream oss;
+  switch (framework::ToVarType(var->Type())) {
+    case framework::proto::VarDesc_VarType_LOD_TENSOR:
+      msg->set_type(sendrecv::VarType::LOD_TENSOR);
+      framework::SerializeToStream(oss, var->Get<framework::LoDTensor>(), ctx);
+      break;
+    case framework::proto::VarDesc_VarType_SELECTED_ROWS:
+      msg->set_type(sendrecv::VarType::SELECTED_ROWS);
+      framework::SerializeToStream(oss, var->Get<framework::SelectedRows>(),
+                                   ctx);
+      break;
+    default: {
+      PADDLE_THROW("Serialize does not support type: %s",
+                   typeid(var->Type()).name());
+      break;
+    }
+  }
+  msg->set_serialized(oss.str());
+}
+
+void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
+                            const platform::DeviceContext& ctx,
+                            framework::Variable* var) {
+  std::istringstream iss(msg.serialized());
+  switch (msg.type()) {
+    case sendrecv::VarType::LOD_TENSOR:
+      DeserializeFromStream(iss, var->GetMutable<framework::LoDTensor>(), ctx);
+      break;
+    case sendrecv::VarType::SELECTED_ROWS: {
+      DeserializeFromStream(iss, var->GetMutable<framework::SelectedRows>(),
+                            ctx);
+      break;
+    }
+    default: {
+      PADDLE_THROW("Deserialize does not support type: %s",
+                   typeid(var->Type()).name());
+      break;
+    }
+  }
+}
+
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/detail/sendrecvop_utils.h b/paddle/operators/detail/sendrecvop_utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..bc6581afab93c626c7c2439d699c6c2d858df9fa
--- /dev/null
+++ b/paddle/operators/detail/sendrecvop_utils.h
@@ -0,0 +1,42 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <iostream>
+#include <string>
+#include <vector>
+
+#include "paddle/framework/data_type.h"
+#include "paddle/framework/lod_tensor.h"
+#include "paddle/framework/scope.h"
+#include "paddle/framework/selected_rows.h"
+#include "paddle/framework/var_type.h"
+
+#include "paddle/operators/detail/send_recv.grpc.pb.h"
+#include "paddle/operators/detail/send_recv.pb.h"
+
+namespace paddle {
+namespace operators {
+namespace detail {
+
+void SerializeToMessage(const std::string& name, const framework::Variable* var,
+                        const platform::DeviceContext& ctx,
+                        sendrecv::VariableMessage* msg);
+
+void DeserializeFromMessage(const sendrecv::VariableMessage& msg,
+                            const platform::DeviceContext& ctx,
+                            framework::Variable* var);
+}  // namespace detail
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/elementwise_add_op.cc b/paddle/operators/elementwise_add_op.cc
index 70b7c9f2ec11bf8ad56a24324a53792955edc77d..37951fa7587c8a200f4733e5a46575461f5026cd 100644
--- a/paddle/operators/elementwise_add_op.cc
+++ b/paddle/operators/elementwise_add_op.cc
@@ -21,7 +21,7 @@ class ElementwiseAddOpMaker : public ElementwiseOpMaker {
  public:
   ElementwiseAddOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Add", "$Out = X + Y$");
+    SetComment("Add", "Out = X + Y");
     AddComment(comment_);
   }
 };
diff --git a/paddle/operators/elementwise_div_op.cc b/paddle/operators/elementwise_div_op.cc
index 1fa960866fa2066a351ef2e65a3c77cf8b6595f7..6ebd58b1b3dd79a465e70a24f7aab56261290bf6 100644
--- a/paddle/operators/elementwise_div_op.cc
+++ b/paddle/operators/elementwise_div_op.cc
@@ -21,7 +21,7 @@ class ElementwiseDivOpMaker : public ElementwiseOpMaker {
  public:
   ElementwiseDivOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Div", "$Out = X / Y$");
+    SetComment("Div", "Out = X / Y");
     AddComment(comment_);
   }
 };
diff --git a/paddle/operators/elementwise_mul_op.cc b/paddle/operators/elementwise_mul_op.cc
index a6d11736194cb79bdc247c721acf8bda9c81dbe5..450dd05c796e22794315274b73398e85c8145940 100644
--- a/paddle/operators/elementwise_mul_op.cc
+++ b/paddle/operators/elementwise_mul_op.cc
@@ -22,7 +22,7 @@ class ElementwiseMulOpMaker : public ElementwiseOpMaker {
  public:
   ElementwiseMulOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Mul", "$Out = X \\odot\\ Y$");
+    SetComment("Mul", "Out = X \\odot\\ Y");
     AddComment(comment_);
   }
 };
diff --git a/paddle/operators/elementwise_op.h b/paddle/operators/elementwise_op.h
index f308ee05e11210540e41cda4b9a896f9f96c4730..a342595b546bfca1a344cf8a549597df6a29adec 100644
--- a/paddle/operators/elementwise_op.h
+++ b/paddle/operators/elementwise_op.h
@@ -58,7 +58,8 @@ Limited Elementwise {name} Operator.
 
 The equation is:
 
-{equation}
+.. math::
+  {equation}
 
 X is a tensor of any dimension and the dimensions of tensor Y must be smaller than
 or equal to the dimensions of X. 
@@ -71,15 +72,16 @@ For case 2:
 Y will be broadcasted to match the shape of X and axis should be 
 the starting dimension index for broadcasting Y onto X.
 
-example:
-  shape(X) = (2, 3, 4, 5), shape(Y) = (,)
-  shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
-  shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
-  shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
-  shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+For example
+  .. code-block:: python
 
-Both the input X and Y can carry the LoD (Level of Details) information,
-or not. But the output only shares the LoD information with input X.
+    shape(X) = (2, 3, 4, 5), shape(Y) = (,)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (5,)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (4, 5)
+    shape(X) = (2, 3, 4, 5), shape(Y) = (3, 4), with axis=1
+    shape(X) = (2, 3, 4, 5), shape(Y) = (2), with axis=0
+
+Either of the inputs X and Y or none can carry the LoD (Level of Details) information. However, the output only shares the LoD information with input X.
 
 )DOC";
     AddComment(comment_);
diff --git a/paddle/operators/elementwise_sub_op.cc b/paddle/operators/elementwise_sub_op.cc
index 2a8d0845b1800277a7d3cd6ff6c5c984e92197ee..d3c51f0a697b7cb07a46871cdba2e84e902fd0f2 100644
--- a/paddle/operators/elementwise_sub_op.cc
+++ b/paddle/operators/elementwise_sub_op.cc
@@ -21,7 +21,7 @@ class ElementwiseSubOpMaker : public ElementwiseOpMaker {
  public:
   ElementwiseSubOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : ElementwiseOpMaker(proto, op_checker) {
-    SetComment("Sub", "$Out = X - Y$");
+    SetComment("Sub", "Out = X - Y");
     AddComment(comment_);
   }
 };
diff --git a/paddle/operators/print_op.cc b/paddle/operators/print_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..89e41d806c7661a3e61e0a944a2a980704297dd9
--- /dev/null
+++ b/paddle/operators/print_op.cc
@@ -0,0 +1,206 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <algorithm>
+#include <ctime>
+
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+#define CLOG std::cout
+
+struct Formater {
+  std::string message;
+  std::string name;
+  std::vector<int> dims;
+  std::type_index dtype{typeid(char)};
+  framework::LoD lod;
+  int summarize;
+  void* data{nullptr};
+
+  void operator()(size_t size) {
+    PrintMessage();
+    PrintName();
+    PrintDims();
+    PrintDtype();
+    PrintLod();
+    PrintData(size);
+  }
+
+ private:
+  void PrintMessage() { CLOG << std::time(nullptr) << "\t" << message; }
+  void PrintName() {
+    if (!name.empty()) {
+      CLOG << "Tensor[" << name << "]" << std::endl;
+    }
+  }
+  void PrintDims() {
+    if (!dims.empty()) {
+      CLOG << "\tshape: [";
+      for (auto i : dims) {
+        CLOG << i << ",";
+      }
+      CLOG << "]" << std::endl;
+    }
+  }
+  void PrintDtype() {
+    if (dtype.hash_code() != typeid(char).hash_code()) {
+      CLOG << "\tdtype: " << dtype.name() << std::endl;
+    }
+  }
+  void PrintLod() {
+    if (!lod.empty()) {
+      CLOG << "\tLoD: [";
+      for (auto level : lod) {
+        CLOG << "[ ";
+        for (auto i : level) {
+          CLOG << i << ",";
+        }
+        CLOG << " ]";
+      }
+      CLOG << "]" << std::endl;
+    }
+  }
+
+  void PrintData(size_t size) {
+    PADDLE_ENFORCE_NOT_NULL(data);
+    // print float
+    if (dtype.hash_code() == typeid(float).hash_code()) {
+      Display<float>(size);
+    }
+    if (dtype.hash_code() == typeid(double).hash_code()) {
+      Display<double>(size);
+    }
+    if (dtype.hash_code() == typeid(int).hash_code()) {
+      Display<int>(size);
+    }
+    if (dtype.hash_code() == typeid(int64_t).hash_code()) {
+      Display<int64_t>(size);
+    }
+  }
+
+  template <typename T>
+  void Display(size_t size) {
+    auto* d = (T*)data;
+    CLOG << "\tdata: ";
+    if (summarize != -1) {
+      summarize = std::min(size, (size_t)summarize);
+      for (int i = 0; i < summarize; i++) {
+        CLOG << d[i] << ",";
+      }
+    } else {
+      for (size_t i = 0; i < size; i++) {
+        CLOG << d[i] << ",";
+      }
+    }
+    CLOG << std::endl;
+  }
+};
+
+// TODO(ChunweiYan) there should be some other printers for TensorArray
+class TensorPrintOp : public framework::OperatorBase {
+ public:
+  TensorPrintOp(const std::string& type,
+                const framework::VariableNameMap& inputs,
+                const framework::VariableNameMap& outputs,
+                const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
+
+  TensorPrintOp(const TensorPrintOp& o)
+      : framework::OperatorBase(
+            static_cast<const framework::OperatorBase&>(o)) {
+    PADDLE_THROW("Not implemented");
+  }
+
+  void Run(const framework::Scope& scope,
+           const platform::Place& place) const override {
+    // Only run the `first_n` times.
+    int first_n = Attr<int>("first_n");
+    if (first_n > 0 && ++times_ > first_n) return;
+
+    PADDLE_ENFORCE(!Inputs("input").empty(), "input should be set");
+    auto* input_var = scope.FindVar(Input("input"));
+    PADDLE_ENFORCE_NOT_NULL(input_var);
+    auto& tensor = input_var->Get<framework::LoDTensor>();
+
+    // TODO(ChunweiYan) support GPU
+    PADDLE_ENFORCE(platform::is_cpu_place(tensor.place()));
+
+    Formater formater;
+    if (Attr<bool>("print_tensor_name")) {
+      formater.name = Inputs("input").front();
+    }
+    if (Attr<bool>("print_tensor_type")) {
+      formater.dtype = tensor.type();
+    }
+    if (Attr<bool>("print_tensor_shape")) {
+      formater.dims.assign(tensor.dims()[0],
+                           tensor.dims()[tensor.dims().size() - 1]);
+    }
+    if (Attr<bool>("print_tensor_lod")) {
+      formater.lod = tensor.lod();
+    }
+    formater.summarize = Attr<int>("summarize");
+    formater.data = (void*)tensor.data<void>();
+    formater(tensor.numel());
+  }
+
+ private:
+  mutable int times_{0};
+};
+
+class PrintOpProtoAndCheckMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  PrintOpProtoAndCheckMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("input", "the tensor that will be displayed.");
+    AddAttr<int>("first_n", "Only log `first_n` number of times.");
+    AddAttr<std::string>("message", "A string message to print as a prefix.");
+    AddAttr<int>("summarize", "Print this number of elements in the tensor.");
+    AddAttr<bool>("print_tensor_name", "Whether to print the tensor name.");
+    AddAttr<bool>("print_tensor_type", "Whether to print the tensor's dtype.");
+    AddAttr<bool>("print_tensor_shape", "Whether to print the tensor's shape.");
+    AddAttr<bool>("print_tensor_lod", "Whether to print the tensor's lod.");
+    AddComment(R"DOC(
+    Creates a print op that will print when a tensor is accessed.
+
+    Wraps the tensor passed in so that whenever that a tensor is accessed,
+    the message `message` is printed, along with the current value of the
+    tensor `t`.)DOC");
+  }
+};
+
+class InferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext* context) const override {
+    PADDLE_ENFORCE(context->HasInput("input"), "input should be set");
+  }
+};
+
+class InferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {}
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OPERATOR(print, paddle::operators::TensorPrintOp,
+                  paddle::operators::PrintOpProtoAndCheckMaker,
+                  paddle::operators::InferShape,
+                  paddle::operators::InferVarType,
+                  paddle::framework::EmptyGradOpMaker);
diff --git a/paddle/operators/recv_op.cc b/paddle/operators/recv_op.cc
index 9331c7b563491902b2824898766cacb9bfdee2d9..55b33343af43802e1b6b95a32603bfee806c9764 100644
--- a/paddle/operators/recv_op.cc
+++ b/paddle/operators/recv_op.cc
@@ -24,7 +24,8 @@ limitations under the License. */
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
 #include "paddle/framework/proto_desc.h"
-#include "paddle/operators/detail/send_recv_impl.h"
+#include "paddle/operators/detail/grpc_server.h"
+#include "paddle/operators/detail/sendrecvop_utils.h"
 #include "paddle/operators/detail/simple_block_queue.h"
 
 #define LISTEN_TERMINATE_MESSAGE "TERMINATE@RECV"
@@ -32,6 +33,11 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+void RunServer(std::shared_ptr<detail::AsyncGRPCServer> service) {
+  service->RunSyncUpdate();
+  VLOG(4) << "RunServer thread end";
+}
+
 static void CreateTensorFromMessageType(framework::Variable *var,
                                         sendrecv::VarType var_type) {
   if (var_type == sendrecv::VarType::LOD_TENSOR) {
@@ -46,18 +52,6 @@ static void CreateTensorFromMessageType(framework::Variable *var,
   }
 }
 
-void RunServer(Server **rpc_server,
-               std::shared_ptr<detail::SendRecvServerImpl> service,
-               const std::string &server_address) {
-  ServerBuilder builder;
-  builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
-  builder.RegisterService(service.get());
-  std::unique_ptr<Server> server(builder.BuildAndStart());
-  *rpc_server = server.get();
-  LOG(INFO) << "Server listening on " << server_address;
-  server->Wait();
-}
-
 class RecvOp : public framework::OperatorBase {
  public:
   RecvOp(const std::string &type, const framework::VariableNameMap &inputs,
@@ -65,10 +59,9 @@ class RecvOp : public framework::OperatorBase {
          const framework::AttributeMap &attrs)
       : OperatorBase(type, inputs, outputs, attrs) {
     if (!rpc_service_) {
-      rpc_service_.reset(new detail::SendRecvServerImpl());
       std::string endpoint = Attr<std::string>("endpoint");
-      server_thread_.reset(
-          new std::thread(RunServer, &rpc_server_, rpc_service_, endpoint));
+      rpc_service_.reset(new detail::AsyncGRPCServer(endpoint));
+      server_thread_.reset(new std::thread(RunServer, rpc_service_));
     }
   }
 
@@ -76,7 +69,7 @@ class RecvOp : public framework::OperatorBase {
     detail::MessageWithName term_msg;
     term_msg.first = LISTEN_TERMINATE_MESSAGE;
     rpc_service_->Push(term_msg);
-    rpc_server_->Shutdown();
+    rpc_service_->ShutDown();
     server_thread_->join();
   }
 
@@ -99,10 +92,12 @@ class RecvOp : public framework::OperatorBase {
     auto grad_list = Attr<std::vector<std::string>>("GradList");
     auto trainer_count = Attr<int>("Trainers");
     size_t param_count = param_list.size();
+
     rpc_service_->Reset();
     // TODO(typhoonzero): change this to a while_op for every cluster-batch.
     bool exit_flag = false;
     while (!exit_flag) {
+      // TODO(gognwb): simply this loop.
       // Get from multiple trainers, we don't care about order in which
       // the gradient arrives, just add suffix 0~n then average the gradient.
       for (size_t i = 0; i < param_count * trainer_count; ++i) {
@@ -110,6 +105,7 @@ class RecvOp : public framework::OperatorBase {
         const detail::MessageWithName &v = rpc_service_->Get();
         auto grad_var_name = v.first;
         if (grad_var_name == LISTEN_TERMINATE_MESSAGE) {
+          VLOG(4) << "received LISTEN_TERMINATE_MESSAGE and RunOp.Run() exit";
           exit_flag = true;
           break;
         }
@@ -118,10 +114,12 @@ class RecvOp : public framework::OperatorBase {
         if (it != grad_list.end()) {
           param_var_name = param_list[it - grad_list.begin()];
         } else {
-          LOG(ERROR) << "grad have no paired param found!";
+          LOG(ERROR) << "grad have no paired param found!\"" << grad_var_name
+                     << "\"";
         }
         VLOG(3) << "recved grad: " << grad_var_name
                 << " updating param: " << param_var_name;
+
         auto *merged_grad = recv_scope.FindVar(grad_var_name);
         if (merged_grad == nullptr) {
           auto *ptr = recv_scope.Var(grad_var_name);
@@ -141,9 +139,11 @@ class RecvOp : public framework::OperatorBase {
         auto &dev_ctx = *pool.Get(dev_place);
         detail::DeserializeFromMessage(v.second, dev_ctx, var);
       }
+
       if (exit_flag) {
         break;
       }
+
       rpc_service_->Reset();
 
       std::string program_str = Attr<std::string>("OptimizeProgram");
@@ -158,17 +158,14 @@ class RecvOp : public framework::OperatorBase {
       } catch (std::exception &e) {
         LOG(ERROR) << "run sub program error " << e.what();
       }
+
       rpc_service_->Done();
       grads_counter_.clear();
     }  // while(true)
   }
 
  protected:
-  // grpc server instance to track status and gracefully shutdown.
-  // borrow an pointer from server thread.
-  Server *rpc_server_{nullptr};
-  // grpc send/recv service implement to register.
-  std::shared_ptr<detail::SendRecvServerImpl> rpc_service_;
+  std::shared_ptr<detail::AsyncGRPCServer> rpc_service_;
   std::shared_ptr<std::thread> server_thread_;
   mutable std::unordered_map<std::string, int> grads_counter_;
 };
diff --git a/paddle/operators/reorder_lod_tensor_by_rank_op.cc b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
index a055cdf7e8952995e57c28b3520c427caa75a4c1..3c30447949421da516213b47178828453671c693 100644
--- a/paddle/operators/reorder_lod_tensor_by_rank_op.cc
+++ b/paddle/operators/reorder_lod_tensor_by_rank_op.cc
@@ -26,22 +26,44 @@ class ReorderLoDTensorByRankTableOpProtoMaker
   ReorderLoDTensorByRankTableOpProtoMaker(OpProto *proto,
                                           OpAttrChecker *op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "(LoDTensor) the input lod tensor need to be reordered.");
+    AddInput("X",
+             "(LoDTensor), the input lod tensor to be reordered according to "
+             "Input(RankTable).");
     AddInput("RankTable",
-             "(LoDRankTable) the rank table that input need follow");
-    AddOutput("Out", "(LoDTensor) reordered lod tensor");
-    AddComment(R"DOC(ReorderLoDTensorByRankTable
+             "(LoDRankTable), the rank table according to which Input(X) is "
+             "reordered.");
+    AddOutput("Out", "(LoDTensor), the reordered lod tensor.");
+    AddComment(R"DOC(ReorderLoDTensorByRankTable operator.
 
-Reorder the input X by the rank of `RankTable`. If `RankTable` is ordered by
-index [3, 0, 2, 1]. Input X will reorder its sequence, the third sequence of
-X will be the first sequence of Output.
-
-NOTE: The RankTable does not need to be calculated by X.
+Input(X) is a batch of sequences. Input(RankTable) stores new orders of the
+input sequence batch. The reorder_lod_tensor_by_rank operator reorders the
+Input(X) according to the information provided by Input(RankTable).
 
 For example:
-The X = [Seq0, Seq1, Seq2, Seq3]. The indices of RankTable are [3, 0, 2, 1].
 
-The Out =  [Seq3, Seq0, Seq2, Seq1] with correct LoD information.
+If the indices stored in the Input(RankTable) are [3, 0, 2, 1], the
+Input(X) will be reordered that the fourth sequence in Input(X) will become the
+first one, and then followed by the original first, third, and the second one.
+
+This is:
+X = [Seq0, Seq1, Seq2, Seq3]. The indices in RankTable are [3, 0, 2, 1].
+Out =  [Seq3, Seq0, Seq2, Seq1] with a new LoD information.
+
+If the LoD information of Input(X) is empty, this means Input(X) is not sequence
+data. This is also identical to a batch of sequences where each sequence has a
+fixed length 1. In this case, the reorder_lod_tensor_by_rank operator reorders
+each slice of Input(X) along the first axis according to Input(RankTable).
+
+This is:
+X = [Slice0, Slice1, Slice2, Slice3] and its LoD information is empty. The
+indices in RankTable are [3, 0, 2, 1].
+Out = [Slice3, Slice0, Slice2, Slice1] with no LoD information is appended.
+
+NOTE: This operator sorts Input(X) according to a given LoDRankTable which does
+not need to be calculated according to Input(X). It can be calculated according
+to another different sequence, and then this operator sorts Input(X) according
+to the given LoDRankTable.
+
 )DOC");
   }
 };
diff --git a/paddle/operators/send_op.cc b/paddle/operators/send_op.cc
index 95c207221a7b34732eca4cfd07fed0a8f1671981..4d145250bdc73607c8817e20fdb753f4c96e2391 100644
--- a/paddle/operators/send_op.cc
+++ b/paddle/operators/send_op.cc
@@ -19,59 +19,45 @@ limitations under the License. */
 #include "paddle/framework/lod_tensor.h"
 #include "paddle/framework/op_registry.h"
 
-#include "paddle/operators/detail/send_recv_impl.h"
-#include "paddle/operators/detail/simple_block_queue.h"
+#include <future>
+#include "paddle/operators/detail/grpc_client.h"
 
 namespace paddle {
 namespace operators {
 
-// TODO(typhoonzero): this is a simple implementation which only send
-// one tensor
 class SendOp : public framework::OperatorBase {
  public:
-  SendOp(const std::string &type, const framework::VariableNameMap &inputs,
-         const framework::VariableNameMap &outputs,
-         const framework::AttributeMap &attrs)
-      : OperatorBase(type, inputs, outputs, attrs) {
-    // init client when the operator is created at runtime.
-    std::vector<std::string> endpoints =
-        Attr<std::vector<std::string>>("endpoints");
-    for (auto ep : endpoints) {
-      client_map_[ep].reset(new detail::RPCClient(
-          grpc::CreateChannel(ep, grpc::InsecureChannelCredentials())));
-    }
-  }
+  SendOp(const std::string& type, const framework::VariableNameMap& inputs,
+         const framework::VariableNameMap& outputs,
+         const framework::AttributeMap& attrs)
+      : OperatorBase(type, inputs, outputs, attrs) {}
 
-  void Run(const framework::Scope &scope,
-           const platform::Place &dev_place) const override {
+  void Run(const framework::Scope& scope,
+           const platform::Place& dev_place) const override {
     auto ins = Inputs("X");
     auto outs = Outputs("Out");
     std::vector<std::string> epmap = Attr<std::vector<std::string>>("epmap");
-    // TODO(typhoonzero): use async calls to send multiple variable asyncly.
-    for (size_t i = 0; i < ins.size(); ++i) {
-      bool ret = client_map_[epmap[i]]->SendVariable(scope, ins[i]);
-      if (!ret) {
-        LOG(ERROR) << "send variable error: " << ins[i];
-      }
+
+    // FIXME(gongwb): DeviceContext?
+    auto ctx = platform::CPUDeviceContext();
+    for (size_t i = 0; i < ins.size(); i++) {
+      client_.AsyncSendVariable(epmap[i], ctx, scope, ins[i]);
     }
-    // TODO(typhoonzero): support async optimization
-    client_map_[epmap[0]]->Wait();
-    for (size_t i = 0; i < outs.size(); ++i) {
-      bool ret = client_map_[epmap[i]]->GetVariable(scope, outs[i]);
-      if (!ret) {
-        LOG(ERROR) << "GetVariable error: " << outs[i];
-      }
+
+    for (size_t i = 0; i < outs.size(); i++) {
+      client_.AsyncGetVariable(epmap[i], ctx, scope, outs[i]);
     }
+
+    client_.wait();
   }
 
- protected:
-  mutable std::unordered_map<std::string, std::shared_ptr<detail::RPCClient>>
-      client_map_;
+ private:
+  mutable detail::RPCClient client_;
 };
 
 class SendOpMaker : public framework::OpProtoAndCheckerMaker {
  public:
-  SendOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  SendOpMaker(OpProto* proto, OpAttrChecker* op_checker)
       : OpProtoAndCheckerMaker(proto, op_checker) {
     AddInput("X", "(Tensor) Input tensor to be send").AsDuplicable();
     AddOutput("Out", "(Tensor) Output tensor to get from server")
diff --git a/paddle/operators/send_recv_op_test.cc b/paddle/operators/send_recv_op_test.cc
index fa94424bf9e8e719ec0822268685b0806a109d21..ea091694798475dfd9631910a750405be950c20c 100644
--- a/paddle/operators/send_recv_op_test.cc
+++ b/paddle/operators/send_recv_op_test.cc
@@ -140,7 +140,7 @@ void StartServerNet(bool is_sparse) {
 
 TEST(SendRecvOp, CPUDense) {
   std::thread server_thread(StartServerNet, false);
-  sleep(3);  // wait server to start
+  sleep(10);  // wait server to start
   // local net
   f::Scope scope;
   p::CPUPlace place;
diff --git a/paddle/operators/sequence_erase_op.cc b/paddle/operators/sequence_erase_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d17b2686238b2d2f872331edfdbb095fb8693b87
--- /dev/null
+++ b/paddle/operators/sequence_erase_op.cc
@@ -0,0 +1,89 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/operators/sequence_erase_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SequenceEraseOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SequenceEraseOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SequenceEraseOp should not be null.");
+    auto x_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(x_dims.size() == 2 && x_dims[1] == 1,
+                   "Input(X) of SequenceEraseOp should be a 2-D LoDTensor "
+                   "with the 2nd dimension equal to 1.");
+    ctx->SetOutputDim("Out", x_dims);
+  }
+};
+
+class SequenceEraseOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  SequenceEraseOpMaker(OpProto* proto, OpAttrChecker* op_checker)
+      : OpProtoAndCheckerMaker(proto, op_checker) {
+    AddInput("X",
+             "(2-D LoDTensor with the 2nd dim. equal to 1) "
+             "Input LoDTensor of SequenceEraseOp.");
+    AddOutput("Out",
+              "(2-D LoDTensor with the 2nd dim. equal to 1) "
+              "Output LoDTensor of SequenceEraseOp.");
+    AddAttr<std::vector<int>>("tokens",
+                              "(vector<int>) Tokens need to be erased from "
+                              "input sequences.");
+    AddComment(R"DOC(
+Sequence Erase Operator.
+
+Sequence erase operator erases tokens specified by Attr(tokens) from the input 
+sequences Input(X), and outputs the remaining data and modifies the LoD 
+information at the same time. For example, given a 2-D LoDTensor
+
+    X = [[2, 2, 6, 1, 3, 9, 6, 1, 0, 1]]^T
+
+with lod = [[0, 3, 6, 10]], there are three sequences in the input:
+   
+     X1 = [[2, 2, 6]]^T, X2 = [[1, 3, 9]]^T and X3 = [[6, 1, 0, 1]]^T.
+
+If the tokens to be erased are Attr(tokens) = [2, 3, 5], after the erasing 
+operation, the three sequences become
+
+    X1' = [[6]]^T, X2' = [[1, 9]]^T and X3' = [[6, 1, 0, 1]]^T.
+
+Hence the LoDTensor Output(Out) should be
+
+    Out = [[6, 1, 9, 6, 1, 0, 1]]^T,
+
+with lod = [[0, 1, 3, 7]].
+
+An example usage for this operator is to remove the special tokens when 
+computing the edit distance between two strings, such as blank, start token, 
+and end token.
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(sequence_erase, ops::SequenceEraseOp,
+                             ops::SequenceEraseOpMaker);
+REGISTER_OP_CPU_KERNEL(
+    sequence_erase,
+    ops::SequenceEraseKernel<paddle::platform::CPUDeviceContext, int32_t>);
diff --git a/paddle/operators/sequence_erase_op.cu b/paddle/operators/sequence_erase_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..5da8eba3e1ac1fb85dfc65c2fd801574599e02d9
--- /dev/null
+++ b/paddle/operators/sequence_erase_op.cu
@@ -0,0 +1,133 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include "paddle/operators/sequence_erase_op.h"
+#include "paddle/platform/cuda_helper.h"
+
+namespace paddle {
+namespace operators {
+using platform::PADDLE_CUDA_NUM_THREADS;
+using LoDTensor = framework::LoDTensor;
+
+template <typename T>
+__global__ void LabelErasedIdx(const T* in_dat, const int in_len,
+                               const T* tokens, const int tokens_len,
+                               int* num_erased) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < in_len) {
+    int erased = 0;
+    for (int i = 0; i < tokens_len; ++i) {
+      if (in_dat[index] == tokens[i]) {
+        erased = 1;
+      }
+    }
+    num_erased[index + 1] = erased;
+    if (index == 0) {
+      num_erased[0] = 0;
+    }
+  }
+}
+
+template <typename T>
+__global__ void GetOutLod(const T* num_erased, const int* in_lod,
+                          const int lod_len, int* out_lod0) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < lod_len) {
+    out_lod0[index] = in_lod[index] - num_erased[in_lod[index]];
+  }
+}
+
+template <typename T>
+__global__ void SetOutput(const T* in_dat, const int in_len,
+                          const int* num_erased, T* out_dat) {
+  int index = blockIdx.x * blockDim.x + threadIdx.x;
+  if (index < in_len) {
+    if (in_dat[index] != in_dat[index + 1]) {
+      out_dat[index - num_erased[index]] = in_dat[index];
+    }
+  }
+}
+
+template <typename T>
+class SequenceEraseOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<LoDTensor>("X");
+    auto* out = ctx.Output<LoDTensor>("Out");
+
+    auto lod = in->lod();
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(),
+                      "The actual size mismatches with the LoD information.");
+    auto tokens = ctx.Attr<std::vector<T>>("tokens");
+    auto tokens_len = tokens.size();
+    auto in_len = in->numel();
+    auto in_dat = in->data<T>();
+    auto lod0 = lod[0];
+
+    thrust::host_vector<T> host_tokens(tokens_len);
+    for (size_t i = 0; i < tokens.size(); ++i) {
+      host_tokens[i] = tokens[i];
+    }
+    thrust::device_vector<T> dev_tokens = host_tokens;
+    thrust::device_vector<int> num_erased(in_len + 1);
+
+    T* dev_tokens_ptr = thrust::raw_pointer_cast(dev_tokens.data());
+    int* num_erased_ptr = thrust::raw_pointer_cast(num_erased.data());
+
+    auto stream = ctx.cuda_device_context().stream();
+    LabelErasedIdx<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
+                     PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        in_dat, in_len, dev_tokens_ptr, tokens_len, num_erased_ptr);
+    thrust::inclusive_scan(num_erased.begin() + 1, num_erased.end(),
+                           num_erased.begin() + 1);
+
+    // Calc LoD
+    auto lod_len = lod0.size();
+    thrust::host_vector<int> host_lod(lod_len);
+    for (size_t i = 0; i < lod_len; ++i) {
+      host_lod[i] = lod0[i];
+    }
+    thrust::device_vector<int> dev_in_lod = host_lod;
+    thrust::device_vector<int> dev_out_lod(lod_len);
+    int* dev_in_lod_ptr = thrust::raw_pointer_cast(dev_in_lod.data());
+    int* dev_out_lod_ptr = thrust::raw_pointer_cast(dev_out_lod.data());
+    GetOutLod<<<(lod_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
+                PADDLE_CUDA_NUM_THREADS, 0, stream>>>(
+        num_erased_ptr, dev_in_lod_ptr, lod_len, dev_out_lod_ptr);
+    thrust::host_vector<int> host_out_lod = dev_out_lod;
+    std::vector<int> out_lod0(lod_len, 0);
+    for (size_t i = 0; i < lod_len; i++) {
+      out_lod0[i] = host_out_lod[i];
+    }
+    framework::LoD out_lod;
+    out_lod.push_back(out_lod0);
+    out->set_lod(out_lod);
+
+    // Set output
+    out->Resize({out_lod0.back(), 1});
+    auto out_dat = out->mutable_data<T>(ctx.GetPlace());
+    SetOutput<<<(in_len - 1) / PADDLE_CUDA_NUM_THREADS + 1,
+                PADDLE_CUDA_NUM_THREADS, 0, stream>>>(in_dat, in_len,
+                                                      num_erased_ptr, out_dat);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_CUDA_KERNEL(sequence_erase,
+                        paddle::operators::SequenceEraseOpCUDAKernel<int32_t>);
diff --git a/paddle/operators/sequence_erase_op.h b/paddle/operators/sequence_erase_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb2d7be009dcbe0138818457249e95fbdd27fc0a
--- /dev/null
+++ b/paddle/operators/sequence_erase_op.h
@@ -0,0 +1,70 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SequenceEraseKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto* in = ctx.Input<framework::LoDTensor>("X");
+    auto* out = ctx.Output<framework::LoDTensor>("Out");
+
+    auto lod = in->lod();
+    PADDLE_ENFORCE_EQ(lod.size(), 1UL, "Only support one level sequence now.");
+    PADDLE_ENFORCE_EQ(lod[0].back(), (size_t)in->numel(),
+                      "The actual size mismatches with the LoD information.");
+    auto tokens = ctx.Attr<std::vector<int>>("tokens");
+    auto in_len = in->numel();
+    auto in_dat = in->data<T>();
+    auto lod0 = lod[0];
+
+    std::vector<size_t> num_erased(in_len + 1, 0);
+    std::vector<size_t> out_lod0(1, 0);
+    for (size_t i = 0; i < lod0.size() - 1; ++i) {
+      size_t num_out = 0;
+      for (auto j = lod0[i] + 1; j <= lod0[i + 1]; ++j) {
+        num_erased[j] = num_erased[j - 1];
+        if (std::find(tokens.begin(), tokens.end(), in_dat[j - 1]) !=
+            tokens.end()) {
+          num_erased[j] += 1;
+        } else {
+          num_out += 1;
+        }
+      }
+      out_lod0.push_back(out_lod0.back() + num_out);
+    }
+
+    auto out_len = in_len - num_erased[in_len];
+    out->Resize({static_cast<int64_t>(out_len), 1});
+    auto out_dat = out->mutable_data<T>(ctx.GetPlace());
+
+    for (int64_t i = 0; i < in_len; ++i) {
+      if (num_erased[i] == num_erased[i + 1]) {
+        out_dat[i - num_erased[i]] = in_dat[i];
+      }
+    }
+    framework::LoD out_lod;
+    out_lod.push_back(out_lod0);
+    out->set_lod(out_lod);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/operators/shrink_rnn_memory_op.cc b/paddle/operators/shrink_rnn_memory_op.cc
index 3f5b2a9b84350c7dee5cb461ba6207e20e95c11b..ade94b40bed91c64d3074036c067de34323bdaa7 100644
--- a/paddle/operators/shrink_rnn_memory_op.cc
+++ b/paddle/operators/shrink_rnn_memory_op.cc
@@ -45,7 +45,7 @@ class ShrinkRNNMemoryOp : public ArrayOp {
         rank_items.begin();
 
     auto *out_var = scope.FindVar(Output("Out"));
-    PADDLE_ENFORCE(out_var != nullptr, "Output Out must be set");
+    PADDLE_ENFORCE(out_var != nullptr, "Output(Out) must be set.");
     auto &out_tensor = *out_var->GetMutable<framework::LoDTensor>();
 
     size_t height = dst_num_rows;
@@ -76,15 +76,17 @@ class ShrinkRNNMemoryOpProtoMaker : public framework::OpProtoAndCheckerMaker {
              "(LoDTensor) The step index. The RNN step memory 'X' will be "
              "shrinked to match the size of the input of the index'th step.");
     AddOutput("Out", "(LoDTensor) The shrinked RNN step memory.");
-    AddComment(
-        R"DOC(
-        In dynamic RNN, we are able to handle sequences of different lengths.
-        Because of the multiple lengths, the size of each step input can be
-        different, which may lead to a mismatching between the input of
-        the current step and the memory generated by the previous one. This
-        operator shrinks memory according to the size of the next step input,
-        to make sure that they can match each other.
-        )DOC");
+    AddComment(R"DOC(
+This operator is used to shrink output batch of memory defined in dynamic RNN.
+
+Dynamic RNN is able to handle variable-length sequences, in which, sequences in
+a mini-batch are sorted by their lengths first. After that, the longest sequence
+becomes the first one in the sorted batch, followed by the second longest, the
+third longest, and so on. Dynamic RNN then slices a batch input timestep by
+timestep from the sorted input. Once any sequence in the input batch reaches its
+end, memory defined in dynamicRNN has to shrink its outputs to adapt to the input
+batch size for the next time step.
+)DOC");
   }
 };
 
diff --git a/paddle/scripts/submit_local.sh.in b/paddle/scripts/submit_local.sh.in
index bb47ad614ed85923ce5d9704760ec6c5b5ae59ee..80fa0c72af65cbdc21ba955389318a233e02657c 100755
--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -49,7 +49,18 @@ function cpu_config() {
   if [ "@WITH_MKL@" == "OFF" ]; then
     return 0
   fi
-  ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+  platform="`uname -s`"
+  ht=0
+  if [ $platform == "Linux" ]; then
+    ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
+  elif [ $platform == "Darwin" ]; then
+    if [`sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu`]; then
+      # HT is OFF
+      ht=1
+    fi
+  else
+    return 0
+  fi
   if [ $ht -eq 1 ]; then # HT is OFF
     if [ -z "$KMP_AFFINITY" ]; then
       export KMP_AFFINITY="granularity=fine,compact,0,0"
@@ -72,7 +83,15 @@ function threads_config() {
   # according to trainer_count and total processors
   # only when MKL enabled
   # auto set OPENBLAS_NUM_THREADS when do not use MKL
-  processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
+  platform="`uname -s`"
+  processors=0
+  if [ $platform == "Linux" ]; then
+    processors=`grep "processor" /proc/cpuinfo|sort -u|wc -l`
+  elif [ $platform == "Darwin" ]; then
+    processors=`sysctl -n hw.logicalcpu`
+  else
+    return 0
+  fi
   trainers=`grep -Eo 'trainer_count.[0-9]+' <<< "$@" |grep -Eo '[0-9]+'|xargs`
   if [ -z $trainers ]; then
     trainers=1
@@ -148,11 +167,7 @@ else:
   sys.exit(0)
 EOF
 
-if [ "`uname -s`" == "Linux" ]; then
-  # only support on linux yet, with mac can use v2
-  cpu_config
-fi
-
+cpu_config
 # echo $KMP_AFFINITY $OMP_DYNAMIC
 
 case "$1" in
diff --git a/python/paddle/utils/dump_v2_config.py b/python/paddle/utils/dump_v2_config.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dc2111e379fd39b40e1e9bcf2e577b57b101a68
--- /dev/null
+++ b/python/paddle/utils/dump_v2_config.py
@@ -0,0 +1,62 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import collections
+
+from paddle.trainer_config_helpers.layers import LayerOutput
+from paddle.v2.layer import parse_network
+from paddle.proto import TrainerConfig_pb2
+
+__all__ = ["dump_v2_config"]
+
+
+def dump_v2_config(topology, save_path, binary=False):
+    """ Dump the network topology to a specified file.
+
+    This function is only used to dump network defined by using PaddlePaddle V2
+    APIs. This function will NOT dump configurations related to PaddlePaddle
+    optimizer.
+
+    :param topology: The output layers (can be more than one layers given in a
+                     Python List or Tuple) of the entire network. Using the
+                     specified layers (if more than one layer is given) as root,
+                     traversing back to the data layer(s), all the layers
+                     connected to the specified output layers will be dumped.
+                     Layers not connceted to the specified will not be dumped.
+    :type topology: LayerOutput|List|Tuple
+    :param save_path: The path to save the dumped network topology.
+    :type save_path: str
+    :param binary: Whether to dump the serialized network topology or not.
+                   The default value is false. NOTE that, if you call this
+                   function to generate network topology for PaddlePaddle C-API,
+                   a serialized version of network topology is required. When
+                   using PaddlePaddle C-API, this flag MUST be set to True.
+    :type binary: bool
+    """
+
+    if isinstance(topology, LayerOutput):
+        topology = [topology]
+    elif isinstance(topology, collections.Sequence):
+        for out_layer in topology:
+            assert isinstance(out_layer, LayerOutput), (
+                "The type of each element in the parameter topology "
+                "should be LayerOutput.")
+    else:
+        raise RuntimeError("Error input type for parameter topology.")
+
+    model_str = parse_network(topology)
+    with open(save_path, "w") as fout:
+        if binary:
+            fout.write(model_str.SerializeToString())
+        else:
+            fout.write(str(model_str))
diff --git a/python/paddle/utils/merge_model.py b/python/paddle/utils/merge_model.py
index 421e953d2775f145800cf7179ec644697a265060..2b100207728a8532e900992f7db4d3910e893dea 100644
--- a/python/paddle/utils/merge_model.py
+++ b/python/paddle/utils/merge_model.py
@@ -30,7 +30,8 @@ def merge_v2_model(net, param_file, output_file):
     which ends with .tar.gz.
 
     @param  net            The output layer of the network for inference.
-    @param  param_file     Path of the parameters (.tar.gz) which is stored by v2 api.
+    @param  param_file     Path of the parameters (.tar.gz) which is stored by
+                           v2 api.
     @param  output_file    Path of the merged file which will be generated.
 
     Usage:
diff --git a/python/paddle/v2/fluid/__init__.py b/python/paddle/v2/fluid/__init__.py
index ccd5998e3592a1f5dc795ee24875c1aed230587e..ec5159fca161ed1912bc4145e732b7927833cc0b 100644
--- a/python/paddle/v2/fluid/__init__.py
+++ b/python/paddle/v2/fluid/__init__.py
@@ -23,9 +23,22 @@ from memory_optimization_transpiler import memory_optimize
 
 Tensor = LoDTensor
 __all__ = framework.__all__ + executor.__all__ + [
-    'io', 'initializer', 'layers', 'nets', 'optimizer', 'backward',
-    'regularizer', 'LoDTensor', 'CPUPlace', 'CUDAPlace', 'Tensor', 'ParamAttr'
-    'DataFeeder', 'clip', 'DistributeTranspiler', 'memory_optimize'
+    'io',
+    'initializer',
+    'layers',
+    'nets',
+    'optimizer',
+    'backward',
+    'regularizer',
+    'LoDTensor',
+    'CPUPlace',
+    'CUDAPlace',
+    'Tensor',
+    'ParamAttr'
+    'DataFeeder',
+    'clip',
+    'DistributeTranspiler',
+    'memory_optimize',
 ]
 
 
@@ -58,7 +71,7 @@ def __bootstrap__():
 
     read_env_flags = ['use_pinned_memory', 'check_nan_inf']
     if core.is_compile_gpu():
-        read_env_flags.append('fraction_of_gpu_memory_to_use')
+        read_env_flags += ['fraction_of_gpu_memory_to_use', 'op_sync']
     core.init_gflags([sys.argv[0]] +
                      ["--tryfromenv=" + ",".join(read_env_flags)])
     core.init_glog(sys.argv[0])
diff --git a/python/paddle/v2/fluid/backward.py b/python/paddle/v2/fluid/backward.py
index cea2d1e09068da20f4d2fdbfbd9a3e3a511ba267..43f6133a6534efb676dacea2e8b8d25846d91247 100644
--- a/python/paddle/v2/fluid/backward.py
+++ b/python/paddle/v2/fluid/backward.py
@@ -3,7 +3,10 @@ from . import core
 import collections
 import copy
 
-__all__ = ['append_backward', 'calc_gradient']
+__all__ = [
+    'append_backward',
+    'calc_gradient',
+]
 
 
 def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
diff --git a/python/paddle/v2/fluid/clip.py b/python/paddle/v2/fluid/clip.py
index b1fd1c2b65f10010fa959dbb47b3fbab114db2f2..776c0f3f0276cd228db9846e473c65d44e10bbb7 100644
--- a/python/paddle/v2/fluid/clip.py
+++ b/python/paddle/v2/fluid/clip.py
@@ -3,7 +3,9 @@ import layers
 from . import core
 
 __all__ = [
-    'GradientClipByValue', 'append_gradient_clip_ops', 'error_clip_callback'
+    'GradientClipByValue',
+    'append_gradient_clip_ops',
+    'error_clip_callback',
 ]
 
 
diff --git a/python/paddle/v2/fluid/default_scope_funcs.py b/python/paddle/v2/fluid/default_scope_funcs.py
index 60c6165b6bd959f7bb3d92afed667f00f73f144f..9aebc07f8e8aac2d6bfbe7a7817b4bd261859415 100644
--- a/python/paddle/v2/fluid/default_scope_funcs.py
+++ b/python/paddle/v2/fluid/default_scope_funcs.py
@@ -1,16 +1,16 @@
 """
 Default scope function.
 
-`Paddle` manages Scope as programming language's scope.  It just a 
-thread-local stack of Scope. Top of that stack is current scope, the bottom 
-of that stack is all scopes' parent. 
+`Paddle` manages Scope as programming language's scope.  It just a
+thread-local stack of Scope. Top of that stack is current scope, the bottom
+of that stack is all scopes' parent.
 
-Invoking `var/find_var`  can `new/find` variable in current scope. 
-Invoking `enter_local_scope/leave_local_scope` can create or destroy local 
-scope. 
+Invoking `var/find_var`  can `new/find` variable in current scope.
+Invoking `enter_local_scope/leave_local_scope` can create or destroy local
+scope.
 
-A `scoped_function` will take a `function` as input. That function will be 
-invoked in a new local scope. 
+A `scoped_function` will take a `function` as input. That function will be
+invoked in a new local scope.
 """
 
 import paddle.v2.fluid.core
@@ -19,8 +19,12 @@ import threading
 __tl_scope__ = threading.local()
 
 __all__ = [
-    'get_cur_scope', 'enter_local_scope', 'leave_local_scope', 'var',
-    'find_var', 'scoped_function'
+    'get_cur_scope',
+    'enter_local_scope',
+    'leave_local_scope',
+    'var',
+    'find_var',
+    'scoped_function',
 ]
 
 
@@ -71,7 +75,7 @@ def find_var(name):
 def scoped_function(func):
     """
     invoke `func` in new scope.
-    
+
     :param func: a callable function that will be run in new scope.
     :type func: callable
     """
diff --git a/python/paddle/v2/fluid/evaluator.py b/python/paddle/v2/fluid/evaluator.py
index e186ee96c387acf24471d4e26ce020c4ecac8d19..dc083f37b5f357e835fc1a45c25a420b2c3d9798 100644
--- a/python/paddle/v2/fluid/evaluator.py
+++ b/python/paddle/v2/fluid/evaluator.py
@@ -4,7 +4,10 @@ import layers
 from framework import Program, unique_name, Variable, program_guard
 from layer_helper import LayerHelper
 
-__all__ = ['Accuracy', 'ChunkEvaluator']
+__all__ = [
+    'Accuracy',
+    'ChunkEvaluator',
+]
 
 
 def _clone_var_(block, var):
@@ -21,19 +24,19 @@ def _clone_var_(block, var):
 class Evaluator(object):
     """
     Base Class for all evaluators
-    
+
     Args:
-        name(str): The name of evaluator. such as, "accuracy". Used for generate 
+        name(str): The name of evaluator. such as, "accuracy". Used for generate
             temporary variable name.
-        main_program(Program, optional): The evaluator should be added to this 
+        main_program(Program, optional): The evaluator should be added to this
             main_program. Default default_main_program()
-        startup_program(Program, optional):The parameter should be added to this 
+        startup_program(Program, optional):The parameter should be added to this
             startup_program. Default default_startup_program()
-            
+
     Attributes:
-        states(list): The list of state variables. states will be reset to zero 
+        states(list): The list of state variables. states will be reset to zero
             when `reset` is invoked.
-        metrics(list): The list of metrics variables. They will be calculate 
+        metrics(list): The list of metrics variables. They will be calculate
             every mini-batch
     """
 
@@ -66,14 +69,14 @@ class Evaluator(object):
 
     def create_state(self, suffix, dtype, shape):
         """
-        Create state variable. 
-        
+        Create state variable.
+
         NOTE: It is not a public API.
-        
+
         Args:
-            suffix(str): the state suffix. 
-            dtype(str|core.DataType): the state data type 
-            shape(tuple|list): the shape of state 
+            suffix(str): the state suffix.
+            dtype(str|core.DataType): the state data type
+            shape(tuple|list): the shape of state
 
         Returns: State variable
 
@@ -127,8 +130,8 @@ class Accuracy(Evaluator):
 
 class ChunkEvaluator(Evaluator):
     """
-    Accumulate counter numbers output by chunk_eval from mini-batches and 
-    compute the precision recall and F1-score using the accumulated counter 
+    Accumulate counter numbers output by chunk_eval from mini-batches and
+    compute the precision recall and F1-score using the accumulated counter
     numbers.
     """
 
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index 2fb388acfc0a9f19b26c92de95de6a0dc0d9c018..bdbfe9da0772fdbd00dfc8ed00413ece56f48407 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -7,9 +7,15 @@ import proto.framework_pb2 as framework_pb2
 from . import core
 
 __all__ = [
-    'Block', 'Variable', 'Program', 'Operator', 'default_startup_program',
-    'default_main_program', 'program_guard', 'switch_startup_program',
-    'switch_main_program'
+    'Block',
+    'Variable',
+    'Program',
+    'Operator',
+    'default_startup_program',
+    'default_main_program',
+    'program_guard',
+    'switch_startup_program',
+    'switch_main_program',
 ]
 
 EMPTY_VAR_NAME = core.kEmptyVarName()
@@ -236,6 +242,9 @@ class Variable(object):
 
     __repr__ = __str__
 
+    def set_desc(self, input):
+        self.desc = input
+
     @property
     def persistable(self):
         return self.desc.persistable()
diff --git a/python/paddle/v2/fluid/initializer.py b/python/paddle/v2/fluid/initializer.py
index c0839caaf2bb5bc43a76a13b5782cc519a4afe63..c3ed1a9089603abe86d815f6826d084d23e01d99 100644
--- a/python/paddle/v2/fluid/initializer.py
+++ b/python/paddle/v2/fluid/initializer.py
@@ -1,7 +1,12 @@
 import framework
 import numpy as np
 
-__all__ = ['Constant', 'Uniform', 'Normal', 'Xavier']
+__all__ = [
+    'Constant',
+    'Uniform',
+    'Normal',
+    'Xavier',
+]
 
 
 class Initializer(object):
diff --git a/python/paddle/v2/fluid/io.py b/python/paddle/v2/fluid/io.py
index c63567601accd8c072368351f2838857bb61c818..54b6978ebaa02e1a070a666f60cd61b66d3ac1f8 100644
--- a/python/paddle/v2/fluid/io.py
+++ b/python/paddle/v2/fluid/io.py
@@ -4,13 +4,29 @@ import cPickle as pickle
 from paddle.v2.fluid.framework import Program, Parameter, default_main_program, Variable
 
 __all__ = [
-    'save_vars', 'save_params', 'save_persistables', 'load_vars', 'load_params',
-    'load_persistables', "save_inference_model", "load_inference_model",
-    "get_inference_program"
+    'save_vars',
+    'save_params',
+    'save_persistables',
+    'load_vars',
+    'load_params',
+    'load_persistables',
+    'save_inference_model',
+    'load_inference_model',
+    'get_inference_program',
 ]
 
 
 def is_parameter(var):
+    """Check whether the variable is a Parameter.
+
+    This function checks whether the input variable is a Parameter.
+
+    Args:
+        var : The input variable.
+
+    Returns:
+        boolean result whether the variable is a Parameter.
+    """
     return isinstance(var, Parameter)
 
 
diff --git a/python/paddle/v2/fluid/layers/control_flow.py b/python/paddle/v2/fluid/layers/control_flow.py
index 9ad021fa992e5e8dbfebe96cf40ae602b0ed99b5..4b363ecbe78af82733fe1f80e44118a0dfda1f11 100644
--- a/python/paddle/v2/fluid/layers/control_flow.py
+++ b/python/paddle/v2/fluid/layers/control_flow.py
@@ -12,7 +12,7 @@ __all__ = [
     'array_to_lod_tensor', 'increment', 'array_write', 'create_array',
     'less_than', 'array_read', 'shrink_memory', 'array_length', 'IfElse',
     'DynamicRNN', 'ConditionalBlock', 'StaticRNN', 'reorder_lod_tensor_by_rank',
-    'ParallelDo'
+    'ParallelDo', 'Print'
 ]
 
 
@@ -110,6 +110,61 @@ def merge_lod_tensor(in_true, in_false, x, mask, level=0):
     return out
 
 
+def Print(input,
+          first_n=-1,
+          message=None,
+          summarize=-1,
+          print_tensor_name=True,
+          print_tensor_type=True,
+          print_tensor_shape=True,
+          print_tensor_lod=True):
+    '''
+    **Print operator**
+
+    This creates a print op that will print when a tensor is accessed.
+
+    Wraps the tensor passed in so that whenever that a tensor is accessed,
+    the message `message` is printed, along with the current value of the
+    tensor `t`.
+
+    Args:
+      input(Variable): A Tensor to print.
+      summarize(int): Print this number of elements in the tensor, will print all
+                 if left negative.
+      message(str): A string message to print as a prefix.
+      first_n(int): Only log `first_n` number of times.
+      print_tensor_name(bool): Print the tensor name.
+      print_tensor_type(bool): Print the tensor type.
+      print_tensor_shape(bool): Print the tensor shape.
+      print_tensor_lod(bool): Print the tensor lod.
+
+    Returns:
+      None
+
+    Examples:
+        .. code-block:: python
+
+        value = some_layer(...)
+        Print(value, summarize=10,
+              message="The content of some_layer: ")
+    '''
+    helper = LayerHelper('print', **locals())
+    out = helper.create_tmp_variable(dtype='int32')
+    helper.append_op(
+        type='print',
+        inputs={'input': input},
+        attrs={
+            'first_n': first_n,
+            'summarize': summarize,
+            'message': message or "",
+            'print_tensor_name': print_tensor_name,
+            'print_tensor_type': print_tensor_type,
+            'print_tensor_shape': print_tensor_shape,
+            'print_tensor_lod': print_tensor_lod,
+        })
+    return out
+
+
 class BlockGuard(object):
     """
     BlockGuard class.
@@ -687,11 +742,10 @@ def topk(input, k):
 
 
 def lod_tensor_to_array(x, table):
-    """This function performs the operation that converts an LOD_Tensor to
-       an array.
+    """ Convert a LOD_TENSOR to an LOD_TENSOR_ARRAY.
 
     Args:
-        x (Variable|list): The tensor that needs to be converted to an array.
+        x (Variable|list): The LOD tensor to be converted to a LOD tensor array.
         table (ParamAttr|list): The variable that stores the level of lod
                                 which is ordered by sequence length in
                                 descending order.
@@ -721,11 +775,10 @@ def lod_tensor_to_array(x, table):
 
 
 def array_to_lod_tensor(x, table):
-    """This function performs the operations that converts an array to
-       an LOD_Tensor.
+    """Convert a LoD_Tensor_Aarry to an LoDTensor.
 
     Args:
-        x (Variable|list): The array that needs to be converted to a tensor.
+        x (Variable|list): The lod tensor array to be converted to a tensor.
         table (ParamAttr|list): The variable that stores the level of lod
                                 which is ordered by sequence length in
                                 descending order.
@@ -753,7 +806,8 @@ def array_to_lod_tensor(x, table):
 
 
 def increment(x, value=1.0, in_place=True):
-    """This function performs an operation that increments each value in the
+    """
+    This function performs an operation that increments each value in the
     input :math:`x` by an amount: :math:`value` as mentioned in the input
     parameter. This operation is performed in-place by default.
 
@@ -786,17 +840,24 @@ def increment(x, value=1.0, in_place=True):
 
 
 def array_write(x, i, array=None):
-    """This function performs the operation to write the data out as an
-    LOD_TENSOR_ARRAY.
+    """
+    This function writes the given input variable to the specified position
+    indicating by the arrary index to an output LOD_TENSOR_ARRAY. If the
+    output LOD_TENSOR_ARRAY is not given(None), a new one will be created and
+    returned.
 
     Args:
         x (Variable|list): The input tensor from which the data will be read.
-        i (Variable|list): The subscript index in tensor array, that points the
-                           place from which data will be read.
-        array (Variable|list): The data can be read into this variable if
-                               this is assigned.
+        i (Variable|list): The index of the output LOD_TENSOR_ARRAY, pointing to
+                           the position to which the input tensor will be
+                           written.
+        array (Variable|list): The output LOD_TENSOR_ARRAY to which the input
+                               tensor will be written. If this parameter is
+                               NONE, a new LOD_TENSOR_ARRAY will be created and
+                               returned.
+
     Returns:
-        Variable: The tensor type variable that has the data written to it.
+        Variable: The output LOD_TENSOR_ARRAY where the input tensor is written.
 
     Examples:
         .. code-block::python
@@ -1173,7 +1234,7 @@ class DynamicRNN(object):
         self._assert_in_rnn_block_("step_input")
         if not isinstance(x, Variable):
             raise TypeError(
-                "step_input() can only take a Variable as its input")
+                "step_input() can only take a Variable as its input.")
         parent_block = self._parent_block_()
         if self.lod_rank_table is None:
             self.lod_rank_table = parent_block.create_var(
@@ -1234,8 +1295,8 @@ class DynamicRNN(object):
 
     def __call__(self, *args, **kwargs):
         if self.status != DynamicRNN.AFTER_RNN:
-            raise ValueError(
-                "Dynamic RNN outputs can only be retrieved after rnn block")
+            raise ValueError(("Output of the dynamic RNN can only be visited "
+                              "outside the rnn block."))
         if len(self.outputs) == 1:
             return self.outputs[0]
         else:
diff --git a/python/paddle/v2/fluid/layers/nn.py b/python/paddle/v2/fluid/layers/nn.py
index b1534c5a886db3c9694637e4a4195427c3538bb7..94184d59f6f3efe1a081e5adc169b2ece72baf67 100644
--- a/python/paddle/v2/fluid/layers/nn.py
+++ b/python/paddle/v2/fluid/layers/nn.py
@@ -9,12 +9,33 @@ from ..param_attr import ParamAttr
 from tensor import concat
 
 __all__ = [
-    'fc', 'embedding', 'dynamic_lstm', 'gru_unit', 'linear_chain_crf',
-    'crf_decoding', 'cos_sim', 'cross_entropy', 'square_error_cost', 'accuracy',
-    'chunk_eval', 'sequence_conv', 'conv2d', 'sequence_pool', 'pool2d',
-    'batch_norm', 'beam_search_decode', 'conv2d_transpose', 'sequence_expand',
-    'lstm_unit', 'reduce_sum', 'reduce_mean', 'reduce_max', 'reduce_min',
-    'sequence_first_step', 'sequence_last_step'
+    'fc',
+    'embedding',
+    'dynamic_lstm',
+    'gru_unit',
+    'linear_chain_crf',
+    'crf_decoding',
+    'cos_sim',
+    'cross_entropy',
+    'square_error_cost',
+    'accuracy',
+    'chunk_eval',
+    'sequence_conv',
+    'conv2d',
+    'sequence_pool',
+    'pool2d',
+    'batch_norm',
+    'beam_search_decode',
+    'conv2d_transpose',
+    'sequence_expand',
+    'lstm_unit',
+    'reduce_sum',
+    'reduce_mean',
+    'reduce_max',
+    'reduce_min',
+    'sequence_first_step',
+    'sequence_last_step',
+    'dropout',
 ]
 
 
@@ -248,13 +269,13 @@ def gru_unit(input,
             h_t & = dot((1-u_t), m_t) + dot(u_t, h_{t-1})
 
     The inputs of gru unit includes :math:`z_t`, :math:`h_{t-1}`. In terms
-    of the equation above, the :math:`z_t` is split into 3 parts - 
-    :math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to 
-    implement a full GRU unit operator for an input, a fully 
+    of the equation above, the :math:`z_t` is split into 3 parts -
+    :math:`xu_t`, :math:`xr_t` and :math:`xm_t`. This means that in order to
+    implement a full GRU unit operator for an input, a fully
     connected layer has to be applied, such that :math:`z_t = W_{fc}x_t`.
 
-    The terms :math:`u_t` and :math:`r_t` represent the update and reset gates 
-    of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is 
+    The terms :math:`u_t` and :math:`r_t` represent the update and reset gates
+    of the GRU cell. Unlike LSTM, GRU has one lesser gate. However, there is
     an intermediate candidate hidden output, which is denoted by :math:`m_t`.
     This layer has three outputs :math:`h_t`, :math:`dot(r_t, h_{t-1})`
     and concatenation of :math:`u_t`, :math:`r_t` and :math:`m_t`.
@@ -276,7 +297,7 @@ def gru_unit(input,
         .. code-block:: python
 
              # assuming we have x_t_data and prev_hidden of size=10
-             x_t = fluid.layers.fc(input=x_t_data, size=30) 
+             x_t = fluid.layers.fc(input=x_t_data, size=30)
              hidden_val, r_h_val, gate_val = fluid.layers.gru_unit(input=x_t,
                                                     hidden = prev_hidden)
 
@@ -386,6 +407,21 @@ def cos_sim(X, Y, **kwargs):
     return out
 
 
+def dropout(x, dropout_prob, is_test=False, seed=0, **kwargs):
+    helper = LayerHelper('dropout', **kwargs)
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    mask = helper.create_tmp_variable(dtype=x.dtype, stop_gradient=True)
+    helper.append_op(
+        type='dropout',
+        inputs={'X': [x]},
+        outputs={'Out': [out],
+                 'Mask': [mask]},
+        attrs={'dropout_prob': dropout_prob,
+               'is_test': is_test,
+               'seed': seed})
+    return out
+
+
 def cross_entropy(input, label, **kwargs):
     """
     **Cross Entropy Layer**
@@ -968,7 +1004,7 @@ def batch_norm(input,
         default_initializer=Constant(1.0))
 
     bias = helper.create_parameter(
-        attr=helper.param_attr, shape=param_shape, dtype=dtype, is_bias=True)
+        attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True)
 
     mean = helper.create_global_variable(
         dtype=input.dtype,
diff --git a/python/paddle/v2/fluid/layers/ops.py b/python/paddle/v2/fluid/layers/ops.py
index 544623c4bce0cb75ea727906c4879e986c8d1ce8..51a85dbbd3357fabc62fb5b43269fdf79da21bfb 100644
--- a/python/paddle/v2/fluid/layers/ops.py
+++ b/python/paddle/v2/fluid/layers/ops.py
@@ -1,23 +1,39 @@
 from ..registry import register_layer
 
 __activations__ = [
+    'sigmoid',
+    'logsigmoid',
+    'exp',
+    'relu',
+    'tanh',
+    'tanh_shrink',
+    'softshrink',
+    'sqrt',
     'abs',
     'ceil',
-    'exp',
     'floor',
-    'log',
-    'relu',
     'round',
-    'sigmoid',
-    'sqrt',
+    'reciprocal',
+    'log',
     'square',
-    'tanh',
+    'softplus',
+    'softsign',
+    'brelu',
+    'leaky_relu',
+    'soft_relu',
+    'elu',
+    'relu6',
+    'pow',
+    'stanh',
+    'hard_shrink',
+    'thresholded_relu',
+    'hard_sigmoid',
+    'swish',
 ]
 
 __all__ = [
     'mean',
     'mul',
-    'dropout',
     'reshape',
     'scale',
     'transpose',
diff --git a/python/paddle/v2/fluid/layers/tensor.py b/python/paddle/v2/fluid/layers/tensor.py
index 5f12ecfc14f7521948acdf27f1d6249e8052abc5..2608a8d1151fafa2da0cf7b605c4fa1210068057 100644
--- a/python/paddle/v2/fluid/layers/tensor.py
+++ b/python/paddle/v2/fluid/layers/tensor.py
@@ -1,9 +1,21 @@
 from ..layer_helper import LayerHelper
 from ..param_attr import ParamAttr
+from ..framework import convert_np_dtype_to_dtype_
+from ..framework import Variable
+from ..core import DataType
+import numpy
 
 __all__ = [
-    'create_tensor', 'create_parameter', 'cast', 'concat', 'sums', 'assign',
-    'fill_constant_batch_size_like', 'fill_constant', 'ones', 'zeros'
+    'create_tensor',
+    'create_parameter',
+    'cast',
+    'concat',
+    'sums',
+    'assign',
+    'fill_constant_batch_size_like',
+    'fill_constant',
+    'ones',
+    'zeros',
 ]
 
 
@@ -121,7 +133,7 @@ def assign(input, output):
     This function copies the *input* Variable to the *output* Variable.
 
     Args:
-        input(Variable): The source variable
+        input(Variable|numpy.ndarray): The source variable
         output(Variable): The destination variable
 
     Returns:
@@ -134,11 +146,37 @@ def assign(input, output):
           fluid.layers.assign(hidden, out)
     """
     helper = LayerHelper('assign', **locals())
-    helper.append_op(
-        type='scale',
-        inputs={'X': [input]},
-        outputs={'Out': [output]},
-        attrs={'scale': 1.0})
+    if isinstance(input, Variable):
+        helper.append_op(
+            type='scale',
+            inputs={'X': [input]},
+            outputs={'Out': [output]},
+            attrs={'scale': 1.0})
+    elif isinstance(input, numpy.ndarray):
+        dtype = convert_np_dtype_to_dtype_(input.dtype)
+        if dtype == DataType.FP32:
+            value_name = "fp32_values"
+            values = [float(v) for v in input.flat]
+        elif dtype == DataType.INT32:
+            value_name = "int32_values"
+            values = [int(v) for v in input.flat]
+        else:
+            raise ValueError("Unsupported dtype %s", input.dtype)
+        if input.size > 1024 * 1024:
+            raise ValueError("The size of input is too big. Please consider "
+                             "saving it to file and 'load_op' to load it")
+
+        helper.append_op(
+            type='assign_value',
+            outputs={'Out': [output]},
+            attrs={
+                'dtype': dtype,
+                'shape': list(input.shape),
+                value_name: values
+            })
+    else:
+        raise ValueError("Wrong type for assign input: %s" % type(input))
+
     return output
 
 
@@ -146,25 +184,26 @@ def fill_constant(shape, dtype, value, out=None):
     """
     **fill_constant**
 
-    This function creates a tensor of specified *shape* and
-    *dtype*, and initializes this with a constant supplied in *value*.
+    This function creates a tensor with specified `shape` and `dtype`, and
+    initializes it with a constant specifed by `value`.
 
-    It also sets *stop_gradient* to True.
+    The attribute `stop_gradient` of the created tensor is set to True.
 
     Args:
-        shape(tuple|list|None): Shape of output tensor
-        dtype(np.dtype|core.DataType|str): Data type of output tensor
-        value(float): Constant value to initialize the output tensor
-        out(Variable): Output Variable to initialize
+        shape(tuple|list|None): Shape of the output tensor.
+        dtype(np.dtype|core.DataType|str): Data type of the output tensor.
+        value(float): The constant value used to initialize the output tensor.
+        out(Variable): The output tensor.
 
     Returns:
-        Variable: The tensor variable storing the output
+        Variable: The tensor variable storing the output.
 
     Examples:
         .. code-block:: python
 
           data = fluid.layers.fill_constant(shape=[1], value=0, dtype='int64')
     """
+
     helper = LayerHelper("fill_constant", **locals())
     if out is None:
         out = helper.create_tmp_variable(dtype=dtype)
diff --git a/python/paddle/v2/fluid/memory_optimization_transpiler.py b/python/paddle/v2/fluid/memory_optimization_transpiler.py
index 571fce7fac616356ae0368b407e90537caa42977..293b116957ff9a7c02417bc268b4c0b4b2fc0a15 100644
--- a/python/paddle/v2/fluid/memory_optimization_transpiler.py
+++ b/python/paddle/v2/fluid/memory_optimization_transpiler.py
@@ -3,6 +3,17 @@ import framework
 from framework import Program, default_main_program, Parameter, Variable
 import backward
 from backward import _rename_arg_
+from . import core
+
+dtype_to_size = {
+    core.DataType.FP16: 2,
+    core.DataType.FP32: 4,
+    core.DataType.FP64: 8,
+    core.DataType.INT16: 2,
+    core.DataType.INT32: 4,
+    core.DataType.INT64: 8,
+    core.DataType.BOOL: 1
+}
 
 
 class ControlFlowGraph(object):
@@ -28,18 +39,33 @@ class ControlFlowGraph(object):
         block_size = program_desc.num_blocks()
 
         # TODO(qijun) handle Program with if/while operators
-        self.global_block = program_desc.block(0)
-        self.op_size = self.global_block.op_size()
+        self.global_block_desc = program_desc.block(0)
+        self.op_size = self.global_block_desc.op_size()
 
         op_node_connections = [(i, i + 1) for i in range(self.op_size - 1)]
         self._add_connections(op_node_connections)
 
-        self.ops = [self.global_block.op(i) for i in range(self.op_size)]
+        self.ops = [self.global_block_desc.op(i) for i in range(self.op_size)]
 
         for i in range(self.op_size):
             self._uses[i].update(self.ops[i].input_arg_names())
             self._defs[i].update(self.ops[i].output_arg_names())
 
+    def _update_graph(self, old_name, new_name, begin_idx=0):
+        for i in range(begin_idx, self.op_size):
+            if old_name in self._uses[i]:
+                self._uses[i].remove(old_name)
+                self._uses[i].add(new_name)
+            if old_name in self._defs[i]:
+                self._defs[i].remove(old_name)
+                self._defs[i].add(new_name)
+            if old_name in self._live_in[i]:
+                self._live_in[i].remove(old_name)
+                self._live_out[i].add(new_name)
+            if old_name in self._live_out[i]:
+                self._live_out[i].remove(old_name)
+                self._live_out[i].add(new_name)
+
     def _reach_fixed_point(self, live_in, live_out):
         if len(live_in) != len(self._live_in):
             return False
@@ -79,30 +105,47 @@ class ControlFlowGraph(object):
         self.pool = []
         for i in range(self.op_size):
             if self.pool:
-                out_pair = [(x, self.global_block.var(str(x)).shape())
+                out_pair = [(x, self.global_block_desc.var(str(x)).shape())
                             for x in self._defs[i]]
                 for x, x_shape in out_pair:
-                    for index, cache_pair in enumerate(self.pool):
-                        cache_var = cache_pair[0]
-                        cache_shape = cache_pair[1]
-                        if x_shape == cache_shape:
-                            print(
-                                "Hit Cache !!!! cache pool index is %d, var name is %s, cached var name is %s, var shape is %s "
-                                % (index, x, cache_var, str(cache_shape)))
-                            self.pool.pop(index)
-                            _rename_arg_(self.ops, x, cache_var, begin_idx=i)
-                            self._dataflow_analyze()
-                            break
+                    if not self.global_block_desc.var(str(x)).persistable():
+                        for index, cache_pair in enumerate(self.pool):
+                            cache_var = cache_pair[0]
+                            cache_shape = cache_pair[1]
+                            if x_shape == cache_shape:
+                                x_dtype = self.global_block_desc.var(str(
+                                    x)).dtype()
+                                cache_dtype = self.global_block_desc.var(
+                                    str(cache_var)).dtype()
+                                # TODO(qijun): actually, we should compare dtype_to_size[x_dtype]
+                                # and dtype_to_size[cache_dtype]
+                                if x_dtype == cache_dtype:
+                                    print(
+                                        ("Hit Cache !!!! cache pool index "
+                                         "is %d, var name is %s, "
+                                         "cached var name is %s, "
+                                         "var shape is %s ") %
+                                        (index, x, cache_var, str(cache_shape)))
+                                    self.pool.pop(index)
+                                    _rename_arg_(
+                                        self.ops, x, cache_var, begin_idx=i)
+                                    self._program.current_block().var(str(
+                                        x)).desc = self.global_block_desc.var(
+                                            str(cache_var))
+                                    self._update_graph(
+                                        x, cache_var, begin_idx=i)
+                                    break
 
             in_diff, out_diff = self._get_diff(self._live_in[i],
                                                self._live_out[i])
             can_optimize = filter(
-                lambda x: not self.global_block.var(str(x)).persistable(),
+                lambda x: not self.global_block_desc.var(str(x)).persistable(),
                 in_diff)
             if can_optimize:
                 for var_name in can_optimize:
-                    self.pool.append((
-                        var_name, self.global_block.var(str(var_name)).shape()))
+                    self.pool.append(
+                        (var_name,
+                         self.global_block_desc.var(str(var_name)).shape()))
 
     def get_program(self):
         return self._program
diff --git a/python/paddle/v2/fluid/nets.py b/python/paddle/v2/fluid/nets.py
index 54886a8f2cc63474fe82290c0a12771b4cbdba72..47b550bf4d851a6c19fa88cc5fff2a7a0afc9bda 100644
--- a/python/paddle/v2/fluid/nets.py
+++ b/python/paddle/v2/fluid/nets.py
@@ -1,6 +1,9 @@
 import layers
 
-__all__ = ["simple_img_conv_pool", "sequence_conv_pool"]
+__all__ = [
+    "simple_img_conv_pool",
+    "sequence_conv_pool",
+]
 
 
 def simple_img_conv_pool(input,
diff --git a/python/paddle/v2/fluid/registry.py b/python/paddle/v2/fluid/registry.py
index 7aa82906114b355277185211134bb791e5dc43f9..94b16bca8c95e7d76377b1cd6e60532069fb452f 100644
--- a/python/paddle/v2/fluid/registry.py
+++ b/python/paddle/v2/fluid/registry.py
@@ -8,7 +8,11 @@ import proto.framework_pb2 as framework_pb2
 from framework import OpProtoHolder, Variable, Program, Operator
 from paddle.v2.fluid.layer_helper import LayerHelper, unique_name
 
-__all__ = ['deprecated', 'register_layer', 'autodoc']
+__all__ = [
+    'deprecated',
+    'register_layer',
+    'autodoc',
+]
 
 
 def _convert_(name):
@@ -80,11 +84,10 @@ def _generate_doc_string_(op_proto):
 
 
 def register_layer(op_type):
-    """
-    Register an Python layer for an Operator
+    """Register the Python layer for an Operator.
 
     Args:
-       op_type: The name of the operator to be created
+       op_type: The name of the operator to be created.
 
     This function takes in the operator type (sigmoid, mean , average etc) and
     creates the operator functionality.
@@ -98,16 +101,16 @@ def register_layer(op_type):
 
     if len(not_intermediate_outputs) != 1:
         raise ValueError("Only one non intermediate output operator can be",
-                         "automatically generated")
+                         "automatically generated.")
 
     if not_intermediate_outputs[0].duplicable:
         raise ValueError(
-            "Only non duplicable op can be automatically generated")
+            "Only non duplicable op can be automatically generated.")
 
     for output in intermediate_outputs:
         if output.duplicable:
             raise ValueError("The op can be automatically generated only when ",
-                             "all intermediate ops are not duplicable")
+                             "all intermediate ops are not duplicable.")
 
     o_name = not_intermediate_outputs[0].name
     intermediate_output_names = [output.name for output in intermediate_outputs]
diff --git a/python/paddle/v2/fluid/regularizer.py b/python/paddle/v2/fluid/regularizer.py
index d1955b00479676448d99603a31249aa7ac6a0d3f..117c45c49f14ab53db5a3a7b8360ba173cc87bf1 100644
--- a/python/paddle/v2/fluid/regularizer.py
+++ b/python/paddle/v2/fluid/regularizer.py
@@ -1,6 +1,10 @@
 import framework
 
-__all__ = ['append_regularization_ops', 'L1Decay', 'L2Decay']
+__all__ = [
+    'append_regularization_ops',
+    'L1Decay',
+    'L2Decay',
+]
 
 
 def append_regularization_ops(parameters_and_grads, regularization=None):
diff --git a/python/paddle/v2/fluid/tests/book_distribute/test_dist_fit_a_line.py b/python/paddle/v2/fluid/tests/book_distribute/test_dist_fit_a_line.py
new file mode 100644
index 0000000000000000000000000000000000000000..bb339c440bd0d229d2ae348cf5a7745b16d156d5
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/test_dist_fit_a_line.py
@@ -0,0 +1,62 @@
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+import os
+
+x = fluid.layers.data(name='x', shape=[13], dtype='float32')
+
+y_predict = fluid.layers.fc(input=x, size=1, act=None)
+
+y = fluid.layers.data(name='y', shape=[1], dtype='float32')
+
+cost = fluid.layers.square_error_cost(input=y_predict, label=y)
+avg_cost = fluid.layers.mean(x=cost)
+
+sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
+optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+
+BATCH_SIZE = 20
+
+train_reader = paddle.batch(
+    paddle.reader.shuffle(
+        paddle.dataset.uci_housing.train(), buf_size=500),
+    batch_size=BATCH_SIZE)
+
+place = fluid.CPUPlace()
+feeder = fluid.DataFeeder(place=place, feed_list=[x, y])
+exe = fluid.Executor(place)
+
+t = fluid.DistributeTranspiler()
+# all parameter server endpoints list for spliting parameters
+pserver_endpoints = os.getenv("PSERVERS")
+# server endpoint for current node
+current_endpoint = os.getenv("SERVER_ENDPOINT")
+# run as trainer or parameter server
+training_role = os.getenv("TRAINING_ROLE",
+                          "TRAINER")  # get the training role: trainer/pserver
+t.transpile(optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+if training_role == "PSERVER":
+    if not current_endpoint:
+        print("need env SERVER_ENDPOINT")
+        exit(1)
+    pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
+    exe.run(fluid.default_startup_program())
+    exe.run(pserver_prog)
+else:
+    trainer_prog = t.get_trainer_program()
+
+    exe.run(fluid.default_startup_program())
+
+    PASS_NUM = 100
+    for pass_id in range(PASS_NUM):
+        fluid.io.save_persistables(exe, "./fit_a_line.model/")
+        fluid.io.load_persistables(exe, "./fit_a_line.model/")
+        for data in train_reader():
+            avg_loss_value, = exe.run(trainer_prog,
+                                      feed=feeder.feed(data),
+                                      fetch_list=[avg_cost])
+
+            if avg_loss_value[0] < 10.0:
+                exit(0)
+exit(1)
diff --git a/python/paddle/v2/fluid/tests/book_distribute/test_dist_label_semantic_roles.py b/python/paddle/v2/fluid/tests/book_distribute/test_dist_label_semantic_roles.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fa5e0e5f34e6904e0e66d3ab4149cdfcffeb244
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/test_dist_label_semantic_roles.py
@@ -0,0 +1,225 @@
+import math
+
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.dataset.conll05 as conll05
+import paddle.v2.fluid as fluid
+import time
+import os
+
+word_dict, verb_dict, label_dict = conll05.get_dict()
+word_dict_len = len(word_dict)
+label_dict_len = len(label_dict)
+pred_len = len(verb_dict)
+
+mark_dict_len = 2
+word_dim = 32
+mark_dim = 5
+hidden_dim = 512
+depth = 8
+mix_hidden_lr = 1e-3
+
+IS_SPARSE = True
+PASS_NUM = 10
+BATCH_SIZE = 20
+
+embedding_name = 'emb'
+
+
+def load_parameter(file_name, h, w):
+    with open(file_name, 'rb') as f:
+        f.read(16)  # skip header.
+        return np.fromfile(f, dtype=np.float32).reshape(h, w)
+
+
+def db_lstm(word, predicate, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, mark,
+            **ignored):
+    # 8 features
+    predicate_embedding = fluid.layers.embedding(
+        input=predicate,
+        size=[pred_len, word_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE,
+        param_attr='vemb')
+
+    mark_embedding = fluid.layers.embedding(
+        input=mark,
+        size=[mark_dict_len, mark_dim],
+        dtype='float32',
+        is_sparse=IS_SPARSE)
+
+    word_input = [word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2]
+    emb_layers = [
+        fluid.layers.embedding(
+            size=[word_dict_len, word_dim],
+            input=x,
+            param_attr=fluid.ParamAttr(
+                name=embedding_name, trainable=False)) for x in word_input
+    ]
+    emb_layers.append(predicate_embedding)
+    emb_layers.append(mark_embedding)
+
+    hidden_0_layers = [
+        fluid.layers.fc(input=emb, size=hidden_dim) for emb in emb_layers
+    ]
+
+    hidden_0 = fluid.layers.sums(input=hidden_0_layers)
+
+    lstm_0 = fluid.layers.dynamic_lstm(
+        input=hidden_0,
+        size=hidden_dim,
+        candidate_activation='relu',
+        gate_activation='sigmoid',
+        cell_activation='sigmoid')
+
+    # stack L-LSTM and R-LSTM with direct edges
+    input_tmp = [hidden_0, lstm_0]
+
+    for i in range(1, depth):
+        mix_hidden = fluid.layers.sums(input=[
+            fluid.layers.fc(input=input_tmp[0], size=hidden_dim),
+            fluid.layers.fc(input=input_tmp[1], size=hidden_dim)
+        ])
+
+        lstm = fluid.layers.dynamic_lstm(
+            input=mix_hidden,
+            size=hidden_dim,
+            candidate_activation='relu',
+            gate_activation='sigmoid',
+            cell_activation='sigmoid',
+            is_reverse=((i % 2) == 1))
+
+        input_tmp = [mix_hidden, lstm]
+
+    feature_out = fluid.layers.sums(input=[
+        fluid.layers.fc(input=input_tmp[0], size=label_dict_len),
+        fluid.layers.fc(input=input_tmp[1], size=label_dict_len)
+    ])
+
+    return feature_out
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    # define network topology
+    word = fluid.layers.data(
+        name='word_data', shape=[1], dtype='int64', lod_level=1)
+    predicate = fluid.layers.data(
+        name='verb_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n2 = fluid.layers.data(
+        name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_n1 = fluid.layers.data(
+        name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_0 = fluid.layers.data(
+        name='ctx_0_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p1 = fluid.layers.data(
+        name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1)
+    ctx_p2 = fluid.layers.data(
+        name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1)
+    mark = fluid.layers.data(
+        name='mark_data', shape=[1], dtype='int64', lod_level=1)
+    feature_out = db_lstm(**locals())
+    target = fluid.layers.data(
+        name='target', shape=[1], dtype='int64', lod_level=1)
+    crf_cost = fluid.layers.linear_chain_crf(
+        input=feature_out,
+        label=target,
+        param_attr=fluid.ParamAttr(
+            name='crfw', learning_rate=mix_hidden_lr))
+    avg_cost = fluid.layers.mean(x=crf_cost)
+
+    # TODO(qiao)
+    # check other optimizers and check why out will be NAN
+    sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.0001)
+    optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost)
+
+    # TODO(qiao)
+    # add dependency track and move this config before optimizer
+    crf_decode = fluid.layers.crf_decoding(
+        input=feature_out, param_attr=fluid.ParamAttr(name='crfw'))
+
+    chunk_evaluator = fluid.evaluator.ChunkEvaluator(
+        input=crf_decode,
+        label=target,
+        chunk_scheme="IOB",
+        num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0)))
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.conll05.test(), buf_size=8192),
+        batch_size=BATCH_SIZE)
+    place = fluid.CPUPlace()
+    feeder = fluid.DataFeeder(
+        feed_list=[
+            word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target
+        ],
+        place=place)
+    exe = fluid.Executor(place)
+
+    t = fluid.DistributeTranspiler()
+    pserver_endpoints = os.getenv("PSERVERS")
+    # server endpoint for current node
+    current_endpoint = os.getenv("SERVER_ENDPOINT")
+    # run as trainer or parameter server
+    training_role = os.getenv(
+        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
+    t.transpile(
+        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+    if training_role == "PSERVER":
+        if not current_endpoint:
+            print("need env SERVER_ENDPOINT")
+            exit(1)
+        pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
+        exe.run(fluid.default_startup_program())
+        exe.run(pserver_prog)
+    elif training_role == "TRAINER":
+        trainer_prog = t.get_trainer_program()
+        start_time = time.time()
+        batch_id = 0
+        exe.run(fluid.default_startup_program())
+        embedding_param = fluid.global_scope().find_var(
+            embedding_name).get_tensor()
+        embedding_param.set(
+            load_parameter(conll05.get_embedding(), word_dict_len, word_dim),
+            place)
+        for pass_id in xrange(PASS_NUM):
+            chunk_evaluator.reset(exe)
+            for data in train_data():
+                cost, precision, recall, f1_score = exe.run(
+                    trainer_prog,
+                    feed=feeder.feed(data),
+                    fetch_list=[avg_cost] + chunk_evaluator.metrics)
+                pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(
+                    exe)
+
+                if batch_id % 10 == 0:
+                    print("avg_cost:" + str(cost) + " precision:" + str(
+                        precision) + " recall:" + str(recall) + " f1_score:" +
+                          str(f1_score) + " pass_precision:" + str(
+                              pass_precision) + " pass_recall:" + str(
+                                  pass_recall) + " pass_f1_score:" + str(
+                                      pass_f1_score))
+                    if batch_id != 0:
+                        print("second per batch: " + str((time.time(
+                        ) - start_time) / batch_id))
+
+                batch_id = batch_id + 1
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/book_distribute/test_understand_sentiment_conv_dist.py b/python/paddle/v2/fluid/tests/book_distribute/test_understand_sentiment_conv_dist.py
new file mode 100644
index 0000000000000000000000000000000000000000..db419e23abcd06ca39011b1bef078b0cafb5100e
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/book_distribute/test_understand_sentiment_conv_dist.py
@@ -0,0 +1,110 @@
+from __future__ import print_function
+import os
+import numpy as np
+import paddle.v2 as paddle
+import paddle.v2.fluid as fluid
+
+
+def convolution_net(data, label, input_dim, class_dim=2, emb_dim=32,
+                    hid_dim=32):
+    emb = fluid.layers.embedding(input=data, size=[input_dim, emb_dim])
+    conv_3 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=3,
+        act="tanh",
+        pool_type="sqrt")
+    conv_4 = fluid.nets.sequence_conv_pool(
+        input=emb,
+        num_filters=hid_dim,
+        filter_size=4,
+        act="tanh",
+        pool_type="sqrt")
+    prediction = fluid.layers.fc(input=[conv_3, conv_4],
+                                 size=class_dim,
+                                 act="softmax")
+    cost = fluid.layers.cross_entropy(input=prediction, label=label)
+    avg_cost = fluid.layers.mean(x=cost)
+    adam_optimizer = fluid.optimizer.Adam(learning_rate=0.002)
+    optimize_ops, params_grads = adam_optimizer.minimize(avg_cost)
+    accuracy = fluid.evaluator.Accuracy(input=prediction, label=label)
+    return avg_cost, accuracy, accuracy.metrics[0], optimize_ops, params_grads
+
+
+def to_lodtensor(data, place):
+    seq_lens = [len(seq) for seq in data]
+    cur_len = 0
+    lod = [cur_len]
+    for l in seq_lens:
+        cur_len += l
+        lod.append(cur_len)
+    flattened_data = np.concatenate(data, axis=0).astype("int64")
+    flattened_data = flattened_data.reshape([len(flattened_data), 1])
+    res = fluid.LoDTensor()
+    res.set(flattened_data, place)
+    res.set_lod([lod])
+    return res
+
+
+def main():
+    BATCH_SIZE = 100
+    PASS_NUM = 5
+
+    word_dict = paddle.dataset.imdb.word_dict()
+    dict_dim = len(word_dict)
+    class_dim = 2
+
+    data = fluid.layers.data(
+        name="words", shape=[1], dtype="int64", lod_level=1)
+    label = fluid.layers.data(name="label", shape=[1], dtype="int64")
+    cost, accuracy, acc_out, optimize_ops, params_grads = convolution_net(
+        data, label, input_dim=dict_dim, class_dim=class_dim)
+
+    train_data = paddle.batch(
+        paddle.reader.shuffle(
+            paddle.dataset.imdb.train(word_dict), buf_size=1000),
+        batch_size=BATCH_SIZE)
+    place = fluid.CPUPlace()
+    exe = fluid.Executor(place)
+
+    t = fluid.DistributeTranspiler()
+
+    # all parameter server endpoints list for spliting parameters
+    pserver_endpoints = os.getenv("PSERVERS")
+    # server endpoint for current node
+    current_endpoint = os.getenv("SERVER_ENDPOINT")
+    # run as trainer or parameter server
+    training_role = os.getenv(
+        "TRAINING_ROLE", "TRAINER")  # get the training role: trainer/pserver
+    t.transpile(
+        optimize_ops, params_grads, pservers=pserver_endpoints, trainers=2)
+
+    exe.run(fluid.default_startup_program())
+
+    if training_role == "PSERVER":
+        if not current_endpoint:
+            print("need env SERVER_ENDPOINT")
+            exit(1)
+        pserver_prog = t.get_pserver_program(current_endpoint, optimize_ops)
+        exe.run(pserver_prog)
+    elif training_role == "TRAINER":
+        trainer_prog = t.get_trainer_program()
+        feeder = fluid.DataFeeder(feed_list=[data, label], place=place)
+
+        for pass_id in xrange(PASS_NUM):
+            accuracy.reset(exe)
+            for data in train_data():
+                cost_val, acc_val = exe.run(trainer_prog,
+                                            feed=feeder.feed(data),
+                                            fetch_list=[cost, acc_out])
+                pass_acc = accuracy.eval(exe)
+                print("cost=" + str(cost_val) + " acc=" + str(acc_val) +
+                      " pass_acc=" + str(pass_acc))
+                if cost_val < 1.0 and pass_acc > 0.8:
+                    exit(0)
+    else:
+        print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
+
+
+if __name__ == '__main__':
+    main()
diff --git a/python/paddle/v2/fluid/tests/test_assign_value_op.py b/python/paddle/v2/fluid/tests/test_assign_value_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..51b99d091825ab3edc2175202ae5d8a364a54378
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_assign_value_op.py
@@ -0,0 +1,40 @@
+import paddle.v2.fluid as fluid
+import paddle.v2.fluid.layers as layers
+import op_test
+import numpy
+import unittest
+import paddle.v2.fluid.framework as framework
+
+
+class TestAssignValueOp(op_test.OpTest):
+    def setUp(self):
+        self.op_type = "assign_value"
+        x = numpy.random.random(size=(2, 5)).astype(numpy.float32)
+        self.inputs = {}
+        self.outputs = {'Out': x}
+        self.attrs = {
+            'shape': x.shape,
+            'dtype': framework.convert_np_dtype_to_dtype_(x.dtype),
+            'fp32_values': [float(v) for v in x.flat]
+        }
+
+    def test_forward(self):
+        self.check_output()
+
+    def test_assign(self):
+        val = (
+            -100 + 200 * numpy.random.random(size=(2, 5))).astype(numpy.int32)
+        x = layers.create_tensor(dtype="float32")
+        layers.assign(input=val, output=x)
+        exe = fluid.Executor(fluid.CPUPlace())
+        fetched_x = exe.run(fluid.default_main_program(),
+                            feed={},
+                            fetch_list=[x])[0]
+        self.assertTrue(
+            numpy.array_equal(fetched_x, val),
+            "fetch_x=%s val=%s" % (fetched_x, val))
+        self.assertEqual(fetched_x.dtype, val.dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_parallel_op.py b/python/paddle/v2/fluid/tests/test_parallel_op.py
index 59ed041e7fa1dd68c0f8d610f2575886442d1b4d..2b51a1f50473d0728b8180772f42584797143b4e 100644
--- a/python/paddle/v2/fluid/tests/test_parallel_op.py
+++ b/python/paddle/v2/fluid/tests/test_parallel_op.py
@@ -1,45 +1,156 @@
 import unittest
-
-import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid as fluid
-from paddle.v2.fluid.framework import Program
-from paddle.v2.fluid.executor import Executor
-from paddle.v2.fluid.backward import append_backward
-import numpy as np
-import paddle.v2.fluid.core as core
-
-
-class ParallelOpTest(unittest.TestCase):
-    def setUp(self):
-        x = layers.data(
-            shape=[-1, 30, 40],
-            dtype='float32',
-            name='x',
-            append_batch_size=False,
-            stop_gradient=False)
-
-        places = layers.get_places(device_count=4)
-        pd = layers.ParallelDo(places=places)
-
-        with pd.do():
-            data = pd.read_input(x)
-            hidden = layers.fc(input=data, size=7)
-            pd.write_output(hidden)
-        data = pd()
-        loss = layers.mean(x=data)
-        sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.001)
-        sgd_optimizer.minimize(loss)
-
-        exe = fluid.Executor(fluid.CPUPlace())
-        exe.run(fluid.default_startup_program())
-        exe.run(fluid.default_main_program(),
-                feed={
-                    x.name: np.random.uniform(0.1, 0.6,
-                                              (20, 30, 40)).astype("float32")
-                })
-
-    def test_forward(self):
-        pass
+import numpy
+
+
+class BaseParallelForTest(unittest.TestCase):
+    def run_test(self, callback, feed, fetch):
+        """
+        Run the unittest for parallel.for
+        Args:
+            callback(callable): A callable function returns a generator. There 
+                are two yields in the generator function. The first yield 
+                returns the data layers, and the second yield returns the loss. 
+                The modified data variables will be sent back during the first 
+                yield.
+            
+            feed(dict): The executor feeding dictionary.
+            fetch(list|basestr): The fetch name lists. 
+
+        Returns:
+            None
+            
+        Raises:
+            AssertionError when the computation of cpu, parallel.for in cpu, 
+                gpu, parallel.for in gpu are different.
+
+        """
+        cpu = fluid.CPUPlace()
+        result_cpu = self._run_test_impl_(
+            callback=callback,
+            feed=feed,
+            fetch=fetch,
+            place=cpu,
+            use_parallel=False)
+        result_cpu_parallel = self._run_test_impl_(
+            callback=callback,
+            feed=feed,
+            fetch=fetch,
+            place=cpu,
+            use_parallel=True)
+        if fluid.core.is_compile_gpu():
+            gpu = fluid.CUDAPlace(0)
+            result_gpu = self._run_test_impl_(
+                callback=callback,
+                feed=feed,
+                fetch=fetch,
+                place=gpu,
+                use_parallel=False)
+            result_gpu_parallel = self._run_test_impl_(
+                callback=callback,
+                feed=feed,
+                fetch=fetch,
+                place=gpu,
+                use_parallel=True)
+            self._assert_same_(fetch, result_cpu, result_cpu_parallel,
+                               result_gpu, result_gpu_parallel)
+        else:
+            self._assert_same_(fetch, result_cpu, result_cpu_parallel)
+
+    def _run_test_impl_(self, callback, feed, fetch, place, use_parallel=False):
+        """
+        Run a single test, returns the fetch values
+        Args:
+            place(Place): the computation place. 
+            use_parallel(bool): Whether use parallel.for or not. 
+
+        Returns:
+            Fetched numpy arrays.
+
+        """
+        if isinstance(fetch, basestring):
+            fetch = [fetch]
+        main = fluid.Program()
+        startup = fluid.Program()
+        # Fix seed
+        main.random_seed = 10
+        startup.random_seed = 10
+
+        with fluid.program_guard(main, startup):
+            generator = callback()
+            # Automatically insert parallel do if use_parallel = True
+            if use_parallel:
+                places = fluid.layers.get_places()
+                pd = fluid.layers.ParallelDo(places)
+                data = next(generator)
+
+                if isinstance(data, fluid.Variable):
+                    data = [data]
+
+                with pd.do():
+                    ins = map(pd.read_input, data)
+                    if len(ins) == 1:
+                        ins = ins[0]
+                    loss = generator.send(ins)  # patch input
+                    pd.write_output(loss)
+
+                loss = pd()
+            else:
+                data = next(generator)
+                loss = generator.send(data)
+            self.assertIsNotNone(loss)
+            avg_loss = fluid.layers.mean(x=loss)
+            fluid.backward.append_backward(loss=avg_loss)
+
+        exe = fluid.Executor(place)
+        exe.run(startup)
+        return exe.run(main, feed=feed, fetch_list=fetch)
+
+    def _assert_same_(self, fetch, *args):
+        """
+        Assert the return values of `run_test` are same.
+        Args:
+            fetch: Fetch list. Used for print error message
+            *args: The fetch result lists of each situations.
+
+        Returns:
+            None
+            
+        Raises:
+            AssertionError
+
+        """
+
+        def _impl_(a, b, fetch_id, item_id):
+            item_str = ['CPU', 'ParallelCPU', 'GPU', 'ParallelGPU']
+            flag = numpy.allclose(a, b, rtol=0.1)
+            self.assertTrue(flag, "The {0} are different in {1}".format(
+                fetch[fetch_id], item_str[item_id]))
+
+        for i, items in enumerate(zip(*args)):
+            self.assertGreater(len(items), 0)
+            for j in range(1, len(items)):
+                _impl_(items[0], items[j], fetch_id=i, item_id=j)
+
+
+class ParallelOpTest(BaseParallelForTest):
+    def test_simple_fc(self):
+        def __network__():
+            x = fluid.layers.data(shape=[784], dtype='float32', name='img')
+            # FIXME: This is a bug of parallel.do
+            x.stop_gradient = False
+            x = yield x
+            hidden = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
+            loss = fluid.layers.mean(x=hidden)
+            yield loss
+
+        self.run_test(
+            callback=__network__,
+            feed={
+                'img':
+                numpy.random.random(size=(128 * 3, 784)).astype('float32')
+            },
+            fetch='fc1.w@GRAD')
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/v2/fluid/tests/test_print_op.py b/python/paddle/v2/fluid/tests/test_print_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..86a701a020fc197d69d113f82a4e5ac58f377179
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_print_op.py
@@ -0,0 +1,21 @@
+import unittest
+import numpy as np
+from paddle.v2.fluid.executor import Executor
+import paddle.v2.fluid.core as core
+import paddle.v2.fluid.layers as pd
+
+
+class TestSumOp(unittest.TestCase):
+    def test_tensor(self):
+        i = pd.zeros(shape=[2, 10], dtype='float32')
+
+        pd.Print(i, message="I am a message", summarize=10)
+
+        cpu = core.CPUPlace()
+        exe = Executor(cpu)
+
+        exe.run()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/v2/fluid/tests/test_sequence_erase_op.py b/python/paddle/v2/fluid/tests/test_sequence_erase_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf257fefea0d98c6f4d9860dbac4ccedf59bcdd9
--- /dev/null
+++ b/python/paddle/v2/fluid/tests/test_sequence_erase_op.py
@@ -0,0 +1,35 @@
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+def sequence_erase(in_seq, lod0, tokens):
+    new_lod0 = [0]
+    out_seq = []
+    for i in range(0, len(lod0) - 1):
+        num_out = 0
+        for dat in in_seq[lod0[i]:lod0[i + 1]]:
+            if dat not in tokens:
+                out_seq.append(dat)
+                num_out += 1
+        new_lod0.append(new_lod0[-1] + num_out)
+    return np.array(out_seq).astype("int32"), new_lod0
+
+
+class TestSequenceEraseOp(OpTest):
+    def setUp(self):
+        self.op_type = "sequence_erase"
+        in_seq = np.random.randint(0, 10, (30, 1)).astype("int32")
+        lod = [[0, 9, 13, 24, 30]]
+        tokens = [2, 3, 5]
+        out_seq, new_lod0 = sequence_erase(in_seq, lod[0], tokens)
+        self.attrs = {'tokens': tokens}
+        self.inputs = {'X': (in_seq, lod)}
+        self.outputs = {'Out': (out_seq, [new_lod0])}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/manylinux1/Dockerfile.android b/tools/manylinux1/Dockerfile.android
new file mode 100644
index 0000000000000000000000000000000000000000..b6cae228a0c45ab70ba8ecc80ae4df7e0fa5bdbc
--- /dev/null
+++ b/tools/manylinux1/Dockerfile.android
@@ -0,0 +1,55 @@
+FROM ubuntu:16.04
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+ARG UBUNTU_MIRROR
+RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi'
+
+# ENV variables
+ARG ANDROID_ABI
+ARG ANDROID_API
+
+ENV ANDROID_ABI=${ANDROID_ABI:-"armeabi-v7a"}
+ENV ANDROID_API=${ANDROID_API:-21}
+
+ENV HOME=/root \
+    ANDROID_NDK_HOME=/opt/android-ndk-linux \
+    ANDROID_TOOLCHAINS_DIR=/opt/toolchains
+
+RUN apt-get update && \
+    apt-get install -y \
+    git python-dev python-pip python-numpy \
+    wget curl tar unzip gcc g++ locales clang-format-3.8 swig cmake && \
+    apt-get clean -y
+
+# Install Go and glide
+RUN wget -qO- go.tgz https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+RUN pip install --upgrade pip && \
+    pip install -U 'protobuf==3.1.0' && \
+    pip install -U wheel sphinx && \
+    pip install pre-commit
+
+# Android NDK
+RUN mkdir -p ${ANDROID_TOOLCHAINS_DIR} && \
+    mkdir -p /opt/android-ndk-tmp && \
+    cd /opt/android-ndk-tmp && \
+    wget -q https://dl.google.com/android/repository/android-ndk-r14b-linux-x86_64.zip && \
+    unzip -q android-ndk-r14b-linux-x86_64.zip && \
+    mv android-ndk-r14b ${ANDROID_NDK_HOME} && \
+    rm -rf /opt/android-ndk-tmp
+
+CMD ["bash", "/paddle/paddle/scripts/docker/build_android.sh"]
+
diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
new file mode 100644
index 0000000000000000000000000000000000000000..2c6ba650a5d7996bef212e88a16f2a159ca377e7
--- /dev/null
+++ b/tools/manylinux1/Dockerfile.x64
@@ -0,0 +1,54 @@
+# NOTE The manylinux1 policy mandates CentOS-5. We replace it with CentOS-6 in
+# order to satisfy the build of capnproto library (a nupic.core dependency),
+# which requires some headers and symbols not present on CentOS-5 (e.g.,
+# signalfd.h, pipe2, O_NONBLOCK, SOCK_NONBLOCK, etc.). See
+# https://github.com/sandstorm-io/capnproto/issues/350.
+FROM nvidia/cuda:<baseimg>
+MAINTAINER Numenta, based on the ManyLinux project
+
+ENV LC_ALL en_US.UTF-8
+ENV LANG en_US.UTF-8
+ENV LANGUAGE en_US.UTF-8
+ENV PATH /opt/rh/devtoolset-2/root/usr/bin:$PATH
+ENV LD_LIBRARY_PATH /opt/rh/devtoolset-2/root/usr/lib64:/opt/rh/devtoolset-2/root/usr/lib:/usr/local/lib64:/usr/local/lib:${LD_LIBRARY_PATH}
+ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
+
+COPY build_scripts /build_scripts
+RUN bash build_scripts/build.sh && rm -r build_scripts
+
+ENV SSL_CERT_FILE=/opt/_internal/certs.pem
+
+# for paddle
+RUN wget --no-check-certificate -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src
+
+
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+ENV PATH=${GOROOT}/bin:${GOPATH}/bin:${PATH}
+
+# protobuf 3.1.0
+RUN cd /opt && wget -q --no-check-certificate https://github.com/google/protobuf/releases/download/v3.1.0/protobuf-cpp-3.1.0.tar.gz && \
+    tar xzf protobuf-cpp-3.1.0.tar.gz && \
+    cd protobuf-3.1.0 && ./configure && make -j4 && make install && cd .. && rm -f protobuf-cpp-3.1.0.tar.gz
+
+
+RUN yum install -y sqlite-devel zlib-devel openssl-devel boost boost-devel pcre-devel vim tk-devel tkinter libtool
+
+RUN wget -O /root/requirements.txt https://raw.githubusercontent.com/PaddlePaddle/Paddle/develop/python/requirements.txt
+
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install -r /root/requirements.txt && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install -r /root/requirements.txt && \
+    go get github.com/Masterminds/glide && \
+    rm -rf /root/requirements.txt
+
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python
+
+RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
+    cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
+
+RUN mkdir -p /src && cd /src && git clone https://github.com/NVIDIA/nccl.git nccl && cd nccl &&\
+    make -j `nproc` install <NCCL_MAKE_OPTS>  && cd .. && rm -rf nccl
diff --git a/tools/manylinux1/README.md b/tools/manylinux1/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..cb0a9ac22cda6fb6f585ab8fd95179573c760f28
--- /dev/null
+++ b/tools/manylinux1/README.md
@@ -0,0 +1,30 @@
+# buildtools
+
+We release PaddlePaddle and PaddlePaddle Fluid as shared libraries,
+which, we hope could be released as wheel packages on PyPI, so we need
+to make sure that the build follows the
+[manulinux1](https://www.python.org/dev/peps/pep-0513/) standard.
+
+The manylinux standard suggests building Python modules on an old
+system, because that a module would anyway depend on some shared
+libraries, and Linux's shared library standard states that those built
+with newer version compilers cannot work with those with older
+versions.  The suggested building environment is as old as CentOS 5.
+However, PaddlePaddle relies on CUDA, and the earlies version of
+[CentOS works with CUDA is 6](https://hub.docker.com/r/nvidia/cuda/).
+So, here we provide a Docker image basing on CentOS 6 and CUDA for
+building PaddlePaddle and making the release supports "as-manylinux as
+possible."  or "sufficiently many Linux" according to [this
+discussion](https://mail.python.org/pipermail/wheel-builders/2016-July/000175.html).
+
+The build output of our Docker image includes multiple wheel files --
+some contain the CPU-only binary, some others support CUDA; some are
+compatible with the cp27m Python ABI, some others with cp27.
+
+To build these wheels, please run the following commands:
+
+```bash
+git clone https://github.com/paddlepaddle/paddle
+cd paddle/tools/manylinux1
+REPO=[yourrepo] ./build_all.sh
+```
diff --git a/tools/manylinux1/build_all.sh b/tools/manylinux1/build_all.sh
new file mode 100755
index 0000000000000000000000000000000000000000..097bedb5265d00f8aa362bb0272af633c97192ba
--- /dev/null
+++ b/tools/manylinux1/build_all.sh
@@ -0,0 +1,26 @@
+#!/bin/bash
+set -xe
+
+REPO="${REPO:-typhoon1986}"
+
+# NOTE: version matches are determined!
+sed 's/<baseimg>/7.5-cudnn5-devel-centos6/g' Dockerfile.x64 | \
+sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52"/g'> Dockerfile.tmp
+docker build -t ${REPO}/paddle_manylinux_devel:cuda7.5_cudnn5 -f Dockerfile.tmp .
+docker push ${REPO}/paddle_manylinux_devel:cuda7.5_cudnn5
+
+sed 's/<baseimg>/8.0-cudnn5-devel-centos6/g' Dockerfile.x64 | \
+sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62"/g'> Dockerfile.tmp
+docker build -t ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn5 -f Dockerfile.tmp .
+docker push ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn5
+
+sed 's/<baseimg>/8.0-cudnn7-devel-centos6/g' Dockerfile.x64 | \
+sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62"/g'> Dockerfile.tmp
+
+docker build -t ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn7 -f Dockerfile.tmp .
+docker push ${REPO}/paddle_manylinux_devel:cuda8.0_cudnn7
+
+sed 's/<baseimg>/9.0-cudnn7-devel-centos6/g' Dockerfile.x64 | \
+sed 's/<NCCL_MAKE_OPTS>/NVCC_GENCODE="-gencode=arch=compute_35,code=sm_35 -gencode=arch=compute_50,code=sm_50 -gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_60,code=sm_60 -gencode=arch=compute_60,code=compute_60 -gencode=arch=compute_61,code=sm_61 -gencode=arch=compute_62,code=sm_62 -gencode=arch=compute_70,code=sm_70"/g'> Dockerfile.tmp
+docker build -t ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7 -f Dockerfile.tmp .
+docker push ${REPO}/paddle_manylinux_devel:cuda9.0_cudnn7
diff --git a/tools/manylinux1/build_scripts/build.sh b/tools/manylinux1/build_scripts/build.sh
new file mode 100644
index 0000000000000000000000000000000000000000..93591fa9ddad8a78df344e1e912a5f1c7e93dfa4
--- /dev/null
+++ b/tools/manylinux1/build_scripts/build.sh
@@ -0,0 +1,152 @@
+#!/bin/bash
+# Top-level build script called from Dockerfile
+
+# Stop at any error, show all commands
+set -ex
+
+# Python versions to be installed in /opt/$VERSION_NO
+# NOTE Only need python 2.7.11 for nupic.core/nupic.bindings at this time, so
+# remove others to expedite build and reduce docker image size. The original
+# manylinux docker image project builds many python versions.
+# NOTE We added back 3.5.1, since auditwheel requires python 3.3+
+CPYTHON_VERSIONS="2.7.11 3.5.1"
+
+# openssl version to build, with expected sha256 hash of .tar.gz
+# archive
+OPENSSL_ROOT=openssl-1.0.2l
+OPENSSL_HASH=ce07195b659e75f4e1db43552860070061f156a98bb37b672b101ba6e3ddf30c
+EPEL_RPM_HASH=e5ed9ecf22d0c4279e92075a64c757ad2b38049bcf5c16c4f2b75d5f6860dc0d
+DEVTOOLS_HASH=a8ebeb4bed624700f727179e6ef771dafe47651131a00a78b342251415646acc
+PATCHELF_HASH=d9afdff4baeacfbc64861454f368b7f2c15c44d245293f7587bbf726bfe722fb
+CURL_ROOT=curl-7.49.1
+CURL_HASH=eb63cec4bef692eab9db459033f409533e6d10e20942f4b060b32819e81885f1
+AUTOCONF_ROOT=autoconf-2.69
+AUTOCONF_HASH=954bd69b391edc12d6a4a51a2dd1476543da5c6bbf05a95b59dc0dd6fd4c2969
+
+# Dependencies for compiling Python that we want to remove from
+# the final image after compiling Python
+PYTHON_COMPILE_DEPS="zlib-devel bzip2-devel ncurses-devel sqlite-devel readline-devel tk-devel gdbm-devel db4-devel libpcap-devel xz-devel"
+
+# Libraries that are allowed as part of the manylinux1 profile
+MANYLINUX1_DEPS="glibc-devel libstdc++-devel glib2-devel libX11-devel libXext-devel libXrender-devel  mesa-libGL-devel libICE-devel libSM-devel ncurses-devel"
+
+# Get build utilities
+MY_DIR=$(dirname "${BASH_SOURCE[0]}")
+source $MY_DIR/build_utils.sh
+
+# EPEL support
+yum -y install wget curl
+curl -sLO https://dl.fedoraproject.org/pub/epel/6/x86_64/epel-release-6-8.noarch.rpm
+check_sha256sum epel-release-6-8.noarch.rpm $EPEL_RPM_HASH
+
+# Dev toolset (for LLVM and other projects requiring C++11 support)
+curl -sLO http://people.centos.org/tru/devtools-2/devtools-2.repo
+check_sha256sum devtools-2.repo $DEVTOOLS_HASH
+mv devtools-2.repo /etc/yum.repos.d/devtools-2.repo
+rpm -Uvh --replacepkgs epel-release-6*.rpm
+rm -f epel-release-6*.rpm
+
+# Development tools and libraries
+yum -y install bzip2 make git patch unzip bison yasm diffutils \
+    automake which file \
+    kernel-devel-`uname -r` \
+    devtoolset-2-binutils devtoolset-2-gcc \
+    devtoolset-2-gcc-c++ devtoolset-2-gcc-gfortran \
+    ${PYTHON_COMPILE_DEPS}
+
+# Install more recent version of cmake
+# curl -O https://cmake.org/files/v3.8/cmake-3.8.1-Linux-x86_64.sh
+# /bin/sh cmake-3.8.1-Linux-x86_64.sh --prefix=/usr/local --skip-license
+# rm cmake-3.8.1-Linux-x86_64.sh
+
+wget -q https://cmake.org/files/v3.5/cmake-3.5.2.tar.gz && tar xzf cmake-3.5.2.tar.gz && \
+cd cmake-3.5.2 && ./bootstrap && \
+make -j4 && make install && cd .. && rm cmake-3.5.2.tar.gz
+
+
+# Install newest autoconf
+build_autoconf $AUTOCONF_ROOT $AUTOCONF_HASH
+autoconf --version
+
+# Compile the latest Python releases.
+# (In order to have a proper SSL module, Python is compiled
+# against a recent openssl [see env vars above], which is linked
+# statically. We delete openssl afterwards.)
+build_openssl $OPENSSL_ROOT $OPENSSL_HASH
+mkdir -p /opt/python
+build_cpythons $CPYTHON_VERSIONS
+
+PY35_BIN=/opt/python/cp35-cp35m/bin
+# NOTE Since our custom manylinux image builds pythons with shared
+# libpython, we need to add libpython's dir to LD_LIBRARY_PATH before running
+# python.
+ORIGINAL_LD_LIBRARY_PATH="${LD_LIBRARY_PATH}"
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib"
+
+# Our openssl doesn't know how to find the system CA trust store
+#   (https://github.com/pypa/manylinux/issues/53)
+# And it's not clear how up-to-date that is anyway
+# So let's just use the same one pip and everyone uses
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install certifi
+ln -s $($PY35_BIN/python -c 'import certifi; print(certifi.where())') \
+      /opt/_internal/certs.pem
+# If you modify this line you also have to modify the versions in the
+# Dockerfiles:
+export SSL_CERT_FILE=/opt/_internal/certs.pem
+
+# Install newest curl
+build_curl $CURL_ROOT $CURL_HASH
+rm -rf /usr/local/include/curl /usr/local/lib/libcurl* /usr/local/lib/pkgconfig/libcurl.pc
+hash -r
+curl --version
+curl-config --features
+
+# Now we can delete our built SSL
+rm -rf /usr/local/ssl
+
+# Install patchelf (latest with unreleased bug fixes)
+curl -sLO https://nipy.bic.berkeley.edu/manylinux/patchelf-0.9njs2.tar.gz
+check_sha256sum patchelf-0.9njs2.tar.gz $PATCHELF_HASH
+tar -xzf patchelf-0.9njs2.tar.gz
+(cd patchelf-0.9njs2 && ./configure && make && make install)
+rm -rf patchelf-0.9njs2.tar.gz patchelf-0.9njs2
+
+# Install latest pypi release of auditwheel
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname ${PY35_BIN})/lib" $PY35_BIN/pip install auditwheel
+ln -s $PY35_BIN/auditwheel /usr/local/bin/auditwheel
+
+# Clean up development headers and other unnecessary stuff for
+# final image
+yum -y erase wireless-tools gtk2 libX11 hicolor-icon-theme \
+    avahi freetype bitstream-vera-fonts \
+    ${PYTHON_COMPILE_DEPS}  > /dev/null 2>&1
+yum -y install ${MANYLINUX1_DEPS}
+yum -y clean all > /dev/null 2>&1
+yum list installed
+# we don't need libpython*.a, and they're many megabytes
+find /opt/_internal -name '*.a' -print0 | xargs -0 rm -f
+# Strip what we can -- and ignore errors, because this just attempts to strip
+# *everything*, including non-ELF files:
+find /opt/_internal -type f -print0 \
+    | xargs -0 -n1 strip --strip-unneeded 2>/dev/null || true
+# We do not need the Python test suites, or indeed the precompiled .pyc and
+# .pyo files. Partially cribbed from:
+#    https://github.com/docker-library/python/blob/master/3.4/slim/Dockerfile
+find /opt/_internal \
+     \( -type d -a -name test -o -name tests \) \
+  -o \( -type f -a -name '*.pyc' -o -name '*.pyo' \) \
+  -print0 | xargs -0 rm -f
+
+for PYTHON in /opt/python/*/bin/python; do
+    # Add matching directory of libpython shared library to library lookup path
+    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib"
+
+    # Smoke test to make sure that our Pythons work, and do indeed detect as
+    # being manylinux compatible:
+    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/manylinux1-check.py
+    # Make sure that SSL cert checking works
+    LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}:$(dirname $(dirname ${PYTHON}))/lib" $PYTHON $MY_DIR/ssl-check.py
+done
+
+# Restore LD_LIBRARY_PATH
+LD_LIBRARY_PATH="${ORIGINAL_LD_LIBRARY_PATH}"
diff --git a/tools/manylinux1/build_scripts/build_utils.sh b/tools/manylinux1/build_scripts/build_utils.sh
new file mode 100755
index 0000000000000000000000000000000000000000..10422ae3bd00f4e0dd059af0384f8cc17e4b7855
--- /dev/null
+++ b/tools/manylinux1/build_scripts/build_utils.sh
@@ -0,0 +1,173 @@
+#!/bin/bash
+# Helper utilities for build
+
+PYTHON_DOWNLOAD_URL=https://www.python.org/ftp/python
+# XXX: the official https server at www.openssl.org cannot be reached
+# with the old versions of openssl and curl in Centos 5.11 hence the fallback
+# to the ftp mirror:
+# OPENSSL_DOWNLOAD_URL=ftp://ftp.openssl.org/source
+OPENSSL_DOWNLOAD_URL=https://www.openssl.org/source
+# Ditto the curl sources
+CURL_DOWNLOAD_URL=http://curl.askapache.com/download
+
+GET_PIP_URL=https://bootstrap.pypa.io/get-pip.py
+
+AUTOCONF_DOWNLOAD_URL=http://ftp.gnu.org/gnu/autoconf
+
+
+function check_var {
+    if [ -z "$1" ]; then
+        echo "required variable not defined"
+        exit 1
+    fi
+}
+
+
+function lex_pyver {
+    # Echoes Python version string padded with zeros
+    # Thus:
+    # 3.2.1 -> 003002001
+    # 3     -> 003000000
+    echo $1 | awk -F "." '{printf "%03d%03d%03d", $1, $2, $3}'
+}
+
+
+function do_cpython_build {
+    local py_ver=$1
+    check_var $py_ver
+    local ucs_setting=$2
+    check_var $ucs_setting
+    tar -xzf Python-$py_ver.tgz
+    pushd Python-$py_ver
+    if [ "$ucs_setting" = "none" ]; then
+        unicode_flags=""
+        dir_suffix=""
+    else
+        local unicode_flags="--enable-unicode=$ucs_setting"
+        local dir_suffix="-$ucs_setting"
+    fi
+    local prefix="/opt/_internal/cpython-${py_ver}${dir_suffix}"
+    mkdir -p ${prefix}/lib
+    # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6
+
+    # NOTE --enable-shared for generating libpython shared library needed for
+    # linking of some of the nupic.core test executables.
+    CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null
+    make -j2 > /dev/null
+    make install > /dev/null
+    popd
+    echo "ZZZ looking for libpython"
+    find / -name 'libpython*.so*'
+    rm -rf Python-$py_ver
+    # Some python's install as bin/python3. Make them available as
+    # bin/python.
+    if [ -e ${prefix}/bin/python3 ]; then
+        ln -s python3 ${prefix}/bin/python
+    fi
+    # NOTE Make libpython shared library visible to python calls below
+    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python get-pip.py
+    LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/pip install wheel
+    local abi_tag=$(LD_LIBRARY_PATH="${prefix}/lib" ${prefix}/bin/python ${MY_DIR}/python-tag-abi-tag.py)
+    ln -s ${prefix} /opt/python/${abi_tag}
+}
+
+
+function build_cpython {
+    local py_ver=$1
+    check_var $py_ver
+    check_var $PYTHON_DOWNLOAD_URL
+    wget -q $PYTHON_DOWNLOAD_URL/$py_ver/Python-$py_ver.tgz
+    if [ $(lex_pyver $py_ver) -lt $(lex_pyver 3.3) ]; then
+        # NOTE We only need wide unicode for nupic.bindings wheel
+        do_cpython_build $py_ver ucs2
+        do_cpython_build $py_ver ucs4
+    else
+        do_cpython_build $py_ver none
+    fi
+    rm -f Python-$py_ver.tgz
+}
+
+
+function build_cpythons {
+    check_var $GET_PIP_URL
+    curl -sLO $GET_PIP_URL
+    for py_ver in $@; do
+        build_cpython $py_ver
+    done
+    rm get-pip.py
+}
+
+
+function do_openssl_build {
+    ./config no-ssl2 no-shared -fPIC --prefix=/usr/local/ssl > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function check_sha256sum {
+    local fname=$1
+    check_var ${fname}
+    local sha256=$2
+    check_var ${sha256}
+
+    echo "${sha256}  ${fname}" > ${fname}.sha256
+    sha256sum -c ${fname}.sha256
+    rm ${fname}.sha256
+}
+
+
+function build_openssl {
+    local openssl_fname=$1
+    check_var ${openssl_fname}
+    local openssl_sha256=$2
+    check_var ${openssl_sha256}
+    check_var ${OPENSSL_DOWNLOAD_URL}
+    curl -sLO ${OPENSSL_DOWNLOAD_URL}/${openssl_fname}.tar.gz
+    check_sha256sum ${openssl_fname}.tar.gz ${openssl_sha256}
+    tar -xzf ${openssl_fname}.tar.gz
+    (cd ${openssl_fname} && do_openssl_build)
+    rm -rf ${openssl_fname} ${openssl_fname}.tar.gz
+}
+
+
+function do_curl_build {
+    LIBS=-ldl ./configure --with-ssl --disable-shared > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function build_curl {
+    local curl_fname=$1
+    check_var ${curl_fname}
+    local curl_sha256=$2
+    check_var ${curl_sha256}
+    check_var ${CURL_DOWNLOAD_URL}
+    curl -sLO ${CURL_DOWNLOAD_URL}/${curl_fname}.tar.bz2
+    check_sha256sum ${curl_fname}.tar.bz2 ${curl_sha256}
+    tar -jxf ${curl_fname}.tar.bz2
+    (cd ${curl_fname} && do_curl_build)
+    rm -rf ${curl_fname} ${curl_fname}.tar.bz2
+}
+
+
+function do_standard_install {
+    ./configure > /dev/null
+    make > /dev/null
+    make install > /dev/null
+}
+
+
+function build_autoconf {
+    local autoconf_fname=$1
+    check_var ${autoconf_fname}
+    local autoconf_sha256=$2
+    check_var ${autoconf_sha256}
+    check_var ${AUTOCONF_DOWNLOAD_URL}
+    curl -sLO ${AUTOCONF_DOWNLOAD_URL}/${autoconf_fname}.tar.gz
+    check_sha256sum ${autoconf_fname}.tar.gz ${autoconf_sha256}
+    tar -zxf ${autoconf_fname}.tar.gz
+    (cd ${autoconf_fname} && do_standard_install)
+    rm -rf ${autoconf_fname} ${autoconf_fname}.tar.gz
+}
diff --git a/tools/manylinux1/build_scripts/manylinux1-check.py b/tools/manylinux1/build_scripts/manylinux1-check.py
new file mode 100644
index 0000000000000000000000000000000000000000..47fd3d673be662d2229480ee650dc3799301c31e
--- /dev/null
+++ b/tools/manylinux1/build_scripts/manylinux1-check.py
@@ -0,0 +1,56 @@
+# Logic copied from PEP 513
+
+
+def is_manylinux1_compatible():
+    # Only Linux, and only x86-64 / i686
+    from distutils.util import get_platform
+    if get_platform() not in ["linux-x86_64", "linux-i686"]:
+        return False
+
+    # Check for presence of _manylinux module
+    try:
+        import _manylinux
+        return bool(_manylinux.manylinux1_compatible)
+    except (ImportError, AttributeError):
+        # Fall through to heuristic check below
+        pass
+
+    # Check glibc version. CentOS 5 uses glibc 2.5.
+    return have_compatible_glibc(2, 5)
+
+
+def have_compatible_glibc(major, minimum_minor):
+    import ctypes
+
+    process_namespace = ctypes.CDLL(None)
+    try:
+        gnu_get_libc_version = process_namespace.gnu_get_libc_version
+    except AttributeError:
+        # Symbol doesn't exist -> therefore, we are not linked to
+        # glibc.
+        return False
+
+    # Call gnu_get_libc_version, which returns a string like "2.5".
+    gnu_get_libc_version.restype = ctypes.c_char_p
+    version_str = gnu_get_libc_version()
+    # py2 / py3 compatibility:
+    if not isinstance(version_str, str):
+        version_str = version_str.decode("ascii")
+
+    # Parse string and check against requested version.
+    version = [int(piece) for piece in version_str.split(".")]
+    assert len(version) == 2
+    if major != version[0]:
+        return False
+    if minimum_minor > version[1]:
+        return False
+    return True
+
+
+import sys
+if is_manylinux1_compatible():
+    print("%s is manylinux1 compatible" % (sys.executable, ))
+    sys.exit(0)
+else:
+    print("%s is NOT manylinux1 compatible" % (sys.executable, ))
+    sys.exit(1)
diff --git a/tools/manylinux1/build_scripts/python-tag-abi-tag.py b/tools/manylinux1/build_scripts/python-tag-abi-tag.py
new file mode 100644
index 0000000000000000000000000000000000000000..301fbf07a47fef03c91d9dd5f49c2894a5971319
--- /dev/null
+++ b/tools/manylinux1/build_scripts/python-tag-abi-tag.py
@@ -0,0 +1,7 @@
+# Utility script to print the python tag + the abi tag for a Python
+# See PEP 425 for exactly what these are, but an example would be:
+#   cp27-cp27mu
+
+from wheel.pep425tags import get_abbr_impl, get_impl_ver, get_abi_tag
+
+print("{0}{1}-{2}".format(get_abbr_impl(), get_impl_ver(), get_abi_tag()))
diff --git a/tools/manylinux1/build_scripts/ssl-check.py b/tools/manylinux1/build_scripts/ssl-check.py
new file mode 100644
index 0000000000000000000000000000000000000000..a85d91978c510cccd366c174c317e6a3bdb589bd
--- /dev/null
+++ b/tools/manylinux1/build_scripts/ssl-check.py
@@ -0,0 +1,32 @@
+# cf. https://github.com/pypa/manylinux/issues/53
+
+GOOD_SSL = "https://google.com"
+BAD_SSL = "https://self-signed.badssl.com"
+
+import sys
+
+print("Testing SSL certificate checking for Python:", sys.version)
+
+if (sys.version_info[:2] < (2, 7) or sys.version_info[:2] < (3, 4)):
+    print("This version never checks SSL certs; skipping tests")
+    sys.exit(0)
+
+if sys.version_info[0] >= 3:
+    from urllib.request import urlopen
+    EXC = OSError
+else:
+    from urllib import urlopen
+    EXC = IOError
+
+print("Connecting to %s should work" % (GOOD_SSL, ))
+urlopen(GOOD_SSL)
+print("...it did, yay.")
+
+print("Connecting to %s should fail" % (BAD_SSL, ))
+try:
+    urlopen(BAD_SSL)
+    # If we get here then we failed:
+    print("...it DIDN'T!!!!!11!!1one!")
+    sys.exit(1)
+except EXC:
+    print("...it did, yay.")